diff options
| author | Mike Vink <mike1994vink@gmail.com> | 2020-09-16 02:35:13 +0200 |
|---|---|---|
| committer | Mike Vink <mike1994vink@gmail.com> | 2020-09-16 02:35:13 +0200 |
| commit | 9277a0790d4925f1fd50c789cc83a27096f543e7 (patch) | |
| tree | 7b57dd5eababd794e6387fdb72ed7878fddcdfc5 /gettingStarted.py | |
| parent | 220afa88dc69c1eb59a1ba3c75a6936f40bc156a (diff) | |
Major update: obj oriented tree
Diffstat (limited to 'gettingStarted.py')
| -rw-r--r-- | gettingStarted.py | 100 |
1 files changed, 100 insertions, 0 deletions
diff --git a/gettingStarted.py b/gettingStarted.py new file mode 100644 index 0000000..3a8d907 --- /dev/null +++ b/gettingStarted.py @@ -0,0 +1,100 @@ +import numpy as np
+import random
+import math
+from copy import deepcopy
+
+credit_data = np.genfromtxt('/Users/mikevink/Documents/python/2020_data_mining_assignments/credit_score.txt', delimiter=',', skip_header=True)
+
+#print(credit_data)
+#print(credit_data[0])
+#print(credit_data[:,3])
+#print(credit_data[4,0])
+#print(np.sort(np.unique(credit_data[:,3]))) #Give the distinct values of income, sorted from low to high
+#print(np.sum(credit_data[:,5]))
+#print(credit_data.sum(axis=0)) #Add the entries of each column of credit_data
+#print(credit_data.sum(axis=1)) #Add the entries of each row
+#print(credit_data[credit_data[:,0] > 27]) # Select all rows where the first column is bigger than 27
+#
+#x = np.array([2, 5, 10])
+#print(x)
+#print(np.arange(0, 10))
+#
+#print(np.arange(0, 10)[credit_data[:,0] > 27]) #Select the *row numbers* of the rows where the first column of credit_data is bigger than 27
+#
+#index = np.random.choice(np.arange(0, 10), size=5, replace=False) #Draw a random sample of size 5 from the numbers 1 through 10 (without replacement)
+#print(index)
+#train = credit_data[index,]
+#print(train)
+#test = np.delete(credit_data, index, axis=0) #Select all rows with row number not in "index"
+#print(test)
+#
+#print(random.choice(train))
+
+
+### Practice exercise 1 ###
+def impurity(vector): # vector = list of 0s and 1s
+ num_of_class_labels = len(vector)
+ num_of_class_1 = sum(vector)
+ num_of_class_0 = num_of_class_labels - num_of_class_1
+ return (num_of_class_0 / num_of_class_labels) * (num_of_class_1 / num_of_class_labels)
+
+array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+print(impurity(array))
+
+
+### Practice exercise 2 ###
+def bestsplit(x, y): # x = numeric values; y = class labels
+ x_sorted = np.sort(np.unique(x))
+ split_points = (x_sorted[:len(x_sorted)-1] + x_sorted[1:]) / 2
+
+ best_impurity_after_split = math.inf
+ for split in split_points:
+ impurity_after_split = impurity(y[x <= split]) + impurity(y[x > split])
+ if impurity_after_split < best_impurity_after_split:
+ best_split = split
+ best_impurity_after_split = impurity_after_split
+
+ return best_split
+
+print(bestsplit(credit_data[:,3], credit_data[:,5]))
+
+
+
+class Node:
+ def _init_(self):
+ self.left = None
+ self.right = None
+ self.split_value = None
+
+class Leaf:
+ def __init__(self, predicted_class: int):
+ self.predicted_class = predicted_class
+
+
+def tree_grow(x, y): # x = numeric values; y = class labels
+ root = Node()
+ root.split_value = bestsplit(x, y)
+ root.left = Leaf(0)
+ root.right = Leaf(1)
+ return root
+
+def tree_pred(x, tr):
+ y = []
+ for value in x:
+ y.append(single_value_pred(value, tr))
+ return y
+
+def single_value_pred(value, current_tree):
+ if isinstance(current_tree, Leaf):
+ return current_tree.predicted_class
+ else:
+ if value <= current_tree.split_value:
+ return single_value_pred(value, current_tree.left)
+ else:
+ return single_value_pred(value, current_tree.right)
+
+tree = tree_grow(credit_data[:,3], credit_data[:,5])
+print(tree_pred([32, 38, 3, 40], tree))
+
+
+
|
