diff options
| author | Mike Vink <mike1994vink@gmail.com> | 2020-09-16 02:35:13 +0200 |
|---|---|---|
| committer | Mike Vink <mike1994vink@gmail.com> | 2020-09-16 02:35:13 +0200 |
| commit | 9277a0790d4925f1fd50c789cc83a27096f543e7 (patch) | |
| tree | 7b57dd5eababd794e6387fdb72ed7878fddcdfc5 | |
| parent | 220afa88dc69c1eb59a1ba3c75a6936f40bc156a (diff) | |
Major update: obj oriented tree
| -rw-r--r-- | assignment1.py | 306 | ||||
| -rw-r--r-- | gettingStarted.py | 100 | ||||
| -rw-r--r-- | message | 10 | ||||
| -rw-r--r-- | recursive_node.py | 58 |
4 files changed, 314 insertions, 160 deletions
diff --git a/assignment1.py b/assignment1.py index 4134270..a442f55 100644 --- a/assignment1.py +++ b/assignment1.py @@ -4,8 +4,7 @@ credit_data = np.genfromtxt('./credit_score.txt', delimiter=',', skip_header=True) -# "credit_data" is now a 2d NumPy array. Each rows represent a record and the -# columns represent the data attributes. +# age,married,house,income,gender,class # [(22, 0, 0, 28, 1, 0) # (46, 0, 1, 32, 0, 0) # (24, 1, 1, 24, 1, 0) @@ -18,137 +17,57 @@ credit_data = np.genfromtxt('./credit_score.txt', # (50, 1, 1, 28, 0, 1)] -class Tree(): +class Node(): """ - @todo: docstring for Tree + @todo: docstring for Node """ - def __init__(self, d_structure): + def __init__(self, value=None): """@todo: Docstring for init method. - /d_structure/ @todo + /value=None/ @todo """ - self.d_structure = d_structure + self.value = value - def __repr__(self): - return str(self.d_structure) - - def drop_left(self) -> dict: + def add_split(self, left, right): """ - @todo: Docstring for drop_left + @todo: Docstring for add_split """ - # Need to change - print("Dropping left:\n", self.prediction["left"]) - return self.prediction["left"] - - def drop_right(self) -> dict: - """ - @todo: Docstring for drop_right - """ - # Need to change - print("Dropping right:\n", self.prediction["right"]) - return self.prediction["right"] + self.left = left + self.right = right -tree_grow_defaults = { - 'x': credit_data[:, :5], - 'y': credit_data[:, 5], - 'n_min': 0, - 'min_leaf': 0, - 'n_feat': 5 -} +class Leaf: + def __init__(self, predicted_class: int): + self.predicted_class = predicted_class -def tree_grow(x=None, - y=None, - n_min=None, - min_leaf=None, - n_feat=None, - **defaults) -> Tree: - """ - @todo: Docstring for tree_grow - """ - print( - "All attribute columns in credit data (to be exhaustively searched per split):\n", - x, "\n") - print("Class label column in credit data:\n", y, "\n") - print( - f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' - ) - print( - f'Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' - ) - print( - f"Number of features/attributes to be randomly drawn from {x=} to be considered for a split, should only be lower than {len(x[0,:])=} for random forest growing, {n_feat=}" - ) - d_structure = {"root": ((0,30), {"left": (None, 1), "right": (None, 0)})} # dummy data - return Tree(d_structure) - - -# Calling the function, unpacking default as argument -# print(tree_grow(**tree_grow_defaults)) - -tree_pred_defaults = { - 'x': credit_data[:, :5], - 'tr': tree_grow(**tree_grow_defaults) -} - - -def tree_pred(x=None, tr=None, **defaults) -> np.array: +class Tree(): """ - @todo: Docstring for tree_pred + @todo: docstring for Tree """ - print("\n\n#########Tree_pred output start:\n") - print(f"Drop a row in {x=} down the tree {tr.__repr__()}") - # x should be a set of rows that represent cases to be dropped in the tree, - # we iterate over the numpy rows - y = np.array([]) - for case in x: - # Assumes that tr is a data structure with the following form - # - # {"root":(split, {"left":(...), "right":(...)}) - # - # Where the (...) is recursion of the pattern tuple(tuple, dict) - - # Copy the data structure of the tree, not very efficient? - tr.prediction = tr.d_structure - - split, tr.prediction = tr.prediction["root"] - # Unpack the split info we need to do the first split - # c is the split value for the drop, and col is the attribute/feature - # we are dropping on - col, c = split - while isinstance(tr.prediction, dict): - # The drop methods return a tuple by giving the current - # d_structure a key that is either "left" or right": - # def drop_left(self): - # ... - # return tr.prediction["left"] - # - # Don't even need to check for leaf node, since the while loop - # should do that. - col, c = split - if case[col] > c: - split, tr.prediction = tr.drop_right() - elif case[col] <= c: - split, tr.prediction = tr.drop_left() - # Assumes that the leaf nodes in tr.d_structure is the leaf node - # that is just the majority(!) class label, which is just the integer 1 or 0. - # - # This is also the reason we break out of the while loop, when we - # arrive to a leaf node with the drop methods, the tr.d_structure - # type is no longer a dict, but an int. - # print(tr.prediction) - y = np.append(y, tr.prediction) - print("Predictions from the tree:\n", y) - return y + def __init__(self, root_node_obj): + """@todo: Docstring for init method. + /root_node_obj/ @todo -tree_pred(**tree_pred_defaults) + """ + self.tree = root_node_obj -# -# -# Put all helper functions below this comment! + def __repr__(self): + nodelist = [self.tree] + tree_str = '' + while nodelist: + current_node = nodelist.pop() + # print(current_node.value) + try: + childs = [current_node.right, current_node.left] + nodelist += childs + except AttributeError: + pass + col, c = current_node.value + tree_str += f"{col=}, {c=}" + return tree_str def impurity(array) -> int: @@ -180,11 +99,6 @@ def impurity(array) -> int: return gini_index -# array=np.array([1,0,1,1,1,0,0,1,1,0,1]) -# print(impurity(array)) -# Should give 0.23.... - - def bestsplit(x, y) -> int: """ x = vector of num values @@ -203,43 +117,115 @@ def bestsplit(x, y) -> int: >>> bestsplit(credit_data[:,3],credit_data[:,5]) 36 """ - # Sort all unique attribute values - num_attr_sorted = np.sort(np.unique(x)) - - # Use python vector addition to add all corresponding consecutive column - # elements and take their average - consec_avg_attr_splitpoints = (num_attr_sorted[0:7] + - num_attr_sorted[1:8]) / 2 - - # Convert array to list - split_points = list(consec_avg_attr_splitpoints) - - # Prepare the constants for the delta impurity equation - impurity_parent_node = impurity(y) - n_obs_parent_node = len(y) - - # Init return list - split_points_delta_impurities = [] - while split_points: - # compute child nodes class vectors for the split value - split_point = split_points.pop() - child_node = {"l": y[x > split_point], "r": y[x <= split_point]} - - # Take the weighted average of child node impurities - w_avg_child_impurities = ( - impurity(child_node["l"]) * len(child_node["l"]) + impurity( - child_node["r"]) * len(child_node["r"])) / n_obs_parent_node - - # Add the used split point and delta impurity to the return list - split_points_delta_impurities += [ - (split_point, impurity_parent_node - w_avg_child_impurities) - ] - - # Take the maximum of the list, and unpack - best_split, best_delta_impurity = max(split_points_delta_impurities, - key=lambda x: x[1]) - return best_split - - -# print(bestsplit(credit_data[:,3],credit_data[:,5])) -# Should give 36 + x_sorted = np.sort(np.unique(x)) + if len(x_sorted) <= 2: + # Turns binary attributes from [0,1] to [0, 0.5], allows us to use + # normal slices + split_points = x_sorted / 2 + else: + split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 + + best_dict = None + for split in split_points: + # Returns boolean matrix which can be used for slicing x + x_slices = { + "left": np.index_exp[x > split], + "right": np.index_exp[x <= split] + } + + # delta_i formule + delta_i = impurity(y) - (len(y[x_slices["left"]]) * impurity( + y[x_slices["left"]]) + len(y[x_slices["right"]]) * + impurity(y[x_slices["right"]])) / len(y) + + print(f"{split=}, {delta_i=}") + if best_dict is not None: + if delta_i > best_dict["delta_i"]: + best_dict = {"x_slices": x_slices, "split": split, "delta_i":delta_i} + else: + best_dict = {"slices": x_slices, "split": split, "delta_i":delta_i} + return best_dict + +# +# +# Put all helper functions above this comment! + +def tree_grow(x=None, + y=None, + n_min=None, + min_leaf=None, + n_feat=None, + **defaults) -> Tree: + """ + @todo: Docstring for tree_grow + """ + all_rows = np.index_exp[:] + # Initiate the nodelist with tuples of slice and class labels + nodelist = [Node(value={"slices": all_rows})] + tree = Tree(nodelist[0]) + while nodelist: + current_node = nodelist.pop() + slices = current_node.value["slices"] + print(f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}") + + if impurity(y[slices]) > 0: + + # bestsplit(col, node_labels) -> + # {"x_slices": (boolean matrix, boolean matrix), "split": numpyfloat, "best_delta_i": numpyfloat} + + # slices used for knowing where to split on the next node. + # best_split saved in current_node.value + # best_delta_i used to find best split among x_columns + best_dict = None + for x_col in x[slices].transpose(): + print("\nExhaustive split search says; \"Entering new column\":") + col_split_dict = bestsplit(x_col, y[slices]) + + if best_dict is not None: + if col_split_dict["delta_i"] > best_dict["delta_i"]: + best_dict = col_split_dict + else: + best_dict = col_split_dict + print("\nThe best split for current node:", best_dict) + + return tree + + + +def tree_pred(x=None, tr=None, **defaults) -> np.array: + """ + @todo: Docstring for tree_pred + """ + y = np.array([]) + return y + + +if __name__ == '__main__': + #### IMPURITY TEST + # array=np.array([1,0,1,1,1,0,0,1,1,0,1]) + # print(impurity(array)) + # Should give 0.23.... + + #### BESTSPLIT TEST + # print(bestsplit(credit_data[:, 3], credit_data[:, 5])) + # Should give 36 + + #### TREE_GROW TEST + tree_grow_defaults = { + 'x': credit_data[:, :5], + 'y': credit_data[:, 5], + 'n_min': 2, + 'min_leaf': 1, + 'n_feat': 5 + } + + # Calling the tree grow, unpacking default as argument + tree_grow(**tree_grow_defaults) + + # tree_pred_defaults = { + # 'x': credit_data[:, :5], + # 'tr': tree_grow(**tree_grow_defaults) + # } + + # tree_pred(**tree_pred_defaults) + diff --git a/gettingStarted.py b/gettingStarted.py new file mode 100644 index 0000000..3a8d907 --- /dev/null +++ b/gettingStarted.py @@ -0,0 +1,100 @@ +import numpy as np
+import random
+import math
+from copy import deepcopy
+
+credit_data = np.genfromtxt('/Users/mikevink/Documents/python/2020_data_mining_assignments/credit_score.txt', delimiter=',', skip_header=True)
+
+#print(credit_data)
+#print(credit_data[0])
+#print(credit_data[:,3])
+#print(credit_data[4,0])
+#print(np.sort(np.unique(credit_data[:,3]))) #Give the distinct values of income, sorted from low to high
+#print(np.sum(credit_data[:,5]))
+#print(credit_data.sum(axis=0)) #Add the entries of each column of credit_data
+#print(credit_data.sum(axis=1)) #Add the entries of each row
+#print(credit_data[credit_data[:,0] > 27]) # Select all rows where the first column is bigger than 27
+#
+#x = np.array([2, 5, 10])
+#print(x)
+#print(np.arange(0, 10))
+#
+#print(np.arange(0, 10)[credit_data[:,0] > 27]) #Select the *row numbers* of the rows where the first column of credit_data is bigger than 27
+#
+#index = np.random.choice(np.arange(0, 10), size=5, replace=False) #Draw a random sample of size 5 from the numbers 1 through 10 (without replacement)
+#print(index)
+#train = credit_data[index,]
+#print(train)
+#test = np.delete(credit_data, index, axis=0) #Select all rows with row number not in "index"
+#print(test)
+#
+#print(random.choice(train))
+
+
+### Practice exercise 1 ###
+def impurity(vector): # vector = list of 0s and 1s
+ num_of_class_labels = len(vector)
+ num_of_class_1 = sum(vector)
+ num_of_class_0 = num_of_class_labels - num_of_class_1
+ return (num_of_class_0 / num_of_class_labels) * (num_of_class_1 / num_of_class_labels)
+
+array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+print(impurity(array))
+
+
+### Practice exercise 2 ###
+def bestsplit(x, y): # x = numeric values; y = class labels
+ x_sorted = np.sort(np.unique(x))
+ split_points = (x_sorted[:len(x_sorted)-1] + x_sorted[1:]) / 2
+
+ best_impurity_after_split = math.inf
+ for split in split_points:
+ impurity_after_split = impurity(y[x <= split]) + impurity(y[x > split])
+ if impurity_after_split < best_impurity_after_split:
+ best_split = split
+ best_impurity_after_split = impurity_after_split
+
+ return best_split
+
+print(bestsplit(credit_data[:,3], credit_data[:,5]))
+
+
+
+class Node:
+ def _init_(self):
+ self.left = None
+ self.right = None
+ self.split_value = None
+
+class Leaf:
+ def __init__(self, predicted_class: int):
+ self.predicted_class = predicted_class
+
+
+def tree_grow(x, y): # x = numeric values; y = class labels
+ root = Node()
+ root.split_value = bestsplit(x, y)
+ root.left = Leaf(0)
+ root.right = Leaf(1)
+ return root
+
+def tree_pred(x, tr):
+ y = []
+ for value in x:
+ y.append(single_value_pred(value, tr))
+ return y
+
+def single_value_pred(value, current_tree):
+ if isinstance(current_tree, Leaf):
+ return current_tree.predicted_class
+ else:
+ if value <= current_tree.split_value:
+ return single_value_pred(value, current_tree.left)
+ else:
+ return single_value_pred(value, current_tree.right)
+
+tree = tree_grow(credit_data[:,3], credit_data[:,5])
+print(tree_pred([32, 38, 3, 40], tree))
+
+
+
@@ -0,0 +1,10 @@ +heb twee punten: + +1. Impurity_after_split (line 52) is som over child impurities, in plaats +daarvan moet het gewogen gemiddelde tot de len(new_y) / len(old_y) zijn denk ik +(misschien? want het output hetzelfde als het voorbeeld). + +2. Ik denk dat je op deze manier steeds een sub-tree/ een node-recursie aan +single_value_pred geeft. Wat is de beste manier om een node uit de tree aan te +spreken? + diff --git a/recursive_node.py b/recursive_node.py new file mode 100644 index 0000000..22bc75d --- /dev/null +++ b/recursive_node.py @@ -0,0 +1,58 @@ +import numpy as np + + +class Node(): + """ + @todo: docstring for Node + """ + def __init__(self, value=None): + """@todo: Docstring for init method. + + /value=None/ @todo + + """ + self.value = value + + def add_split(self, left, right): + """ + @todo: Docstring for add_split + """ + self.left = left + self.right = right + + +class Tree(): + """ + @todo: docstring for Tree + """ + def __init__(self, root_node_obj): + """@todo: Docstring for init method. + + /root_node_obj/ @todo + + """ + self.tree = root_node_obj + + def __repr__(self): + nodelist = [self.tree] + tree_str = '' + while nodelist: + current_node = nodelist.pop() + # print(current_node.value) + try: + childs = [current_node.right, current_node.left] + nodelist += childs + except AttributeError: + pass + tree_str += current_node.value + return tree_str + + +n1 = Node(value="root\n") +n2 = Node(value="left child of n1, ") +n3 = Node(value="right child of n1") + +n1.add_split(n2, n3) + +my_tree = Tree(n1) +print(my_tree) |
