diff options
Diffstat (limited to 'assignment1.py')
| -rw-r--r-- | assignment1.py | 306 |
1 files changed, 118 insertions, 188 deletions
diff --git a/assignment1.py b/assignment1.py index 2a670c2..00432ec 100644 --- a/assignment1.py +++ b/assignment1.py @@ -1,4 +1,3 @@ -import time import numpy as np credit_data = np.genfromtxt('./credit_score.txt', @@ -60,23 +59,11 @@ class Node: np.bincount(node_classes.astype(int))) -# class Leaf: -# """ -# Simple class that contains only the majority class in the leaf node. -# """ -# def __init__(self, maj_class): -# """Initialises the majority vote. - -# /maj_class/ @todo - -# """ - - class Tree: """ Tree object that points towards the root node. """ - def __init__(self, root_node_obj): + def __init__(self, root_node_obj, hyper_params): """Initialises only by pointing to a Node object. /root_node_obj/ is a node object that is made before entering the main @@ -84,31 +71,64 @@ class Tree: """ self.tree = root_node_obj - def __repr__(self): - tree_string = '' - node = self.tree - depth = 0 - nodelist = [node] - while nodelist: - node = nodelist.pop() - depth += 1 - if node.col is not None: - left, right = node.left, node.right - nodelist += [left, right] - else: + self.hyper_params = hyper_params + + def predict(self, x): + """ + Makes a list of root nodes, and drops one row of x through the tree per + loop + """ + # Maak een lijst van nodes, wiens indexes overeen komen met de rows in + # x die we willen droppen + nodes = [self.tree] * len(x) + + # De index van de row van x die we in de boom willen droppen + drop = 0 + while not set(nodes).issubset({0, 1}): + # Als de col None is dan is het een leaf node, dus dan is de row + # van x hier beland + if nodes[drop].col is None: + nodes[drop] = nodes[drop].split_value_or_rows + drop += 1 continue - tree_string += '\n' + depth * ' ' - tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\' - tree_string += '\n' + ' ' * 2 * depth - for direc in left, right: - if not direc.split_value_or_rows%10: - tree_string += ' ' * 4 - else: - tree_string += ' ' * 3 - tree_string += str(int(direc.split_value_or_rows)) - - tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string - return tree_string + + # Vergelijk de x col (age,married,house,income,gender,class), in de + # gedropte row met het split value van de node. Op basis hiervan + # drop naar links of rechts + if x[drop, nodes[drop].col] > nodes[drop].split_value_or_rows: + nodes[drop] = nodes[drop].left + else: + nodes[drop] = nodes[drop].right + return np.array(nodes) + + # Work in progress tree printer + # + # def __repr__(self): + # tree_string = '' + # node = self.tree + # depth = 0 + # nodelist = [node] + # while nodelist: + # node = nodelist.pop() + # depth += 1 + # if node.col is not None: + # left, right = node.left, node.right + # nodelist += [left, right] + # else: + # continue + # tree_string += '\n' + depth * ' ' + # tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\' + # tree_string += '\n' + ' ' * 2 * depth + # for direc in left, right: + # if not direc.split_value_or_rows%10: + # tree_string += ' ' * 4 + # else: + # tree_string += ' ' * 3 + # tree_string += str(int(direc.split_value_or_rows)) + + # tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string + # return tree_string + def impurity(array) -> int: """ @@ -208,14 +228,15 @@ def bestsplit(x, y, min_leaf) -> None: best_list.append((delta_i, mask_left, mask_right, split_value)) # Haal de huidige split_point uit split_points split_points = split_points[:-1] - # Bereken de best split voor deze x col + + # Bereken de best split voor deze x col, als er ten minste 1 bestaat die + # voldoet aan min leaf if best_list: return max(best_list, key=lambda x: x[0]) else: return False - def exhaustive_split_search(rows, classes, min_leaf): """ @todo: Docstring for exhaustive_split_search @@ -224,7 +245,7 @@ def exhaustive_split_search(rows, classes, min_leaf): # We hebben enumerate nodig, want we willen weten op welke col # (age,married,house,income,gender) we een split doen exhaustive_best_list = [] - # print(f"{rows=}, {classes=}") + print(f"Rows:\n{rows},\n Classes:\n{classes}") for i, col in enumerate(rows.transpose()): # calculate the best split for the col that satisfies the min_leaf # constraint @@ -232,12 +253,13 @@ def exhaustive_split_search(rows, classes, min_leaf): # Check if there was a split fullfilling the min leaf rule if col_best_split: # add for which row we calculated the best split - col_best_split += (i,) + col_best_split += (i, ) exhaustive_best_list.append(col_best_split) print("\t\t->returning from exhaustive split search") return exhaustive_best_list -def add_children(node, x, best_split): + +def add_children(node, best_split): """ @todo: Docstring for add_children """ @@ -247,54 +269,49 @@ def add_children(node, x, best_split): current_mask = node.split_value_or_rows # Unpacking the best_split tuple - mask_left, mask_right, node_split_value, node_col = best_split[1:] - # print(f"{mask_left=}, {mask_right=}, {node_split_value=}, {node_col=}") + mask_left, mask_right, node_split_value, node_col = best_split[1:] + # print(f"{mask_left}, {mask_right}, {node_split_value}, {node_col}") # Give the current node the split_value and col it needs for predictions node.split_value_or_rows, node.col = node_split_value, node_col # Updating the row masks to give it to children, keeping numpy dimension consistent - mask_left, mask_right = update_mask(mask_left, current_mask), update_mask(mask_right, current_mask) + mask_left, mask_right = update_mask(mask_left, current_mask), update_mask( + mask_right, current_mask) - # Adding the link between parent and children - node.left, node.right = Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right) + # Adding the pointer between parent and children + node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right)) print("\t\t\t->children added to node and node list\n") return [node.left, node.right] + def update_mask(mask, current_mask): """ - @todo: Docstring for update_mask + Updates the spit bool array from any dimension to an array with length + equal to the total number of rows in dataset x. """ - print("\t\t\t\t->entering update mask to calculate which rows belong to child") - # current_mask = np.array([ True, True, True, True, True, True, False, False, False, True]) - # print(f"Parent mask: {current_mask=}") - # print(f"The result of bestsplit: {mask=}") - # mask_left=np.array([False, True, False, False, False, True, True]) - # mask_right=np.array([ True, False, True, True, True, False, False]) + print( + "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child" + ) + # Copy heb ik gedaan omdat we een slice assignment doen. + # + # Het punt van deze functie is dat tijdens het tree growen het aantal rows + # in een node afneemt. Deze functie update de bool array die gebruikt is om + # te splitten in bestsplit naar een array met als lengte het totale aantal + # rows in de data set x. Op deze manier kunnen we in tree grow de totale x + # splitten. + # + # Een andere optie zou zijn om de rows letterlijk tijdelijk op te slaan in + # nodes... weet niet wat beter is, het kan best veel geheugen in beslag + # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd + # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft. + print(f"Before update:\n{mask}") copy = np.array(current_mask, copy=True) copy[np.where(current_mask)] = mask - # print(f"Child mask:{current_mask=}") - - # print(f"{current_mask=}") - # print(f"{mask_right=}") - # current_mask[np.where(current_mask)] = mask_left - # print(f"{current_mask=}") - # mask_right = np.where(current_mask, False, True) - # print(f"{mask_right=}") - - # current_row_indexs = np.where(current_mask, mask_left, mask_right) - # print(f"{current_mask[current_row_indexs]=}") - # current_mask[current_row_indexs] = mask_left - - - # print(f"{current_row_indexs=}") - # print(f"{mask_left=}") - # print(f"{len(mask_left)=}") - # print(f"{mask_right=}") - # print(f"{len(mask_right)=}") + print(f"After update:\n{copy}") print("\t\t\t\t->updated row mask for child node") return copy - + # # @@ -322,7 +339,7 @@ def tree_grow(x=None, root = Node(split_value_or_rows=mask) # We instantieren ook gelijk het Tree object - tr = Tree(root) + tr = Tree(root, (n_min, min_leaf, n_feat)) # De eerste nodelist heeft alleen de root node, daarna zijn twee childs, # etc. totdat alle splits gemaakt zijn en de lijst leeg is. @@ -338,12 +355,14 @@ def tree_grow(x=None, # Gebruik de boolean mask van de node om de classes in de node uit y te halen node_classes = y[node.split_value_or_rows] - # print(f"{node_classes=}, {node_rows=}") + # print(f"{node_classes}, {node_rows}") # Test of de node een leaf node is met n_min if len(node_rows) < n_min: node.is_leaf_node(node_classes) - print("\t->Node has less rows than n_min, it is a leaf node, continueing to next node") + print( + "\t->Node has less rows than n_min, it is a leaf node, continueing to next node" + ) continue print("\t->Node has more rows than n_min, it is not a leaf node") @@ -358,129 +377,43 @@ def tree_grow(x=None, # "If the algorithm performs a split, it should be the best split that meets the minleaf constrain" # # We krijgen een exhaustive lijst terug met splits - exhaustive_best_list = exhaustive_split_search(node_rows, node_classes, min_leaf) + exhaustive_best_list = exhaustive_split_search( + node_rows, node_classes, min_leaf) # print(exhaustive_best_list) # Als de lijst leeg is zijn er geen potentieele splits die voldoen # an de min leaf constraint if not exhaustive_best_list: node.is_leaf_node(node_classes) - print("\t\t->No split that fullfils min_leaf was found continueing to next node") + print( + "\t\t->No split that fullfils min_leaf was found continueing to next node" + ) continue - print("\t->Exhaustive search found a split fulfilling the min_leaf rule!") + print( + "\t->Exhaustive search found a split fulfilling the min_leaf rule!" + ) # Hier halen we de beste split, en rows voor de child/split nodes # uit de exhaustive best list best_split = max(exhaustive_best_list, key=lambda z: z[0]) + print(f"\n######################best split tuple (delta_i, bool arrays voor child rows, split value, col)############################\n\n{best_split}\n\n###########################################################################################################################\n") # Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent" - nodelist += add_children(node, x, best_split) + nodelist += add_children(node, best_split) # node.add_split(left_child_node, right_child_node) else: node.is_leaf_node(node_classes) print("\t\t->The node is already pure, it is necessarily a leaf!") continue - print(tr) + # print(tr) return tr - # Initiate the nodelist with tuples of slice and class labels - # nodelist = [Node(value=slices)] - # tree = Tree(nodelist[0]) - # while nodelist: - # current_node = nodelist.pop() - # slices = current_node.value - # node_classes = y[slices] - # # print(node_classes) - - # # f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' - # # put stopping rules here before making a split - # if len(node_classes) < n_min: - # current_node.value = Leaf( - # np.argmax(np.bincount(node_classes.astype(int)))) - # print(f"leaf node has majority clas:\n{current_node.value.value=}") - # continue - - # if impurity(node_classes) > 0: - # # print( - # # f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}" - # # ) - - # # If we arrive here ever we are splitting - # # bestsplit(col, node_labels) -> - # # {"slices": list[int], "split": numpyfloat, "best_delta_i": numpyfloat} - - # # slices (list) used for knowing which rows (int) to consider in a node - # # best_split saved in current_node.value - # # best_delta_i used to find best split among x_columns - # best_dict = None - # for i, x_col in enumerate(x[slices].transpose()): - # print( - # "\nExhaustive split search says; \"Entering new column\":") - # col_split_dict = bestsplit(x_col, node_classes, slices) - - # if best_dict is not None: - # if col_split_dict["delta_i"] > best_dict["delta_i"]: - # best_dict = col_split_dict - # best_dict["col"] = i - # else: - # best_dict = col_split_dict - # best_dict["col"] = i - # print("\nThe best split for current node:", best_dict) - - # # Here we store the splitted data into Node objects - # current_node.value = best_dict["split"] - # current_node.col = best_dict["col"] - # # Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' - # if min([len(x) for x in best_dict["slices"].values()]) < min_leaf: - # continue - # else: - # # Invert left and right because we want left to pop() first - # children = [ - # Node(value=best_dict["slices"]["right"]), - # Node(value=best_dict["slices"]["left"]) - # ] - # current_node.add_split(children[1], children[0]) - # nodelist += children - # else: - # current_node.value = Leaf( - # np.argmax(np.bincount(node_classes.astype(int)))) - # print( - # f"\n\nLEAF NODE has majority clas:\n{current_node.value.value=}" - # ) - # continue - - -def predict(x, nodes) -> list: - """ - @todo: Docstring for predict - """ - # which row to drop - # print(x) - drop = 0 - while not set(nodes).issubset({0, 1}): - print(nodes) - # print(x[drop]) - if isinstance(nodes[drop].value, Leaf): - nodes[drop] = nodes[drop].value.value - drop += 1 - continue - - print(nodes[drop].value) - print(nodes[drop].col) - # print(nodes[drop].col) - if x[drop, nodes[drop].col] > nodes[drop].value: - nodes[drop] = nodes[drop].left - else: - nodes[drop] = nodes[drop].right - return np.array(nodes) - def tree_pred(x=None, tr=None, **defaults) -> np.array: """ @todo: Docstring for tree_pred """ - nodes = [tr.tree] * len(x) - # y = np.linspace(0, len(x), 0) - # y = np.array(ele) - y = predict(x, nodes) - print(f"\n\nPredicted classes for {x=}\n\n are: {y=}") + y = tr.predict(x) + n_min, min_leaf, n_feat = tr.hyper_params + print("-"*80+f"\nPrediction parameters:\nn_min={n_min}\nmin_leaf={min_leaf}\nn_feat={n_feat}") + print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}") return y @@ -500,19 +433,16 @@ if __name__ == '__main__': 'y': credit_data[:, 5], 'n_min': 2, 'min_leaf': 1, - 'n_feat': 5 + 'n_feat': None # 5, should be 5 for bootstrapping folds } # Calling the tree grow, unpacking default as argument tree_grow(**tree_grow_defaults) #### TREE_PRED TEST - # tree_pred_defaults = { - # 'x': credit_data[:, :5], - # 'tr': tree_grow(**tree_grow_defaults) - # } - - # tree_pred(**tree_pred_defaults) + tree_pred_defaults = { + 'x': credit_data[:, :5], + 'tr': tree_grow(**tree_grow_defaults) + } -# start_time = time.time() -# print("--- %s seconds ---" % (time.time() - start_time)) + tree_pred(**tree_pred_defaults) |
