From e1ad1dcd2ea5c7663ae0a867cc53b7cb13f3c91a Mon Sep 17 00:00:00 2001 From: Mike Vink Date: Thu, 24 Sep 2020 17:04:11 +0200 Subject: Improv: numpy exhaustive split and better update --- assignment1.py | 448 ------------------------------------------------------- tree.py | 459 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 459 insertions(+), 448 deletions(-) delete mode 100644 assignment1.py create mode 100644 tree.py diff --git a/assignment1.py b/assignment1.py deleted file mode 100644 index 4b264d7..0000000 --- a/assignment1.py +++ /dev/null @@ -1,448 +0,0 @@ -import numpy as np - -credit_data = np.genfromtxt('./credit_score.txt', - delimiter=',', - skip_header=True) - -# age,married,house,income,gender,class -# [(22, 0, 0, 28, 1, 0) -# (46, 0, 1, 32, 0, 0) -# (24, 1, 1, 24, 1, 0) -# (25, 0, 0, 27, 1, 0) -# (29, 1, 1, 32, 0, 0) -# (45, 1, 1, 30, 0, 1) -# (63, 1, 1, 58, 1, 1) -# (36, 1, 0, 52, 1, 1) -# (23, 0, 1, 40, 0, 1) -# (50, 1, 1, 28, 0, 1)] - -# In the program data points are called rows - -# In the program categorical or numerical attributes are called cols for columns - -# The last column are the classes and will be called as classes in the program - - -class Node: - """ - The node object points to two other Node objects. - """ - def __init__(self, split_value_or_rows=None, col=None): - """Initialises the column and split value for the node. - - /split_value_or_rows=None/ can either be the best split value of - a col, or a boolean mask for x that selects the rows to consider for - calculating the split_value - - /col=None/ if the node object has a split_value, then it also has a col - that belongs to this value - - """ - self.split_value_or_rows = split_value_or_rows - self.col = col - - def add_split(self, left, right): - """ - Lets the node object point to two other objects that can be either Leaf - or Node. - """ - self.left = left - self.right = right - - def is_leaf_node(self, node_classes): - """ - is_leaf_node is used to change the col attribute to None to indicate a - leaf node - """ - self.col = None - # This weird numpy line gives the majority vote, which is 1 or 0 - self.split_value_or_rows = np.argmax( - np.bincount(node_classes.astype(int))) - - -class Tree: - """ - Tree object that points towards the root node. - """ - def __init__(self, root_node_obj, hyper_params): - """Initialises only by pointing to a Node object. - - /root_node_obj/ is a node object that is made before entering the main - loop of tree grow. - - """ - self.tree = root_node_obj - self.hyper_params = hyper_params - - def predict(self, x): - """ - Makes a list of root nodes, and drops one row of x through the tree per - loop - """ - # Maak een lijst van nodes, wiens indexes overeen komen met de rows in - # x die we willen droppen - nodes = [self.tree] * len(x) - - # De index van de row van x die we in de boom willen droppen - drop = 0 - while not all(pred_class in {0,1} for pred_class in nodes): - # Als de col None is dan is het een leaf node, dus dan is de row - # van x hier beland - if nodes[drop].col is None: - nodes[drop] = nodes[drop].split_value_or_rows - drop += 1 - continue - - # Vergelijk de x col (age,married,house,income,gender,class), in de - # gedropte row met het split value van de node. Op basis hiervan - # drop naar links of rechts - if x[drop, nodes[drop].col] > nodes[drop].split_value_or_rows: - nodes[drop] = nodes[drop].left - else: - nodes[drop] = nodes[drop].right - return np.array(nodes) - - # Work in progress tree printer - # - # def __repr__(self): - # tree_string = '' - # node = self.tree - # depth = 0 - # nodelist = [node] - # while nodelist: - # node = nodelist.pop() - # depth += 1 - # if node.col is not None: - # left, right = node.left, node.right - # nodelist += [left, right] - # else: - # continue - # tree_string += '\n' + depth * ' ' - # tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\' - # tree_string += '\n' + ' ' * 2 * depth - # for direc in left, right: - # if not direc.split_value_or_rows%10: - # tree_string += ' ' * 4 - # else: - # tree_string += ' ' * 3 - # tree_string += str(int(direc.split_value_or_rows)) - - # tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string - # return tree_string - - -def impurity(array) -> int: - """ - Assumes the argument array is a one dimensional vector of zeroes and ones. - Computes the gini index impurity based on the relative frequency of ones in - the vector. - - Example: - - >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1]) - >>> array - array([1,0,1,1,1,0,0,1,1,0,1]) - - >>> impurity(array) - 0.23140495867768596 - """ - # Total labels - n_labels = len(array) - if n_labels == 0: - print( - "division by zero will happen, child node is pure, doesnt contain anything" - ) - n_labels = 1 - # Number of tuples labeled 1 - n_labels_1 = array.sum() - # Calculate the relative frequency of ones with respect to the total labels - rel_freq_1 = n_labels_1 / n_labels - # Use the symmetry around the median property to also calculate the - # relative frequency of zeroes - rel_freq_0 = 1 - rel_freq_1 - # Multiply the frequencies to get the gini index - gini_index = rel_freq_1 * rel_freq_0 - return gini_index - - -def bestsplit(x, y, min_leaf) -> None: - """ - x = vector of single col - y = vector of classes (last col in x) - - Consider splits of type "x <= c" where "c" is the average of two consecutive - values of x in the sorted order. - - x and y must be of the same length - - y[i] must be the class label of the i-th observation, and x[i] is the - correspnding value of attribute x - - Example (best split on income): - - >>> bestsplit(credit_data[:,3],credit_data[:,5]) - 36 - """ - x_sorted = np.sort(np.unique(x)) - if len(x_sorted) <= 2: - # Allows splitting on categorical (0 or 1) cols - split_points = x_sorted / 2 - else: - # Take average between consecutive numerical rows in the x col - split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 - - # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee - # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes - # bestsplit toe te passen. - # - # Deze lus berekent de best split value, en op basis daarvan weten we welke - # twee "x rows" arrays we moeten returnen, en welke split value het beste - # was natuurlijk. - - # Hieren stoppen we (delta_i, split_value, rows_left, rows_right) - best_list = [] - # Stop wanneer de array met split points leeg is - while split_points.size != 0: - # Huidige split value - split_value = split_points[-1] - # boolean masks om x rows mee te masken/slicen - mask_left, mask_right = x > split_value, x <= split_value - - # class voor beide split kanten - classes_left, classes_right = y[mask_left], y[mask_right] - - # Kijk of er genoeg rows in de gesplitte nodes terechtkomen, anders - # mogen we de split niet toelaten vanwege de min_leaf constraint - if len(classes_left) < min_leaf or len(classes_right) < min_leaf: - # Haal de huidige split_point uit split_points - split_points = split_points[:-1] - continue - - # delta_i formule, improved by not taking the parent impurity into - # account, and not making the weighted average but the weigthed sum - # only (thanks Lonnie) - delta_i = ( - impurity(classes_left) * len(classes_left) + - impurity(classes_right) * len(classes_right)) - # stop huidige splits in de lijst om best split te berekenen - best_list.append((delta_i, mask_left, mask_right, split_value)) - # Haal de huidige split_point uit split_points - split_points = split_points[:-1] - - # Bereken de best split voor deze x col, als er ten minste 1 bestaat die - # voldoet aan min leaf - if best_list: - return min(best_list, key=lambda x: x[0]) - else: - return False - - -def exhaustive_split_search(rows, classes, min_leaf): - """ - @todo: Docstring for exhaustive_split_search - """ - print("\t\t->entering exhaustive split search") - # We hebben enumerate nodig, want we willen weten op welke col - # (age,married,house,income,gender) we een split doen - exhaustive_best_list = [] - print(f"Rows:\n{rows},\n Classes:\n{classes}") - for i, col in enumerate(rows.transpose()): - # calculate the best split for the col that satisfies the min_leaf - # constraint - col_best_split = bestsplit(col, classes, min_leaf) - # Check if there was a split fullfilling the min leaf rule - if col_best_split: - # add for which row we calculated the best split - col_best_split += (i, ) - exhaustive_best_list.append(col_best_split) - print("\t\t->returning from exhaustive split search") - return exhaustive_best_list - - -def add_children(node, best_split): - """ - @todo: Docstring for add_children - """ - print("\t\t\t->entering add children") - # The mask that was used to get the rows for the current node from x, we - # need this to update the rows for the children - current_mask = node.split_value_or_rows - - # Unpacking the best_split tuple - mask_left, mask_right, node_split_value, node_col = best_split[1:] - # print(f"{mask_left}, {mask_right}, {node_split_value}, {node_col}") - - # Give the current node the split_value and col it needs for predictions - node.split_value_or_rows, node.col = node_split_value, node_col - - # Updating the row masks to give it to children, keeping numpy dimension consistent - mask_left, mask_right = update_mask(mask_left, current_mask), update_mask( - mask_right, current_mask) - - # Adding the pointer between parent and children - node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right)) - print("\t\t\t->children added to node and node list\n") - return [node.left, node.right] - - -def update_mask(mask, current_mask): - """ - Updates the spit bool array from any dimension to an array with length - equal to the total number of rows in dataset x. - """ - print( - "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child" - ) - # Copy heb ik gedaan omdat we een slice assignment doen. - # - # Het punt van deze functie is dat tijdens het tree growen het aantal rows - # in een node afneemt. Deze functie update de bool array die gebruikt is om - # te splitten in bestsplit naar een array met als lengte het totale aantal - # rows in de data set x. Op deze manier kunnen we in tree grow de totale x - # splitten. - # - # Een andere optie zou zijn om de rows letterlijk tijdelijk op te slaan in - # nodes... weet niet wat beter is, het kan best veel geheugen in beslag - # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd - # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft. - print(f"Before update:\n{mask}") - copy = np.array(current_mask, copy=True) - copy[np.where(current_mask)] = mask - print(f"After update:\n{copy}") - print("\t\t\t\t->updated row mask for child node") - return copy - - -# -# -# Put all helper functions above this comment! - - -def tree_grow(x=None, - y=None, - n_min=None, - min_leaf=None, - n_feat=None, - **defaults) -> Tree: - """ - @todo: Docstring for tree_grow - """ - # De nodelist heeft in het begin alleen een root node met alle rows van x, - # omdat alle rows in de root in acht worden genomen voor bestsplit berekening. - # - # Dit representeren we met een boolean mask over x, met lengte het aantal rows - # in x en elementen True. Deze boolean mask zullen we repeatedly gebruiken als een - # mask over x om de rows voor bestsplit op te halen. - mask = np.full(len(x), True) - - # Het eerste node object moet nu geinstantieerd worden - root = Node(split_value_or_rows=mask) - - # We instantieren ook gelijk het Tree object - tr = Tree(root, (n_min, min_leaf, n_feat)) - - # De eerste nodelist heeft alleen de root node, daarna zijn twee childs, - # etc. totdat alle splits gemaakt zijn en de lijst leeg is. - nodelist = [root] - - while nodelist: - print("->Taking new node from the node list") - # Pop de current node uit de nodelist - node = nodelist.pop() - # Gebruik de boolean mask van de node om de rows in de node uit x te halen - node_rows = x[node.split_value_or_rows] - # print(node_rows) - # Gebruik de boolean mask van de node om de classes in de node uit y te halen - node_classes = y[node.split_value_or_rows] - - # print(f"{node_classes}, {node_rows}") - - # Test of de node een leaf node is met n_min - if len(node_rows) < n_min: - node.is_leaf_node(node_classes) - print( - "\t->Node has less rows than n_min, it is a leaf node, continueing to next node" - ) - continue - print("\t->Node has more rows than n_min, it is not a leaf node") - - # Als de node niet puur is, probeer dan te splitten - if impurity(node_classes) > 0: - print("\t->Node is not pure yet starting exhaustive split search") - # We gaan exhaustively voor de rows in de node over de cols - # (age,married,house,income,gender) om de bestsplit te - # bepalen - # - # We geven min_leaf als argument omdat: - # "If the algorithm performs a split, it should be the best split that meets the minleaf constrain" - # - # We krijgen een exhaustive lijst terug met splits - exhaustive_best_list = exhaustive_split_search( - node_rows, node_classes, min_leaf) - # print(exhaustive_best_list) - # Als de lijst leeg is zijn er geen potentieele splits die voldoen - # an de min leaf constraint - if not exhaustive_best_list: - node.is_leaf_node(node_classes) - print( - "\t\t->No split that fullfils min_leaf was found continueing to next node" - ) - continue - print( - "\t->Exhaustive search found a split fulfilling the min_leaf rule!" - ) - # Hier halen we de beste split, en rows voor de child/split nodes - # uit de exhaustive best list - best_split = min(exhaustive_best_list, key=lambda z: z[0]) - print(f"\n######################best split tuple (delta_i, bool arrays voor child rows, split value, col)############################\n\n{best_split}\n\n###########################################################################################################################\n") - # Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent" - nodelist += add_children(node, best_split) - # node.add_split(left_child_node, right_child_node) - else: - node.is_leaf_node(node_classes) - print("\t\t->The node is already pure, it is necessarily a leaf!") - continue - # print(tr) - return tr - - -def tree_pred(x=None, tr=None, **defaults) -> np.array: - """ - @todo: Docstring for tree_pred - """ - y = tr.predict(x) - n_min, min_leaf, n_feat = tr.hyper_params - print("-"*80+f"\nPrediction parameters:\nn_min={n_min}\nmin_leaf={min_leaf}\nn_feat={n_feat}") - print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}") - return y - - -if __name__ == '__main__': - #### IMPURITY TEST - # array=np.array([1,0,1,1,1,0,0,1,1,0,1]) - # print(impurity(array)) - # Should give 0.23.... - - #### BESTSPLIT TEST - # print(bestsplit(credit_data[:, 3], credit_data[:, 5])) - # Should give 36 - - #### TREE_GROW TEST - tree_grow_defaults = { - 'x': credit_data[:, :5], - 'y': credit_data[:, 5], - 'n_min': 2, - 'min_leaf': 1, - 'n_feat': None # 5, should be 5 for bootstrapping folds - } - - # Calling the tree grow, unpacking default as argument - # tree_grow(**tree_grow_defaults) - - #### TREE_PRED TEST - tree_pred_defaults = { - 'x': credit_data[:, :5], - 'tr': tree_grow(**tree_grow_defaults) - } - - tree_pred(**tree_pred_defaults) diff --git a/tree.py b/tree.py new file mode 100644 index 0000000..64e9d21 --- /dev/null +++ b/tree.py @@ -0,0 +1,459 @@ +import numpy as np + +credit_data = np.genfromtxt('./data/credit_score.txt', + delimiter=',', + skip_header=True) + +# age,married,house,income,gender,class +# [(22, 0, 0, 28, 1, 0) +# (46, 0, 1, 32, 0, 0) +# (24, 1, 1, 24, 1, 0) +# (25, 0, 0, 27, 1, 0) +# (29, 1, 1, 32, 0, 0) +# (45, 1, 1, 30, 0, 1) +# (63, 1, 1, 58, 1, 1) +# (36, 1, 0, 52, 1, 1) +# (23, 0, 1, 40, 0, 1) +# (50, 1, 1, 28, 0, 1)] + +# In the program data points are called rows + +# In the program categorical or numerical attributes are called cols for columns + +# The last column are the classes and will be called as classes in the program + + +class Node: + """ + The node object points to two other Node objects. + """ + def __init__(self, split_value_or_rows=None, col=None): + """Initialises the column and split value for the node. + + /split_value_or_rows=None/ can either be the best split value of + a col, or a boolean mask for x that selects the rows to consider for + calculating the split_value + + /col=None/ if the node object has a split_value, then it also has a col + that belongs to this value + + """ + self.split_value_or_rows = split_value_or_rows + self.col = col + + def add_split(self, left, right): + """ + Lets the node object point to two other objects that can be either Leaf + or Node. + """ + self.left = left + self.right = right + + def is_leaf_node(self, node_classes): + """ + is_leaf_node is used to change the col attribute to None to indicate a + leaf node + """ + self.col = None + # This weird numpy line gives the majority vote, which is 1 or 0 + self.split_value_or_rows = np.argmax( + np.bincount(node_classes.astype(int))) + + +class Tree: + """ + Tree object that points towards the root node. + """ + def __init__(self, root_node_obj, hyper_params): + """Initialises only by pointing to a Node object. + + /root_node_obj/ is a node object that is made before entering the main + loop of tree grow. + + """ + self.tree = root_node_obj + self.hyper_params = hyper_params + + def predict(self, x): + """ + Makes a list of root nodes, and drops one row of x through the tree per + loop + """ + # Maak een lijst van nodes, wiens indexes overeen komen met de rows in + # x die we willen droppen + nodes = [self.tree] * len(x) + + # De index van de row van x die we in de boom willen droppen + drop = 0 + while not all(pred_class in {0,1} for pred_class in nodes): + # Als de col None is dan is het een leaf node, dus dan is de row + # van x hier beland + # print(nodes[drop].col, nodes[drop].split_value_or_rows) + if nodes[drop].col is None: + nodes[drop] = nodes[drop].split_value_or_rows + drop += 1 + continue + + # Vergelijk de x col (age,married,house,income,gender,class), in de + # gedropte row met het split value van de node. Op basis hiervan + # drop naar links of rechts + if x[drop, nodes[drop].col] > nodes[drop].split_value_or_rows: + nodes[drop] = nodes[drop].left + else: + nodes[drop] = nodes[drop].right + return np.array(nodes) + + # Work in progress tree printer + # + # def __repr__(self): + # tree_string = '' + # node = self.tree + # depth = 0 + # nodelist = [node] + # while nodelist: + # node = nodelist.pop() + # depth += 1 + # if node.col is not None: + # left, right = node.left, node.right + # nodelist += [left, right] + # else: + # continue + # tree_string += '\n' + depth * ' ' + # tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\' + # tree_string += '\n' + ' ' * 2 * depth + # for direc in left, right: + # if not direc.split_value_or_rows%10: + # tree_string += ' ' * 4 + # else: + # tree_string += ' ' * 3 + # tree_string += str(int(direc.split_value_or_rows)) + + # tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string + # return tree_string + + +def impurity(array) -> int: + """ + Assumes the argument array is a one dimensional vector of zeroes and ones. + Computes the gini index impurity based on the relative frequency of ones in + the vector. + + Example: + + >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1]) + >>> array + array([1,0,1,1,1,0,0,1,1,0,1]) + + >>> impurity(array) + 0.23140495867768596 + """ + # Total labels + n_labels = len(array) + if n_labels == 0: + print( + "division by zero will happen, child node is pure, doesnt contain anything" + ) + n_labels = 1 + # Number of tuples labeled 1 + n_labels_1 = array.sum() + # Calculate the relative frequency of ones with respect to the total labels + rel_freq_1 = n_labels_1 / n_labels + # Use the symmetry around the median property to also calculate the + # relative frequency of zeroes + rel_freq_0 = 1 - rel_freq_1 + # Multiply the frequencies to get the gini index + gini_index = rel_freq_1 * rel_freq_0 + return gini_index + + +def bestsplit(x, y, min_leaf) -> None: + """ + x = vector of single col + y = vector of classes (last col in x) + + Consider splits of type "x <= c" where "c" is the average of two consecutive + values of x in the sorted order. + + x and y must be of the same length + + y[i] must be the class label of the i-th observation, and x[i] is the + correspnding value of attribute x + + Example (best split on income): + + >>> bestsplit(credit_data[:,3],credit_data[:,5]) + 36 + """ + x_sorted = np.sort(np.unique(x)) + # if len(x_sorted) <= 2: + # # Allows splitting on categorical (0 or 1) cols + # split_points = x_sorted / 2 + # else: + # # Take average between consecutive numerical rows in the x col + # split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 + + split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 + + # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee + # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes + # bestsplit toe te passen. + # + # Deze lus berekent de best split value, en op basis daarvan weten we welke + # twee "x rows" arrays we moeten returnen, en welke split value het beste + # was natuurlijk. + + # Hieren stoppen we (rows for children, delta_i, split value, col will be added later) + best_array = np.zeros((split_points.shape[0], x.shape[0] + 3), dtype='float64') + # print(f"{best_array=}") + # Stop wanneer de array met split points leeg is + while split_points.size != 0: + # Huidige split value + split_value = split_points[-1] + # print(split_value) + # boolean masks om x rows mee te masken/slicen + # print(best_array.shape) + current_row_of_best_array = split_points.shape[0]-1 + best_array[current_row_of_best_array, :x.shape[0]] = x > split_value + # print(best_array) + # print(f"{best_array[current_row_of_best_array,:-2]=}") + + # class voor beide split kanten + classes_left, classes_right = y[best_array[current_row_of_best_array,:-3].astype(bool)], y[np.invert(best_array[current_row_of_best_array,:-3].astype(bool))] + # print(f"{classes_left=} {classes_right=}") + + # Kijk of er genoeg rows in de gesplitte nodes terechtkomen, anders + # mogen we de split niet toelaten vanwege de min_leaf constraint + # print(f"{classes_left.shape[0]=}") + if classes_left.shape[0] < min_leaf or classes_right.shape[0] < min_leaf: + # Haal de huidige split_point uit split_points + split_points = split_points[:-1] + continue + + # delta_i formule, improved by not taking the parent impurity into + # account, and not making the weighted average but the weigthed sum + # only (thanks Lonnie) + delta_i = ( + impurity(classes_left) * classes_left.shape[0] + + impurity(classes_right) * classes_right.shape[0]) + # stop huidige splits in de lijst om best split te berekenen + # print(f"{np.array((delta_i, mask_left, mask_right, split_value))=}") + # print(f"{best_array[:, classes_left.shape[0]-1]=}") + best_array[current_row_of_best_array,-3:-1] = delta_i, split_value + # print(best_array) + # Haal de huidige split_point uit split_points + split_points = split_points[:-1] + + # Bereken de best split voor deze x col, als er ten minste 1 bestaat die + # voldoet aan min leaf + return best_array + + +def exhaustive_split_search(rows, classes, min_leaf): + """ + @todo: Docstring for exhaustive_split_search + """ + print("\t\t->entering exhaustive split search") + # We hebben enumerate nodig, want we willen weten op welke col + # (age,married,house,income,gender) we een split doen + exhaustive_best_array = np.zeros((1, rows.shape[0] + 3), dtype='float64') + print(f"Rows:\n{rows},\n Classes:\n{classes}") + for i, col in enumerate(rows.transpose()): + # calculate the best split for the col that satisfies the min_leaf + # constraint + best_array_for_col = bestsplit(col, classes, min_leaf) + # add col number to rows + best_array_for_col[:,-1] = i + exhaustive_best_array = np.vstack((exhaustive_best_array, best_array_for_col)) + print("The array with exhaustive splits is\n", exhaustive_best_array) + print("\t\t->returning from exhaustive split search") + return exhaustive_best_array + + +def add_children(node, best_split): + """ + @todo: Docstring for add_children + """ + print("\t\t\t->entering add children") + # The mask that was used to get the rows for the current node from x, we + # need this to update the rows for the children + current_mask = node.split_value_or_rows + + # Give the current node the split_value and col it needs for predictions + node.split_value_or_rows, node.col = best_split[-2], int(best_split[-1]) + + # Updating the row masks to give it to children, keeping numpy dimension consistent + mask_left, mask_right = update_mask(best_split[:-3], current_mask) + + # Adding the pointer between parent and children + node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right)) + print("\t\t\t->children added to node and node list\n") + return [node.left, node.right] + + +def update_mask(mask, current_mask): + """ + Updates the spit bool array from any dimension to an array with length + equal to the total number of rows in dataset x. + """ + print( + "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child" + ) + # Copy heb ik gedaan omdat we een slice assignment doen. + # + # Het punt van deze functie is dat tijdens het tree growen het aantal rows + # in een node afneemt. Deze functie update de bool array die gebruikt is om + # te splitten in bestsplit naar een array met als lengte het totale aantal + # rows in de data set x. Op deze manier kunnen we in tree grow de totale x + # splitten. + # + # Een andere optie zou zijn om de rows letterlijk tijdelijk op te slaan in + # nodes... weet niet wat beter is, het kan best veel geheugen in beslag + # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd + # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft. + current_mask = np.vstack((current_mask, current_mask)) + print(f"\nBefore update (upper rows goes to the left child, bottom to the right):\n{current_mask}") + # print(mask) + # print(mask.astype(bool)) + # print(current_mask[current_mask == True]) + current_mask[0][current_mask[0] == True] = mask.astype(bool) + current_mask[1][current_mask[1] == True] = np.invert(mask.astype(bool)) + # copy = np.array(current_mask, copy=True) + # copy[np.where(current_mask)] = mask + print(f"After update:\n{current_mask}") + print("\t\t\t\t->updated row mask for child node") + return current_mask[0], current_mask[1] + + +# +# +# Put all helper functions above this comment! + + +def tree_grow(x=None, + y=None, + n_min=None, + min_leaf=None, + n_feat=None, + **defaults) -> Tree: + """ + @todo: Docstring for tree_grow + """ + # De nodelist heeft in het begin alleen een root node met alle rows van x, + # omdat alle rows in de root in acht worden genomen voor bestsplit berekening. + # + # Dit representeren we met een boolean mask over x, met lengte het aantal rows + # in x en elementen True. Deze boolean mask zullen we repeatedly gebruiken als een + # mask over x om de rows voor bestsplit op te halen. + mask = np.full(len(x), True) + + # Het eerste node object moet nu geinstantieerd worden + root = Node(split_value_or_rows=mask) + + # We instantieren ook gelijk het Tree object + tr = Tree(root, (n_min, min_leaf, n_feat)) + + # De eerste nodelist heeft alleen de root node, daarna zijn twee childs, + # etc. totdat alle splits gemaakt zijn en de lijst leeg is. + nodelist = [root] + + while nodelist: + print("->Taking new node from the node list") + # Pop de current node uit de nodelist + node = nodelist.pop() + # Gebruik de boolean mask van de node om de rows in de node uit x te halen + node_rows = x[node.split_value_or_rows] + # print(node_rows) + # Gebruik de boolean mask van de node om de classes in de node uit y te halen + node_classes = y[node.split_value_or_rows] + + # print(f"{node_classes}, {node_rows}") + + # Test of de node een leaf node is met n_min + if len(node_rows) < n_min: + node.is_leaf_node(node_classes) + print( + "\t->Node has less rows than n_min, it is a leaf node, continueing to next node" + ) + continue + print("\t->Node has more rows than n_min, it is not a leaf node") + + # Als de node niet puur is, probeer dan te splitten + if impurity(node_classes) > 0: + print("\t->Node is not pure yet starting exhaustive split search") + # We gaan exhaustively voor de rows in de node over de cols + # (age,married,house,income,gender) om de bestsplit te + # bepalen + # + # We geven min_leaf als argument omdat: + # "If the algorithm performs a split, it should be the best split that meets the minleaf constrain" + # + # We krijgen een exhaustive lijst terug met splits + exhaustive_best_array = exhaustive_split_search( + node_rows, node_classes, min_leaf) + # print(exhaustive_best_list) + # Als de lijst leeg is zijn er geen potentieele splits die voldoen + # an de min leaf constraint + if exhaustive_best_array.shape[0] == 1: + node.is_leaf_node(node_classes) + print( + "\t\t->No split that fullfils min_leaf was found continueing to next node" + ) + continue + print( + "\t->Exhaustive search found a split fulfilling the min_leaf rule!" + ) + # Hier halen we de beste split, en rows voor de child/split nodes + # uit de exhaustive best list + best_split = exhaustive_best_array[np.argmin(exhaustive_best_array[1:,-3]) + 1] + print(f"\n######################best split array, the last three element=[delta_i, splitvalue, col/attribute], the rest is boolean include or not include row in child node############################\n\n{best_split}\n\n###############################################################################\n") + # Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent" + nodelist += add_children(node, best_split) + # node.add_split(left_child_node, right_child_node) + else: + node.is_leaf_node(node_classes) + print("\t\t->The node is already pure, it is necessarily a leaf!") + continue + # print(tr) + return tr + + +def tree_pred(x=None, tr=None, **defaults) -> np.array: + """ + @todo: Docstring for tree_pred + """ + y = tr.predict(x) + n_min, min_leaf, n_feat = tr.hyper_params + print("-"*80+f"\nPrediction parameters:\nn_min={n_min}\nmin_leaf={min_leaf}\nn_feat={n_feat}") + print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}") + return y + + +if __name__ == '__main__': + #### IMPURITY TEST + # array=np.array([1,0,1,1,1,0,0,1,1,0,1]) + # print(impurity(array)) + # Should give 0.23.... + + #### BESTSPLIT TEST + # print(bestsplit(credit_data[:, 3], credit_data[:, 5])) + # Should give 36 + + #### TREE_GROW TEST + tree_grow_defaults = { + 'x': credit_data[:, :5], + 'y': credit_data[:, 5], + 'n_min': 2, + 'min_leaf': 1, + 'n_feat': None # 5, should be 5 for bootstrapping folds + } + + # Calling the tree grow, unpacking default as argument + # tree_grow(**tree_grow_defaults) + + #### TREE_PRED TEST + tree_pred_defaults = { + 'x': credit_data[:, :5], + 'tr': tree_grow(**tree_grow_defaults) + } + + tree_pred(**tree_pred_defaults) -- cgit v1.2.3