#- Names and student no.: # Hunter Sterk 6981046 # Lonnie Bregman 123456789 # Mike Vink 5585791 #- Main functions: # def tree_grow(x,y,nmin,minleaf,nfeat): # """ # /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns # /y/ numpy.ndarray, 1D numpy array containing binary x-row labels # /nmin/ int, number of x-rows that a parent must contain before splitting # /minleaf/ int, number of x-rows a child must have before splitting # /nfeat/ int, number of x-columns randomly considered before splitting # Returns -> Tree object # tree_grow returns a tree object that stores a classification tree in a # data structure that is similar to a linked list. To build # the tree, it exhaustively considers the splits possible using nfeat # random x-columns. The gini-index is used to determine the best split. # Stopping rules constraining the number of x-rows in parent and child # nodes are used as complexity parameters. # EXAMPLE: # >>> x # array([[22., 0., 0., 28., 1.], # [46., 0., 1., 32., 0.], # [24., 1., 1., 24., 1.], # [25., 0., 0., 27., 1.], # [29., 1., 1., 32., 0.], # [45., 1., 1., 30., 0.], # [63., 1., 1., 58., 1.], # [36., 1., 0., 52., 1.], # [23., 0., 1., 40., 0.], # [50., 1., 1., 28., 0.]]) # >>> y # array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]) # >>> tree_grow(x=x,y=y,nmin=2,minleaf=1,nfeat=5) # <__main__.Tree object at 0x10d752ee0> # """ # def tree_pred(x,tr,true): # """ # /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns # /tr/ Tree object, tree to predict a binary label for each row in x # /true/ numpy.ndarray, 1D numpy array containing "true" labels # # Returns -> numpy.ndarray, 1D numpy array containing predicted binary # labels for each row in x # tree_pred uses a tree object to predict binary labels on a given 2D # data array x. The "true" argument should only be given if predictions # metrics are to be calculated and printed, which gives an immediate idea # if the tree is erronous by seeing low prediction performance on the # training data for example. # EXAMPLE: # >>> x # array([[22., 0., 0., 28., 1.], # [46., 0., 1., 32., 0.], # [24., 1., 1., 24., 1.], # [25., 0., 0., 27., 1.], # [29., 1., 1., 32., 0.], # [45., 1., 1., 30., 0.], # [63., 1., 1., 58., 1.], # [36., 1., 0., 52., 1.], # [23., 0., 1., 40., 0.], # [50., 1., 1., 28., 0.]]) # >>> tr = tree_grow(x=x,y=y,nmin=2,minleaf=1,nfeat=5) # >>> tree_pred(x, tr) # array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]) # """ # def tree_grow_b(x,y,nmin,minleaf,nfeat,m): # """ # /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns # /y/ numpy.ndarray, 1D numpy array containing binary x-row labels # /nmin/ int, number of x-rows that a parent must contain before splitting # /minleaf/ int, number of x-rows a child must have before splitting # /nfeat/ int, number of x-columns randomly considered before splitting # /m/ int, number of bootstrap samples to draw # Returns -> list of m Tree objects # tree_grow_b returns a list of Tree objects that store a classification # tree in a data structure that is similar to a linked list. To build the # tree, it exhaustively considers the splits possible using nfeat random # x-columns. In case of a random forest nfeat should be lower than the # number of columns in x. The gini-index is used to determine the best # split. Stopping rules constraining the number of x-rows in parent and # child nodes are used as complexity parameters. # EXAMPLE: # >>> x # array([[22., 0., 0., 28., 1.], # [46., 0., 1., 32., 0.], # [24., 1., 1., 24., 1.], # [25., 0., 0., 27., 1.], # [29., 1., 1., 32., 0.], # [45., 1., 1., 30., 0.], # [63., 1., 1., 58., 1.], # [36., 1., 0., 52., 1.], # [23., 0., 1., 40., 0.], # [50., 1., 1., 28., 0.]]) # >>> y # array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]) # >>> trees = tree_grow_b(x=x,y=y,nmin=2,minleaf=1,nfeat=4,m=50) # >>> type(trees), type(trees[0]), len(trees) == m # (, , True) # """ # def tree_pred_b(x,tr,true): # """ # /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns # /tr/ list of Tree objects, trees predict a binary label for each row in # x, which are used to make a majority vote on the # final predicted labels # /true/ numpy.ndarray, 1D numpy array containing "true" labels # # Returns -> numpy.ndarray, 1D numpy array containing predicted binary # labels for each row in x # tree_pred_b uses a list of tree objects to predict binary labels on a # given 2D data array x. The main difference with tree_pred is that now we # get for each tree a 1D numpy array of predicted binary labels for each # row in x. A 2D numpy array is constructed where each column corresponds # to all predictions for one tree, and a row corresponds to a row in x. # Therefore we take the majority vote of rows in this array to return a 1D # numpy array with the final predicted labels of the trees. # The "true" argument should only be given if predictions metrics are to be # calculated and printed, which gives an immediate idea if the tree is # erronous by seeing low prediction performance on the training data for # example. # EXAMPLE: # >>> x # array([[22., 0., 0., 28., 1.], # [46., 0., 1., 32., 0.], # [24., 1., 1., 24., 1.], # [25., 0., 0., 27., 1.], # [29., 1., 1., 32., 0.], # [45., 1., 1., 30., 0.], # [63., 1., 1., 58., 1.], # [36., 1., 0., 52., 1.], # [23., 0., 1., 40., 0.], # [50., 1., 1., 28., 0.]]) # >>> type(trees), type(trees[0]), len(trees) == 50 # (, , True) # >>> tree_pred_b(x=x,tr=trees) # array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) # """ #- Miscellaneous functions: # def major_vote(classes): # """ # /classes/ numpy.array, 1D numpy array of zeroes and ones # Returns -> int # Uses numpy methods to calculate if 1 or 0 elements are the majority in # the classes vector. Note that when the number of 1 and 0 elements are # equal, it returns 0. # EXAMPLE: # >>> y # array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]) # >>> major_vote(y) # 0 # """ # def impurity(array): # """ # /array/ numpy.array, 1D numpy array of zeroes and ones # Returns -> float # Computes the gini index impurity based on the relative frequency of ones # in the vector. # # EXAMPLE: # >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1]) # >>> array # array([1,0,1,1,1,0,0,1,1,0,1]) # >>> impurity(array) # 0.23140495867768596 # """ # def bestsplit(x,y,minleaf): # """ # /x/ numpy.array, 1D numpy array corresponding to a feature column of the # 2D data array corresponding to some node # /y/ numpy.array, 1D numpy array of binary labels corresponding to the # rows in the 2D data array corresponding to some node # /minleaf/ int, number of x-rows a child must have before splitting # Returns -> tuple # Computes the best split based on the given features using the impurity # function. # EXAMPLE: # >>> x # array([28., 32., 24., 27., 32., 30., 58., 52., 40., 28.]) # >>> y # array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]) # >>> bestsplit(x,y,minleaf=1) # (1.4285714285714286, array([False, False, False, False, False, False, True, True, True, # False]), array([ True, True, True, True, True, True, False, False, False, # True]), 36.0) # # """ # def exhaustive_split_search(rows, classes, minleaf): # """ # /rows/ numpy.array, 2D data array corresponding to some node # /classes/ numpy.array, 1D numpy array of binary labels corresponding to the # rows in the 2D data array corresponding to some node # /minleaf/ int, number of x-rows a child must have before splitting # Returns -> List # Stores the best splits computed with the bestsplit function for the # considered columns in a list, if the list is empty then there are no # splits and the node becomes a leaf node. # """ # def add_children(node, best_split): # """ # /node/ Node object, the current node in the main tree growing loop # /best_split/ tuple, tuple containing (delta_i, rows_left, rows_right, splitvalue) # Processes the splits into the tree data structure and returns children yet # to be splitted to the main nodelist in tree_grow. # """ # def update_mask(mask, current_mask): # """ # /mask/ np.array, 1D boolean vector corresponding to the rows in the new # child node that might have a length that is incompatible with the rows in # the main 2D data array x of tree_grow # /current_mask/ np.array, 1D boolean vector corresponding to the rows in the current node # Updates the spit bool array from any dimension to an array with length # equal to the total number of rows in dataset x of tree_grow. # """ import numpy as np class Node: """ The node object points to two other Node objects. """ def __init__(self, split_value_or_rows=None, col=None): """Initialises the column and split value for the node. /split_value_or_rows=None/ can either be the best split value of a col, or a boolean mask for x that selects the rows to consider for calculating the split_value /col=None/ if the node object has a split_value, then it also has a col that belongs to this value """ self.split_value_or_rows = split_value_or_rows self.col = col def add_split(self, left, right): """ Lets the node object point to two other objects that can be either Leaf or Node. """ self.left = left self.right = right def is_leaf_node(self, node_classes): """ is_leaf_node is used to change the col attribute to None to indicate a leaf node """ self.col = None self.split_value_or_rows = major_vote(node_classes) class Tree: """ Tree object that points towards the root node. """ def __init__(self, root_node_obj, hyper_params): """Initialises only by pointing to a Node object. /root_node_obj/ is a node object that is made before entering the main loop of tree grow. """ self.tree = root_node_obj self.hyper_params = hyper_params def predict(self, x): """ Makes a list of root nodes, and drops one row of x through the tree per loop """ # Maak een lijst van nodes, wiens indexes overeen komen met de rows in # x die we willen droppen rows_to_predict = len(x) nodes = np.array([self.tree] * rows_to_predict) predictions = np.zeros(rows_to_predict) # # De index van de row van x die we in de boom willen droppen drop = 0 while nodes.size != 0: node = nodes[0] if node.col is None: node = node.split_value_or_rows predictions[drop] = node nodes = nodes[1:] drop += 1 continue elif x[drop, node.col] > node.split_value_or_rows: nodes[0] = node.left else: nodes[0] = node.right return predictions def tree_grow(x=None, y=None, nmin=None, minleaf=None, nfeat=None) -> Tree: """ Builds a classification tree given training data and labels using stopping rule complexity parameters. """ mask = np.full(len(x), True) root = Node(split_value_or_rows=mask) tr = Tree(root, (nmin, minleaf, nfeat)) nodelist = [root] while nodelist: node = nodelist.pop() node_classes = y[node.split_value_or_rows] if len(node_classes) < nmin: node.is_leaf_node(node_classes) continue if impurity(node_classes) > 0: node_rows = x[node.split_value_or_rows] exhaustive_best_list = exhaustive_split_search( node_rows, node_classes, minleaf) if not exhaustive_best_list: node.is_leaf_node(node_classes) continue best_split = min(exhaustive_best_list, key=lambda z: z[0]) nodelist += add_children(node, best_split) else: # impurity 0 node.is_leaf_node(node_classes) continue return tr def tree_pred(x=None, tr=None, true=None) -> np.array: """ Predicts a binary label for each row in x using tr.predict. """ y = tr.predict(x).astype(float) nmin, minleaf, nfeat = tr.hyper_params if true is not None: print( f'Results from: prediction single tree({nmin=}, {minleaf=}, {nfeat=})' ) print(f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, true)}') print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, true)}') print(f'\t->Precission:\n\t\t{metrics.precision_score(y, true)}') print(f'\t->Recall:\n\t\t{metrics.recall_score(y, true)}') return y def tree_grow_b(x=None, y=None, nmin=None, minleaf=None, nfeat=None, m=None) -> Tree: """ The m times repeated application of tree_grow with bagged data. """ forest = [] for i in range(m): choice = np.random.randint(len(x), size=len(x)) x_bag, y_bag = x[choice], y[choice] forest.append( tree_grow(x=x_bag, y=y_bag, nmin=nmin, minleaf=minleaf, nfeat=nfeat)) return forest def tree_pred_b(x=None, tr=None, true=None) -> np.array: """ The repeated application of tree.predict to construct a 2D array which is used to make a majority vote label prediction for the rows in x. """ y_bag = np.zeros((len(x), len(tr))) for i, tree in enumerate(tr): y_bag[:, i] = tree.predict(x).astype(float) nmin, minleaf, nfeat = tr[0].hyper_params y = np.array([major_vote(y_bag[i]) for i in range(len(y_bag))]) if true is not None: if nfeat == x.shape[1]: print( f'Results from: prediction bagged tree({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})' ) else: print( f'Results from: prediction random forest({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})' ) print(f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, true)}') print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, true)}') print(f'\t->Precission:\n\t\t{metrics.precision_score(y, true)}') print(f'\t->Recall:\n\t\t{metrics.recall_score(y, true)}') return y # # # Put all helper functions below this comment! def major_vote(classes): """ Returns a zero or one based on the highest occurence in the classes vector. """ return np.argmax(np.bincount(classes.astype(int))) def impurity(array) -> int: """ Calculates the impurity of the labels in a node. """ n_labels = len(array) n_labels_1 = array.sum() rel_freq_1 = n_labels_1 / n_labels rel_freq_0 = 1 - rel_freq_1 gini_index = rel_freq_1 * rel_freq_0 return gini_index def bestsplit(x, y, minleaf) -> None: """ x = vector of single col y = vector of classes (last col in x) Consider splits of type "x <= c" where "c" is the average of two consecutive values of x in the sorted order. x and y must be of the same length y[i] must be the class label of the i-th observation, and x[i] is the correspnding value of attribute x Example (best split on income): >>> bestsplit(credit_data[:,3],credit_data[:,5]) 36 """ x_sorted = np.sort(np.unique(x)) split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 # Hieren stoppen we (delta_i, split_value, rows_left, rows_right) best_list = [] while split_points.size != 0: split_value = split_points[-1] mask_left, mask_right = x > split_value, x <= split_value classes_left, classes_right = y[mask_left], y[mask_right] if len(classes_left) < minleaf or len(classes_right) < minleaf: split_points = split_points[:-1] continue delta_i = (impurity(classes_left) * len(classes_left) + impurity(classes_right) * len(classes_right)) best_list.append((delta_i, mask_left, mask_right, split_value)) split_points = split_points[:-1] # Bereken de best split voor deze x col, als er ten minste 1 bestaat die # voldoet aan min leaf if best_list: return min(best_list, key=lambda x: x[0]) else: return False def exhaustive_split_search(rows, classes, minleaf): """ The nfeat repeated application of bestsplit. """ # We hebben enumerate nodig, want we willen weten op welke col (i) # (age,married,house,income,gender) we een split doen exhaustive_best_list = [] for i, col in enumerate(rows.transpose()): col_best_split = bestsplit(col, classes, minleaf) if col_best_split: # add for which row we calculated the best split col_best_split += (i, ) exhaustive_best_list.append(col_best_split) return exhaustive_best_list def add_children(node, best_split): """ Processes the splits into the tree data structure and returns children yet to be splitted to the nodelist in tree_grow. """ current_mask = node.split_value_or_rows mask_left, mask_right, node_split_value, node_col = best_split[1:] # Give the current node the split_value and col it needs for predictions node.split_value_or_rows, node.col = node_split_value, node_col # Updating the row masks to give it to children, keeping numpy dimension consistent mask_left, mask_right = update_mask(mask_left, current_mask), update_mask( mask_right, current_mask) # Adding the pointer between parent and children node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right)) return [node.left, node.right] def update_mask(mask, current_mask): """ Updates the spit bool array from any dimension to an array with length equal to the total number of rows in dataset x. """ copy = np.array(current_mask, copy=True) copy[current_mask == True] = mask return copy