From aeda214879ee4858eb93169257aa7b87c0973789 Mon Sep 17 00:00:00 2001 From: Mike Vink Date: Sun, 4 Oct 2020 12:38:14 +0200 Subject: Documenting before the hand-in --- tree.py | 509 ++++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 287 insertions(+), 222 deletions(-) diff --git a/tree.py b/tree.py index babd570..db40611 100644 --- a/tree.py +++ b/tree.py @@ -1,29 +1,185 @@ -import numpy as np -import cProfile -import pstats -# import tqdm - -# from tqdm import trange -from pstats import SortKey -from sklearn import metrics +#- Names and student no.: + +# Hunter Sterk 123456789 +# Lonnie Bregman 123456789 +# Mike Vink 5585791 + +#- Main functions: + +# def tree_grow(x,y,nmin,minleaf,nfeat): +# """ +# /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns +# /y/ numpy.ndarray, 1D numpy array containing binary x-row labels +# /nmin/ int, number of x-rows that a parent must contain before splitting +# /minleaf/ int, number of x-rows a child must have before splitting +# /nfeat/ int, number of x-columns randomly considered before splitting + +# Returns -> Tree object + +# tree_grow returns a tree object that stores a classification tree in a +# data structure that is similar to a linked list. To build +# the tree, it exhaustively considers the splits possible using nfeat +# random x-columns. The gini-index is used to determine the best split. +# Stopping rules constraining the number of x-rows in parent and child +# nodes are used as complexity parameters. + +# EXAMPLE: +# >>> x +# array([[22., 0., 0., 28., 1.], +# [46., 0., 1., 32., 0.], +# [24., 1., 1., 24., 1.], +# [25., 0., 0., 27., 1.], +# [29., 1., 1., 32., 0.], +# [45., 1., 1., 30., 0.], +# [63., 1., 1., 58., 1.], +# [36., 1., 0., 52., 1.], +# [23., 0., 1., 40., 0.], +# [50., 1., 1., 28., 0.]]) +# >>> y +# array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]) +# >>> tree_grow(x=x,y=y,nmin=2,minleaf=1,nfeat=5) +# <__main__.Tree object at 0x10d752ee0> + +# """ + +# def tree_pred(x,tr,true): +# """ +# /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns +# /tr/ Tree object, tree to predict a binary label for each row in x +# /true/ numpy.ndarray, 1D numpy array containing "true" labels +# -# age,married,house,income,gender,class -# [(22, 0, 0, 28, 1, 0) -# (46, 0, 1, 32, 0, 0) -# (24, 1, 1, 24, 1, 0) -# (25, 0, 0, 27, 1, 0) -# (29, 1, 1, 32, 0, 0) -# (45, 1, 1, 30, 0, 1) -# (63, 1, 1, 58, 1, 1) -# (36, 1, 0, 52, 1, 1) -# (23, 0, 1, 40, 0, 1) -# (50, 1, 1, 28, 0, 1)] +# Returns -> numpy.ndarray, 1D numpy array containing predicted binary +# labels for each row in x + +# tree_pred uses a tree object to predict binary labels on a given 2D +# data array x. The "true" argument should only be given if predictions +# metrics are to be calculated and printed, which gives an immediate idea +# if the tree is erronous by seeing low prediction performance on the +# training data for example. + +# EXAMPLE: +# >>> x +# array([[22., 0., 0., 28., 1.], +# [46., 0., 1., 32., 0.], +# [24., 1., 1., 24., 1.], +# [25., 0., 0., 27., 1.], +# [29., 1., 1., 32., 0.], +# [45., 1., 1., 30., 0.], +# [63., 1., 1., 58., 1.], +# [36., 1., 0., 52., 1.], +# [23., 0., 1., 40., 0.], +# [50., 1., 1., 28., 0.]]) +# >>> tr = tree_grow(x=x,y=y,nmin=2,minleaf=1,nfeat=5) +# >>> tree_pred(x, tr) +# array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]) + +# """ + +# def tree_grow_b(x,y,nmin,minleaf,nfeat,m): +# """ +# /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns +# /y/ numpy.ndarray, 1D numpy array containing binary x-row labels +# /nmin/ int, number of x-rows that a parent must contain before splitting +# /minleaf/ int, number of x-rows a child must have before splitting +# /nfeat/ int, number of x-columns randomly considered before splitting +# /m/ int, number of bootstrap samples to draw + +# Returns -> list of m Tree objects + +# tree_grow_b returns a list of Tree objects that store a classification +# tree in a data structure that is similar to a linked list. To build the +# tree, it exhaustively considers the splits possible using nfeat random +# x-columns. In case of a random forest nfeat should be lower than the +# number of columns in x. The gini-index is used to determine the best +# split. Stopping rules constraining the number of x-rows in parent and +# child nodes are used as complexity parameters. + +# EXAMPLE: +# >>> x +# array([[22., 0., 0., 28., 1.], +# [46., 0., 1., 32., 0.], +# [24., 1., 1., 24., 1.], +# [25., 0., 0., 27., 1.], +# [29., 1., 1., 32., 0.], +# [45., 1., 1., 30., 0.], +# [63., 1., 1., 58., 1.], +# [36., 1., 0., 52., 1.], +# [23., 0., 1., 40., 0.], +# [50., 1., 1., 28., 0.]]) +# >>> y +# array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]) +# >>> trees = tree_grow_b(x=x,y=y,nmin=2,minleaf=1,nfeat=4,m=50) +# >>> type(trees), type(trees[0]), len(trees) == m +# (, , True) + +# """ + +# def tree_pred_b(x,tr,true): +# """ +# /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns +# /tr/ list of Tree objects, trees predict a binary label for each row in +# x, which are used to make a majority vote on the +# final predicted labels +# /true/ numpy.ndarray, 1D numpy array containing "true" labels +# -# In the program data points are called rows +# Returns -> numpy.ndarray, 1D numpy array containing predicted binary +# labels for each row in x + +# tree_pred_b uses a list of tree objects to predict binary labels on a +# given 2D data array x. The main difference with tree_pred is that now we +# get for each tree a 1D numpy array of predicted binary labels for each +# row in x. A 2D numpy array is constructed where each column corresponds +# to all predictions for one tree, and a row corresponds to a row in x. +# Therefore we take the majority vote of rows in this array to return a 1D +# numpy array with the final predicted labels of the trees. + +# The "true" argument should only be given if predictions metrics are to be +# calculated and printed, which gives an immediate idea if the tree is +# erronous by seeing low prediction performance on the training data for +# example. + +# EXAMPLE: +# >>> x +# array([[22., 0., 0., 28., 1.], +# [46., 0., 1., 32., 0.], +# [24., 1., 1., 24., 1.], +# [25., 0., 0., 27., 1.], +# [29., 1., 1., 32., 0.], +# [45., 1., 1., 30., 0.], +# [63., 1., 1., 58., 1.], +# [36., 1., 0., 52., 1.], +# [23., 0., 1., 40., 0.], +# [50., 1., 1., 28., 0.]]) +# >>> type(trees), type(trees[0]), len(trees) == 50 +# (, , True) +# >>> tree_pred_b(x=x,tr=trees) +# array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) + +# """ + +#- Miscellaneous functions: + +# def major_vote(classes): +# """ +# /classes/ numpy.array, 1D numpy array of zeroes and ones + +# Returns -> int + +# Uses numpy methods to calculate if 1 or 0 elements are the majority in +# the classes vector. Note that when the number of 1 and 0 elements are +# equal, it returns 0. + +# >>> y +# array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]) +# >>> major_vote(y) +# 0 + +# """ -# In the program categorical or numerical attributes are called cols for columns -# The last column are the classes and will be called as classes in the program +import numpy as np class Node: @@ -58,9 +214,9 @@ class Node: leaf node """ self.col = None - # This weird numpy line gives the majority vote, which is 1 or 0 self.split_value_or_rows = major_vote(node_classes) + class Tree: """ Tree object that points towards the root node. @@ -102,40 +258,117 @@ class Tree: nodes[0] = node.right return predictions - # Work in progress tree printer - # - # def __repr__(self): - # tree_string = '' - # node = self.tree - # depth = 0 - # nodelist = [node] - # while nodelist: - # node = nodelist.pop() - # depth += 1 - # if node.col is not None: - # left, right = node.left, node.right - # nodelist += [left, right] - # else: - # continue - # tree_string += '\n' + depth * ' ' - # tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\' - # tree_string += '\n' + ' ' * 2 * depth - # for direc in left, right: - # if not direc.split_value_or_rows%10: - # tree_string += ' ' * 4 - # else: - # tree_string += ' ' * 3 - # tree_string += str(int(direc.split_value_or_rows)) - - # tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string - # return tree_string +def tree_grow(x=None, y=None, nmin=None, minleaf=None, nfeat=None) -> Tree: + """ + Builds a classification tree given training data and labels using stopping + rule complexity parameters. + """ + mask = np.full(len(x), True) + root = Node(split_value_or_rows=mask) + tr = Tree(root, (nmin, minleaf, nfeat)) + + nodelist = [root] + while nodelist: + node = nodelist.pop() + node_classes = y[node.split_value_or_rows] + + if len(node_classes) < nmin: + node.is_leaf_node(node_classes) + continue + + if impurity(node_classes) > 0: + node_rows = x[node.split_value_or_rows] + exhaustive_best_list = exhaustive_split_search( + node_rows, node_classes, minleaf) + if not exhaustive_best_list: + node.is_leaf_node(node_classes) + continue + best_split = min(exhaustive_best_list, key=lambda z: z[0]) + nodelist += add_children(node, best_split) + + else: + # impurity 0 + node.is_leaf_node(node_classes) + continue + return tr + + +def tree_pred(x=None, tr=None, true=None) -> np.array: + """ + Predicts a binary label for each row in x using tr.predict. + """ + y = tr.predict(x).astype(float) + nmin, minleaf, nfeat = tr.hyper_params + if true is not None: + print( + f'Results from: prediction single tree({nmin=}, {minleaf=}, {nfeat=})' + ) + print(f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, true)}') + print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, true)}') + print(f'\t->Precission:\n\t\t{metrics.precision_score(y, true)}') + print(f'\t->Recall:\n\t\t{metrics.recall_score(y, true)}') + return y + + +def tree_grow_b(x=None, + y=None, + nmin=None, + minleaf=None, + nfeat=None, + m=None) -> Tree: + """ + The m times repeated application of tree_grow with bagged data. + """ + forest = [] + for i in range(m): + choice = np.random.randint(len(x), size=len(x)) + x_bag, y_bag = x[choice], y[choice] + forest.append( + tree_grow(x=x_bag, + y=y_bag, + nmin=nmin, + minleaf=minleaf, + nfeat=nfeat)) + return forest + + +def tree_pred_b(x=None, tr=None, true=None) -> np.array: + """ + The repeated application of tree.predict to construct a 2D array which is + used to make a majority vote label prediction for the rows in x. + """ + y_bag = np.zeros((len(x), len(tr))) + for i, tree in enumerate(tr): + y_bag[:, i] = tree.predict(x).astype(float) + nmin, minleaf, nfeat = tr[0].hyper_params + y = np.array([major_vote(y_bag[i]) for i in range(len(y_bag))]) + if true is not None: + if nfeat == x.shape[1]: + print( + f'Results from: prediction bagged tree({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})' + ) + else: + print( + f'Results from: prediction random forest({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})' + ) + print(f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, true)}') + print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, true)}') + print(f'\t->Precission:\n\t\t{metrics.precision_score(y, true)}') + print(f'\t->Recall:\n\t\t{metrics.recall_score(y, true)}') + return y + + +# +# +# Put all helper functions below this comment! def major_vote(classes): """ - @todo: Docstring for major_vote(classes + Returns a zero or one based on the highest occurence in the classes vector. """ return np.argmax(np.bincount(classes.astype(int))) + def impurity(array) -> int: """ Assumes the argument array is a one dimensional vector of zeroes and ones. @@ -211,7 +444,7 @@ def bestsplit(x, y, minleaf) -> None: def exhaustive_split_search(rows, classes, minleaf): """ - @todo: Docstring for exhaustive_split_search + The nfeat repeated application of best_split. """ # We hebben enumerate nodig, want we willen weten op welke col (i) # (age,married,house,income,gender) we een split doen @@ -227,7 +460,8 @@ def exhaustive_split_search(rows, classes, minleaf): def add_children(node, best_split): """ - @todo: Docstring for add_children + Processes the splits into the tree data structure and returns children yet + to be splitted to the nodelist in tree_grow. """ current_mask = node.split_value_or_rows mask_left, mask_right, node_split_value, node_col = best_split[1:] @@ -253,172 +487,3 @@ def update_mask(mask, current_mask): copy = np.array(current_mask, copy=True) copy[current_mask == True] = mask return copy - - -# -# -# Put all helper functions above this comment! - - -def tree_grow(x=None, - y=None, - nmin=None, - minleaf=None, - nfeat=None, - **defaults) -> Tree: - """ - @todo: Docstring for tree_grow - """ - mask = np.full(len(x), True) - root = Node(split_value_or_rows=mask) - tr = Tree(root, (nmin, minleaf, nfeat)) - - nodelist = [root] - while nodelist: - node = nodelist.pop() - node_classes = y[node.split_value_or_rows] - - if len(node_classes) < nmin: - node.is_leaf_node(node_classes) - continue - - if impurity(node_classes) > 0: - node_rows = x[node.split_value_or_rows] - exhaustive_best_list = exhaustive_split_search( - node_rows, node_classes, minleaf) - if not exhaustive_best_list: - node.is_leaf_node(node_classes) - continue - best_split = min(exhaustive_best_list, key=lambda z: z[0]) - nodelist += add_children(node, best_split) - - else: - # impurity 0 - node.is_leaf_node(node_classes) - continue - return tr - - -def tree_grow_b(x=None, - y=None, - nmin=None, - minleaf=None, - nfeat=None, - m=None, - **defaults) -> Tree: - forest = [] - for i in range(m):# ,desc=f'planting a forest, growing {m} trees'): - choice = np.random.randint(len(x),size=len(x)) - x_bag, y_bag = x[choice], y[choice] - forest.append(tree_grow(x=x_bag,y=y_bag,nmin=nmin,minleaf=minleaf,nfeat=nfeat)) - return forest - - - -def tree_pred(x=None, tr=None, training=None, **defaults) -> np.array: - """ - @todo: Docstring for tree_pred - """ - y = tr.predict(x).astype(float) - nmin, minleaf, nfeat = tr.hyper_params - if training is not None: - # print(np.mean(training == y)) - print( - f'Results from: prediction single tree({nmin=}, {minleaf=}, {nfeat=})' - ) - print( - f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, training)}') - print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, training)}') - print(f'\t->Precission:\n\t\t{metrics.precision_score(y, training)}') - print(f'\t->Recall:\n\t\t{metrics.recall_score(y, training)}') - return y - - -def tree_pred_b(x=None, tr=None, training=None, **defaults) -> np.array: - y_bag = np.zeros((len(x), len(tr))) - for i, tree in enumerate(tr): # , total=len(tr),desc=f'making also {len(tr)} predictions!'): - y_bag[:,i] = tree.predict(x).astype(float) - nmin, minleaf, nfeat = tr[0].hyper_params - y = np.array([major_vote(y_bag[i]) for i in range(len(y_bag))]) - if training is not None: - # print(np.mean(training == y)) - if nfeat == x.shape[1]: - print( - f'Results from: prediction bagged tree({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})' - ) - else: - print( - f'Results from: prediction random forest({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})' - ) - print( - f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, training)}') - print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, training)}') - print(f'\t->Precission:\n\t\t{metrics.precision_score(y, training)}') - print(f'\t->Recall:\n\t\t{metrics.recall_score(y, training)}') - return y - - -if __name__ == '__main__': - credit_data = np.genfromtxt('./data/credit_score.txt', - delimiter=',', - skip_header=True) - - pima_indians = np.genfromtxt('./data/pima_indians.csv', - delimiter=',', - skip_header=True) - - # print("\nDataset: credit data") - # tree_pred(x=credit_data[:, :5], - # tr=tree_grow(x=credit_data[:, 0:5], - # y=credit_data[:, 5], - # nmin=2, - # minleaf=1, - # nfeat=5), - # training=credit_data[:, 5]) - - # print("\nDataset: credit data") - # tree_pred_b(x=credit_data[:, :5], - # tr=tree_grow_b(x=credit_data[:, 0:5], - # y=credit_data[:, 5], - # nmin=2, - # minleaf=1, - # nfeat=4, - # m=50), - # training=credit_data[:, 5]) - - # print('\nDataset: pima indians') - # tree_pred(x=pima_indians[:, :8], - # tr=tree_grow(x=pima_indians[:, :8], - # y=pima_indians[:, 8], - # nmin=20, - # minleaf=5, - # nfeat=pima_indians.shape[1] - 1), - # training=pima_indians[:, 8]) - - - # print('\nDataset: pima indians') - # tree_pred_b(x=pima_indians[:, :8], - # tr=tree_grow_b(x=pima_indians[:, :8], - # y=pima_indians[:, 8], - # nmin=20, - # minleaf=5, - # nfeat=4, - # m=5), - # training=pima_indians[:, 8]) - - - - # Time profiles: see what functions take what time! :) - - # print("prediction metrics single tree pima indians:") - # cProfile.run("tree_pred(x=credit_data[:,:5], tr=tree_grow(x=credit_data[:,0:5], y=credit_data[:,5], nmin=2, minleaf=1, nfeat=5), dataset='credit score')", 'restats') - - # Time profile of pima indians data prediction with single tree - # print("prediction metrics single tree pima indians:") - cProfile.run( - "tree_pred_b(x=pima_indians[:, :8], tr=tree_grow_b(x=pima_indians[:, :8], y=pima_indians[:, 8], nmin=20, minleaf=5, nfeat=4, m=5), training=pima_indians[:, 8])", - 'restats') - - p = pstats.Stats('restats') - p.sort_stats(SortKey.TIME) - p.print_stats() -- cgit v1.2.3