summaryrefslogtreecommitdiff
path: root/tree.py
diff options
context:
space:
mode:
Diffstat (limited to 'tree.py')
-rw-r--r--tree.py509
1 files changed, 287 insertions, 222 deletions
diff --git a/tree.py b/tree.py
index babd570..db40611 100644
--- a/tree.py
+++ b/tree.py
@@ -1,29 +1,185 @@
-import numpy as np
-import cProfile
-import pstats
-# import tqdm
-
-# from tqdm import trange
-from pstats import SortKey
-from sklearn import metrics
+#- Names and student no.:
+
+# Hunter Sterk 123456789
+# Lonnie Bregman 123456789
+# Mike Vink 5585791
+
+#- Main functions:
+
+# def tree_grow(x,y,nmin,minleaf,nfeat):
+# """
+# /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns
+# /y/ numpy.ndarray, 1D numpy array containing binary x-row labels
+# /nmin/ int, number of x-rows that a parent must contain before splitting
+# /minleaf/ int, number of x-rows a child must have before splitting
+# /nfeat/ int, number of x-columns randomly considered before splitting
+
+# Returns -> Tree object
+
+# tree_grow returns a tree object that stores a classification tree in a
+# data structure that is similar to a linked list. To build
+# the tree, it exhaustively considers the splits possible using nfeat
+# random x-columns. The gini-index is used to determine the best split.
+# Stopping rules constraining the number of x-rows in parent and child
+# nodes are used as complexity parameters.
+
+# EXAMPLE:
+# >>> x
+# array([[22., 0., 0., 28., 1.],
+# [46., 0., 1., 32., 0.],
+# [24., 1., 1., 24., 1.],
+# [25., 0., 0., 27., 1.],
+# [29., 1., 1., 32., 0.],
+# [45., 1., 1., 30., 0.],
+# [63., 1., 1., 58., 1.],
+# [36., 1., 0., 52., 1.],
+# [23., 0., 1., 40., 0.],
+# [50., 1., 1., 28., 0.]])
+# >>> y
+# array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.])
+# >>> tree_grow(x=x,y=y,nmin=2,minleaf=1,nfeat=5)
+# <__main__.Tree object at 0x10d752ee0>
+
+# """
+
+# def tree_pred(x,tr,true):
+# """
+# /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns
+# /tr/ Tree object, tree to predict a binary label for each row in x
+# /true/ numpy.ndarray, 1D numpy array containing "true" labels
+#
-# age,married,house,income,gender,class
-# [(22, 0, 0, 28, 1, 0)
-# (46, 0, 1, 32, 0, 0)
-# (24, 1, 1, 24, 1, 0)
-# (25, 0, 0, 27, 1, 0)
-# (29, 1, 1, 32, 0, 0)
-# (45, 1, 1, 30, 0, 1)
-# (63, 1, 1, 58, 1, 1)
-# (36, 1, 0, 52, 1, 1)
-# (23, 0, 1, 40, 0, 1)
-# (50, 1, 1, 28, 0, 1)]
+# Returns -> numpy.ndarray, 1D numpy array containing predicted binary
+# labels for each row in x
+
+# tree_pred uses a tree object to predict binary labels on a given 2D
+# data array x. The "true" argument should only be given if predictions
+# metrics are to be calculated and printed, which gives an immediate idea
+# if the tree is erronous by seeing low prediction performance on the
+# training data for example.
+
+# EXAMPLE:
+# >>> x
+# array([[22., 0., 0., 28., 1.],
+# [46., 0., 1., 32., 0.],
+# [24., 1., 1., 24., 1.],
+# [25., 0., 0., 27., 1.],
+# [29., 1., 1., 32., 0.],
+# [45., 1., 1., 30., 0.],
+# [63., 1., 1., 58., 1.],
+# [36., 1., 0., 52., 1.],
+# [23., 0., 1., 40., 0.],
+# [50., 1., 1., 28., 0.]])
+# >>> tr = tree_grow(x=x,y=y,nmin=2,minleaf=1,nfeat=5)
+# >>> tree_pred(x, tr)
+# array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.])
+
+# """
+
+# def tree_grow_b(x,y,nmin,minleaf,nfeat,m):
+# """
+# /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns
+# /y/ numpy.ndarray, 1D numpy array containing binary x-row labels
+# /nmin/ int, number of x-rows that a parent must contain before splitting
+# /minleaf/ int, number of x-rows a child must have before splitting
+# /nfeat/ int, number of x-columns randomly considered before splitting
+# /m/ int, number of bootstrap samples to draw
+
+# Returns -> list of m Tree objects
+
+# tree_grow_b returns a list of Tree objects that store a classification
+# tree in a data structure that is similar to a linked list. To build the
+# tree, it exhaustively considers the splits possible using nfeat random
+# x-columns. In case of a random forest nfeat should be lower than the
+# number of columns in x. The gini-index is used to determine the best
+# split. Stopping rules constraining the number of x-rows in parent and
+# child nodes are used as complexity parameters.
+
+# EXAMPLE:
+# >>> x
+# array([[22., 0., 0., 28., 1.],
+# [46., 0., 1., 32., 0.],
+# [24., 1., 1., 24., 1.],
+# [25., 0., 0., 27., 1.],
+# [29., 1., 1., 32., 0.],
+# [45., 1., 1., 30., 0.],
+# [63., 1., 1., 58., 1.],
+# [36., 1., 0., 52., 1.],
+# [23., 0., 1., 40., 0.],
+# [50., 1., 1., 28., 0.]])
+# >>> y
+# array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.])
+# >>> trees = tree_grow_b(x=x,y=y,nmin=2,minleaf=1,nfeat=4,m=50)
+# >>> type(trees), type(trees[0]), len(trees) == m
+# (<class 'list'>, <class '__main__.Tree'>, True)
+
+# """
+
+# def tree_pred_b(x,tr,true):
+# """
+# /x/ numpy.ndarray, 2D numpy array containing data rows and feature columns
+# /tr/ list of Tree objects, trees predict a binary label for each row in
+# x, which are used to make a majority vote on the
+# final predicted labels
+# /true/ numpy.ndarray, 1D numpy array containing "true" labels
+#
-# In the program data points are called rows
+# Returns -> numpy.ndarray, 1D numpy array containing predicted binary
+# labels for each row in x
+
+# tree_pred_b uses a list of tree objects to predict binary labels on a
+# given 2D data array x. The main difference with tree_pred is that now we
+# get for each tree a 1D numpy array of predicted binary labels for each
+# row in x. A 2D numpy array is constructed where each column corresponds
+# to all predictions for one tree, and a row corresponds to a row in x.
+# Therefore we take the majority vote of rows in this array to return a 1D
+# numpy array with the final predicted labels of the trees.
+
+# The "true" argument should only be given if predictions metrics are to be
+# calculated and printed, which gives an immediate idea if the tree is
+# erronous by seeing low prediction performance on the training data for
+# example.
+
+# EXAMPLE:
+# >>> x
+# array([[22., 0., 0., 28., 1.],
+# [46., 0., 1., 32., 0.],
+# [24., 1., 1., 24., 1.],
+# [25., 0., 0., 27., 1.],
+# [29., 1., 1., 32., 0.],
+# [45., 1., 1., 30., 0.],
+# [63., 1., 1., 58., 1.],
+# [36., 1., 0., 52., 1.],
+# [23., 0., 1., 40., 0.],
+# [50., 1., 1., 28., 0.]])
+# >>> type(trees), type(trees[0]), len(trees) == 50
+# (<class 'list'>, <class '__main__.Tree'>, True)
+# >>> tree_pred_b(x=x,tr=trees)
+# array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+# """
+
+#- Miscellaneous functions:
+
+# def major_vote(classes):
+# """
+# /classes/ numpy.array, 1D numpy array of zeroes and ones
+
+# Returns -> int
+
+# Uses numpy methods to calculate if 1 or 0 elements are the majority in
+# the classes vector. Note that when the number of 1 and 0 elements are
+# equal, it returns 0.
+
+# >>> y
+# array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.])
+# >>> major_vote(y)
+# 0
+
+# """
-# In the program categorical or numerical attributes are called cols for columns
-# The last column are the classes and will be called as classes in the program
+import numpy as np
class Node:
@@ -58,9 +214,9 @@ class Node:
leaf node
"""
self.col = None
- # This weird numpy line gives the majority vote, which is 1 or 0
self.split_value_or_rows = major_vote(node_classes)
+
class Tree:
"""
Tree object that points towards the root node.
@@ -102,40 +258,117 @@ class Tree:
nodes[0] = node.right
return predictions
- # Work in progress tree printer
- #
- # def __repr__(self):
- # tree_string = ''
- # node = self.tree
- # depth = 0
- # nodelist = [node]
- # while nodelist:
- # node = nodelist.pop()
- # depth += 1
- # if node.col is not None:
- # left, right = node.left, node.right
- # nodelist += [left, right]
- # else:
- # continue
- # tree_string += '\n' + depth * ' '
- # tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\'
- # tree_string += '\n' + ' ' * 2 * depth
- # for direc in left, right:
- # if not direc.split_value_or_rows%10:
- # tree_string += ' ' * 4
- # else:
- # tree_string += ' ' * 3
- # tree_string += str(int(direc.split_value_or_rows))
-
- # tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string
- # return tree_string
+def tree_grow(x=None, y=None, nmin=None, minleaf=None, nfeat=None) -> Tree:
+ """
+ Builds a classification tree given training data and labels using stopping
+ rule complexity parameters.
+ """
+ mask = np.full(len(x), True)
+ root = Node(split_value_or_rows=mask)
+ tr = Tree(root, (nmin, minleaf, nfeat))
+
+ nodelist = [root]
+ while nodelist:
+ node = nodelist.pop()
+ node_classes = y[node.split_value_or_rows]
+
+ if len(node_classes) < nmin:
+ node.is_leaf_node(node_classes)
+ continue
+
+ if impurity(node_classes) > 0:
+ node_rows = x[node.split_value_or_rows]
+ exhaustive_best_list = exhaustive_split_search(
+ node_rows, node_classes, minleaf)
+ if not exhaustive_best_list:
+ node.is_leaf_node(node_classes)
+ continue
+ best_split = min(exhaustive_best_list, key=lambda z: z[0])
+ nodelist += add_children(node, best_split)
+
+ else:
+ # impurity 0
+ node.is_leaf_node(node_classes)
+ continue
+ return tr
+
+
+def tree_pred(x=None, tr=None, true=None) -> np.array:
+ """
+ Predicts a binary label for each row in x using tr.predict.
+ """
+ y = tr.predict(x).astype(float)
+ nmin, minleaf, nfeat = tr.hyper_params
+ if true is not None:
+ print(
+ f'Results from: prediction single tree({nmin=}, {minleaf=}, {nfeat=})'
+ )
+ print(f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, true)}')
+ print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, true)}')
+ print(f'\t->Precission:\n\t\t{metrics.precision_score(y, true)}')
+ print(f'\t->Recall:\n\t\t{metrics.recall_score(y, true)}')
+ return y
+
+
+def tree_grow_b(x=None,
+ y=None,
+ nmin=None,
+ minleaf=None,
+ nfeat=None,
+ m=None) -> Tree:
+ """
+ The m times repeated application of tree_grow with bagged data.
+ """
+ forest = []
+ for i in range(m):
+ choice = np.random.randint(len(x), size=len(x))
+ x_bag, y_bag = x[choice], y[choice]
+ forest.append(
+ tree_grow(x=x_bag,
+ y=y_bag,
+ nmin=nmin,
+ minleaf=minleaf,
+ nfeat=nfeat))
+ return forest
+
+
+def tree_pred_b(x=None, tr=None, true=None) -> np.array:
+ """
+ The repeated application of tree.predict to construct a 2D array which is
+ used to make a majority vote label prediction for the rows in x.
+ """
+ y_bag = np.zeros((len(x), len(tr)))
+ for i, tree in enumerate(tr):
+ y_bag[:, i] = tree.predict(x).astype(float)
+ nmin, minleaf, nfeat = tr[0].hyper_params
+ y = np.array([major_vote(y_bag[i]) for i in range(len(y_bag))])
+ if true is not None:
+ if nfeat == x.shape[1]:
+ print(
+ f'Results from: prediction bagged tree({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})'
+ )
+ else:
+ print(
+ f'Results from: prediction random forest({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})'
+ )
+ print(f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, true)}')
+ print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, true)}')
+ print(f'\t->Precission:\n\t\t{metrics.precision_score(y, true)}')
+ print(f'\t->Recall:\n\t\t{metrics.recall_score(y, true)}')
+ return y
+
+
+#
+#
+# Put all helper functions below this comment!
def major_vote(classes):
"""
- @todo: Docstring for major_vote(classes
+ Returns a zero or one based on the highest occurence in the classes vector.
"""
return np.argmax(np.bincount(classes.astype(int)))
+
def impurity(array) -> int:
"""
Assumes the argument array is a one dimensional vector of zeroes and ones.
@@ -211,7 +444,7 @@ def bestsplit(x, y, minleaf) -> None:
def exhaustive_split_search(rows, classes, minleaf):
"""
- @todo: Docstring for exhaustive_split_search
+ The nfeat repeated application of best_split.
"""
# We hebben enumerate nodig, want we willen weten op welke col (i)
# (age,married,house,income,gender) we een split doen
@@ -227,7 +460,8 @@ def exhaustive_split_search(rows, classes, minleaf):
def add_children(node, best_split):
"""
- @todo: Docstring for add_children
+ Processes the splits into the tree data structure and returns children yet
+ to be splitted to the nodelist in tree_grow.
"""
current_mask = node.split_value_or_rows
mask_left, mask_right, node_split_value, node_col = best_split[1:]
@@ -253,172 +487,3 @@ def update_mask(mask, current_mask):
copy = np.array(current_mask, copy=True)
copy[current_mask == True] = mask
return copy
-
-
-#
-#
-# Put all helper functions above this comment!
-
-
-def tree_grow(x=None,
- y=None,
- nmin=None,
- minleaf=None,
- nfeat=None,
- **defaults) -> Tree:
- """
- @todo: Docstring for tree_grow
- """
- mask = np.full(len(x), True)
- root = Node(split_value_or_rows=mask)
- tr = Tree(root, (nmin, minleaf, nfeat))
-
- nodelist = [root]
- while nodelist:
- node = nodelist.pop()
- node_classes = y[node.split_value_or_rows]
-
- if len(node_classes) < nmin:
- node.is_leaf_node(node_classes)
- continue
-
- if impurity(node_classes) > 0:
- node_rows = x[node.split_value_or_rows]
- exhaustive_best_list = exhaustive_split_search(
- node_rows, node_classes, minleaf)
- if not exhaustive_best_list:
- node.is_leaf_node(node_classes)
- continue
- best_split = min(exhaustive_best_list, key=lambda z: z[0])
- nodelist += add_children(node, best_split)
-
- else:
- # impurity 0
- node.is_leaf_node(node_classes)
- continue
- return tr
-
-
-def tree_grow_b(x=None,
- y=None,
- nmin=None,
- minleaf=None,
- nfeat=None,
- m=None,
- **defaults) -> Tree:
- forest = []
- for i in range(m):# ,desc=f'planting a forest, growing {m} trees'):
- choice = np.random.randint(len(x),size=len(x))
- x_bag, y_bag = x[choice], y[choice]
- forest.append(tree_grow(x=x_bag,y=y_bag,nmin=nmin,minleaf=minleaf,nfeat=nfeat))
- return forest
-
-
-
-def tree_pred(x=None, tr=None, training=None, **defaults) -> np.array:
- """
- @todo: Docstring for tree_pred
- """
- y = tr.predict(x).astype(float)
- nmin, minleaf, nfeat = tr.hyper_params
- if training is not None:
- # print(np.mean(training == y))
- print(
- f'Results from: prediction single tree({nmin=}, {minleaf=}, {nfeat=})'
- )
- print(
- f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, training)}')
- print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, training)}')
- print(f'\t->Precission:\n\t\t{metrics.precision_score(y, training)}')
- print(f'\t->Recall:\n\t\t{metrics.recall_score(y, training)}')
- return y
-
-
-def tree_pred_b(x=None, tr=None, training=None, **defaults) -> np.array:
- y_bag = np.zeros((len(x), len(tr)))
- for i, tree in enumerate(tr): # , total=len(tr),desc=f'making also {len(tr)} predictions!'):
- y_bag[:,i] = tree.predict(x).astype(float)
- nmin, minleaf, nfeat = tr[0].hyper_params
- y = np.array([major_vote(y_bag[i]) for i in range(len(y_bag))])
- if training is not None:
- # print(np.mean(training == y))
- if nfeat == x.shape[1]:
- print(
- f'Results from: prediction bagged tree({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})'
- )
- else:
- print(
- f'Results from: prediction random forest({nmin=}, {minleaf=}, {nfeat=}, trees={len(tr)})'
- )
- print(
- f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, training)}')
- print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, training)}')
- print(f'\t->Precission:\n\t\t{metrics.precision_score(y, training)}')
- print(f'\t->Recall:\n\t\t{metrics.recall_score(y, training)}')
- return y
-
-
-if __name__ == '__main__':
- credit_data = np.genfromtxt('./data/credit_score.txt',
- delimiter=',',
- skip_header=True)
-
- pima_indians = np.genfromtxt('./data/pima_indians.csv',
- delimiter=',',
- skip_header=True)
-
- # print("\nDataset: credit data")
- # tree_pred(x=credit_data[:, :5],
- # tr=tree_grow(x=credit_data[:, 0:5],
- # y=credit_data[:, 5],
- # nmin=2,
- # minleaf=1,
- # nfeat=5),
- # training=credit_data[:, 5])
-
- # print("\nDataset: credit data")
- # tree_pred_b(x=credit_data[:, :5],
- # tr=tree_grow_b(x=credit_data[:, 0:5],
- # y=credit_data[:, 5],
- # nmin=2,
- # minleaf=1,
- # nfeat=4,
- # m=50),
- # training=credit_data[:, 5])
-
- # print('\nDataset: pima indians')
- # tree_pred(x=pima_indians[:, :8],
- # tr=tree_grow(x=pima_indians[:, :8],
- # y=pima_indians[:, 8],
- # nmin=20,
- # minleaf=5,
- # nfeat=pima_indians.shape[1] - 1),
- # training=pima_indians[:, 8])
-
-
- # print('\nDataset: pima indians')
- # tree_pred_b(x=pima_indians[:, :8],
- # tr=tree_grow_b(x=pima_indians[:, :8],
- # y=pima_indians[:, 8],
- # nmin=20,
- # minleaf=5,
- # nfeat=4,
- # m=5),
- # training=pima_indians[:, 8])
-
-
-
- # Time profiles: see what functions take what time! :)
-
- # print("prediction metrics single tree pima indians:")
- # cProfile.run("tree_pred(x=credit_data[:,:5], tr=tree_grow(x=credit_data[:,0:5], y=credit_data[:,5], nmin=2, minleaf=1, nfeat=5), dataset='credit score')", 'restats')
-
- # Time profile of pima indians data prediction with single tree
- # print("prediction metrics single tree pima indians:")
- cProfile.run(
- "tree_pred_b(x=pima_indians[:, :8], tr=tree_grow_b(x=pima_indians[:, :8], y=pima_indians[:, 8], nmin=20, minleaf=5, nfeat=4, m=5), training=pima_indians[:, 8])",
- 'restats')
-
- p = pstats.Stats('restats')
- p.sort_stats(SortKey.TIME)
- p.print_stats()