summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Vink <mike1994vink@gmail.com>2020-09-25 08:03:37 +0200
committerMike Vink <mike1994vink@gmail.com>2020-09-25 08:03:37 +0200
commitcc3cb48854d4575e5b533473d5f8a0c164c07658 (patch)
tree6605b89d3897132158c48a8059e7c738e5af24b7
parentdd8c5300b721d62a983add45ae38e0ef4425921a (diff)
Update: this tree is fast and gets good results
-rw-r--r--tree.py67
1 files changed, 61 insertions, 6 deletions
diff --git a/tree.py b/tree.py
index 1dee59c..e971bd4 100644
--- a/tree.py
+++ b/tree.py
@@ -1,7 +1,9 @@
import numpy as np
import cProfile
import pstats
+# import tqdm
+# from tqdm import trange
from pstats import SortKey
from sklearn import metrics
@@ -57,9 +59,7 @@ class Node:
"""
self.col = None
# This weird numpy line gives the majority vote, which is 1 or 0
- self.split_value_or_rows = np.argmax(
- np.bincount(node_classes.astype(int)))
-
+ self.split_value_or_rows = major_vote(node_classes)
class Tree:
"""
@@ -131,6 +131,11 @@ class Tree:
# tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string
# return tree_string
+def major_vote(classes):
+ """
+ @todo: Docstring for major_vote(classes
+ """
+ return np.argmax(np.bincount(classes.astype(int)))
def impurity(array) -> int:
"""
@@ -302,7 +307,13 @@ def tree_grow_b(x=None,
nfeat=None,
m=None,
**defaults) -> Tree:
- pass
+ forest = []
+ for i in range(m):# ,desc=f'planting a forest, growing {m} trees'):
+ choice = np.random.randint(len(x),size=len(x))
+ x_bag, y_bag = x[choice], y[choice]
+ forest.append(tree_grow(x=x_bag,y=y_bag,nmin=nmin,minleaf=minleaf,nfeat=nfeat))
+ return forest
+
def tree_pred(x=None, tr=None, training=None, **defaults) -> np.array:
@@ -323,8 +334,29 @@ def tree_pred(x=None, tr=None, training=None, **defaults) -> np.array:
print(f'\t->Recall:\n\t\t{metrics.recall_score(y, training)}')
return y
+
def tree_pred_b(x=None, tr=None, training=None, **defaults) -> np.array:
- pass
+ y_bag = np.zeros((len(x), len(tr)))
+ for i, tree in enumerate(tr): # , total=len(tr),desc=f'making also {len(tr)} predictions!'):
+ y_bag[:,i] = tree.predict(x).astype(float)
+ nmin, minleaf, nfeat = tr[0].hyper_params
+ y = np.array([major_vote(y_bag[i]) for i in range(len(y_bag))])
+ if training is not None:
+ # print(np.mean(training == y))
+ if nfeat == x.shape[1]:
+ print(
+ f'Results from: prediction bagged tree({nmin=}, {minleaf=}, {nfeat=})'
+ )
+ else:
+ print(
+ f'Results from: prediction random forest tree({nmin=}, {minleaf=}, {nfeat=})'
+ )
+ print(
+ f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, training)}')
+ print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, training)}')
+ print(f'\t->Precission:\n\t\t{metrics.precision_score(y, training)}')
+ print(f'\t->Recall:\n\t\t{metrics.recall_score(y, training)}')
+ return y
if __name__ == '__main__':
@@ -345,6 +377,16 @@ if __name__ == '__main__':
nfeat=5),
training=credit_data[:, 5])
+ print("Dataset: credit data")
+ tree_pred_b(x=credit_data[:, :5],
+ tr=tree_grow_b(x=credit_data[:, 0:5],
+ y=credit_data[:, 5],
+ nmin=2,
+ minleaf=1,
+ nfeat=4,
+ m=50),
+ training=credit_data[:, 5])
+
print('\nDataset: pima indians')
tree_pred(x=pima_indians[:, :8],
tr=tree_grow(x=pima_indians[:, :8],
@@ -354,6 +396,19 @@ if __name__ == '__main__':
nfeat=pima_indians.shape[1] - 1),
training=pima_indians[:, 8])
+
+ print('\nDataset: pima indians (takes 2 min max, big data bootstrap)')
+ tree_pred_b(x=pima_indians[:, :8],
+ tr=tree_grow_b(x=pima_indians[:, :8],
+ y=pima_indians[:, 8],
+ nmin=20,
+ minleaf=5,
+ nfeat=4,
+ m=5),
+ training=pima_indians[:, 8])
+
+
+
# Time profiles: see what functions take what time! :)
# print("prediction metrics single tree pima indians:")
@@ -362,7 +417,7 @@ if __name__ == '__main__':
# Time profile of pima indians data prediction with single tree
# print("prediction metrics single tree pima indians:")
# cProfile.run(
- # "tree_pred(x=pima_indians[:,:8], tr=tree_grow(x=pima_indians[:,:8], y=pima_indians[:,8], nmin=20, minleaf=5, nfeat=pima_indians.shape[1]-1), training=pima_indians[:,8])",
+ # "tree_pred_b(x=pima_indians[:, :8], tr=tree_grow_b(x=pima_indians[:, :8], y=pima_indians[:, 8], nmin=20, minleaf=5, nfeat=pima_indians.shape[1] - 1, m=50), training=pima_indians[:, 8])",
# 'restats')
# p = pstats.Stats('restats')