summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--tree.py (renamed from assignment1.py)321
1 files changed, 149 insertions, 172 deletions
diff --git a/assignment1.py b/tree.py
index 4b264d7..e971bd4 100644
--- a/assignment1.py
+++ b/tree.py
@@ -1,8 +1,11 @@
import numpy as np
+import cProfile
+import pstats
+# import tqdm
-credit_data = np.genfromtxt('./credit_score.txt',
- delimiter=',',
- skip_header=True)
+# from tqdm import trange
+from pstats import SortKey
+from sklearn import metrics
# age,married,house,income,gender,class
# [(22, 0, 0, 28, 1, 0)
@@ -56,9 +59,7 @@ class Node:
"""
self.col = None
# This weird numpy line gives the majority vote, which is 1 or 0
- self.split_value_or_rows = np.argmax(
- np.bincount(node_classes.astype(int)))
-
+ self.split_value_or_rows = major_vote(node_classes)
class Tree:
"""
@@ -85,7 +86,7 @@ class Tree:
# De index van de row van x die we in de boom willen droppen
drop = 0
- while not all(pred_class in {0,1} for pred_class in nodes):
+ while not all(pred_class in {0, 1} for pred_class in nodes):
# Als de col None is dan is het een leaf node, dus dan is de row
# van x hier beland
if nodes[drop].col is None:
@@ -130,6 +131,11 @@ class Tree:
# tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string
# return tree_string
+def major_vote(classes):
+ """
+ @todo: Docstring for major_vote(classes
+ """
+ return np.argmax(np.bincount(classes.astype(int)))
def impurity(array) -> int:
"""
@@ -146,26 +152,15 @@ def impurity(array) -> int:
>>> impurity(array)
0.23140495867768596
"""
- # Total labels
n_labels = len(array)
- if n_labels == 0:
- print(
- "division by zero will happen, child node is pure, doesnt contain anything"
- )
- n_labels = 1
- # Number of tuples labeled 1
n_labels_1 = array.sum()
- # Calculate the relative frequency of ones with respect to the total labels
rel_freq_1 = n_labels_1 / n_labels
- # Use the symmetry around the median property to also calculate the
- # relative frequency of zeroes
rel_freq_0 = 1 - rel_freq_1
- # Multiply the frequencies to get the gini index
gini_index = rel_freq_1 * rel_freq_0
return gini_index
-def bestsplit(x, y, min_leaf) -> None:
+def bestsplit(x, y, minleaf) -> None:
"""
x = vector of single col
y = vector of classes (last col in x)
@@ -184,46 +179,24 @@ def bestsplit(x, y, min_leaf) -> None:
36
"""
x_sorted = np.sort(np.unique(x))
- if len(x_sorted) <= 2:
- # Allows splitting on categorical (0 or 1) cols
- split_points = x_sorted / 2
- else:
- # Take average between consecutive numerical rows in the x col
- split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
-
- # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee
- # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes
- # bestsplit toe te passen.
- #
- # Deze lus berekent de best split value, en op basis daarvan weten we welke
- # twee "x rows" arrays we moeten returnen, en welke split value het beste
- # was natuurlijk.
+ split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
# Hieren stoppen we (delta_i, split_value, rows_left, rows_right)
best_list = []
# Stop wanneer de array met split points leeg is
while split_points.size != 0:
- # Huidige split value
split_value = split_points[-1]
- # boolean masks om x rows mee te masken/slicen
mask_left, mask_right = x > split_value, x <= split_value
-
- # class voor beide split kanten
classes_left, classes_right = y[mask_left], y[mask_right]
# Kijk of er genoeg rows in de gesplitte nodes terechtkomen, anders
- # mogen we de split niet toelaten vanwege de min_leaf constraint
- if len(classes_left) < min_leaf or len(classes_right) < min_leaf:
- # Haal de huidige split_point uit split_points
+ # mogen we de split niet toelaten vanwege de minleaf constraint
+ if len(classes_left) < minleaf or len(classes_right) < minleaf:
split_points = split_points[:-1]
continue
- # delta_i formule, improved by not taking the parent impurity into
- # account, and not making the weighted average but the weigthed sum
- # only (thanks Lonnie)
- delta_i = (
- impurity(classes_left) * len(classes_left) +
- impurity(classes_right) * len(classes_right))
+ delta_i = (impurity(classes_left) * len(classes_left) +
+ impurity(classes_right) * len(classes_right))
# stop huidige splits in de lijst om best split te berekenen
best_list.append((delta_i, mask_left, mask_right, split_value))
# Haal de huidige split_point uit split_points
@@ -237,25 +210,19 @@ def bestsplit(x, y, min_leaf) -> None:
return False
-def exhaustive_split_search(rows, classes, min_leaf):
+def exhaustive_split_search(rows, classes, minleaf):
"""
@todo: Docstring for exhaustive_split_search
"""
- print("\t\t->entering exhaustive split search")
- # We hebben enumerate nodig, want we willen weten op welke col
+ # We hebben enumerate nodig, want we willen weten op welke col (i)
# (age,married,house,income,gender) we een split doen
exhaustive_best_list = []
- print(f"Rows:\n{rows},\n Classes:\n{classes}")
for i, col in enumerate(rows.transpose()):
- # calculate the best split for the col that satisfies the min_leaf
- # constraint
- col_best_split = bestsplit(col, classes, min_leaf)
- # Check if there was a split fullfilling the min leaf rule
+ col_best_split = bestsplit(col, classes, minleaf)
if col_best_split:
# add for which row we calculated the best split
col_best_split += (i, )
exhaustive_best_list.append(col_best_split)
- print("\t\t->returning from exhaustive split search")
return exhaustive_best_list
@@ -263,14 +230,8 @@ def add_children(node, best_split):
"""
@todo: Docstring for add_children
"""
- print("\t\t\t->entering add children")
- # The mask that was used to get the rows for the current node from x, we
- # need this to update the rows for the children
current_mask = node.split_value_or_rows
-
- # Unpacking the best_split tuple
mask_left, mask_right, node_split_value, node_col = best_split[1:]
- # print(f"{mask_left}, {mask_right}, {node_split_value}, {node_col}")
# Give the current node the split_value and col it needs for predictions
node.split_value_or_rows, node.col = node_split_value, node_col
@@ -280,8 +241,8 @@ def add_children(node, best_split):
mask_right, current_mask)
# Adding the pointer between parent and children
- node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right))
- print("\t\t\t->children added to node and node list\n")
+ node.add_split(Node(split_value_or_rows=mask_left),
+ Node(split_value_or_rows=mask_right))
return [node.left, node.right]
@@ -290,26 +251,8 @@ def update_mask(mask, current_mask):
Updates the spit bool array from any dimension to an array with length
equal to the total number of rows in dataset x.
"""
- print(
- "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child"
- )
- # Copy heb ik gedaan omdat we een slice assignment doen.
- #
- # Het punt van deze functie is dat tijdens het tree growen het aantal rows
- # in een node afneemt. Deze functie update de bool array die gebruikt is om
- # te splitten in bestsplit naar een array met als lengte het totale aantal
- # rows in de data set x. Op deze manier kunnen we in tree grow de totale x
- # splitten.
- #
- # Een andere optie zou zijn om de rows letterlijk tijdelijk op te slaan in
- # nodes... weet niet wat beter is, het kan best veel geheugen in beslag
- # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd
- # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft.
- print(f"Before update:\n{mask}")
copy = np.array(current_mask, copy=True)
copy[np.where(current_mask)] = mask
- print(f"After update:\n{copy}")
- print("\t\t\t\t->updated row mask for child node")
return copy
@@ -320,129 +263,163 @@ def update_mask(mask, current_mask):
def tree_grow(x=None,
y=None,
- n_min=None,
- min_leaf=None,
- n_feat=None,
+ nmin=None,
+ minleaf=None,
+ nfeat=None,
**defaults) -> Tree:
"""
@todo: Docstring for tree_grow
"""
- # De nodelist heeft in het begin alleen een root node met alle rows van x,
- # omdat alle rows in de root in acht worden genomen voor bestsplit berekening.
- #
- # Dit representeren we met een boolean mask over x, met lengte het aantal rows
- # in x en elementen True. Deze boolean mask zullen we repeatedly gebruiken als een
- # mask over x om de rows voor bestsplit op te halen.
mask = np.full(len(x), True)
-
- # Het eerste node object moet nu geinstantieerd worden
root = Node(split_value_or_rows=mask)
+ tr = Tree(root, (nmin, minleaf, nfeat))
- # We instantieren ook gelijk het Tree object
- tr = Tree(root, (n_min, min_leaf, n_feat))
-
- # De eerste nodelist heeft alleen de root node, daarna zijn twee childs,
- # etc. totdat alle splits gemaakt zijn en de lijst leeg is.
nodelist = [root]
-
while nodelist:
- print("->Taking new node from the node list")
- # Pop de current node uit de nodelist
node = nodelist.pop()
- # Gebruik de boolean mask van de node om de rows in de node uit x te halen
- node_rows = x[node.split_value_or_rows]
- # print(node_rows)
- # Gebruik de boolean mask van de node om de classes in de node uit y te halen
node_classes = y[node.split_value_or_rows]
- # print(f"{node_classes}, {node_rows}")
-
- # Test of de node een leaf node is met n_min
- if len(node_rows) < n_min:
+ if len(node_classes) < nmin:
node.is_leaf_node(node_classes)
- print(
- "\t->Node has less rows than n_min, it is a leaf node, continueing to next node"
- )
continue
- print("\t->Node has more rows than n_min, it is not a leaf node")
- # Als de node niet puur is, probeer dan te splitten
if impurity(node_classes) > 0:
- print("\t->Node is not pure yet starting exhaustive split search")
- # We gaan exhaustively voor de rows in de node over de cols
- # (age,married,house,income,gender) om de bestsplit te
- # bepalen
- #
- # We geven min_leaf als argument omdat:
- # "If the algorithm performs a split, it should be the best split that meets the minleaf constrain"
- #
- # We krijgen een exhaustive lijst terug met splits
+ node_rows = x[node.split_value_or_rows]
exhaustive_best_list = exhaustive_split_search(
- node_rows, node_classes, min_leaf)
- # print(exhaustive_best_list)
- # Als de lijst leeg is zijn er geen potentieele splits die voldoen
- # an de min leaf constraint
+ node_rows, node_classes, minleaf)
if not exhaustive_best_list:
node.is_leaf_node(node_classes)
- print(
- "\t\t->No split that fullfils min_leaf was found continueing to next node"
- )
continue
- print(
- "\t->Exhaustive search found a split fulfilling the min_leaf rule!"
- )
- # Hier halen we de beste split, en rows voor de child/split nodes
- # uit de exhaustive best list
best_split = min(exhaustive_best_list, key=lambda z: z[0])
- print(f"\n######################best split tuple (delta_i, bool arrays voor child rows, split value, col)############################\n\n{best_split}\n\n###########################################################################################################################\n")
- # Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent"
nodelist += add_children(node, best_split)
- # node.add_split(left_child_node, right_child_node)
+
else:
+ # impurity 0
node.is_leaf_node(node_classes)
- print("\t\t->The node is already pure, it is necessarily a leaf!")
continue
- # print(tr)
return tr
-def tree_pred(x=None, tr=None, **defaults) -> np.array:
+def tree_grow_b(x=None,
+ y=None,
+ nmin=None,
+ minleaf=None,
+ nfeat=None,
+ m=None,
+ **defaults) -> Tree:
+ forest = []
+ for i in range(m):# ,desc=f'planting a forest, growing {m} trees'):
+ choice = np.random.randint(len(x),size=len(x))
+ x_bag, y_bag = x[choice], y[choice]
+ forest.append(tree_grow(x=x_bag,y=y_bag,nmin=nmin,minleaf=minleaf,nfeat=nfeat))
+ return forest
+
+
+
+def tree_pred(x=None, tr=None, training=None, **defaults) -> np.array:
"""
@todo: Docstring for tree_pred
"""
- y = tr.predict(x)
- n_min, min_leaf, n_feat = tr.hyper_params
- print("-"*80+f"\nPrediction parameters:\nn_min={n_min}\nmin_leaf={min_leaf}\nn_feat={n_feat}")
- print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}")
+ y = tr.predict(x).astype(float)
+ nmin, minleaf, nfeat = tr.hyper_params
+ if training is not None:
+ # print(np.mean(training == y))
+ print(
+ f'Results from: prediction single tree({nmin=}, {minleaf=}, {nfeat=})'
+ )
+ print(
+ f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, training)}')
+ print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, training)}')
+ print(f'\t->Precission:\n\t\t{metrics.precision_score(y, training)}')
+ print(f'\t->Recall:\n\t\t{metrics.recall_score(y, training)}')
+ return y
+
+
+def tree_pred_b(x=None, tr=None, training=None, **defaults) -> np.array:
+ y_bag = np.zeros((len(x), len(tr)))
+ for i, tree in enumerate(tr): # , total=len(tr),desc=f'making also {len(tr)} predictions!'):
+ y_bag[:,i] = tree.predict(x).astype(float)
+ nmin, minleaf, nfeat = tr[0].hyper_params
+ y = np.array([major_vote(y_bag[i]) for i in range(len(y_bag))])
+ if training is not None:
+ # print(np.mean(training == y))
+ if nfeat == x.shape[1]:
+ print(
+ f'Results from: prediction bagged tree({nmin=}, {minleaf=}, {nfeat=})'
+ )
+ else:
+ print(
+ f'Results from: prediction random forest tree({nmin=}, {minleaf=}, {nfeat=})'
+ )
+ print(
+ f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, training)}')
+ print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, training)}')
+ print(f'\t->Precission:\n\t\t{metrics.precision_score(y, training)}')
+ print(f'\t->Recall:\n\t\t{metrics.recall_score(y, training)}')
return y
if __name__ == '__main__':
- #### IMPURITY TEST
- # array=np.array([1,0,1,1,1,0,0,1,1,0,1])
- # print(impurity(array))
- # Should give 0.23....
-
- #### BESTSPLIT TEST
- # print(bestsplit(credit_data[:, 3], credit_data[:, 5]))
- # Should give 36
-
- #### TREE_GROW TEST
- tree_grow_defaults = {
- 'x': credit_data[:, :5],
- 'y': credit_data[:, 5],
- 'n_min': 2,
- 'min_leaf': 1,
- 'n_feat': None # 5, should be 5 for bootstrapping folds
- }
-
- # Calling the tree grow, unpacking default as argument
- # tree_grow(**tree_grow_defaults)
-
- #### TREE_PRED TEST
- tree_pred_defaults = {
- 'x': credit_data[:, :5],
- 'tr': tree_grow(**tree_grow_defaults)
- }
-
- tree_pred(**tree_pred_defaults)
+ credit_data = np.genfromtxt('./data/credit_score.txt',
+ delimiter=',',
+ skip_header=True)
+
+ pima_indians = np.genfromtxt('./data/pima_indians.csv',
+ delimiter=',',
+ skip_header=True)
+
+ print("Dataset: credit data")
+ tree_pred(x=credit_data[:, :5],
+ tr=tree_grow(x=credit_data[:, 0:5],
+ y=credit_data[:, 5],
+ nmin=2,
+ minleaf=1,
+ nfeat=5),
+ training=credit_data[:, 5])
+
+ print("Dataset: credit data")
+ tree_pred_b(x=credit_data[:, :5],
+ tr=tree_grow_b(x=credit_data[:, 0:5],
+ y=credit_data[:, 5],
+ nmin=2,
+ minleaf=1,
+ nfeat=4,
+ m=50),
+ training=credit_data[:, 5])
+
+ print('\nDataset: pima indians')
+ tree_pred(x=pima_indians[:, :8],
+ tr=tree_grow(x=pima_indians[:, :8],
+ y=pima_indians[:, 8],
+ nmin=20,
+ minleaf=5,
+ nfeat=pima_indians.shape[1] - 1),
+ training=pima_indians[:, 8])
+
+
+ print('\nDataset: pima indians (takes 2 min max, big data bootstrap)')
+ tree_pred_b(x=pima_indians[:, :8],
+ tr=tree_grow_b(x=pima_indians[:, :8],
+ y=pima_indians[:, 8],
+ nmin=20,
+ minleaf=5,
+ nfeat=4,
+ m=5),
+ training=pima_indians[:, 8])
+
+
+
+ # Time profiles: see what functions take what time! :)
+
+ # print("prediction metrics single tree pima indians:")
+ # cProfile.run("tree_pred(x=credit_data[:,:5], tr=tree_grow(x=credit_data[:,0:5], y=credit_data[:,5], nmin=2, minleaf=1, nfeat=5), dataset='credit score')", 'restats')
+
+ # Time profile of pima indians data prediction with single tree
+ # print("prediction metrics single tree pima indians:")
+ # cProfile.run(
+ # "tree_pred_b(x=pima_indians[:, :8], tr=tree_grow_b(x=pima_indians[:, :8], y=pima_indians[:, 8], nmin=20, minleaf=5, nfeat=pima_indians.shape[1] - 1, m=50), training=pima_indians[:, 8])",
+ # 'restats')
+
+ # p = pstats.Stats('restats')
+ # p.sort_stats(SortKey.TIME)
+ # p.print_stats()