summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--assignment1.py306
1 files changed, 118 insertions, 188 deletions
diff --git a/assignment1.py b/assignment1.py
index 2a670c2..00432ec 100644
--- a/assignment1.py
+++ b/assignment1.py
@@ -1,4 +1,3 @@
-import time
import numpy as np
credit_data = np.genfromtxt('./credit_score.txt',
@@ -60,23 +59,11 @@ class Node:
np.bincount(node_classes.astype(int)))
-# class Leaf:
-# """
-# Simple class that contains only the majority class in the leaf node.
-# """
-# def __init__(self, maj_class):
-# """Initialises the majority vote.
-
-# /maj_class/ @todo
-
-# """
-
-
class Tree:
"""
Tree object that points towards the root node.
"""
- def __init__(self, root_node_obj):
+ def __init__(self, root_node_obj, hyper_params):
"""Initialises only by pointing to a Node object.
/root_node_obj/ is a node object that is made before entering the main
@@ -84,31 +71,64 @@ class Tree:
"""
self.tree = root_node_obj
- def __repr__(self):
- tree_string = ''
- node = self.tree
- depth = 0
- nodelist = [node]
- while nodelist:
- node = nodelist.pop()
- depth += 1
- if node.col is not None:
- left, right = node.left, node.right
- nodelist += [left, right]
- else:
+ self.hyper_params = hyper_params
+
+ def predict(self, x):
+ """
+ Makes a list of root nodes, and drops one row of x through the tree per
+ loop
+ """
+ # Maak een lijst van nodes, wiens indexes overeen komen met de rows in
+ # x die we willen droppen
+ nodes = [self.tree] * len(x)
+
+ # De index van de row van x die we in de boom willen droppen
+ drop = 0
+ while not set(nodes).issubset({0, 1}):
+ # Als de col None is dan is het een leaf node, dus dan is de row
+ # van x hier beland
+ if nodes[drop].col is None:
+ nodes[drop] = nodes[drop].split_value_or_rows
+ drop += 1
continue
- tree_string += '\n' + depth * ' '
- tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\'
- tree_string += '\n' + ' ' * 2 * depth
- for direc in left, right:
- if not direc.split_value_or_rows%10:
- tree_string += ' ' * 4
- else:
- tree_string += ' ' * 3
- tree_string += str(int(direc.split_value_or_rows))
-
- tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string
- return tree_string
+
+ # Vergelijk de x col (age,married,house,income,gender,class), in de
+ # gedropte row met het split value van de node. Op basis hiervan
+ # drop naar links of rechts
+ if x[drop, nodes[drop].col] > nodes[drop].split_value_or_rows:
+ nodes[drop] = nodes[drop].left
+ else:
+ nodes[drop] = nodes[drop].right
+ return np.array(nodes)
+
+ # Work in progress tree printer
+ #
+ # def __repr__(self):
+ # tree_string = ''
+ # node = self.tree
+ # depth = 0
+ # nodelist = [node]
+ # while nodelist:
+ # node = nodelist.pop()
+ # depth += 1
+ # if node.col is not None:
+ # left, right = node.left, node.right
+ # nodelist += [left, right]
+ # else:
+ # continue
+ # tree_string += '\n' + depth * ' '
+ # tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\'
+ # tree_string += '\n' + ' ' * 2 * depth
+ # for direc in left, right:
+ # if not direc.split_value_or_rows%10:
+ # tree_string += ' ' * 4
+ # else:
+ # tree_string += ' ' * 3
+ # tree_string += str(int(direc.split_value_or_rows))
+
+ # tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string
+ # return tree_string
+
def impurity(array) -> int:
"""
@@ -208,14 +228,15 @@ def bestsplit(x, y, min_leaf) -> None:
best_list.append((delta_i, mask_left, mask_right, split_value))
# Haal de huidige split_point uit split_points
split_points = split_points[:-1]
- # Bereken de best split voor deze x col
+
+ # Bereken de best split voor deze x col, als er ten minste 1 bestaat die
+ # voldoet aan min leaf
if best_list:
return max(best_list, key=lambda x: x[0])
else:
return False
-
def exhaustive_split_search(rows, classes, min_leaf):
"""
@todo: Docstring for exhaustive_split_search
@@ -224,7 +245,7 @@ def exhaustive_split_search(rows, classes, min_leaf):
# We hebben enumerate nodig, want we willen weten op welke col
# (age,married,house,income,gender) we een split doen
exhaustive_best_list = []
- # print(f"{rows=}, {classes=}")
+ print(f"Rows:\n{rows},\n Classes:\n{classes}")
for i, col in enumerate(rows.transpose()):
# calculate the best split for the col that satisfies the min_leaf
# constraint
@@ -232,12 +253,13 @@ def exhaustive_split_search(rows, classes, min_leaf):
# Check if there was a split fullfilling the min leaf rule
if col_best_split:
# add for which row we calculated the best split
- col_best_split += (i,)
+ col_best_split += (i, )
exhaustive_best_list.append(col_best_split)
print("\t\t->returning from exhaustive split search")
return exhaustive_best_list
-def add_children(node, x, best_split):
+
+def add_children(node, best_split):
"""
@todo: Docstring for add_children
"""
@@ -247,54 +269,49 @@ def add_children(node, x, best_split):
current_mask = node.split_value_or_rows
# Unpacking the best_split tuple
- mask_left, mask_right, node_split_value, node_col = best_split[1:]
- # print(f"{mask_left=}, {mask_right=}, {node_split_value=}, {node_col=}")
+ mask_left, mask_right, node_split_value, node_col = best_split[1:]
+ # print(f"{mask_left}, {mask_right}, {node_split_value}, {node_col}")
# Give the current node the split_value and col it needs for predictions
node.split_value_or_rows, node.col = node_split_value, node_col
# Updating the row masks to give it to children, keeping numpy dimension consistent
- mask_left, mask_right = update_mask(mask_left, current_mask), update_mask(mask_right, current_mask)
+ mask_left, mask_right = update_mask(mask_left, current_mask), update_mask(
+ mask_right, current_mask)
- # Adding the link between parent and children
- node.left, node.right = Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right)
+ # Adding the pointer between parent and children
+ node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right))
print("\t\t\t->children added to node and node list\n")
return [node.left, node.right]
+
def update_mask(mask, current_mask):
"""
- @todo: Docstring for update_mask
+ Updates the spit bool array from any dimension to an array with length
+ equal to the total number of rows in dataset x.
"""
- print("\t\t\t\t->entering update mask to calculate which rows belong to child")
- # current_mask = np.array([ True, True, True, True, True, True, False, False, False, True])
- # print(f"Parent mask: {current_mask=}")
- # print(f"The result of bestsplit: {mask=}")
- # mask_left=np.array([False, True, False, False, False, True, True])
- # mask_right=np.array([ True, False, True, True, True, False, False])
+ print(
+ "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child"
+ )
+ # Copy heb ik gedaan omdat we een slice assignment doen.
+ #
+ # Het punt van deze functie is dat tijdens het tree growen het aantal rows
+ # in een node afneemt. Deze functie update de bool array die gebruikt is om
+ # te splitten in bestsplit naar een array met als lengte het totale aantal
+ # rows in de data set x. Op deze manier kunnen we in tree grow de totale x
+ # splitten.
+ #
+ # Een andere optie zou zijn om de rows letterlijk tijdelijk op te slaan in
+ # nodes... weet niet wat beter is, het kan best veel geheugen in beslag
+ # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd
+ # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft.
+ print(f"Before update:\n{mask}")
copy = np.array(current_mask, copy=True)
copy[np.where(current_mask)] = mask
- # print(f"Child mask:{current_mask=}")
-
- # print(f"{current_mask=}")
- # print(f"{mask_right=}")
- # current_mask[np.where(current_mask)] = mask_left
- # print(f"{current_mask=}")
- # mask_right = np.where(current_mask, False, True)
- # print(f"{mask_right=}")
-
- # current_row_indexs = np.where(current_mask, mask_left, mask_right)
- # print(f"{current_mask[current_row_indexs]=}")
- # current_mask[current_row_indexs] = mask_left
-
-
- # print(f"{current_row_indexs=}")
- # print(f"{mask_left=}")
- # print(f"{len(mask_left)=}")
- # print(f"{mask_right=}")
- # print(f"{len(mask_right)=}")
+ print(f"After update:\n{copy}")
print("\t\t\t\t->updated row mask for child node")
return copy
-
+
#
#
@@ -322,7 +339,7 @@ def tree_grow(x=None,
root = Node(split_value_or_rows=mask)
# We instantieren ook gelijk het Tree object
- tr = Tree(root)
+ tr = Tree(root, (n_min, min_leaf, n_feat))
# De eerste nodelist heeft alleen de root node, daarna zijn twee childs,
# etc. totdat alle splits gemaakt zijn en de lijst leeg is.
@@ -338,12 +355,14 @@ def tree_grow(x=None,
# Gebruik de boolean mask van de node om de classes in de node uit y te halen
node_classes = y[node.split_value_or_rows]
- # print(f"{node_classes=}, {node_rows=}")
+ # print(f"{node_classes}, {node_rows}")
# Test of de node een leaf node is met n_min
if len(node_rows) < n_min:
node.is_leaf_node(node_classes)
- print("\t->Node has less rows than n_min, it is a leaf node, continueing to next node")
+ print(
+ "\t->Node has less rows than n_min, it is a leaf node, continueing to next node"
+ )
continue
print("\t->Node has more rows than n_min, it is not a leaf node")
@@ -358,129 +377,43 @@ def tree_grow(x=None,
# "If the algorithm performs a split, it should be the best split that meets the minleaf constrain"
#
# We krijgen een exhaustive lijst terug met splits
- exhaustive_best_list = exhaustive_split_search(node_rows, node_classes, min_leaf)
+ exhaustive_best_list = exhaustive_split_search(
+ node_rows, node_classes, min_leaf)
# print(exhaustive_best_list)
# Als de lijst leeg is zijn er geen potentieele splits die voldoen
# an de min leaf constraint
if not exhaustive_best_list:
node.is_leaf_node(node_classes)
- print("\t\t->No split that fullfils min_leaf was found continueing to next node")
+ print(
+ "\t\t->No split that fullfils min_leaf was found continueing to next node"
+ )
continue
- print("\t->Exhaustive search found a split fulfilling the min_leaf rule!")
+ print(
+ "\t->Exhaustive search found a split fulfilling the min_leaf rule!"
+ )
# Hier halen we de beste split, en rows voor de child/split nodes
# uit de exhaustive best list
best_split = max(exhaustive_best_list, key=lambda z: z[0])
+ print(f"\n######################best split tuple (delta_i, bool arrays voor child rows, split value, col)############################\n\n{best_split}\n\n###########################################################################################################################\n")
# Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent"
- nodelist += add_children(node, x, best_split)
+ nodelist += add_children(node, best_split)
# node.add_split(left_child_node, right_child_node)
else:
node.is_leaf_node(node_classes)
print("\t\t->The node is already pure, it is necessarily a leaf!")
continue
- print(tr)
+ # print(tr)
return tr
- # Initiate the nodelist with tuples of slice and class labels
- # nodelist = [Node(value=slices)]
- # tree = Tree(nodelist[0])
- # while nodelist:
- # current_node = nodelist.pop()
- # slices = current_node.value
- # node_classes = y[slices]
- # # print(node_classes)
-
- # # f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n'
- # # put stopping rules here before making a split
- # if len(node_classes) < n_min:
- # current_node.value = Leaf(
- # np.argmax(np.bincount(node_classes.astype(int))))
- # print(f"leaf node has majority clas:\n{current_node.value.value=}")
- # continue
-
- # if impurity(node_classes) > 0:
- # # print(
- # # f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}"
- # # )
-
- # # If we arrive here ever we are splitting
- # # bestsplit(col, node_labels) ->
- # # {"slices": list[int], "split": numpyfloat, "best_delta_i": numpyfloat}
-
- # # slices (list) used for knowing which rows (int) to consider in a node
- # # best_split saved in current_node.value
- # # best_delta_i used to find best split among x_columns
- # best_dict = None
- # for i, x_col in enumerate(x[slices].transpose()):
- # print(
- # "\nExhaustive split search says; \"Entering new column\":")
- # col_split_dict = bestsplit(x_col, node_classes, slices)
-
- # if best_dict is not None:
- # if col_split_dict["delta_i"] > best_dict["delta_i"]:
- # best_dict = col_split_dict
- # best_dict["col"] = i
- # else:
- # best_dict = col_split_dict
- # best_dict["col"] = i
- # print("\nThe best split for current node:", best_dict)
-
- # # Here we store the splitted data into Node objects
- # current_node.value = best_dict["split"]
- # current_node.col = best_dict["col"]
- # # Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n'
- # if min([len(x) for x in best_dict["slices"].values()]) < min_leaf:
- # continue
- # else:
- # # Invert left and right because we want left to pop() first
- # children = [
- # Node(value=best_dict["slices"]["right"]),
- # Node(value=best_dict["slices"]["left"])
- # ]
- # current_node.add_split(children[1], children[0])
- # nodelist += children
- # else:
- # current_node.value = Leaf(
- # np.argmax(np.bincount(node_classes.astype(int))))
- # print(
- # f"\n\nLEAF NODE has majority clas:\n{current_node.value.value=}"
- # )
- # continue
-
-
-def predict(x, nodes) -> list:
- """
- @todo: Docstring for predict
- """
- # which row to drop
- # print(x)
- drop = 0
- while not set(nodes).issubset({0, 1}):
- print(nodes)
- # print(x[drop])
- if isinstance(nodes[drop].value, Leaf):
- nodes[drop] = nodes[drop].value.value
- drop += 1
- continue
-
- print(nodes[drop].value)
- print(nodes[drop].col)
- # print(nodes[drop].col)
- if x[drop, nodes[drop].col] > nodes[drop].value:
- nodes[drop] = nodes[drop].left
- else:
- nodes[drop] = nodes[drop].right
- return np.array(nodes)
-
def tree_pred(x=None, tr=None, **defaults) -> np.array:
"""
@todo: Docstring for tree_pred
"""
- nodes = [tr.tree] * len(x)
- # y = np.linspace(0, len(x), 0)
- # y = np.array(ele)
- y = predict(x, nodes)
- print(f"\n\nPredicted classes for {x=}\n\n are: {y=}")
+ y = tr.predict(x)
+ n_min, min_leaf, n_feat = tr.hyper_params
+ print("-"*80+f"\nPrediction parameters:\nn_min={n_min}\nmin_leaf={min_leaf}\nn_feat={n_feat}")
+ print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}")
return y
@@ -500,19 +433,16 @@ if __name__ == '__main__':
'y': credit_data[:, 5],
'n_min': 2,
'min_leaf': 1,
- 'n_feat': 5
+ 'n_feat': None # 5, should be 5 for bootstrapping folds
}
# Calling the tree grow, unpacking default as argument
tree_grow(**tree_grow_defaults)
#### TREE_PRED TEST
- # tree_pred_defaults = {
- # 'x': credit_data[:, :5],
- # 'tr': tree_grow(**tree_grow_defaults)
- # }
-
- # tree_pred(**tree_pred_defaults)
+ tree_pred_defaults = {
+ 'x': credit_data[:, :5],
+ 'tr': tree_grow(**tree_grow_defaults)
+ }
-# start_time = time.time()
-# print("--- %s seconds ---" % (time.time() - start_time))
+ tree_pred(**tree_pred_defaults)