From e1ad1dcd2ea5c7663ae0a867cc53b7cb13f3c91a Mon Sep 17 00:00:00 2001
From: Mike Vink <mike1994vink@gmail.com>
Date: Thu, 24 Sep 2020 17:04:11 +0200
Subject: Improv: numpy exhaustive split and better update

---
 assignment1.py | 448 -------------------------------------------------------
 tree.py        | 459 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 459 insertions(+), 448 deletions(-)
 delete mode 100644 assignment1.py
 create mode 100644 tree.py

diff --git a/assignment1.py b/assignment1.py
deleted file mode 100644
index 4b264d7..0000000
--- a/assignment1.py
+++ /dev/null
@@ -1,448 +0,0 @@
-import numpy as np
-
-credit_data = np.genfromtxt('./credit_score.txt',
-                            delimiter=',',
-                            skip_header=True)
-
-# age,married,house,income,gender,class
-# [(22, 0, 0, 28, 1, 0)
-#  (46, 0, 1, 32, 0, 0)
-#  (24, 1, 1, 24, 1, 0)
-#  (25, 0, 0, 27, 1, 0)
-#  (29, 1, 1, 32, 0, 0)
-#  (45, 1, 1, 30, 0, 1)
-#  (63, 1, 1, 58, 1, 1)
-#  (36, 1, 0, 52, 1, 1)
-#  (23, 0, 1, 40, 0, 1)
-#  (50, 1, 1, 28, 0, 1)]
-
-# In the program data points are called rows
-
-# In the program categorical or numerical attributes are called cols for columns
-
-# The last column are the classes and will be called as classes in the program
-
-
-class Node:
-    """
-    The node object points to two other Node objects.
-    """
-    def __init__(self, split_value_or_rows=None, col=None):
-        """Initialises the column and split value for the node.
-
-        /split_value_or_rows=None/ can either be the best split value of
-        a col, or a boolean mask for x that selects the rows to consider for
-        calculating the split_value
-
-        /col=None/ if the node object has a split_value, then it also has a col
-        that belongs to this value
-
-        """
-        self.split_value_or_rows = split_value_or_rows
-        self.col = col
-
-    def add_split(self, left, right):
-        """
-        Lets the node object point to two other objects that can be either Leaf
-        or Node.
-        """
-        self.left = left
-        self.right = right
-
-    def is_leaf_node(self, node_classes):
-        """
-        is_leaf_node is used to change the col attribute to None to indicate a
-        leaf node
-        """
-        self.col = None
-        # This weird numpy line gives the majority vote, which is 1 or 0
-        self.split_value_or_rows = np.argmax(
-            np.bincount(node_classes.astype(int)))
-
-
-class Tree:
-    """
-    Tree object that points towards the root node.
-    """
-    def __init__(self, root_node_obj, hyper_params):
-        """Initialises only by pointing to a Node object.
-
-        /root_node_obj/ is a node object that is made before entering the main
-        loop of tree grow.
-
-        """
-        self.tree = root_node_obj
-        self.hyper_params = hyper_params
-
-    def predict(self, x):
-        """
-        Makes a list of root nodes, and drops one row of x through the tree per
-        loop
-        """
-        # Maak een lijst van nodes, wiens indexes overeen komen met de rows in
-        # x die we willen droppen
-        nodes = [self.tree] * len(x)
-
-        # De index van de row van x die we in de boom willen droppen
-        drop = 0
-        while not all(pred_class in {0,1} for pred_class in nodes):
-            # Als de col None is dan is het een leaf node, dus dan is de row
-            # van x hier beland
-            if nodes[drop].col is None:
-                nodes[drop] = nodes[drop].split_value_or_rows
-                drop += 1
-                continue
-
-            # Vergelijk de x col (age,married,house,income,gender,class), in de
-            # gedropte row met het split value van de node. Op basis hiervan
-            # drop naar links of rechts
-            if x[drop, nodes[drop].col] > nodes[drop].split_value_or_rows:
-                nodes[drop] = nodes[drop].left
-            else:
-                nodes[drop] = nodes[drop].right
-        return np.array(nodes)
-
-    # Work in progress tree printer
-    #
-    # def __repr__(self):
-    #     tree_string = ''
-    #     node = self.tree
-    #     depth = 0
-    #     nodelist = [node]
-    #     while nodelist:
-    #         node = nodelist.pop()
-    #         depth += 1
-    #         if node.col is not None:
-    #             left, right = node.left, node.right
-    #             nodelist += [left, right]
-    #         else:
-    #             continue
-    #         tree_string += '\n' + depth * ' '
-    #         tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\'
-    #         tree_string += '\n' + ' ' * 2 * depth
-    #         for direc in left, right:
-    #             if not direc.split_value_or_rows%10:
-    #                 tree_string += ' ' * 4
-    #             else:
-    #                 tree_string += ' ' * 3
-    #             tree_string += str(int(direc.split_value_or_rows))
-
-    #     tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string
-    #     return tree_string
-
-
-def impurity(array) -> int:
-    """
-    Assumes the argument array is a one dimensional vector of zeroes and ones.
-    Computes the gini index impurity based on the relative frequency of ones in
-    the vector.
-
-    Example:
-
-    >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1])
-    >>> array
-    array([1,0,1,1,1,0,0,1,1,0,1])
-
-    >>> impurity(array)
-    0.23140495867768596
-    """
-    # Total labels
-    n_labels = len(array)
-    if n_labels == 0:
-        print(
-            "division by zero will happen, child node is pure, doesnt contain anything"
-        )
-        n_labels = 1
-    # Number of tuples labeled 1
-    n_labels_1 = array.sum()
-    # Calculate the relative frequency of ones with respect to the total labels
-    rel_freq_1 = n_labels_1 / n_labels
-    # Use the symmetry around the median property to also calculate the
-    # relative frequency of zeroes
-    rel_freq_0 = 1 - rel_freq_1
-    # Multiply the frequencies to get the gini index
-    gini_index = rel_freq_1 * rel_freq_0
-    return gini_index
-
-
-def bestsplit(x, y, min_leaf) -> None:
-    """
-    x = vector of single col
-    y = vector of classes (last col in x)
-
-    Consider splits of type "x <= c" where "c" is the average of two consecutive
-    values of x in the sorted order.
-
-    x and y must be of the same length
-
-    y[i] must be the class label of the i-th observation, and x[i] is the
-    correspnding value of attribute x
-
-    Example (best split on income):
-
-    >>> bestsplit(credit_data[:,3],credit_data[:,5])
-     36
-    """
-    x_sorted = np.sort(np.unique(x))
-    if len(x_sorted) <= 2:
-        # Allows splitting on categorical (0 or 1) cols
-        split_points = x_sorted / 2
-    else:
-        # Take average between consecutive numerical rows in the x col
-        split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
-
-    # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee
-    # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes
-    # bestsplit toe te passen.
-    #
-    # Deze lus berekent de best split value, en op basis daarvan weten we welke
-    # twee "x rows" arrays we moeten returnen, en welke split value het beste
-    # was natuurlijk.
-
-    # Hieren stoppen we (delta_i, split_value, rows_left, rows_right)
-    best_list = []
-    # Stop wanneer de array met split points leeg is
-    while split_points.size != 0:
-        # Huidige split value
-        split_value = split_points[-1]
-        # boolean masks om x rows mee te masken/slicen
-        mask_left, mask_right = x > split_value, x <= split_value
-
-        # class voor beide split kanten
-        classes_left, classes_right = y[mask_left], y[mask_right]
-
-        # Kijk of er genoeg rows in de gesplitte nodes terechtkomen, anders
-        # mogen we de split niet toelaten vanwege de min_leaf constraint
-        if len(classes_left) < min_leaf or len(classes_right) < min_leaf:
-            # Haal de huidige split_point uit split_points
-            split_points = split_points[:-1]
-            continue
-
-        # delta_i formule, improved by not taking the parent impurity into
-        # account, and not making the weighted average but the weigthed sum
-        # only (thanks Lonnie)
-        delta_i = (
-            impurity(classes_left) * len(classes_left) +
-            impurity(classes_right) * len(classes_right))
-        # stop huidige splits in de lijst om best split te berekenen
-        best_list.append((delta_i, mask_left, mask_right, split_value))
-        # Haal de huidige split_point uit split_points
-        split_points = split_points[:-1]
-
-    # Bereken de best split voor deze x col, als er ten minste 1 bestaat die
-    # voldoet aan min leaf
-    if best_list:
-        return min(best_list, key=lambda x: x[0])
-    else:
-        return False
-
-
-def exhaustive_split_search(rows, classes, min_leaf):
-    """
-    @todo: Docstring for exhaustive_split_search
-    """
-    print("\t\t->entering exhaustive split search")
-    # We hebben enumerate nodig, want we willen weten op welke col
-    # (age,married,house,income,gender) we een split doen
-    exhaustive_best_list = []
-    print(f"Rows:\n{rows},\n Classes:\n{classes}")
-    for i, col in enumerate(rows.transpose()):
-        # calculate the best split for the col that satisfies the min_leaf
-        # constraint
-        col_best_split = bestsplit(col, classes, min_leaf)
-        # Check if there was a split fullfilling the min leaf rule
-        if col_best_split:
-            # add for which row we calculated the best split
-            col_best_split += (i, )
-            exhaustive_best_list.append(col_best_split)
-    print("\t\t->returning from exhaustive split search")
-    return exhaustive_best_list
-
-
-def add_children(node, best_split):
-    """
-    @todo: Docstring for add_children
-    """
-    print("\t\t\t->entering add children")
-    # The mask that was used to get the rows for the current node from x, we
-    # need this to update the rows for the children
-    current_mask = node.split_value_or_rows
-
-    # Unpacking the best_split tuple
-    mask_left, mask_right, node_split_value, node_col = best_split[1:]
-    # print(f"{mask_left}, {mask_right}, {node_split_value}, {node_col}")
-
-    # Give the current node the split_value and col it needs for predictions
-    node.split_value_or_rows, node.col = node_split_value, node_col
-
-    # Updating the row masks to give it to children, keeping numpy dimension consistent
-    mask_left, mask_right = update_mask(mask_left, current_mask), update_mask(
-        mask_right, current_mask)
-
-    # Adding the pointer between parent and children
-    node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right))
-    print("\t\t\t->children added to node and node list\n")
-    return [node.left, node.right]
-
-
-def update_mask(mask, current_mask):
-    """
-    Updates the spit bool array from any dimension to an array with length
-    equal to the total number of rows in dataset x.
-    """
-    print(
-        "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child"
-    )
-    # Copy heb ik gedaan omdat we een slice assignment doen.
-    #
-    # Het punt van deze functie is dat tijdens het tree growen het aantal rows
-    # in een node afneemt. Deze functie update de bool array die gebruikt is om
-    # te splitten in bestsplit naar een array met als lengte het totale aantal
-    # rows in de data set x. Op deze manier kunnen we in tree grow de totale x
-    # splitten. 
-    #
-    # Een andere optie zou zijn om de rows letterlijk tijdelijk op te slaan in
-    # nodes... weet niet wat beter is, het kan best veel geheugen in beslag
-    # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd
-    # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft.
-    print(f"Before update:\n{mask}")
-    copy = np.array(current_mask, copy=True)
-    copy[np.where(current_mask)] = mask
-    print(f"After update:\n{copy}")
-    print("\t\t\t\t->updated row mask for child node")
-    return copy
-
-
-#
-#
-# Put all helper functions above this comment!
-
-
-def tree_grow(x=None,
-              y=None,
-              n_min=None,
-              min_leaf=None,
-              n_feat=None,
-              **defaults) -> Tree:
-    """
-    @todo: Docstring for tree_grow
-    """
-    # De nodelist heeft in het begin alleen een root node met alle rows van x,
-    # omdat alle rows in de root in acht worden genomen voor bestsplit berekening.
-    #
-    # Dit representeren we met een boolean mask over x, met lengte het aantal rows
-    # in x en elementen True. Deze boolean mask zullen we repeatedly gebruiken als een
-    # mask over x om de rows voor bestsplit op te halen.
-    mask = np.full(len(x), True)
-
-    # Het eerste node object moet nu geinstantieerd worden
-    root = Node(split_value_or_rows=mask)
-
-    # We instantieren ook gelijk het Tree object
-    tr = Tree(root, (n_min, min_leaf, n_feat))
-
-    # De eerste nodelist heeft alleen de root node, daarna zijn twee childs,
-    # etc. totdat alle splits gemaakt zijn en de lijst leeg is.
-    nodelist = [root]
-
-    while nodelist:
-        print("->Taking new node from the node list")
-        # Pop de current node uit de nodelist
-        node = nodelist.pop()
-        # Gebruik de boolean mask van de node om de rows in de node uit x te halen
-        node_rows = x[node.split_value_or_rows]
-        # print(node_rows)
-        # Gebruik de boolean mask van de node om de classes in de node uit y te halen
-        node_classes = y[node.split_value_or_rows]
-
-        # print(f"{node_classes}, {node_rows}")
-
-        # Test of de node een leaf node is met n_min
-        if len(node_rows) < n_min:
-            node.is_leaf_node(node_classes)
-            print(
-                "\t->Node has less rows than n_min, it is a leaf node, continueing to next node"
-            )
-            continue
-        print("\t->Node has more rows than n_min, it is not a leaf node")
-
-        # Als de node niet puur is, probeer dan te splitten
-        if impurity(node_classes) > 0:
-            print("\t->Node is not pure yet starting exhaustive split search")
-            # We gaan exhaustively voor de rows in de node over de cols
-            # (age,married,house,income,gender) om de bestsplit te
-            # bepalen
-            #
-            # We geven min_leaf als argument omdat:
-            # "If the algorithm performs a split, it should be the best split that meets the minleaf constrain"
-            #
-            # We krijgen een exhaustive lijst terug met splits
-            exhaustive_best_list = exhaustive_split_search(
-                node_rows, node_classes, min_leaf)
-            # print(exhaustive_best_list)
-            # Als de lijst leeg is zijn er geen potentieele splits die voldoen
-            # an de min leaf constraint
-            if not exhaustive_best_list:
-                node.is_leaf_node(node_classes)
-                print(
-                    "\t\t->No split that fullfils min_leaf was found continueing to next node"
-                )
-                continue
-            print(
-                "\t->Exhaustive search found a split fulfilling the min_leaf rule!"
-            )
-            # Hier halen we de beste split, en rows voor de child/split nodes
-            # uit de exhaustive best list
-            best_split = min(exhaustive_best_list, key=lambda z: z[0])
-            print(f"\n######################best split tuple (delta_i, bool arrays voor child rows, split value, col)############################\n\n{best_split}\n\n###########################################################################################################################\n")
-            # Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent"
-            nodelist += add_children(node, best_split)
-            # node.add_split(left_child_node, right_child_node)
-        else:
-            node.is_leaf_node(node_classes)
-            print("\t\t->The node is already pure, it is necessarily a leaf!")
-            continue
-    # print(tr)
-    return tr
-
-
-def tree_pred(x=None, tr=None, **defaults) -> np.array:
-    """
-    @todo: Docstring for tree_pred
-    """
-    y = tr.predict(x)
-    n_min, min_leaf, n_feat = tr.hyper_params
-    print("-"*80+f"\nPrediction parameters:\nn_min={n_min}\nmin_leaf={min_leaf}\nn_feat={n_feat}")
-    print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}")
-    return y
-
-
-if __name__ == '__main__':
-    #### IMPURITY TEST
-    # array=np.array([1,0,1,1,1,0,0,1,1,0,1])
-    # print(impurity(array))
-    # Should give 0.23....
-
-    #### BESTSPLIT TEST
-    # print(bestsplit(credit_data[:, 3], credit_data[:, 5]))
-    # Should give 36
-
-    #### TREE_GROW TEST
-    tree_grow_defaults = {
-        'x': credit_data[:, :5],
-        'y': credit_data[:, 5],
-        'n_min': 2,
-        'min_leaf': 1,
-        'n_feat': None # 5, should be 5 for bootstrapping folds
-    }
-
-    # Calling the tree grow, unpacking default as argument
-    # tree_grow(**tree_grow_defaults)
-
-    #### TREE_PRED TEST
-    tree_pred_defaults = {
-        'x': credit_data[:, :5],
-        'tr': tree_grow(**tree_grow_defaults)
-    }
-
-    tree_pred(**tree_pred_defaults)
diff --git a/tree.py b/tree.py
new file mode 100644
index 0000000..64e9d21
--- /dev/null
+++ b/tree.py
@@ -0,0 +1,459 @@
+import numpy as np
+
+credit_data = np.genfromtxt('./data/credit_score.txt',
+                            delimiter=',',
+                            skip_header=True)
+
+# age,married,house,income,gender,class
+# [(22, 0, 0, 28, 1, 0)
+#  (46, 0, 1, 32, 0, 0)
+#  (24, 1, 1, 24, 1, 0)
+#  (25, 0, 0, 27, 1, 0)
+#  (29, 1, 1, 32, 0, 0)
+#  (45, 1, 1, 30, 0, 1)
+#  (63, 1, 1, 58, 1, 1)
+#  (36, 1, 0, 52, 1, 1)
+#  (23, 0, 1, 40, 0, 1)
+#  (50, 1, 1, 28, 0, 1)]
+
+# In the program data points are called rows
+
+# In the program categorical or numerical attributes are called cols for columns
+
+# The last column are the classes and will be called as classes in the program
+
+
+class Node:
+    """
+    The node object points to two other Node objects.
+    """
+    def __init__(self, split_value_or_rows=None, col=None):
+        """Initialises the column and split value for the node.
+
+        /split_value_or_rows=None/ can either be the best split value of
+        a col, or a boolean mask for x that selects the rows to consider for
+        calculating the split_value
+
+        /col=None/ if the node object has a split_value, then it also has a col
+        that belongs to this value
+
+        """
+        self.split_value_or_rows = split_value_or_rows
+        self.col = col
+
+    def add_split(self, left, right):
+        """
+        Lets the node object point to two other objects that can be either Leaf
+        or Node.
+        """
+        self.left = left
+        self.right = right
+
+    def is_leaf_node(self, node_classes):
+        """
+        is_leaf_node is used to change the col attribute to None to indicate a
+        leaf node
+        """
+        self.col = None
+        # This weird numpy line gives the majority vote, which is 1 or 0
+        self.split_value_or_rows = np.argmax(
+            np.bincount(node_classes.astype(int)))
+
+
+class Tree:
+    """
+    Tree object that points towards the root node.
+    """
+    def __init__(self, root_node_obj, hyper_params):
+        """Initialises only by pointing to a Node object.
+
+        /root_node_obj/ is a node object that is made before entering the main
+        loop of tree grow.
+
+        """
+        self.tree = root_node_obj
+        self.hyper_params = hyper_params
+
+    def predict(self, x):
+        """
+        Makes a list of root nodes, and drops one row of x through the tree per
+        loop
+        """
+        # Maak een lijst van nodes, wiens indexes overeen komen met de rows in
+        # x die we willen droppen
+        nodes = [self.tree] * len(x)
+
+        # De index van de row van x die we in de boom willen droppen
+        drop = 0
+        while not all(pred_class in {0,1} for pred_class in nodes):
+            # Als de col None is dan is het een leaf node, dus dan is de row
+            # van x hier beland
+            # print(nodes[drop].col, nodes[drop].split_value_or_rows)
+            if nodes[drop].col is None:
+                nodes[drop] = nodes[drop].split_value_or_rows
+                drop += 1
+                continue
+
+            # Vergelijk de x col (age,married,house,income,gender,class), in de
+            # gedropte row met het split value van de node. Op basis hiervan
+            # drop naar links of rechts
+            if x[drop, nodes[drop].col] > nodes[drop].split_value_or_rows:
+                nodes[drop] = nodes[drop].left
+            else:
+                nodes[drop] = nodes[drop].right
+        return np.array(nodes)
+
+    # Work in progress tree printer
+    #
+    # def __repr__(self):
+    #     tree_string = ''
+    #     node = self.tree
+    #     depth = 0
+    #     nodelist = [node]
+    #     while nodelist:
+    #         node = nodelist.pop()
+    #         depth += 1
+    #         if node.col is not None:
+    #             left, right = node.left, node.right
+    #             nodelist += [left, right]
+    #         else:
+    #             continue
+    #         tree_string += '\n' + depth * ' '
+    #         tree_string += (depth + 4) * ' ' + '/' + ' ' * 2 + '\\'
+    #         tree_string += '\n' + ' ' * 2 * depth
+    #         for direc in left, right:
+    #             if not direc.split_value_or_rows%10:
+    #                 tree_string += ' ' * 4
+    #             else:
+    #                 tree_string += ' ' * 3
+    #             tree_string += str(int(direc.split_value_or_rows))
+
+    #     tree_string = depth * ' ' + str(int(self.tree.split_value_or_rows)) + tree_string
+    #     return tree_string
+
+
+def impurity(array) -> int:
+    """
+    Assumes the argument array is a one dimensional vector of zeroes and ones.
+    Computes the gini index impurity based on the relative frequency of ones in
+    the vector.
+
+    Example:
+
+    >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+    >>> array
+    array([1,0,1,1,1,0,0,1,1,0,1])
+
+    >>> impurity(array)
+    0.23140495867768596
+    """
+    # Total labels
+    n_labels = len(array)
+    if n_labels == 0:
+        print(
+            "division by zero will happen, child node is pure, doesnt contain anything"
+        )
+        n_labels = 1
+    # Number of tuples labeled 1
+    n_labels_1 = array.sum()
+    # Calculate the relative frequency of ones with respect to the total labels
+    rel_freq_1 = n_labels_1 / n_labels
+    # Use the symmetry around the median property to also calculate the
+    # relative frequency of zeroes
+    rel_freq_0 = 1 - rel_freq_1
+    # Multiply the frequencies to get the gini index
+    gini_index = rel_freq_1 * rel_freq_0
+    return gini_index
+
+
+def bestsplit(x, y, min_leaf) -> None:
+    """
+    x = vector of single col
+    y = vector of classes (last col in x)
+
+    Consider splits of type "x <= c" where "c" is the average of two consecutive
+    values of x in the sorted order.
+
+    x and y must be of the same length
+
+    y[i] must be the class label of the i-th observation, and x[i] is the
+    correspnding value of attribute x
+
+    Example (best split on income):
+
+    >>> bestsplit(credit_data[:,3],credit_data[:,5])
+     36
+    """
+    x_sorted = np.sort(np.unique(x))
+    # if len(x_sorted) <= 2:
+    #     # Allows splitting on categorical (0 or 1) cols
+    #     split_points = x_sorted / 2
+    # else:
+    #     # Take average between consecutive numerical rows in the x col
+    #     split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
+
+    split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
+
+    # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee
+    # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes
+    # bestsplit toe te passen.
+    #
+    # Deze lus berekent de best split value, en op basis daarvan weten we welke
+    # twee "x rows" arrays we moeten returnen, en welke split value het beste
+    # was natuurlijk.
+
+    # Hieren stoppen we (rows for children, delta_i, split value, col will be added later)
+    best_array = np.zeros((split_points.shape[0], x.shape[0] + 3), dtype='float64')
+    # print(f"{best_array=}")
+    # Stop wanneer de array met split points leeg is
+    while split_points.size != 0:
+        # Huidige split value
+        split_value = split_points[-1]
+        # print(split_value)
+        # boolean masks om x rows mee te masken/slicen
+        # print(best_array.shape)
+        current_row_of_best_array = split_points.shape[0]-1
+        best_array[current_row_of_best_array, :x.shape[0]] = x > split_value
+        # print(best_array)
+        # print(f"{best_array[current_row_of_best_array,:-2]=}")
+
+        # class voor beide split kanten
+        classes_left, classes_right = y[best_array[current_row_of_best_array,:-3].astype(bool)], y[np.invert(best_array[current_row_of_best_array,:-3].astype(bool))]
+        # print(f"{classes_left=} {classes_right=}")
+
+        # Kijk of er genoeg rows in de gesplitte nodes terechtkomen, anders
+        # mogen we de split niet toelaten vanwege de min_leaf constraint
+        # print(f"{classes_left.shape[0]=}")
+        if classes_left.shape[0] < min_leaf or classes_right.shape[0] < min_leaf:
+            # Haal de huidige split_point uit split_points
+            split_points = split_points[:-1]
+            continue
+
+        # delta_i formule, improved by not taking the parent impurity into
+        # account, and not making the weighted average but the weigthed sum
+        # only (thanks Lonnie)
+        delta_i = (
+            impurity(classes_left) * classes_left.shape[0] +
+            impurity(classes_right) * classes_right.shape[0])
+        # stop huidige splits in de lijst om best split te berekenen
+        # print(f"{np.array((delta_i, mask_left, mask_right, split_value))=}")
+        # print(f"{best_array[:, classes_left.shape[0]-1]=}")  
+        best_array[current_row_of_best_array,-3:-1] = delta_i, split_value
+        # print(best_array)
+        # Haal de huidige split_point uit split_points
+        split_points = split_points[:-1]
+
+    # Bereken de best split voor deze x col, als er ten minste 1 bestaat die
+    # voldoet aan min leaf
+    return best_array
+
+
+def exhaustive_split_search(rows, classes, min_leaf):
+    """
+    @todo: Docstring for exhaustive_split_search
+    """
+    print("\t\t->entering exhaustive split search")
+    # We hebben enumerate nodig, want we willen weten op welke col
+    # (age,married,house,income,gender) we een split doen
+    exhaustive_best_array = np.zeros((1, rows.shape[0] + 3), dtype='float64')
+    print(f"Rows:\n{rows},\n Classes:\n{classes}")
+    for i, col in enumerate(rows.transpose()):
+        # calculate the best split for the col that satisfies the min_leaf
+        # constraint
+        best_array_for_col = bestsplit(col, classes, min_leaf)
+        # add col number to rows
+        best_array_for_col[:,-1] = i
+        exhaustive_best_array = np.vstack((exhaustive_best_array, best_array_for_col))
+    print("The array with exhaustive splits is\n", exhaustive_best_array)
+    print("\t\t->returning from exhaustive split search")
+    return exhaustive_best_array
+
+
+def add_children(node, best_split):
+    """
+    @todo: Docstring for add_children
+    """
+    print("\t\t\t->entering add children")
+    # The mask that was used to get the rows for the current node from x, we
+    # need this to update the rows for the children
+    current_mask = node.split_value_or_rows
+
+    # Give the current node the split_value and col it needs for predictions
+    node.split_value_or_rows, node.col = best_split[-2], int(best_split[-1])
+
+    # Updating the row masks to give it to children, keeping numpy dimension consistent
+    mask_left, mask_right = update_mask(best_split[:-3], current_mask)
+
+    # Adding the pointer between parent and children
+    node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right))
+    print("\t\t\t->children added to node and node list\n")
+    return [node.left, node.right]
+
+
+def update_mask(mask, current_mask):
+    """
+    Updates the spit bool array from any dimension to an array with length
+    equal to the total number of rows in dataset x.
+    """
+    print(
+        "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child"
+    )
+    # Copy heb ik gedaan omdat we een slice assignment doen.
+    #
+    # Het punt van deze functie is dat tijdens het tree growen het aantal rows
+    # in een node afneemt. Deze functie update de bool array die gebruikt is om
+    # te splitten in bestsplit naar een array met als lengte het totale aantal
+    # rows in de data set x. Op deze manier kunnen we in tree grow de totale x
+    # splitten. 
+    #
+    # Een andere optie zou zijn om de rows letterlijk tijdelijk op te slaan in
+    # nodes... weet niet wat beter is, het kan best veel geheugen in beslag
+    # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd
+    # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft.
+    current_mask = np.vstack((current_mask, current_mask))
+    print(f"\nBefore update (upper rows goes to the left child, bottom to the right):\n{current_mask}")
+    # print(mask)
+    # print(mask.astype(bool))
+    # print(current_mask[current_mask == True])
+    current_mask[0][current_mask[0] == True] = mask.astype(bool)
+    current_mask[1][current_mask[1] == True] = np.invert(mask.astype(bool))
+    # copy = np.array(current_mask, copy=True)
+    # copy[np.where(current_mask)] = mask
+    print(f"After update:\n{current_mask}")
+    print("\t\t\t\t->updated row mask for child node")
+    return current_mask[0], current_mask[1]
+
+
+#
+#
+# Put all helper functions above this comment!
+
+
+def tree_grow(x=None,
+              y=None,
+              n_min=None,
+              min_leaf=None,
+              n_feat=None,
+              **defaults) -> Tree:
+    """
+    @todo: Docstring for tree_grow
+    """
+    # De nodelist heeft in het begin alleen een root node met alle rows van x,
+    # omdat alle rows in de root in acht worden genomen voor bestsplit berekening.
+    #
+    # Dit representeren we met een boolean mask over x, met lengte het aantal rows
+    # in x en elementen True. Deze boolean mask zullen we repeatedly gebruiken als een
+    # mask over x om de rows voor bestsplit op te halen.
+    mask = np.full(len(x), True)
+
+    # Het eerste node object moet nu geinstantieerd worden
+    root = Node(split_value_or_rows=mask)
+
+    # We instantieren ook gelijk het Tree object
+    tr = Tree(root, (n_min, min_leaf, n_feat))
+
+    # De eerste nodelist heeft alleen de root node, daarna zijn twee childs,
+    # etc. totdat alle splits gemaakt zijn en de lijst leeg is.
+    nodelist = [root]
+
+    while nodelist:
+        print("->Taking new node from the node list")
+        # Pop de current node uit de nodelist
+        node = nodelist.pop()
+        # Gebruik de boolean mask van de node om de rows in de node uit x te halen
+        node_rows = x[node.split_value_or_rows]
+        # print(node_rows)
+        # Gebruik de boolean mask van de node om de classes in de node uit y te halen
+        node_classes = y[node.split_value_or_rows]
+
+        # print(f"{node_classes}, {node_rows}")
+
+        # Test of de node een leaf node is met n_min
+        if len(node_rows) < n_min:
+            node.is_leaf_node(node_classes)
+            print(
+                "\t->Node has less rows than n_min, it is a leaf node, continueing to next node"
+            )
+            continue
+        print("\t->Node has more rows than n_min, it is not a leaf node")
+
+        # Als de node niet puur is, probeer dan te splitten
+        if impurity(node_classes) > 0:
+            print("\t->Node is not pure yet starting exhaustive split search")
+            # We gaan exhaustively voor de rows in de node over de cols
+            # (age,married,house,income,gender) om de bestsplit te
+            # bepalen
+            #
+            # We geven min_leaf als argument omdat:
+            # "If the algorithm performs a split, it should be the best split that meets the minleaf constrain"
+            #
+            # We krijgen een exhaustive lijst terug met splits
+            exhaustive_best_array = exhaustive_split_search(
+                node_rows, node_classes, min_leaf)
+            # print(exhaustive_best_list)
+            # Als de lijst leeg is zijn er geen potentieele splits die voldoen
+            # an de min leaf constraint
+            if exhaustive_best_array.shape[0] == 1:
+                node.is_leaf_node(node_classes)
+                print(
+                    "\t\t->No split that fullfils min_leaf was found continueing to next node"
+                )
+                continue
+            print(
+                "\t->Exhaustive search found a split fulfilling the min_leaf rule!"
+            )
+            # Hier halen we de beste split, en rows voor de child/split nodes
+            # uit de exhaustive best list
+            best_split = exhaustive_best_array[np.argmin(exhaustive_best_array[1:,-3]) + 1]
+            print(f"\n######################best split array, the last three element=[delta_i, splitvalue, col/attribute], the rest is boolean include or not include row in child node############################\n\n{best_split}\n\n###############################################################################\n")
+            # Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent"
+            nodelist += add_children(node, best_split)
+            # node.add_split(left_child_node, right_child_node)
+        else:
+            node.is_leaf_node(node_classes)
+            print("\t\t->The node is already pure, it is necessarily a leaf!")
+            continue
+    # print(tr)
+    return tr
+
+
+def tree_pred(x=None, tr=None, **defaults) -> np.array:
+    """
+    @todo: Docstring for tree_pred
+    """
+    y = tr.predict(x)
+    n_min, min_leaf, n_feat = tr.hyper_params
+    print("-"*80+f"\nPrediction parameters:\nn_min={n_min}\nmin_leaf={min_leaf}\nn_feat={n_feat}")
+    print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}")
+    return y
+
+
+if __name__ == '__main__':
+    #### IMPURITY TEST
+    # array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+    # print(impurity(array))
+    # Should give 0.23....
+
+    #### BESTSPLIT TEST
+    # print(bestsplit(credit_data[:, 3], credit_data[:, 5]))
+    # Should give 36
+
+    #### TREE_GROW TEST
+    tree_grow_defaults = {
+        'x': credit_data[:, :5],
+        'y': credit_data[:, 5],
+        'n_min': 2,
+        'min_leaf': 1,
+        'n_feat': None # 5, should be 5 for bootstrapping folds
+    }
+
+    # Calling the tree grow, unpacking default as argument
+    # tree_grow(**tree_grow_defaults)
+
+    #### TREE_PRED TEST
+    tree_pred_defaults = {
+        'x': credit_data[:, :5],
+        'tr': tree_grow(**tree_grow_defaults)
+    }
+
+    tree_pred(**tree_pred_defaults)
-- 
cgit v1.2.3