Improv: numpy exhaustive split and better update

author: Mike Vink <mike1994vink@gmail.com> 2020-09-24 17:04:11 +0200
committer: Mike Vink <mike1994vink@gmail.com> 2020-09-24 17:04:11 +0200
commit: e1ad1dcd2ea5c7663ae0a867cc53b7cb13f3c91a (patch)
tree: 0a71bbad78e932ec00d2ab287ce3f6238c3773b7
parent: e650ad27d13b392f5b6535906e36176cb0777650 (diff)
1 files changed, 54 insertions, 43 deletions
diff --git a/assignment1.py b/tree.py
index 4b264d7..64e9d21 100644
--- a/assignment1.py
+++ b/tree.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-credit_data = np.genfromtxt('./credit_score.txt',
+credit_data = np.genfromtxt('./data/credit_score.txt',
                             delimiter=',',
                             skip_header=True)
 
@@ -88,6 +88,7 @@ class Tree:
         while not all(pred_class in {0,1} for pred_class in nodes):
             # Als de col None is dan is het een leaf node, dus dan is de row
             # van x hier beland
+            # print(nodes[drop].col, nodes[drop].split_value_or_rows)
             if nodes[drop].col is None:
                 nodes[drop] = nodes[drop].split_value_or_rows
                 drop += 1
@@ -184,12 +185,14 @@ def bestsplit(x, y, min_leaf) -> None:
      36
     """
     x_sorted = np.sort(np.unique(x))
-    if len(x_sorted) <= 2:
-        # Allows splitting on categorical (0 or 1) cols
-        split_points = x_sorted / 2
-    else:
-        # Take average between consecutive numerical rows in the x col
-        split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
+    # if len(x_sorted) <= 2:
+    #     # Allows splitting on categorical (0 or 1) cols
+    #     split_points = x_sorted / 2
+    # else:
+    #     # Take average between consecutive numerical rows in the x col
+    #     split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
+
+    split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
 
     # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee
     # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes
@@ -199,21 +202,29 @@ def bestsplit(x, y, min_leaf) -> None:
     # twee "x rows" arrays we moeten returnen, en welke split value het beste
     # was natuurlijk.
 
-    # Hieren stoppen we (delta_i, split_value, rows_left, rows_right)
-    best_list = []
+    # Hieren stoppen we (rows for children, delta_i, split value, col will be added later)
+    best_array = np.zeros((split_points.shape[0], x.shape[0] + 3), dtype='float64')
+    # print(f"{best_array=}")
     # Stop wanneer de array met split points leeg is
     while split_points.size != 0:
         # Huidige split value
         split_value = split_points[-1]
+        # print(split_value)
         # boolean masks om x rows mee te masken/slicen
-        mask_left, mask_right = x > split_value, x <= split_value
+        # print(best_array.shape)
+        current_row_of_best_array = split_points.shape[0]-1
+        best_array[current_row_of_best_array, :x.shape[0]] = x > split_value
+        # print(best_array)
+        # print(f"{best_array[current_row_of_best_array,:-2]=}")
 
         # class voor beide split kanten
-        classes_left, classes_right = y[mask_left], y[mask_right]
+        classes_left, classes_right = y[best_array[current_row_of_best_array,:-3].astype(bool)], y[np.invert(best_array[current_row_of_best_array,:-3].astype(bool))]
+        # print(f"{classes_left=} {classes_right=}")
 
         # Kijk of er genoeg rows in de gesplitte nodes terechtkomen, anders
         # mogen we de split niet toelaten vanwege de min_leaf constraint
-        if len(classes_left) < min_leaf or len(classes_right) < min_leaf:
+        # print(f"{classes_left.shape[0]=}")
+        if classes_left.shape[0] < min_leaf or classes_right.shape[0] < min_leaf:
             # Haal de huidige split_point uit split_points
             split_points = split_points[:-1]
             continue
@@ -222,19 +233,19 @@ def bestsplit(x, y, min_leaf) -> None:
         # account, and not making the weighted average but the weigthed sum
         # only (thanks Lonnie)
         delta_i = (
-            impurity(classes_left) * len(classes_left) +
-            impurity(classes_right) * len(classes_right))
+            impurity(classes_left) * classes_left.shape[0] +
+            impurity(classes_right) * classes_right.shape[0])
         # stop huidige splits in de lijst om best split te berekenen
-        best_list.append((delta_i, mask_left, mask_right, split_value))
+        # print(f"{np.array((delta_i, mask_left, mask_right, split_value))=}")
+        # print(f"{best_array[:, classes_left.shape[0]-1]=}")  
+        best_array[current_row_of_best_array,-3:-1] = delta_i, split_value
+        # print(best_array)
         # Haal de huidige split_point uit split_points
         split_points = split_points[:-1]
 
     # Bereken de best split voor deze x col, als er ten minste 1 bestaat die
     # voldoet aan min leaf
-    if best_list:
-        return min(best_list, key=lambda x: x[0])
-    else:
-        return False
+    return best_array
 
 
 def exhaustive_split_search(rows, classes, min_leaf):
@@ -244,19 +255,18 @@ def exhaustive_split_search(rows, classes, min_leaf):
     print("\t\t->entering exhaustive split search")
     # We hebben enumerate nodig, want we willen weten op welke col
     # (age,married,house,income,gender) we een split doen
-    exhaustive_best_list = []
+    exhaustive_best_array = np.zeros((1, rows.shape[0] + 3), dtype='float64')
     print(f"Rows:\n{rows},\n Classes:\n{classes}")
     for i, col in enumerate(rows.transpose()):
         # calculate the best split for the col that satisfies the min_leaf
         # constraint
-        col_best_split = bestsplit(col, classes, min_leaf)
-        # Check if there was a split fullfilling the min leaf rule
-        if col_best_split:
-            # add for which row we calculated the best split
-            col_best_split += (i, )
-            exhaustive_best_list.append(col_best_split)
+        best_array_for_col = bestsplit(col, classes, min_leaf)
+        # add col number to rows
+        best_array_for_col[:,-1] = i
+        exhaustive_best_array = np.vstack((exhaustive_best_array, best_array_for_col))
+    print("The array with exhaustive splits is\n", exhaustive_best_array)
     print("\t\t->returning from exhaustive split search")
-    return exhaustive_best_list
+    return exhaustive_best_array
 
 
 def add_children(node, best_split):
@@ -268,16 +278,11 @@ def add_children(node, best_split):
     # need this to update the rows for the children
     current_mask = node.split_value_or_rows
 
-    # Unpacking the best_split tuple
-    mask_left, mask_right, node_split_value, node_col = best_split[1:]
-    # print(f"{mask_left}, {mask_right}, {node_split_value}, {node_col}")
-
     # Give the current node the split_value and col it needs for predictions
-    node.split_value_or_rows, node.col = node_split_value, node_col
+    node.split_value_or_rows, node.col = best_split[-2], int(best_split[-1])
 
     # Updating the row masks to give it to children, keeping numpy dimension consistent
-    mask_left, mask_right = update_mask(mask_left, current_mask), update_mask(
-        mask_right, current_mask)
+    mask_left, mask_right = update_mask(best_split[:-3], current_mask)
 
     # Adding the pointer between parent and children
     node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right))
@@ -305,12 +310,18 @@ def update_mask(mask, current_mask):
     # nodes... weet niet wat beter is, het kan best veel geheugen in beslag
     # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd
     # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft.
-    print(f"Before update:\n{mask}")
-    copy = np.array(current_mask, copy=True)
-    copy[np.where(current_mask)] = mask
-    print(f"After update:\n{copy}")
+    current_mask = np.vstack((current_mask, current_mask))
+    print(f"\nBefore update (upper rows goes to the left child, bottom to the right):\n{current_mask}")
+    # print(mask)
+    # print(mask.astype(bool))
+    # print(current_mask[current_mask == True])
+    current_mask[0][current_mask[0] == True] = mask.astype(bool)
+    current_mask[1][current_mask[1] == True] = np.invert(mask.astype(bool))
+    # copy = np.array(current_mask, copy=True)
+    # copy[np.where(current_mask)] = mask
+    print(f"After update:\n{current_mask}")
     print("\t\t\t\t->updated row mask for child node")
-    return copy
+    return current_mask[0], current_mask[1]
 
 
 #
@@ -377,12 +388,12 @@ def tree_grow(x=None,
             # "If the algorithm performs a split, it should be the best split that meets the minleaf constrain"
             #
             # We krijgen een exhaustive lijst terug met splits
-            exhaustive_best_list = exhaustive_split_search(
+            exhaustive_best_array = exhaustive_split_search(
                 node_rows, node_classes, min_leaf)
             # print(exhaustive_best_list)
             # Als de lijst leeg is zijn er geen potentieele splits die voldoen
             # an de min leaf constraint
-            if not exhaustive_best_list:
+            if exhaustive_best_array.shape[0] == 1:
                 node.is_leaf_node(node_classes)
                 print(
                     "\t\t->No split that fullfils min_leaf was found continueing to next node"
@@ -393,8 +404,8 @@ def tree_grow(x=None,
             )
             # Hier halen we de beste split, en rows voor de child/split nodes
             # uit de exhaustive best list
-            best_split = min(exhaustive_best_list, key=lambda z: z[0])
-            print(f"\n######################best split tuple (delta_i, bool arrays voor child rows, split value, col)############################\n\n{best_split}\n\n###########################################################################################################################\n")
+            best_split = exhaustive_best_array[np.argmin(exhaustive_best_array[1:,-3]) + 1]
+            print(f"\n######################best split array, the last three element=[delta_i, splitvalue, col/attribute], the rest is boolean include or not include row in child node############################\n\n{best_split}\n\n###############################################################################\n")
             # Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent"
             nodelist += add_children(node, best_split)
             # node.add_split(left_child_node, right_child_node)
author	Mike Vink <mike1994vink@gmail.com>	2020-09-24 17:04:11 +0200
committer	Mike Vink <mike1994vink@gmail.com>	2020-09-24 17:04:11 +0200
commit	e1ad1dcd2ea5c7663ae0a867cc53b7cb13f3c91a (patch)
tree	0a71bbad78e932ec00d2ab287ce3f6238c3773b7
parent	e650ad27d13b392f5b6535906e36176cb0777650 (diff)