Fix: pima indians data prediction working now

author: Mike Vink <mike1994vink@gmail.com> 2020-09-24 23:08:40 +0200
committer: Mike Vink <mike1994vink@gmail.com> 2020-09-24 23:08:40 +0200
commit: 53f6101594a410b6fa6f924d4c256f5f0001c0e5 (patch)
tree: ac76678997913e8bb60eca34137d25665dd93b64
parent: 35c613a547bb209cc6e9a289019c1ad3feff5fd4 (diff)
1 files changed, 110 insertions, 64 deletions
diff --git a/tree.py b/tree.py
index 9f67b0b..f1fa548 100644
--- a/tree.py
+++ b/tree.py
@@ -1,8 +1,9 @@
 import numpy as np
+from sklearn import metrics
+import pstats
+from pstats import SortKey
+import cProfile
 
-credit_data = np.genfromtxt('./data/credit_score.txt',
-                            delimiter=',',
-                            skip_header=True)
 
 # age,married,house,income,gender,class
 # [(22, 0, 0, 28, 1, 0)
@@ -166,7 +167,7 @@ def impurity(array) -> int:
     return gini_index
 
 
-def bestsplit(x, y, min_leaf) -> None:
+def bestsplit(x, y, minleaf) -> None:
     """
     x = vector of single col
     y = vector of classes (last col in x)
@@ -193,6 +194,7 @@ def bestsplit(x, y, min_leaf) -> None:
     #     split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
 
     split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
+    # print(split_points)
 
     # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee
     # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes
@@ -212,19 +214,17 @@ def bestsplit(x, y, min_leaf) -> None:
         # print(split_value)
         # boolean masks om x rows mee te masken/slicen
         # print(best_array.shape)
-        current_row_of_best_array = split_points.shape[0]-1
-        best_array[current_row_of_best_array, :x.shape[0]] = x > split_value
         # print(best_array)
         # print(f"{best_array[current_row_of_best_array,:-2]=}")
 
         # class voor beide split kanten
-        classes_left, classes_right = y[best_array[current_row_of_best_array,:-3].astype(bool)], y[np.invert(best_array[current_row_of_best_array,:-3].astype(bool))]
+        classes_left, classes_right = y[x > split_value], y[x <= split_value]
         # print(f"{classes_left=} {classes_right=}")
 
         # Kijk of er genoeg rows in de gesplitte nodes terechtkomen, anders
-        # mogen we de split niet toelaten vanwege de min_leaf constraint
+        # mogen we de split niet toelaten vanwege de minleaf constraint
         # print(f"{classes_left.shape[0]=}")
-        if classes_left.shape[0] < min_leaf or classes_right.shape[0] < min_leaf:
+        if classes_left.shape[0] < minleaf or classes_right.shape[0] < minleaf:
             # Haal de huidige split_point uit split_points
             split_points = split_points[:-1]
             continue
@@ -238,6 +238,8 @@ def bestsplit(x, y, min_leaf) -> None:
         # stop huidige splits in de lijst om best split te berekenen
         # print(f"{np.array((delta_i, mask_left, mask_right, split_value))=}")
         # print(f"{best_array[:, classes_left.shape[0]-1]=}")  
+        current_row_of_best_array = split_points.shape[0]-1
+        best_array[current_row_of_best_array, :x.shape[0]] = x > split_value
         best_array[current_row_of_best_array,-3:-1] = delta_i, split_value
         # print(best_array)
         # Haal de huidige split_point uit split_points
@@ -248,25 +250,25 @@ def bestsplit(x, y, min_leaf) -> None:
     return best_array
 
 
-def exhaustive_split_search(rows, classes, min_leaf):
+def exhaustive_split_search(rows, classes, minleaf):
     """
     @todo: Docstring for exhaustive_split_search
     """
-    print("\t\t->entering exhaustive split search")
+    # print("\t\t->entering exhaustive split search")
     # We hebben enumerate nodig, want we willen weten op welke col
     # (age,married,house,income,gender) we een split doen
     exhaustive_best_array = []
-    print(f"Rows:\n{rows},\n Classes:\n{classes}")
+    # print(f"Rows:\n{rows},\n Classes:\n{classes}")
     for i, col in enumerate(rows.transpose()):
-        # calculate the best split for the col that satisfies the min_leaf
+        # calculate the best split for the col that satisfies the minleaf
         # constraint
-        best_array_for_col = bestsplit(col, classes, min_leaf)
+        best_array_for_col = bestsplit(col, classes, minleaf)
         # add col number to rows
         best_array_for_col[:,-1] = i
         exhaustive_best_array.append(best_array_for_col)
     exhaustive_best_array = np.concatenate(exhaustive_best_array)
-    print("The array with exhaustive splits is\n", exhaustive_best_array)
-    print("\t\t->returning from exhaustive split search")
+    # print("The array with exhaustive splits is\n", exhaustive_best_array)
+    # print("\t\t->returning from exhaustive split search")
     return exhaustive_best_array
 
 
@@ -274,7 +276,7 @@ def add_children(node, best_split):
     """
     @todo: Docstring for add_children
     """
-    print("\t\t\t->entering add children")
+    # print("\t\t\t->entering add children")
     # The mask that was used to get the rows for the current node from x, we
     # need this to update the rows for the children
     current_mask = node.split_value_or_rows
@@ -287,7 +289,7 @@ def add_children(node, best_split):
 
     # Adding the pointer between parent and children
     node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right))
-    print("\t\t\t->children added to node and node list\n")
+    # print("\t\t\t->children added to node and node list\n")
     return [node.left, node.right]
 
 
@@ -296,9 +298,9 @@ def update_mask(mask, current_mask):
     Updates the spit bool array from any dimension to an array with length
     equal to the total number of rows in dataset x.
     """
-    print(
-        "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child"
-    )
+    # print(
+    #     "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child"
+    # )
     # Copy heb ik gedaan omdat we een slice assignment doen.
     #
     # Het punt van deze functie is dat tijdens het tree growen het aantal rows
@@ -312,7 +314,7 @@ def update_mask(mask, current_mask):
     # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd
     # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft.
     current_mask = np.vstack((current_mask, current_mask))
-    print(f"\nBefore update (upper rows goes to the left child, bottom to the right):\n{current_mask}")
+    # print(f"\nBefore update (upper rows goes to the left child, bottom to the right):\n{current_mask}")
     # print(mask)
     # print(mask.astype(bool))
     # print(current_mask[current_mask == True])
@@ -320,8 +322,8 @@ def update_mask(mask, current_mask):
     current_mask[1][current_mask[1] == True] = np.invert(mask.astype(bool))
     # copy = np.array(current_mask, copy=True)
     # copy[np.where(current_mask)] = mask
-    print(f"After update:\n{current_mask}")
-    print("\t\t\t\t->updated row mask for child node")
+    # print(f"After update:\n{current_mask}")
+    # print("\t\t\t\t->updated row mask for child node")
     return current_mask[0], current_mask[1]
 
 
@@ -332,9 +334,9 @@ def update_mask(mask, current_mask):
 
 def tree_grow(x=None,
               y=None,
-              n_min=None,
-              min_leaf=None,
-              n_feat=None,
+              nmin=None,
+              minleaf=None,
+              nfeat=None,
               **defaults) -> Tree:
     """
     @todo: Docstring for tree_grow
@@ -351,68 +353,89 @@ def tree_grow(x=None,
     root = Node(split_value_or_rows=mask)
 
     # We instantieren ook gelijk het Tree object
-    tr = Tree(root, (n_min, min_leaf, n_feat))
+    tr = Tree(root, (nmin, minleaf, nfeat))
 
     # De eerste nodelist heeft alleen de root node, daarna zijn twee childs,
     # etc. totdat alle splits gemaakt zijn en de lijst leeg is.
     nodelist = [root]
 
     while nodelist:
-        print("->Taking new node from the node list")
+        # print("->Taking new node from the node list")
         # Pop de current node uit de nodelist
         node = nodelist.pop()
         # Gebruik de boolean mask van de node om de rows in de node uit x te halen
-        node_rows = x[node.split_value_or_rows]
         # print(node_rows)
         # Gebruik de boolean mask van de node om de classes in de node uit y te halen
         node_classes = y[node.split_value_or_rows]
+        # print(node_classes)
 
         # print(f"{node_classes}, {node_rows}")
 
-        # Test of de node een leaf node is met n_min
-        if len(node_rows) < n_min:
+        # Test of de node een leaf node is met nmin
+        if len(node_classes) < nmin:
             node.is_leaf_node(node_classes)
             print(
-                "\t->Node has less rows than n_min, it is a leaf node, continueing to next node"
+                "\t->Leafnode! Node has less rows than nmin, it is a leaf node, continueing to next node"
             )
             continue
-        print("\t->Node has more rows than n_min, it is not a leaf node")
+        # print("\t->Node has more rows than nmin, it is not a leaf node")
 
         # Als de node niet puur is, probeer dan te splitten
         if impurity(node_classes) > 0:
-            print("\t->Node is not pure yet starting exhaustive split search")
+            # print("\t->Node is not pure yet starting exhaustive split search")
+
+            # Pick a random set of cols from rows (nfeat, important for forests)
+            nfeat_col_choice = np.random.choice(range(x.shape[1]), nfeat, replace=False)
+            node_rows = x[node.split_value_or_rows][:,np.sort(nfeat_col_choice)]
+
             # We gaan exhaustively voor de rows in de node over de cols
             # (age,married,house,income,gender) om de bestsplit te
             # bepalen
             #
-            # We geven min_leaf als argument omdat:
+            # We geven minleaf als argument omdat:
             # "If the algorithm performs a split, it should be the best split that meets the minleaf constrain"
             #
             # We krijgen een exhaustive lijst terug met splits
             exhaustive_best_array = exhaustive_split_search(
-                node_rows, node_classes, min_leaf)
-            # print(exhaustive_best_list)
-            # Als de lijst leeg is zijn er geen potentieele splits die voldoen
-            # an de min leaf constraint
-            if exhaustive_best_array.shape[0] == 1:
+                node_rows, node_classes, minleaf)
+            
+            # Als de array alleen maar zeroes heeft -> no splits for the minleaf leaf node
+            # print(exhaustive_best_array[:,:-1].any())
+            # raise SystemExit
+            if not exhaustive_best_array[:,:-1].any():
                 node.is_leaf_node(node_classes)
                 print(
-                    "\t\t->No split that fullfils min_leaf was found continueing to next node"
+                    "\t\t->Leaf node! No split that fullfils minleaf was found continueing to next node"
                 )
                 continue
-            print(
-                "\t->Exhaustive search found a split fulfilling the min_leaf rule!"
-            )
+            # print(
+            #     "\t->Exhaustive search found a split fulfilling the minleaf rule!"
+            # )
             # Hier halen we de beste split, en rows voor de child/split nodes
             # uit de exhaustive best list
-            best_split = exhaustive_best_array[np.argmin(exhaustive_best_array[1:,-3]) + 1]
-            print(f"\n######################best split array, the last three element=[delta_i, splitvalue, col/attribute], the rest is boolean include or not include row in child node############################\n\n{best_split}\n\n###############################################################################\n")
+            # [(exhaustive_best_array[:,:-3] == 0).sum(axis=1)]
+            best_split = exhaustive_best_array[exhaustive_best_array[:,:-3].sum(axis=1) != 0][np.argmin(exhaustive_best_array[:,-3])]
+            # print((exhaustive_best_array[:,:-3] == 0).sum(axis=1))
+            # print(exhaustive_best_array.shape)
+            # print(exhaustive_best_array)
+            # print(exhaustive_best_array[:,:-3])
+            # print(exhaustive_best_array[:,:-3] == 0)
+            # print(exhaustive_best_array[:,:-3].sum(axis=1) != 0)
+            # print(exhaustive_best_array[exhaustive_best_array[:,:-3]])
+            # print(exhaustive_best_array[exhaustive_best_array[:,:-3].sum(axis=0) == 0].shape)
+            # print(exhaustive_best_array[exhaustive_best_array[:,:-3].sum(axis=0) == 0][:,-3].shape)
+            # print(best_split)
+            # raise SystemExit
+            # print(exhaustive_best_array[:,:-1])
+            # raise SystemExit
+            # print(f"\n######################best split array, the last three element=[delta_i, splitvalue, col/attribute], the rest is boolean include or not include row in child node############################\n\n{best_split}\n\n###############################################################################\n")
             # Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent"
             nodelist += add_children(node, best_split)
+            # print(exhaustive_best_array.shape)
             # node.add_split(left_child_node, right_child_node)
         else:
             node.is_leaf_node(node_classes)
-            print("\t\t->The node is already pure, it is necessarily a leaf!")
+            print("\t\t->Leafnode! The node is already pure, it is necessarily a leaf!")
             continue
     # print(tr)
     return tr
@@ -422,14 +445,22 @@ def tree_pred(x=None, tr=None, **defaults) -> np.array:
     """
     @todo: Docstring for tree_pred
     """
-    y = tr.predict(x)
-    n_min, min_leaf, n_feat = tr.hyper_params
-    print("-"*80+f"\nPrediction parameters:\nn_min={n_min}\nmin_leaf={min_leaf}\nn_feat={n_feat}")
-    print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}")
+    y = tr.predict(x).astype(float)
+    nmin, minleaf, nfeat = tr.hyper_params
+    print(np.mean(pima_indians[:,-1] == y))
+    # print("-"*80+f"\nPrediction parameters:\nnmin={nmin}\nminleaf={minleaf}\nnfeat={nfeat}")
+    # print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}")
     return y
 
 
 if __name__ == '__main__':
+    # credit_data = np.genfromtxt('./data/credit_score.txt',
+    #                             delimiter=',',
+    #                             skip_header=True)
+
+    pima_indians = np.genfromtxt('./data/pima_indians.csv',
+                                delimiter=',',
+                                skip_header=True)
     #### IMPURITY TEST
     # array=np.array([1,0,1,1,1,0,0,1,1,0,1])
     # print(impurity(array))
@@ -440,21 +471,36 @@ if __name__ == '__main__':
     # Should give 36
 
     #### TREE_GROW TEST
-    tree_grow_defaults = {
-        'x': credit_data[:, :5],
-        'y': credit_data[:, 5],
-        'n_min': 2,
-        'min_leaf': 1,
-        'n_feat': None # 5, should be 5 for bootstrapping folds
-    }
+    # tree_grow_defaults = {
+    #     'x': credit_data[:, :5],
+    #     'y': credit_data[:, 5],
+    #     'nmin': 2,
+    #     'minleaf': 1,
+    #     'nfeat': None # 5, should be 5 for bootstrapping folds
+    # }
 
     # Calling the tree grow, unpacking default as argument
     # tree_grow(**tree_grow_defaults)
 
     #### TREE_PRED TEST
-    tree_pred_defaults = {
-        'x': credit_data[:, :5],
-        'tr': tree_grow(**tree_grow_defaults)
-    }
+    # tree_pred_defaults = {
+    #     'x': credit_data[:, :5],
+    #     'tr': tree_grow(**tree_grow_defaults)
+    # }
+
+    # y_pred = tree_pred(x=credit_data[:,:5], tr=tree_grow(x=credit_data[:,0:5], y=credit_data[:,5], nmin=2, minleaf=1, nfeat=5), dataset='credit score')
+    # print(credit_data[:,5] == y_pred)
+
+    # predictions = tree_pred(x=pima_indians[:,:8], tr=tree_grow(x=pima_indians[:,:8], y=pima_indians[:,8], nmin=20, minleaf=5, nfeat=pima_indians.shape[1]-1), dataset='pima indians')
+    # print(np.mean(pima_indians[:,8] == y_pred))
+
+    # Time profile of credit data prediction with single tree
+    # cProfile.run("tree_pred(x=credit_data[:,:5], tr=tree_grow(x=credit_data[:,0:5], y=credit_data[:,5], nmin=2, minleaf=1, nfeat=5), dataset='credit score')", 'restats')
+
+    # Time profile of pima indians data prediction with single tree
+    cProfile.run("tree_pred(x=pima_indians[:,:8], tr=tree_grow(x=pima_indians[:,:8], y=pima_indians[:,8], nmin=20, minleaf=5, nfeat=pima_indians.shape[1]-1), dataset='pima indians')", 'restats')
+
 
-    tree_pred(**tree_pred_defaults)
+    p = pstats.Stats('restats')
+    p.sort_stats(SortKey.TIME)
+    p.print_stats()
author	Mike Vink <mike1994vink@gmail.com>	2020-09-24 23:08:40 +0200
committer	Mike Vink <mike1994vink@gmail.com>	2020-09-24 23:08:40 +0200
commit	53f6101594a410b6fa6f924d4c256f5f0001c0e5 (patch)
tree	ac76678997913e8bb60eca34137d25665dd93b64
parent	35c613a547bb209cc6e9a289019c1ad3feff5fd4 (diff)