Major update: obj oriented tree

author: Mike Vink <mike1994vink@gmail.com> 2020-09-16 02:35:13 +0200
committer: Mike Vink <mike1994vink@gmail.com> 2020-09-16 02:35:13 +0200
commit: 9277a0790d4925f1fd50c789cc83a27096f543e7 (patch)
tree: 7b57dd5eababd794e6387fdb72ed7878fddcdfc5
parent: 220afa88dc69c1eb59a1ba3c75a6936f40bc156a (diff)
4 files changed, 314 insertions, 160 deletions
diff --git a/assignment1.py b/assignment1.py
index 4134270..a442f55 100644
--- a/assignment1.py
+++ b/assignment1.py
@@ -4,8 +4,7 @@ credit_data = np.genfromtxt('./credit_score.txt',
                             delimiter=',',
                             skip_header=True)
 
-# "credit_data" is now a 2d NumPy array. Each rows represent a record and the
-# columns represent the data attributes.
+# age,married,house,income,gender,class
 # [(22, 0, 0, 28, 1, 0)
 #  (46, 0, 1, 32, 0, 0)
 #  (24, 1, 1, 24, 1, 0)
@@ -18,137 +17,57 @@ credit_data = np.genfromtxt('./credit_score.txt',
 #  (50, 1, 1, 28, 0, 1)]
 
 
-class Tree():
+class Node():
     """
-    @todo: docstring for Tree
+    @todo: docstring for Node
     """
-    def __init__(self, d_structure):
+    def __init__(self, value=None):
         """@todo: Docstring for init method.
 
-        /d_structure/ @todo
+        /value=None/ @todo
 
         """
-        self.d_structure = d_structure
+        self.value = value
 
-    def __repr__(self):
-        return str(self.d_structure)
-
-    def drop_left(self) -> dict:
+    def add_split(self, left, right):
         """
-        @todo: Docstring for drop_left
+        @todo: Docstring for add_split
         """
-        # Need to change
-        print("Dropping left:\n", self.prediction["left"])
-        return self.prediction["left"]
-
-    def drop_right(self) -> dict:
-        """
-        @todo: Docstring for drop_right
-        """
-        # Need to change
-        print("Dropping right:\n", self.prediction["right"])
-        return self.prediction["right"]
+        self.left = left
+        self.right = right
 
 
-tree_grow_defaults = {
-    'x': credit_data[:, :5],
-    'y': credit_data[:, 5],
-    'n_min': 0,
-    'min_leaf': 0,
-    'n_feat': 5
-}
+class Leaf:
+    def __init__(self, predicted_class: int):
+        self.predicted_class = predicted_class
 
 
-def tree_grow(x=None,
-              y=None,
-              n_min=None,
-              min_leaf=None,
-              n_feat=None,
-              **defaults) -> Tree:
-    """
-    @todo: Docstring for tree_grow
-    """
-    print(
-        "All attribute columns in credit data (to be exhaustively searched per split):\n",
-        x, "\n")
-    print("Class label column in credit data:\n", y, "\n")
-    print(
-        f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n'
-    )
-    print(
-        f'Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n'
-    )
-    print(
-        f"Number of features/attributes to be randomly drawn from {x=} to be considered for a split, should only be lower than {len(x[0,:])=} for random forest growing, {n_feat=}"
-    )
-    d_structure = {"root": ((0,30), {"left": (None, 1), "right": (None, 0)})}  # dummy data
-    return Tree(d_structure)
-
-
-# Calling the function, unpacking default as argument
-# print(tree_grow(**tree_grow_defaults))
-
-tree_pred_defaults = {
-    'x': credit_data[:, :5],
-    'tr': tree_grow(**tree_grow_defaults)
-}
-
-
-def tree_pred(x=None, tr=None, **defaults) -> np.array:
+class Tree():
     """
-    @todo: Docstring for tree_pred
+    @todo: docstring for Tree
     """
-    print("\n\n#########Tree_pred output start:\n")
-    print(f"Drop a row in {x=} down the tree {tr.__repr__()}")
-    # x should be a set of rows that represent cases to be dropped in the tree,
-    # we iterate over the numpy rows
-    y = np.array([])
-    for case in x:
-        # Assumes that tr is a data structure with the following form
-        #
-        # {"root":(split, {"left":(...), "right":(...)})
-        #
-        # Where the (...) is recursion of the pattern tuple(tuple, dict)
-
-        # Copy the data structure of the tree, not very efficient?
-        tr.prediction = tr.d_structure
-
-        split, tr.prediction = tr.prediction["root"]
-        # Unpack the split info we need to do the first split
-        # c is the split value for the drop, and col is the attribute/feature
-        # we are dropping on
-        col, c = split
-        while isinstance(tr.prediction, dict):
-            # The drop methods return a tuple by giving the current
-            # d_structure a key that is either "left" or right":
-            # def drop_left(self):
-            # ...
-            # return tr.prediction["left"]
-            #
-            # Don't even need to check for leaf node, since the while loop
-            # should do that.
-            col, c = split
-            if case[col] > c:
-                split, tr.prediction = tr.drop_right()
-            elif case[col] <= c:
-                split, tr.prediction = tr.drop_left()
-            # Assumes that the leaf nodes in tr.d_structure is the leaf node
-            # that is just the majority(!) class label, which is just the integer 1 or 0.
-            #
-            # This is also the reason we break out of the while loop, when we
-            # arrive to a leaf node with the drop methods, the tr.d_structure
-            # type is no longer a dict, but an int.
-        # print(tr.prediction)
-        y = np.append(y, tr.prediction)
-    print("Predictions from the tree:\n", y)
-    return y
+    def __init__(self, root_node_obj):
+        """@todo: Docstring for init method.
 
+        /root_node_obj/ @todo
 
-tree_pred(**tree_pred_defaults)
+        """
+        self.tree = root_node_obj
 
-#
-#
-# Put all helper functions below this comment!
+    def __repr__(self):
+        nodelist = [self.tree]
+        tree_str = ''
+        while nodelist:
+            current_node = nodelist.pop()
+            # print(current_node.value)
+            try:
+                childs = [current_node.right, current_node.left]
+                nodelist += childs
+            except AttributeError:
+                pass
+            col, c = current_node.value
+            tree_str += f"{col=}, {c=}"
+        return tree_str
 
 
 def impurity(array) -> int:
@@ -180,11 +99,6 @@ def impurity(array) -> int:
     return gini_index
 
 
-# array=np.array([1,0,1,1,1,0,0,1,1,0,1])
-# print(impurity(array))
-# Should give 0.23....
-
-
 def bestsplit(x, y) -> int:
     """
     x = vector of num values
@@ -203,43 +117,115 @@ def bestsplit(x, y) -> int:
     >>> bestsplit(credit_data[:,3],credit_data[:,5])
      36
     """
-    # Sort all unique attribute values
-    num_attr_sorted = np.sort(np.unique(x))
-
-    # Use python vector addition to add all corresponding consecutive column
-    # elements and take their average
-    consec_avg_attr_splitpoints = (num_attr_sorted[0:7] +
-                                   num_attr_sorted[1:8]) / 2
-
-    # Convert array to list
-    split_points = list(consec_avg_attr_splitpoints)
-
-    # Prepare the constants for the delta impurity equation
-    impurity_parent_node = impurity(y)
-    n_obs_parent_node = len(y)
-
-    # Init return list
-    split_points_delta_impurities = []
-    while split_points:
-        # compute child nodes class vectors for the split value
-        split_point = split_points.pop()
-        child_node = {"l": y[x > split_point], "r": y[x <= split_point]}
-
-        # Take the weighted average of child node impurities
-        w_avg_child_impurities = (
-            impurity(child_node["l"]) * len(child_node["l"]) + impurity(
-                child_node["r"]) * len(child_node["r"])) / n_obs_parent_node
-
-        # Add the used split point and delta impurity to the return list
-        split_points_delta_impurities += [
-            (split_point, impurity_parent_node - w_avg_child_impurities)
-        ]
-
-    # Take the maximum of the list, and unpack
-    best_split, best_delta_impurity = max(split_points_delta_impurities,
-                                          key=lambda x: x[1])
-    return best_split
-
-
-# print(bestsplit(credit_data[:,3],credit_data[:,5]))
-# Should give 36
+    x_sorted = np.sort(np.unique(x))
+    if len(x_sorted) <= 2:
+        # Turns binary attributes from [0,1] to [0, 0.5], allows us to use
+        # normal slices
+        split_points = x_sorted / 2
+    else:
+        split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
+
+    best_dict = None
+    for split in split_points:
+        # Returns boolean matrix which can be used for slicing x
+        x_slices = {
+            "left": np.index_exp[x > split],
+            "right": np.index_exp[x <= split]
+        }
+
+        # delta_i formule
+        delta_i = impurity(y) - (len(y[x_slices["left"]]) * impurity(
+            y[x_slices["left"]]) + len(y[x_slices["right"]]) *
+                                 impurity(y[x_slices["right"]])) / len(y)
+
+        print(f"{split=}, {delta_i=}")
+        if best_dict is not None:
+            if delta_i > best_dict["delta_i"]:
+                best_dict = {"x_slices": x_slices, "split": split, "delta_i":delta_i}
+        else:
+            best_dict = {"slices": x_slices, "split": split, "delta_i":delta_i}
+    return best_dict
+
+#
+#
+# Put all helper functions above this comment!
+
+def tree_grow(x=None,
+              y=None,
+              n_min=None,
+              min_leaf=None,
+              n_feat=None,
+              **defaults) -> Tree:
+    """
+    @todo: Docstring for tree_grow
+    """
+    all_rows = np.index_exp[:]
+    # Initiate the nodelist with tuples of slice and class labels
+    nodelist = [Node(value={"slices": all_rows})]
+    tree = Tree(nodelist[0])
+    while nodelist:
+        current_node = nodelist.pop()
+        slices = current_node.value["slices"]
+        print(f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}")
+
+        if impurity(y[slices]) > 0:
+
+            # bestsplit(col, node_labels) -> 
+            # {"x_slices": (boolean matrix, boolean matrix), "split": numpyfloat, "best_delta_i": numpyfloat}
+
+            # slices used for knowing where to split on the next node.
+            # best_split saved in current_node.value
+            # best_delta_i used to find best split among x_columns
+            best_dict = None
+            for x_col in x[slices].transpose():
+                print("\nExhaustive split search says; \"Entering new column\":")
+                col_split_dict = bestsplit(x_col, y[slices])
+
+                if best_dict is not None:
+                    if col_split_dict["delta_i"] > best_dict["delta_i"]:
+                        best_dict = col_split_dict
+                else:
+                    best_dict = col_split_dict
+            print("\nThe best split for current node:", best_dict)
+
+    return tree
+
+
+
+def tree_pred(x=None, tr=None, **defaults) -> np.array:
+    """
+    @todo: Docstring for tree_pred
+    """
+    y = np.array([])
+    return y
+
+
+if __name__ == '__main__':
+    #### IMPURITY TEST
+    # array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+    # print(impurity(array))
+    # Should give 0.23....
+
+    #### BESTSPLIT TEST
+    # print(bestsplit(credit_data[:, 3], credit_data[:, 5]))
+    # Should give 36
+
+    #### TREE_GROW TEST
+    tree_grow_defaults = {
+        'x': credit_data[:, :5],
+        'y': credit_data[:, 5],
+        'n_min': 2,
+        'min_leaf': 1,
+        'n_feat': 5
+    }
+
+    # Calling the tree grow, unpacking default as argument
+    tree_grow(**tree_grow_defaults)
+
+    # tree_pred_defaults = {
+    #     'x': credit_data[:, :5],
+    #     'tr': tree_grow(**tree_grow_defaults)
+    # }
+
+    # tree_pred(**tree_pred_defaults)
+
diff --git a/gettingStarted.py b/gettingStarted.py
new file mode 100644
index 0000000..3a8d907
--- /dev/null
+++ b/gettingStarted.py
@@ -0,0 +1,100 @@
+import numpy as np
+import random
+import math
+from copy import deepcopy
+
+credit_data = np.genfromtxt('/Users/mikevink/Documents/python/2020_data_mining_assignments/credit_score.txt', delimiter=',', skip_header=True)
+
+#print(credit_data)
+#print(credit_data[0])
+#print(credit_data[:,3])
+#print(credit_data[4,0])
+#print(np.sort(np.unique(credit_data[:,3]))) #Give the distinct values of income, sorted from low to high
+#print(np.sum(credit_data[:,5]))
+#print(credit_data.sum(axis=0)) #Add the entries of each column of credit_data
+#print(credit_data.sum(axis=1)) #Add the entries of each row
+#print(credit_data[credit_data[:,0] > 27]) # Select all rows where the first column is bigger than 27
+#
+#x = np.array([2, 5, 10])
+#print(x)
+#print(np.arange(0, 10))
+#
+#print(np.arange(0, 10)[credit_data[:,0] > 27]) #Select the *row numbers* of the rows where the first column of credit_data is bigger than 27
+#
+#index = np.random.choice(np.arange(0, 10), size=5, replace=False) #Draw a random sample of size 5 from the numbers 1 through 10 (without replacement)
+#print(index)
+#train = credit_data[index,]
+#print(train)
+#test = np.delete(credit_data, index, axis=0) #Select all rows with row number not in "index"
+#print(test)
+#
+#print(random.choice(train))
+
+
+### Practice exercise 1 ###
+def impurity(vector): # vector = list of 0s and 1s
+    num_of_class_labels = len(vector)
+    num_of_class_1 = sum(vector)
+    num_of_class_0 = num_of_class_labels - num_of_class_1
+    return (num_of_class_0 / num_of_class_labels) * (num_of_class_1 / num_of_class_labels)
+
+array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+print(impurity(array))
+
+
+### Practice exercise 2 ###
+def bestsplit(x, y): # x = numeric values; y = class labels
+    x_sorted = np.sort(np.unique(x))
+    split_points = (x_sorted[:len(x_sorted)-1] + x_sorted[1:]) / 2
+    
+    best_impurity_after_split = math.inf
+    for split in split_points:
+        impurity_after_split = impurity(y[x <= split]) + impurity(y[x > split])
+        if impurity_after_split < best_impurity_after_split:
+            best_split = split
+            best_impurity_after_split = impurity_after_split
+
+    return best_split
+
+print(bestsplit(credit_data[:,3], credit_data[:,5]))
+
+
+
+class Node:
+    def _init_(self):
+        self.left  = None
+        self.right = None
+        self.split_value = None
+        
+class Leaf:
+    def __init__(self, predicted_class: int):
+        self.predicted_class = predicted_class
+
+
+def tree_grow(x, y): # x = numeric values; y = class labels
+    root = Node()
+    root.split_value = bestsplit(x, y)
+    root.left = Leaf(0)
+    root.right = Leaf(1)
+    return root
+    
+def tree_pred(x, tr):
+    y = []
+    for value in x:
+        y.append(single_value_pred(value, tr))  
+    return y
+
+def single_value_pred(value, current_tree):
+    if isinstance(current_tree, Leaf):
+        return current_tree.predicted_class
+    else:
+        if value <= current_tree.split_value:
+            return single_value_pred(value, current_tree.left)
+        else:
+            return single_value_pred(value, current_tree.right)        
+
+tree = tree_grow(credit_data[:,3], credit_data[:,5])
+print(tree_pred([32, 38, 3, 40], tree))
+
+
+
diff --git a/message b/message
new file mode 100644
index 0000000..bc70ce8
--- /dev/null
+++ b/message
@@ -0,0 +1,10 @@
+heb twee punten:
+
+1. Impurity_after_split (line 52) is som over child impurities, in plaats
+daarvan moet het gewogen gemiddelde tot de len(new_y) / len(old_y) zijn denk ik
+(misschien? want het output hetzelfde als het voorbeeld).
+
+2. Ik denk dat je op deze manier steeds een sub-tree/ een node-recursie aan
+single_value_pred geeft. Wat is de beste manier om een node uit de tree aan te
+spreken?
+
diff --git a/recursive_node.py b/recursive_node.py
new file mode 100644
index 0000000..22bc75d
--- /dev/null
+++ b/recursive_node.py
@@ -0,0 +1,58 @@
+import numpy as np
+
+
+class Node():
+    """
+    @todo: docstring for Node
+    """
+    def __init__(self, value=None):
+        """@todo: Docstring for init method.
+
+        /value=None/ @todo
+
+        """
+        self.value = value
+
+    def add_split(self, left, right):
+        """
+        @todo: Docstring for add_split
+        """
+        self.left = left
+        self.right = right
+
+
+class Tree():
+    """
+    @todo: docstring for Tree
+    """
+    def __init__(self, root_node_obj):
+        """@todo: Docstring for init method.
+
+        /root_node_obj/ @todo
+
+        """
+        self.tree = root_node_obj
+
+    def __repr__(self):
+        nodelist = [self.tree]
+        tree_str = ''
+        while nodelist:
+            current_node = nodelist.pop()
+            # print(current_node.value)
+            try:
+                childs = [current_node.right, current_node.left]
+                nodelist += childs
+            except AttributeError:
+                pass
+            tree_str += current_node.value
+        return tree_str
+
+
+n1 = Node(value="root\n")
+n2 = Node(value="left child of n1, ")
+n3 = Node(value="right child of n1")
+
+n1.add_split(n2, n3)
+
+my_tree = Tree(n1)
+print(my_tree)
author	Mike Vink <mike1994vink@gmail.com>	2020-09-16 02:35:13 +0200
committer	Mike Vink <mike1994vink@gmail.com>	2020-09-16 02:35:13 +0200
commit	9277a0790d4925f1fd50c789cc83a27096f543e7 (patch)
tree	7b57dd5eababd794e6387fdb72ed7878fddcdfc5
parent	220afa88dc69c1eb59a1ba3c75a6936f40bc156a (diff)