summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Vink <mike1994vink@gmail.com>2020-09-16 02:35:13 +0200
committerMike Vink <mike1994vink@gmail.com>2020-09-16 02:35:13 +0200
commit9277a0790d4925f1fd50c789cc83a27096f543e7 (patch)
tree7b57dd5eababd794e6387fdb72ed7878fddcdfc5
parent220afa88dc69c1eb59a1ba3c75a6936f40bc156a (diff)
Major update: obj oriented tree
-rw-r--r--assignment1.py306
-rw-r--r--gettingStarted.py100
-rw-r--r--message10
-rw-r--r--recursive_node.py58
4 files changed, 314 insertions, 160 deletions
diff --git a/assignment1.py b/assignment1.py
index 4134270..a442f55 100644
--- a/assignment1.py
+++ b/assignment1.py
@@ -4,8 +4,7 @@ credit_data = np.genfromtxt('./credit_score.txt',
delimiter=',',
skip_header=True)
-# "credit_data" is now a 2d NumPy array. Each rows represent a record and the
-# columns represent the data attributes.
+# age,married,house,income,gender,class
# [(22, 0, 0, 28, 1, 0)
# (46, 0, 1, 32, 0, 0)
# (24, 1, 1, 24, 1, 0)
@@ -18,137 +17,57 @@ credit_data = np.genfromtxt('./credit_score.txt',
# (50, 1, 1, 28, 0, 1)]
-class Tree():
+class Node():
"""
- @todo: docstring for Tree
+ @todo: docstring for Node
"""
- def __init__(self, d_structure):
+ def __init__(self, value=None):
"""@todo: Docstring for init method.
- /d_structure/ @todo
+ /value=None/ @todo
"""
- self.d_structure = d_structure
+ self.value = value
- def __repr__(self):
- return str(self.d_structure)
-
- def drop_left(self) -> dict:
+ def add_split(self, left, right):
"""
- @todo: Docstring for drop_left
+ @todo: Docstring for add_split
"""
- # Need to change
- print("Dropping left:\n", self.prediction["left"])
- return self.prediction["left"]
-
- def drop_right(self) -> dict:
- """
- @todo: Docstring for drop_right
- """
- # Need to change
- print("Dropping right:\n", self.prediction["right"])
- return self.prediction["right"]
+ self.left = left
+ self.right = right
-tree_grow_defaults = {
- 'x': credit_data[:, :5],
- 'y': credit_data[:, 5],
- 'n_min': 0,
- 'min_leaf': 0,
- 'n_feat': 5
-}
+class Leaf:
+ def __init__(self, predicted_class: int):
+ self.predicted_class = predicted_class
-def tree_grow(x=None,
- y=None,
- n_min=None,
- min_leaf=None,
- n_feat=None,
- **defaults) -> Tree:
- """
- @todo: Docstring for tree_grow
- """
- print(
- "All attribute columns in credit data (to be exhaustively searched per split):\n",
- x, "\n")
- print("Class label column in credit data:\n", y, "\n")
- print(
- f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n'
- )
- print(
- f'Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n'
- )
- print(
- f"Number of features/attributes to be randomly drawn from {x=} to be considered for a split, should only be lower than {len(x[0,:])=} for random forest growing, {n_feat=}"
- )
- d_structure = {"root": ((0,30), {"left": (None, 1), "right": (None, 0)})} # dummy data
- return Tree(d_structure)
-
-
-# Calling the function, unpacking default as argument
-# print(tree_grow(**tree_grow_defaults))
-
-tree_pred_defaults = {
- 'x': credit_data[:, :5],
- 'tr': tree_grow(**tree_grow_defaults)
-}
-
-
-def tree_pred(x=None, tr=None, **defaults) -> np.array:
+class Tree():
"""
- @todo: Docstring for tree_pred
+ @todo: docstring for Tree
"""
- print("\n\n#########Tree_pred output start:\n")
- print(f"Drop a row in {x=} down the tree {tr.__repr__()}")
- # x should be a set of rows that represent cases to be dropped in the tree,
- # we iterate over the numpy rows
- y = np.array([])
- for case in x:
- # Assumes that tr is a data structure with the following form
- #
- # {"root":(split, {"left":(...), "right":(...)})
- #
- # Where the (...) is recursion of the pattern tuple(tuple, dict)
-
- # Copy the data structure of the tree, not very efficient?
- tr.prediction = tr.d_structure
-
- split, tr.prediction = tr.prediction["root"]
- # Unpack the split info we need to do the first split
- # c is the split value for the drop, and col is the attribute/feature
- # we are dropping on
- col, c = split
- while isinstance(tr.prediction, dict):
- # The drop methods return a tuple by giving the current
- # d_structure a key that is either "left" or right":
- # def drop_left(self):
- # ...
- # return tr.prediction["left"]
- #
- # Don't even need to check for leaf node, since the while loop
- # should do that.
- col, c = split
- if case[col] > c:
- split, tr.prediction = tr.drop_right()
- elif case[col] <= c:
- split, tr.prediction = tr.drop_left()
- # Assumes that the leaf nodes in tr.d_structure is the leaf node
- # that is just the majority(!) class label, which is just the integer 1 or 0.
- #
- # This is also the reason we break out of the while loop, when we
- # arrive to a leaf node with the drop methods, the tr.d_structure
- # type is no longer a dict, but an int.
- # print(tr.prediction)
- y = np.append(y, tr.prediction)
- print("Predictions from the tree:\n", y)
- return y
+ def __init__(self, root_node_obj):
+ """@todo: Docstring for init method.
+ /root_node_obj/ @todo
-tree_pred(**tree_pred_defaults)
+ """
+ self.tree = root_node_obj
-#
-#
-# Put all helper functions below this comment!
+ def __repr__(self):
+ nodelist = [self.tree]
+ tree_str = ''
+ while nodelist:
+ current_node = nodelist.pop()
+ # print(current_node.value)
+ try:
+ childs = [current_node.right, current_node.left]
+ nodelist += childs
+ except AttributeError:
+ pass
+ col, c = current_node.value
+ tree_str += f"{col=}, {c=}"
+ return tree_str
def impurity(array) -> int:
@@ -180,11 +99,6 @@ def impurity(array) -> int:
return gini_index
-# array=np.array([1,0,1,1,1,0,0,1,1,0,1])
-# print(impurity(array))
-# Should give 0.23....
-
-
def bestsplit(x, y) -> int:
"""
x = vector of num values
@@ -203,43 +117,115 @@ def bestsplit(x, y) -> int:
>>> bestsplit(credit_data[:,3],credit_data[:,5])
36
"""
- # Sort all unique attribute values
- num_attr_sorted = np.sort(np.unique(x))
-
- # Use python vector addition to add all corresponding consecutive column
- # elements and take their average
- consec_avg_attr_splitpoints = (num_attr_sorted[0:7] +
- num_attr_sorted[1:8]) / 2
-
- # Convert array to list
- split_points = list(consec_avg_attr_splitpoints)
-
- # Prepare the constants for the delta impurity equation
- impurity_parent_node = impurity(y)
- n_obs_parent_node = len(y)
-
- # Init return list
- split_points_delta_impurities = []
- while split_points:
- # compute child nodes class vectors for the split value
- split_point = split_points.pop()
- child_node = {"l": y[x > split_point], "r": y[x <= split_point]}
-
- # Take the weighted average of child node impurities
- w_avg_child_impurities = (
- impurity(child_node["l"]) * len(child_node["l"]) + impurity(
- child_node["r"]) * len(child_node["r"])) / n_obs_parent_node
-
- # Add the used split point and delta impurity to the return list
- split_points_delta_impurities += [
- (split_point, impurity_parent_node - w_avg_child_impurities)
- ]
-
- # Take the maximum of the list, and unpack
- best_split, best_delta_impurity = max(split_points_delta_impurities,
- key=lambda x: x[1])
- return best_split
-
-
-# print(bestsplit(credit_data[:,3],credit_data[:,5]))
-# Should give 36
+ x_sorted = np.sort(np.unique(x))
+ if len(x_sorted) <= 2:
+ # Turns binary attributes from [0,1] to [0, 0.5], allows us to use
+ # normal slices
+ split_points = x_sorted / 2
+ else:
+ split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2
+
+ best_dict = None
+ for split in split_points:
+ # Returns boolean matrix which can be used for slicing x
+ x_slices = {
+ "left": np.index_exp[x > split],
+ "right": np.index_exp[x <= split]
+ }
+
+ # delta_i formule
+ delta_i = impurity(y) - (len(y[x_slices["left"]]) * impurity(
+ y[x_slices["left"]]) + len(y[x_slices["right"]]) *
+ impurity(y[x_slices["right"]])) / len(y)
+
+ print(f"{split=}, {delta_i=}")
+ if best_dict is not None:
+ if delta_i > best_dict["delta_i"]:
+ best_dict = {"x_slices": x_slices, "split": split, "delta_i":delta_i}
+ else:
+ best_dict = {"slices": x_slices, "split": split, "delta_i":delta_i}
+ return best_dict
+
+#
+#
+# Put all helper functions above this comment!
+
+def tree_grow(x=None,
+ y=None,
+ n_min=None,
+ min_leaf=None,
+ n_feat=None,
+ **defaults) -> Tree:
+ """
+ @todo: Docstring for tree_grow
+ """
+ all_rows = np.index_exp[:]
+ # Initiate the nodelist with tuples of slice and class labels
+ nodelist = [Node(value={"slices": all_rows})]
+ tree = Tree(nodelist[0])
+ while nodelist:
+ current_node = nodelist.pop()
+ slices = current_node.value["slices"]
+ print(f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}")
+
+ if impurity(y[slices]) > 0:
+
+ # bestsplit(col, node_labels) ->
+ # {"x_slices": (boolean matrix, boolean matrix), "split": numpyfloat, "best_delta_i": numpyfloat}
+
+ # slices used for knowing where to split on the next node.
+ # best_split saved in current_node.value
+ # best_delta_i used to find best split among x_columns
+ best_dict = None
+ for x_col in x[slices].transpose():
+ print("\nExhaustive split search says; \"Entering new column\":")
+ col_split_dict = bestsplit(x_col, y[slices])
+
+ if best_dict is not None:
+ if col_split_dict["delta_i"] > best_dict["delta_i"]:
+ best_dict = col_split_dict
+ else:
+ best_dict = col_split_dict
+ print("\nThe best split for current node:", best_dict)
+
+ return tree
+
+
+
+def tree_pred(x=None, tr=None, **defaults) -> np.array:
+ """
+ @todo: Docstring for tree_pred
+ """
+ y = np.array([])
+ return y
+
+
+if __name__ == '__main__':
+ #### IMPURITY TEST
+ # array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+ # print(impurity(array))
+ # Should give 0.23....
+
+ #### BESTSPLIT TEST
+ # print(bestsplit(credit_data[:, 3], credit_data[:, 5]))
+ # Should give 36
+
+ #### TREE_GROW TEST
+ tree_grow_defaults = {
+ 'x': credit_data[:, :5],
+ 'y': credit_data[:, 5],
+ 'n_min': 2,
+ 'min_leaf': 1,
+ 'n_feat': 5
+ }
+
+ # Calling the tree grow, unpacking default as argument
+ tree_grow(**tree_grow_defaults)
+
+ # tree_pred_defaults = {
+ # 'x': credit_data[:, :5],
+ # 'tr': tree_grow(**tree_grow_defaults)
+ # }
+
+ # tree_pred(**tree_pred_defaults)
+
diff --git a/gettingStarted.py b/gettingStarted.py
new file mode 100644
index 0000000..3a8d907
--- /dev/null
+++ b/gettingStarted.py
@@ -0,0 +1,100 @@
+import numpy as np
+import random
+import math
+from copy import deepcopy
+
+credit_data = np.genfromtxt('/Users/mikevink/Documents/python/2020_data_mining_assignments/credit_score.txt', delimiter=',', skip_header=True)
+
+#print(credit_data)
+#print(credit_data[0])
+#print(credit_data[:,3])
+#print(credit_data[4,0])
+#print(np.sort(np.unique(credit_data[:,3]))) #Give the distinct values of income, sorted from low to high
+#print(np.sum(credit_data[:,5]))
+#print(credit_data.sum(axis=0)) #Add the entries of each column of credit_data
+#print(credit_data.sum(axis=1)) #Add the entries of each row
+#print(credit_data[credit_data[:,0] > 27]) # Select all rows where the first column is bigger than 27
+#
+#x = np.array([2, 5, 10])
+#print(x)
+#print(np.arange(0, 10))
+#
+#print(np.arange(0, 10)[credit_data[:,0] > 27]) #Select the *row numbers* of the rows where the first column of credit_data is bigger than 27
+#
+#index = np.random.choice(np.arange(0, 10), size=5, replace=False) #Draw a random sample of size 5 from the numbers 1 through 10 (without replacement)
+#print(index)
+#train = credit_data[index,]
+#print(train)
+#test = np.delete(credit_data, index, axis=0) #Select all rows with row number not in "index"
+#print(test)
+#
+#print(random.choice(train))
+
+
+### Practice exercise 1 ###
+def impurity(vector): # vector = list of 0s and 1s
+ num_of_class_labels = len(vector)
+ num_of_class_1 = sum(vector)
+ num_of_class_0 = num_of_class_labels - num_of_class_1
+ return (num_of_class_0 / num_of_class_labels) * (num_of_class_1 / num_of_class_labels)
+
+array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+print(impurity(array))
+
+
+### Practice exercise 2 ###
+def bestsplit(x, y): # x = numeric values; y = class labels
+ x_sorted = np.sort(np.unique(x))
+ split_points = (x_sorted[:len(x_sorted)-1] + x_sorted[1:]) / 2
+
+ best_impurity_after_split = math.inf
+ for split in split_points:
+ impurity_after_split = impurity(y[x <= split]) + impurity(y[x > split])
+ if impurity_after_split < best_impurity_after_split:
+ best_split = split
+ best_impurity_after_split = impurity_after_split
+
+ return best_split
+
+print(bestsplit(credit_data[:,3], credit_data[:,5]))
+
+
+
+class Node:
+ def _init_(self):
+ self.left = None
+ self.right = None
+ self.split_value = None
+
+class Leaf:
+ def __init__(self, predicted_class: int):
+ self.predicted_class = predicted_class
+
+
+def tree_grow(x, y): # x = numeric values; y = class labels
+ root = Node()
+ root.split_value = bestsplit(x, y)
+ root.left = Leaf(0)
+ root.right = Leaf(1)
+ return root
+
+def tree_pred(x, tr):
+ y = []
+ for value in x:
+ y.append(single_value_pred(value, tr))
+ return y
+
+def single_value_pred(value, current_tree):
+ if isinstance(current_tree, Leaf):
+ return current_tree.predicted_class
+ else:
+ if value <= current_tree.split_value:
+ return single_value_pred(value, current_tree.left)
+ else:
+ return single_value_pred(value, current_tree.right)
+
+tree = tree_grow(credit_data[:,3], credit_data[:,5])
+print(tree_pred([32, 38, 3, 40], tree))
+
+
+
diff --git a/message b/message
new file mode 100644
index 0000000..bc70ce8
--- /dev/null
+++ b/message
@@ -0,0 +1,10 @@
+heb twee punten:
+
+1. Impurity_after_split (line 52) is som over child impurities, in plaats
+daarvan moet het gewogen gemiddelde tot de len(new_y) / len(old_y) zijn denk ik
+(misschien? want het output hetzelfde als het voorbeeld).
+
+2. Ik denk dat je op deze manier steeds een sub-tree/ een node-recursie aan
+single_value_pred geeft. Wat is de beste manier om een node uit de tree aan te
+spreken?
+
diff --git a/recursive_node.py b/recursive_node.py
new file mode 100644
index 0000000..22bc75d
--- /dev/null
+++ b/recursive_node.py
@@ -0,0 +1,58 @@
+import numpy as np
+
+
+class Node():
+ """
+ @todo: docstring for Node
+ """
+ def __init__(self, value=None):
+ """@todo: Docstring for init method.
+
+ /value=None/ @todo
+
+ """
+ self.value = value
+
+ def add_split(self, left, right):
+ """
+ @todo: Docstring for add_split
+ """
+ self.left = left
+ self.right = right
+
+
+class Tree():
+ """
+ @todo: docstring for Tree
+ """
+ def __init__(self, root_node_obj):
+ """@todo: Docstring for init method.
+
+ /root_node_obj/ @todo
+
+ """
+ self.tree = root_node_obj
+
+ def __repr__(self):
+ nodelist = [self.tree]
+ tree_str = ''
+ while nodelist:
+ current_node = nodelist.pop()
+ # print(current_node.value)
+ try:
+ childs = [current_node.right, current_node.left]
+ nodelist += childs
+ except AttributeError:
+ pass
+ tree_str += current_node.value
+ return tree_str
+
+
+n1 = Node(value="root\n")
+n2 = Node(value="left child of n1, ")
+n3 = Node(value="right child of n1")
+
+n1.add_split(n2, n3)
+
+my_tree = Tree(n1)
+print(my_tree)