summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--assignment1.py186
-rw-r--r--getting_started.py112
2 files changed, 186 insertions, 112 deletions
diff --git a/assignment1.py b/assignment1.py
new file mode 100644
index 0000000..9acd43e
--- /dev/null
+++ b/assignment1.py
@@ -0,0 +1,186 @@
+import numpy as np
+
+credit_data = np.genfromtxt('./credit_score.txt',
+ delimiter=',',
+ skip_header=True)
+
+# "credit_data" is now a 2d NumPy array. Each rows represent a record and the
+# columns represent the data attributes.
+# [(22, 0, 0, 28, 1, 0)
+# (46, 0, 1, 32, 0, 0)
+# (24, 1, 1, 24, 1, 0)
+# (25, 0, 0, 27, 1, 0)
+# (29, 1, 1, 32, 0, 0)
+# (45, 1, 1, 30, 0, 1)
+# (63, 1, 1, 58, 1, 1)
+# (36, 1, 0, 52, 1, 1)
+# (23, 0, 1, 40, 0, 1)
+# (50, 1, 1, 28, 0, 1)]
+
+
+class Tree():
+ """
+ @todo: docstring for Tree
+ """
+ def __init__(self, tr_d_structure):
+ """@todo: Docstring for init method.
+
+ /tr_d_structure/ @todo
+
+ """
+ self.tr_d_structure = tr_d_structure
+
+ def __repr__(self):
+ return str(self.tr_d_structure)
+
+
+tree_grow_defaults = {
+ 'x': credit_data[:, :5],
+ 'y': credit_data[:, 5],
+ 'n_min': 0,
+ 'min_leaf': 0,
+ 'n_feat': 5
+}
+
+
+def tree_grow(x=None,
+ y=None,
+ n_min=None,
+ min_leaf=None,
+ n_feat=None,
+ **defaults) -> Tree:
+ """
+ @todo: Docstring for tree_grow
+ """
+ print(
+ "All attribute columns in credit data (to be exhaustively searched per split):\n",
+ x, "\n")
+ print("Class label column in credit data:\n", y, "\n")
+ print(
+ f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n'
+ )
+ print(
+ f'Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n'
+ )
+ print(
+ f"Number of features/attributes to be randomly drawn from {x=} to be considered for a split, should only be lower than {len(x[0,:])=} for random forest growing, {n_feat=}"
+ )
+ tr_d_structure = {}
+ return Tree(tr_d_structure)
+
+
+# Calling the function, unpacking default as argument
+# print(tree_grow(**tree_grow_defaults))
+
+tree_pred_defaults = {
+ 'x': credit_data[:, :5],
+ 'tr': tree_grow(**tree_grow_defaults)
+}
+
+
+def tree_pred(x=None, tr=None, **defaults) -> np.array:
+ """
+ @todo: Docstring for tree_pred
+ """
+ print("\n\n#########Tree_pred output start:\n")
+ print(f"Drop a row in {x=} down the tree {tr.__repr__()}")
+
+
+tree_pred(**tree_pred_defaults)
+
+#
+#
+# Put all helper functions below this comment!
+
+def impurity(array) -> int:
+ """
+ Assumes the argument array is a one dimensional vector of zeroes and ones.
+ Computes the gini index impurity based on the relative frequency of ones in
+ the vector.
+
+ Example:
+
+ >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+ >>> array
+ array([1,0,1,1,1,0,0,1,1,0,1])
+
+ >>> impurity(array)
+ 0.23140495867768596
+ """
+ # Total labels
+ n_labels = len(array)
+ # Number of tuples labeled 1
+ n_labels_1 = array.sum()
+ # Calculate the relative frequency of ones with respect to the total labels
+ rel_freq_1 = n_labels_1 / n_labels
+ # Use the symmetry around the median property to also calculate the
+ # relative frequency of zeroes
+ rel_freq_0 = 1 - rel_freq_1
+ # Multiply the frequencies to get the gini index
+ gini_index = rel_freq_1 * rel_freq_0
+ return gini_index
+
+
+# array=np.array([1,0,1,1,1,0,0,1,1,0,1])
+# print(impurity(array))
+# Should give 0.23....
+
+
+def bestsplit(x, y) -> int:
+ """
+ x = vector of num values
+ y = vector of class labels ... array([{x: x is 0 or 1}]) ??
+
+ Consider splits of type "x <= c" where "c" is the average of two consecutive
+ values of x in the sorted order.
+
+ x and y must be of the same length
+
+ y[i] must be the class label of the i-th observation, and x[i] is the
+ correspnding value of attribute x
+
+ Example (best split on income):
+
+ >>> bestsplit(credit_data[:,3],credit_data[:,5])
+ 36
+ """
+ # Sort all unique attribute values
+ num_attr_sorted = np.sort(np.unique(x))
+
+ # Use python vector addition to add all corresponding consecutive column
+ # elements and take their average
+ consec_avg_attr_splitpoints = (num_attr_sorted[0:7] +
+ num_attr_sorted[1:8]) / 2
+
+ # Convert array to list
+ split_points = list(consec_avg_attr_splitpoints)
+
+ # Prepare the constants for the delta impurity equation
+ impurity_parent_node = impurity(y)
+ n_obs_parent_node = len(y)
+
+ # Init return list
+ split_points_delta_impurities = []
+ while split_points:
+ # compute child nodes class vectors for the split value
+ split_point = split_points.pop()
+ child_node = {"l": y[x > split_point], "r": y[x <= split_point]}
+
+ # Take the weighted average of child node impurities
+ w_avg_child_impurities = (
+ impurity(child_node["l"]) * len(child_node["l"]) + impurity(
+ child_node["r"]) * len(child_node["r"])) / n_obs_parent_node
+
+ # Add the used split point and delta impurity to the return list
+ split_points_delta_impurities += [
+ (split_point, impurity_parent_node - w_avg_child_impurities)
+ ]
+
+ # Take the maximum of the list, and unpack
+ best_split, best_delta_impurity = max(split_points_delta_impurities,
+ key=lambda x: x[1])
+ return best_split
+
+
+# print(bestsplit(credit_data[:,3],credit_data[:,5]))
+# Should give 36
diff --git a/getting_started.py b/getting_started.py
deleted file mode 100644
index 34d042d..0000000
--- a/getting_started.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import numpy as np
-
-credit_data = np.genfromtxt('./credit_score.txt',
- delimiter=',',
- skip_header=True)
-
-# "credit_data" is now a 2d NumPy array. Each rows represent a record and the
-# columns represent the data attributes.
-# [(22, 0, 0, 28, 1, 0)
-# (46, 0, 1, 32, 0, 0)
-# (24, 1, 1, 24, 1, 0)
-# (25, 0, 0, 27, 1, 0)
-# (29, 1, 1, 32, 0, 0)
-# (45, 1, 1, 30, 0, 1)
-# (63, 1, 1, 58, 1, 1)
-# (36, 1, 0, 52, 1, 1)
-# (23, 0, 1, 40, 0, 1)
-# (50, 1, 1, 28, 0, 1)]
-
-def impurity(array) -> None:
- """
- @todo: Docstring for
- """
- # Assumes array is a 1 dimensional array, the slice is actually arbitrary i think
- n_observations = len(array[0:])
- # print('Total observations is the cardinality of the vector containing all class labels:', n_observations)
-
- n_labels_1 = array[0:].sum()
- # print('Since we are working with binary class labels the amount of "1" labels equals the sum of the class label vector:', n_labels_1)
-
- # Calculate the relative frequency of label 1 with respect to the total sample size
- rel_freq_1 = n_labels_1 / n_observations
-
- # Use the symmetry property to also calculate the relative frequency of zeroes
- rel_freq_0 = 1 - rel_freq_1
- # print('\nThe rel. freq. of 1: ', rel_freq_1)
- # print('\nThe rel. freq. of 0: ', rel_freq_0)
- gini_index = rel_freq_1 * rel_freq_0
- # print('\nThe gini index: ', gini_index)
- # pass
- return gini_index
-
-
-# impurity(test_array)
-
-# x = vector of num values
-# y = vector of class labels ... array([0,1]) ??
-#
-# x and y must be of the same length
-#
-# y[i] must be the class label of the i-th observation, and x[i] is the
-# correspnding value of attribute x
-#
-# Consider splits of type "x <= c" where "c" is the average of two consecutive
-# values of x in the sorted order.
-#
-# So one child contains all elements with
-# "x <= c" and the other child contains all elements with "x > c". This should
-# be considered depending on the modality and skew of the attribute value
-# distribution I think, in an undesirable edge case you might for example
-# consider a child split without observations in it. Here we prevent this
-# putting the condition that the split value has to be in the middle of two
-# attribute values, meaning that there is at least one observation in each
-# child node.
-#
-# We are given already the class labels from the credit_data array
-y = credit_data[:, 5]
-# print(y)
-# And in the example the splits are done based on the income
-#
-# Now we can choose some attribute from the array to make a split on.
-x = credit_data[:, 3]
-
-
-def bestsplit(x, y) -> None:
- """
- @todo: Docstring for bestsplit
- """
- # Make it unique since we don't want two the same split points
- num_attr_sorted = np.sort(np.unique(x))
- # print(num_attr_sorted)
- # print(type(num_attr_sorted))
-
- # Use python vector addition to add all corresponding elements and take
- # their average
- consec_avg_attr_splitpoints = (num_attr_sorted[0:7] +
- num_attr_sorted[1:8]) / 2
-
- split_points = list(consec_avg_attr_splitpoints)
- # print(consec_avg_attr_splitpoints)
- # print(type(consec_avg_attr_splitpoints))
-
- impurity_parent_node = impurity(y)
- n_obs_parent_node = len(y)
- split_points_delta_impurities = []
- while split_points:
- split_point = split_points.pop()
- # print(split_points)
- # print('Popped:', split_point)
- child_node = {"l": y[x > split_point], "r": y[x <= split_point]}
- w_avg_child_impurities = (
- impurity(child_node["l"]) * len(child_node["l"]) + impurity(
- child_node["r"]) * len(child_node["r"])) / n_obs_parent_node
- split_points_delta_impurities += [(split_point,
- impurity_parent_node - w_avg_child_impurities)]
-
- # print(split_points_delta_impurities)
- best_split, best_delta_impurity = max(split_points_delta_impurities, key=lambda x: x[1])
- print(f"{best_split=}, {best_delta_impurity=}")
- # print('reached the end')
-
-bestsplit(x,y)