diff options
Diffstat (limited to 'assignment1.py')
| -rw-r--r-- | assignment1.py | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/assignment1.py b/assignment1.py new file mode 100644 index 0000000..9acd43e --- /dev/null +++ b/assignment1.py @@ -0,0 +1,186 @@ +import numpy as np + +credit_data = np.genfromtxt('./credit_score.txt', + delimiter=',', + skip_header=True) + +# "credit_data" is now a 2d NumPy array. Each rows represent a record and the +# columns represent the data attributes. +# [(22, 0, 0, 28, 1, 0) +# (46, 0, 1, 32, 0, 0) +# (24, 1, 1, 24, 1, 0) +# (25, 0, 0, 27, 1, 0) +# (29, 1, 1, 32, 0, 0) +# (45, 1, 1, 30, 0, 1) +# (63, 1, 1, 58, 1, 1) +# (36, 1, 0, 52, 1, 1) +# (23, 0, 1, 40, 0, 1) +# (50, 1, 1, 28, 0, 1)] + + +class Tree(): + """ + @todo: docstring for Tree + """ + def __init__(self, tr_d_structure): + """@todo: Docstring for init method. + + /tr_d_structure/ @todo + + """ + self.tr_d_structure = tr_d_structure + + def __repr__(self): + return str(self.tr_d_structure) + + +tree_grow_defaults = { + 'x': credit_data[:, :5], + 'y': credit_data[:, 5], + 'n_min': 0, + 'min_leaf': 0, + 'n_feat': 5 +} + + +def tree_grow(x=None, + y=None, + n_min=None, + min_leaf=None, + n_feat=None, + **defaults) -> Tree: + """ + @todo: Docstring for tree_grow + """ + print( + "All attribute columns in credit data (to be exhaustively searched per split):\n", + x, "\n") + print("Class label column in credit data:\n", y, "\n") + print( + f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' + ) + print( + f'Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' + ) + print( + f"Number of features/attributes to be randomly drawn from {x=} to be considered for a split, should only be lower than {len(x[0,:])=} for random forest growing, {n_feat=}" + ) + tr_d_structure = {} + return Tree(tr_d_structure) + + +# Calling the function, unpacking default as argument +# print(tree_grow(**tree_grow_defaults)) + +tree_pred_defaults = { + 'x': credit_data[:, :5], + 'tr': tree_grow(**tree_grow_defaults) +} + + +def tree_pred(x=None, tr=None, **defaults) -> np.array: + """ + @todo: Docstring for tree_pred + """ + print("\n\n#########Tree_pred output start:\n") + print(f"Drop a row in {x=} down the tree {tr.__repr__()}") + + +tree_pred(**tree_pred_defaults) + +# +# +# Put all helper functions below this comment! + +def impurity(array) -> int: + """ + Assumes the argument array is a one dimensional vector of zeroes and ones. + Computes the gini index impurity based on the relative frequency of ones in + the vector. + + Example: + + >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1]) + >>> array + array([1,0,1,1,1,0,0,1,1,0,1]) + + >>> impurity(array) + 0.23140495867768596 + """ + # Total labels + n_labels = len(array) + # Number of tuples labeled 1 + n_labels_1 = array.sum() + # Calculate the relative frequency of ones with respect to the total labels + rel_freq_1 = n_labels_1 / n_labels + # Use the symmetry around the median property to also calculate the + # relative frequency of zeroes + rel_freq_0 = 1 - rel_freq_1 + # Multiply the frequencies to get the gini index + gini_index = rel_freq_1 * rel_freq_0 + return gini_index + + +# array=np.array([1,0,1,1,1,0,0,1,1,0,1]) +# print(impurity(array)) +# Should give 0.23.... + + +def bestsplit(x, y) -> int: + """ + x = vector of num values + y = vector of class labels ... array([{x: x is 0 or 1}]) ?? + + Consider splits of type "x <= c" where "c" is the average of two consecutive + values of x in the sorted order. + + x and y must be of the same length + + y[i] must be the class label of the i-th observation, and x[i] is the + correspnding value of attribute x + + Example (best split on income): + + >>> bestsplit(credit_data[:,3],credit_data[:,5]) + 36 + """ + # Sort all unique attribute values + num_attr_sorted = np.sort(np.unique(x)) + + # Use python vector addition to add all corresponding consecutive column + # elements and take their average + consec_avg_attr_splitpoints = (num_attr_sorted[0:7] + + num_attr_sorted[1:8]) / 2 + + # Convert array to list + split_points = list(consec_avg_attr_splitpoints) + + # Prepare the constants for the delta impurity equation + impurity_parent_node = impurity(y) + n_obs_parent_node = len(y) + + # Init return list + split_points_delta_impurities = [] + while split_points: + # compute child nodes class vectors for the split value + split_point = split_points.pop() + child_node = {"l": y[x > split_point], "r": y[x <= split_point]} + + # Take the weighted average of child node impurities + w_avg_child_impurities = ( + impurity(child_node["l"]) * len(child_node["l"]) + impurity( + child_node["r"]) * len(child_node["r"])) / n_obs_parent_node + + # Add the used split point and delta impurity to the return list + split_points_delta_impurities += [ + (split_point, impurity_parent_node - w_avg_child_impurities) + ] + + # Take the maximum of the list, and unpack + best_split, best_delta_impurity = max(split_points_delta_impurities, + key=lambda x: x[1]) + return best_split + + +# print(bestsplit(credit_data[:,3],credit_data[:,5])) +# Should give 36 |
