From ee067768dc96a78cbfd65e941ab8d706fa95127e Mon Sep 17 00:00:00 2001 From: Mike Vink Date: Tue, 15 Sep 2020 10:44:12 +0200 Subject: made the outline of the assignment --- assignment1.py | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++++ getting_started.py | 112 -------------------------------- 2 files changed, 186 insertions(+), 112 deletions(-) create mode 100644 assignment1.py delete mode 100644 getting_started.py diff --git a/assignment1.py b/assignment1.py new file mode 100644 index 0000000..9acd43e --- /dev/null +++ b/assignment1.py @@ -0,0 +1,186 @@ +import numpy as np + +credit_data = np.genfromtxt('./credit_score.txt', + delimiter=',', + skip_header=True) + +# "credit_data" is now a 2d NumPy array. Each rows represent a record and the +# columns represent the data attributes. +# [(22, 0, 0, 28, 1, 0) +# (46, 0, 1, 32, 0, 0) +# (24, 1, 1, 24, 1, 0) +# (25, 0, 0, 27, 1, 0) +# (29, 1, 1, 32, 0, 0) +# (45, 1, 1, 30, 0, 1) +# (63, 1, 1, 58, 1, 1) +# (36, 1, 0, 52, 1, 1) +# (23, 0, 1, 40, 0, 1) +# (50, 1, 1, 28, 0, 1)] + + +class Tree(): + """ + @todo: docstring for Tree + """ + def __init__(self, tr_d_structure): + """@todo: Docstring for init method. + + /tr_d_structure/ @todo + + """ + self.tr_d_structure = tr_d_structure + + def __repr__(self): + return str(self.tr_d_structure) + + +tree_grow_defaults = { + 'x': credit_data[:, :5], + 'y': credit_data[:, 5], + 'n_min': 0, + 'min_leaf': 0, + 'n_feat': 5 +} + + +def tree_grow(x=None, + y=None, + n_min=None, + min_leaf=None, + n_feat=None, + **defaults) -> Tree: + """ + @todo: Docstring for tree_grow + """ + print( + "All attribute columns in credit data (to be exhaustively searched per split):\n", + x, "\n") + print("Class label column in credit data:\n", y, "\n") + print( + f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' + ) + print( + f'Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' + ) + print( + f"Number of features/attributes to be randomly drawn from {x=} to be considered for a split, should only be lower than {len(x[0,:])=} for random forest growing, {n_feat=}" + ) + tr_d_structure = {} + return Tree(tr_d_structure) + + +# Calling the function, unpacking default as argument +# print(tree_grow(**tree_grow_defaults)) + +tree_pred_defaults = { + 'x': credit_data[:, :5], + 'tr': tree_grow(**tree_grow_defaults) +} + + +def tree_pred(x=None, tr=None, **defaults) -> np.array: + """ + @todo: Docstring for tree_pred + """ + print("\n\n#########Tree_pred output start:\n") + print(f"Drop a row in {x=} down the tree {tr.__repr__()}") + + +tree_pred(**tree_pred_defaults) + +# +# +# Put all helper functions below this comment! + +def impurity(array) -> int: + """ + Assumes the argument array is a one dimensional vector of zeroes and ones. + Computes the gini index impurity based on the relative frequency of ones in + the vector. + + Example: + + >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1]) + >>> array + array([1,0,1,1,1,0,0,1,1,0,1]) + + >>> impurity(array) + 0.23140495867768596 + """ + # Total labels + n_labels = len(array) + # Number of tuples labeled 1 + n_labels_1 = array.sum() + # Calculate the relative frequency of ones with respect to the total labels + rel_freq_1 = n_labels_1 / n_labels + # Use the symmetry around the median property to also calculate the + # relative frequency of zeroes + rel_freq_0 = 1 - rel_freq_1 + # Multiply the frequencies to get the gini index + gini_index = rel_freq_1 * rel_freq_0 + return gini_index + + +# array=np.array([1,0,1,1,1,0,0,1,1,0,1]) +# print(impurity(array)) +# Should give 0.23.... + + +def bestsplit(x, y) -> int: + """ + x = vector of num values + y = vector of class labels ... array([{x: x is 0 or 1}]) ?? + + Consider splits of type "x <= c" where "c" is the average of two consecutive + values of x in the sorted order. + + x and y must be of the same length + + y[i] must be the class label of the i-th observation, and x[i] is the + correspnding value of attribute x + + Example (best split on income): + + >>> bestsplit(credit_data[:,3],credit_data[:,5]) + 36 + """ + # Sort all unique attribute values + num_attr_sorted = np.sort(np.unique(x)) + + # Use python vector addition to add all corresponding consecutive column + # elements and take their average + consec_avg_attr_splitpoints = (num_attr_sorted[0:7] + + num_attr_sorted[1:8]) / 2 + + # Convert array to list + split_points = list(consec_avg_attr_splitpoints) + + # Prepare the constants for the delta impurity equation + impurity_parent_node = impurity(y) + n_obs_parent_node = len(y) + + # Init return list + split_points_delta_impurities = [] + while split_points: + # compute child nodes class vectors for the split value + split_point = split_points.pop() + child_node = {"l": y[x > split_point], "r": y[x <= split_point]} + + # Take the weighted average of child node impurities + w_avg_child_impurities = ( + impurity(child_node["l"]) * len(child_node["l"]) + impurity( + child_node["r"]) * len(child_node["r"])) / n_obs_parent_node + + # Add the used split point and delta impurity to the return list + split_points_delta_impurities += [ + (split_point, impurity_parent_node - w_avg_child_impurities) + ] + + # Take the maximum of the list, and unpack + best_split, best_delta_impurity = max(split_points_delta_impurities, + key=lambda x: x[1]) + return best_split + + +# print(bestsplit(credit_data[:,3],credit_data[:,5])) +# Should give 36 diff --git a/getting_started.py b/getting_started.py deleted file mode 100644 index 34d042d..0000000 --- a/getting_started.py +++ /dev/null @@ -1,112 +0,0 @@ -import numpy as np - -credit_data = np.genfromtxt('./credit_score.txt', - delimiter=',', - skip_header=True) - -# "credit_data" is now a 2d NumPy array. Each rows represent a record and the -# columns represent the data attributes. -# [(22, 0, 0, 28, 1, 0) -# (46, 0, 1, 32, 0, 0) -# (24, 1, 1, 24, 1, 0) -# (25, 0, 0, 27, 1, 0) -# (29, 1, 1, 32, 0, 0) -# (45, 1, 1, 30, 0, 1) -# (63, 1, 1, 58, 1, 1) -# (36, 1, 0, 52, 1, 1) -# (23, 0, 1, 40, 0, 1) -# (50, 1, 1, 28, 0, 1)] - -def impurity(array) -> None: - """ - @todo: Docstring for - """ - # Assumes array is a 1 dimensional array, the slice is actually arbitrary i think - n_observations = len(array[0:]) - # print('Total observations is the cardinality of the vector containing all class labels:', n_observations) - - n_labels_1 = array[0:].sum() - # print('Since we are working with binary class labels the amount of "1" labels equals the sum of the class label vector:', n_labels_1) - - # Calculate the relative frequency of label 1 with respect to the total sample size - rel_freq_1 = n_labels_1 / n_observations - - # Use the symmetry property to also calculate the relative frequency of zeroes - rel_freq_0 = 1 - rel_freq_1 - # print('\nThe rel. freq. of 1: ', rel_freq_1) - # print('\nThe rel. freq. of 0: ', rel_freq_0) - gini_index = rel_freq_1 * rel_freq_0 - # print('\nThe gini index: ', gini_index) - # pass - return gini_index - - -# impurity(test_array) - -# x = vector of num values -# y = vector of class labels ... array([0,1]) ?? -# -# x and y must be of the same length -# -# y[i] must be the class label of the i-th observation, and x[i] is the -# correspnding value of attribute x -# -# Consider splits of type "x <= c" where "c" is the average of two consecutive -# values of x in the sorted order. -# -# So one child contains all elements with -# "x <= c" and the other child contains all elements with "x > c". This should -# be considered depending on the modality and skew of the attribute value -# distribution I think, in an undesirable edge case you might for example -# consider a child split without observations in it. Here we prevent this -# putting the condition that the split value has to be in the middle of two -# attribute values, meaning that there is at least one observation in each -# child node. -# -# We are given already the class labels from the credit_data array -y = credit_data[:, 5] -# print(y) -# And in the example the splits are done based on the income -# -# Now we can choose some attribute from the array to make a split on. -x = credit_data[:, 3] - - -def bestsplit(x, y) -> None: - """ - @todo: Docstring for bestsplit - """ - # Make it unique since we don't want two the same split points - num_attr_sorted = np.sort(np.unique(x)) - # print(num_attr_sorted) - # print(type(num_attr_sorted)) - - # Use python vector addition to add all corresponding elements and take - # their average - consec_avg_attr_splitpoints = (num_attr_sorted[0:7] + - num_attr_sorted[1:8]) / 2 - - split_points = list(consec_avg_attr_splitpoints) - # print(consec_avg_attr_splitpoints) - # print(type(consec_avg_attr_splitpoints)) - - impurity_parent_node = impurity(y) - n_obs_parent_node = len(y) - split_points_delta_impurities = [] - while split_points: - split_point = split_points.pop() - # print(split_points) - # print('Popped:', split_point) - child_node = {"l": y[x > split_point], "r": y[x <= split_point]} - w_avg_child_impurities = ( - impurity(child_node["l"]) * len(child_node["l"]) + impurity( - child_node["r"]) * len(child_node["r"])) / n_obs_parent_node - split_points_delta_impurities += [(split_point, - impurity_parent_node - w_avg_child_impurities)] - - # print(split_points_delta_impurities) - best_split, best_delta_impurity = max(split_points_delta_impurities, key=lambda x: x[1]) - print(f"{best_split=}, {best_delta_impurity=}") - # print('reached the end') - -bestsplit(x,y) -- cgit v1.2.3