import numpy as np credit_data = np.genfromtxt('./credit_score.txt', delimiter=',', skip_header=True) # "credit_data" is now a 2d NumPy array. Each rows represent a record and the # columns represent the data attributes. # [(22, 0, 0, 28, 1, 0) # (46, 0, 1, 32, 0, 0) # (24, 1, 1, 24, 1, 0) # (25, 0, 0, 27, 1, 0) # (29, 1, 1, 32, 0, 0) # (45, 1, 1, 30, 0, 1) # (63, 1, 1, 58, 1, 1) # (36, 1, 0, 52, 1, 1) # (23, 0, 1, 40, 0, 1) # (50, 1, 1, 28, 0, 1)] class Tree(): """ @todo: docstring for Tree """ def __init__(self, tr_d_structure): """@todo: Docstring for init method. /tr_d_structure/ @todo """ self.tr_d_structure = tr_d_structure def __repr__(self): return str(self.tr_d_structure) tree_grow_defaults = { 'x': credit_data[:, :5], 'y': credit_data[:, 5], 'n_min': 0, 'min_leaf': 0, 'n_feat': 5 } def tree_grow(x=None, y=None, n_min=None, min_leaf=None, n_feat=None, **defaults) -> Tree: """ @todo: Docstring for tree_grow """ print( "All attribute columns in credit data (to be exhaustively searched per split):\n", x, "\n") print("Class label column in credit data:\n", y, "\n") print( f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' ) print( f'Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' ) print( f"Number of features/attributes to be randomly drawn from {x=} to be considered for a split, should only be lower than {len(x[0,:])=} for random forest growing, {n_feat=}" ) tr_d_structure = {} return Tree(tr_d_structure) # Calling the function, unpacking default as argument # print(tree_grow(**tree_grow_defaults)) tree_pred_defaults = { 'x': credit_data[:, :5], 'tr': tree_grow(**tree_grow_defaults) } def tree_pred(x=None, tr=None, **defaults) -> np.array: """ @todo: Docstring for tree_pred """ print("\n\n#########Tree_pred output start:\n") print(f"Drop a row in {x=} down the tree {tr.__repr__()}") tree_pred(**tree_pred_defaults) # # # Put all helper functions below this comment! def impurity(array) -> int: """ Assumes the argument array is a one dimensional vector of zeroes and ones. Computes the gini index impurity based on the relative frequency of ones in the vector. Example: >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1]) >>> array array([1,0,1,1,1,0,0,1,1,0,1]) >>> impurity(array) 0.23140495867768596 """ # Total labels n_labels = len(array) # Number of tuples labeled 1 n_labels_1 = array.sum() # Calculate the relative frequency of ones with respect to the total labels rel_freq_1 = n_labels_1 / n_labels # Use the symmetry around the median property to also calculate the # relative frequency of zeroes rel_freq_0 = 1 - rel_freq_1 # Multiply the frequencies to get the gini index gini_index = rel_freq_1 * rel_freq_0 return gini_index # array=np.array([1,0,1,1,1,0,0,1,1,0,1]) # print(impurity(array)) # Should give 0.23.... def bestsplit(x, y) -> int: """ x = vector of num values y = vector of class labels ... array([{x: x is 0 or 1}]) ?? Consider splits of type "x <= c" where "c" is the average of two consecutive values of x in the sorted order. x and y must be of the same length y[i] must be the class label of the i-th observation, and x[i] is the correspnding value of attribute x Example (best split on income): >>> bestsplit(credit_data[:,3],credit_data[:,5]) 36 """ # Sort all unique attribute values num_attr_sorted = np.sort(np.unique(x)) # Use python vector addition to add all corresponding consecutive column # elements and take their average consec_avg_attr_splitpoints = (num_attr_sorted[0:7] + num_attr_sorted[1:8]) / 2 # Convert array to list split_points = list(consec_avg_attr_splitpoints) # Prepare the constants for the delta impurity equation impurity_parent_node = impurity(y) n_obs_parent_node = len(y) # Init return list split_points_delta_impurities = [] while split_points: # compute child nodes class vectors for the split value split_point = split_points.pop() child_node = {"l": y[x > split_point], "r": y[x <= split_point]} # Take the weighted average of child node impurities w_avg_child_impurities = ( impurity(child_node["l"]) * len(child_node["l"]) + impurity( child_node["r"]) * len(child_node["r"])) / n_obs_parent_node # Add the used split point and delta impurity to the return list split_points_delta_impurities += [ (split_point, impurity_parent_node - w_avg_child_impurities) ] # Take the maximum of the list, and unpack best_split, best_delta_impurity = max(split_points_delta_impurities, key=lambda x: x[1]) return best_split # print(bestsplit(credit_data[:,3],credit_data[:,5])) # Should give 36