diff options
Diffstat (limited to 'mike_getting_started.py')
| -rw-r--r-- | mike_getting_started.py | 158 |
1 files changed, 92 insertions, 66 deletions
diff --git a/mike_getting_started.py b/mike_getting_started.py index 227b6fe..34d042d 100644 --- a/mike_getting_started.py +++ b/mike_getting_started.py @@ -1,86 +1,112 @@ import numpy as np -credit_data = np.genfromtxt('./credit_score.txt', delimiter=',', skip_header=True) +credit_data = np.genfromtxt('./credit_score.txt', + delimiter=',', + skip_header=True) # "credit_data" is now a 2d NumPy array. Each rows represent a record and the # columns represent the data attributes. -# [(22, 0, 0, 28, 1, 0) -# (46, 0, 1, 32, 0, 0) +# [(22, 0, 0, 28, 1, 0) +# (46, 0, 1, 32, 0, 0) # (24, 1, 1, 24, 1, 0) -# (25, 0, 0, 27, 1, 0) -# (29, 1, 1, 32, 0, 0) +# (25, 0, 0, 27, 1, 0) +# (29, 1, 1, 32, 0, 0) # (45, 1, 1, 30, 0, 1) -# (63, 1, 1, 58, 1, 1) -# (36, 1, 0, 52, 1, 1) +# (63, 1, 1, 58, 1, 1) +# (36, 1, 0, 52, 1, 1) # (23, 0, 1, 40, 0, 1) # (50, 1, 1, 28, 0, 1)] -# print(credit_data) - -first_row = credit_data[0] -# print('The first row: ', first_row) - -fourth_col = credit_data[:,3] -# print('The fourth column: ', fourth_col) - -# print(credit_data[...,1:]) - -# print(credit_data[:3,2]) - -# print(np.sort(np.unique(credit_data[:,3]))) - -# print('Total number of examples with binary label 1:', np.sum(credit_data[:,5])) - -# print('Sum of all entries in the cols:', credit_data.sum(axis=0)) - -# print('Select all rows where the first column is bigger than 27:', credit_data[credit_data[:,0] > 27]) - -x = np.array([2,5,10]) -# print(x) - -# print(np.arange(0, 10)) - -# Select the *row numbers* of the rows where the first column of credit_data is bigger than 27: -# print(np.arange(0,10)[credit_data[:,0] > 27]) - -# Draw a random sample of size 5 from the numbers 1 through 10 (without replacement): -index = np.random.choice(np.arange(0,10), size=5, replace=False) -# print(index) - -train = credit_data[index,] -# print(train) - -test = np.delete(credit_data, index, axis=0) -# print(test) - -# help(np.random.choice) - -# Practice exercise1 - -# test_array = credit_data[:,-1] -test_array = np.array([1,0,1,1,1,0,0,1,1,0,1]) def impurity(array) -> None: """ @todo: Docstring for """ - # print(array) - rel_freq_1_len = len(array[0:]) - print('len of the vector:', rel_freq_1_len) - rel_freq_1_sum = array[0:].sum() - print(rel_freq_1_sum) - rel_freq_1 = rel_freq_1_sum / rel_freq_1_len - rel_freq_0 = 1 - rel_freq_1 - print('\nThe rel. freq. of 1: ', rel_freq_1) - print('\nThe rel. freq. of 0: ', rel_freq_0) - gini_index = rel_freq_1 * rel_freq_0 - print('\nThe gini index: ', gini_index) - # pass + # Assumes array is a 1 dimensional array, the slice is actually arbitrary i think + n_observations = len(array[0:]) + # print('Total observations is the cardinality of the vector containing all class labels:', n_observations) -impurity(test_array) + n_labels_1 = array[0:].sum() + # print('Since we are working with binary class labels the amount of "1" labels equals the sum of the class label vector:', n_labels_1) + # Calculate the relative frequency of label 1 with respect to the total sample size + rel_freq_1 = n_labels_1 / n_observations -def bestsplit(x,y) -> None: + # Use the symmetry property to also calculate the relative frequency of zeroes + rel_freq_0 = 1 - rel_freq_1 + # print('\nThe rel. freq. of 1: ', rel_freq_1) + # print('\nThe rel. freq. of 0: ', rel_freq_0) + gini_index = rel_freq_1 * rel_freq_0 + # print('\nThe gini index: ', gini_index) + # pass + return gini_index + + +# impurity(test_array) + +# x = vector of num values +# y = vector of class labels ... array([0,1]) ?? +# +# x and y must be of the same length +# +# y[i] must be the class label of the i-th observation, and x[i] is the +# correspnding value of attribute x +# +# Consider splits of type "x <= c" where "c" is the average of two consecutive +# values of x in the sorted order. +# +# So one child contains all elements with +# "x <= c" and the other child contains all elements with "x > c". This should +# be considered depending on the modality and skew of the attribute value +# distribution I think, in an undesirable edge case you might for example +# consider a child split without observations in it. Here we prevent this +# putting the condition that the split value has to be in the middle of two +# attribute values, meaning that there is at least one observation in each +# child node. +# +# We are given already the class labels from the credit_data array +y = credit_data[:, 5] +# print(y) +# And in the example the splits are done based on the income +# +# Now we can choose some attribute from the array to make a split on. +x = credit_data[:, 3] + + +def bestsplit(x, y) -> None: """ @todo: Docstring for bestsplit """ - pass + # Make it unique since we don't want two the same split points + num_attr_sorted = np.sort(np.unique(x)) + # print(num_attr_sorted) + # print(type(num_attr_sorted)) + + # Use python vector addition to add all corresponding elements and take + # their average + consec_avg_attr_splitpoints = (num_attr_sorted[0:7] + + num_attr_sorted[1:8]) / 2 + + split_points = list(consec_avg_attr_splitpoints) + # print(consec_avg_attr_splitpoints) + # print(type(consec_avg_attr_splitpoints)) + + impurity_parent_node = impurity(y) + n_obs_parent_node = len(y) + split_points_delta_impurities = [] + while split_points: + split_point = split_points.pop() + # print(split_points) + # print('Popped:', split_point) + child_node = {"l": y[x > split_point], "r": y[x <= split_point]} + w_avg_child_impurities = ( + impurity(child_node["l"]) * len(child_node["l"]) + impurity( + child_node["r"]) * len(child_node["r"])) / n_obs_parent_node + split_points_delta_impurities += [(split_point, + impurity_parent_node - w_avg_child_impurities)] + + # print(split_points_delta_impurities) + best_split, best_delta_impurity = max(split_points_delta_impurities, key=lambda x: x[1]) + print(f"{best_split=}, {best_delta_impurity=}") + # print('reached the end') + +bestsplit(x,y) |
