diff options
Diffstat (limited to 'getting_started.py')
| -rw-r--r-- | getting_started.py | 112 |
1 files changed, 0 insertions, 112 deletions
diff --git a/getting_started.py b/getting_started.py deleted file mode 100644 index 34d042d..0000000 --- a/getting_started.py +++ /dev/null @@ -1,112 +0,0 @@ -import numpy as np - -credit_data = np.genfromtxt('./credit_score.txt', - delimiter=',', - skip_header=True) - -# "credit_data" is now a 2d NumPy array. Each rows represent a record and the -# columns represent the data attributes. -# [(22, 0, 0, 28, 1, 0) -# (46, 0, 1, 32, 0, 0) -# (24, 1, 1, 24, 1, 0) -# (25, 0, 0, 27, 1, 0) -# (29, 1, 1, 32, 0, 0) -# (45, 1, 1, 30, 0, 1) -# (63, 1, 1, 58, 1, 1) -# (36, 1, 0, 52, 1, 1) -# (23, 0, 1, 40, 0, 1) -# (50, 1, 1, 28, 0, 1)] - -def impurity(array) -> None: - """ - @todo: Docstring for - """ - # Assumes array is a 1 dimensional array, the slice is actually arbitrary i think - n_observations = len(array[0:]) - # print('Total observations is the cardinality of the vector containing all class labels:', n_observations) - - n_labels_1 = array[0:].sum() - # print('Since we are working with binary class labels the amount of "1" labels equals the sum of the class label vector:', n_labels_1) - - # Calculate the relative frequency of label 1 with respect to the total sample size - rel_freq_1 = n_labels_1 / n_observations - - # Use the symmetry property to also calculate the relative frequency of zeroes - rel_freq_0 = 1 - rel_freq_1 - # print('\nThe rel. freq. of 1: ', rel_freq_1) - # print('\nThe rel. freq. of 0: ', rel_freq_0) - gini_index = rel_freq_1 * rel_freq_0 - # print('\nThe gini index: ', gini_index) - # pass - return gini_index - - -# impurity(test_array) - -# x = vector of num values -# y = vector of class labels ... array([0,1]) ?? -# -# x and y must be of the same length -# -# y[i] must be the class label of the i-th observation, and x[i] is the -# correspnding value of attribute x -# -# Consider splits of type "x <= c" where "c" is the average of two consecutive -# values of x in the sorted order. -# -# So one child contains all elements with -# "x <= c" and the other child contains all elements with "x > c". This should -# be considered depending on the modality and skew of the attribute value -# distribution I think, in an undesirable edge case you might for example -# consider a child split without observations in it. Here we prevent this -# putting the condition that the split value has to be in the middle of two -# attribute values, meaning that there is at least one observation in each -# child node. -# -# We are given already the class labels from the credit_data array -y = credit_data[:, 5] -# print(y) -# And in the example the splits are done based on the income -# -# Now we can choose some attribute from the array to make a split on. -x = credit_data[:, 3] - - -def bestsplit(x, y) -> None: - """ - @todo: Docstring for bestsplit - """ - # Make it unique since we don't want two the same split points - num_attr_sorted = np.sort(np.unique(x)) - # print(num_attr_sorted) - # print(type(num_attr_sorted)) - - # Use python vector addition to add all corresponding elements and take - # their average - consec_avg_attr_splitpoints = (num_attr_sorted[0:7] + - num_attr_sorted[1:8]) / 2 - - split_points = list(consec_avg_attr_splitpoints) - # print(consec_avg_attr_splitpoints) - # print(type(consec_avg_attr_splitpoints)) - - impurity_parent_node = impurity(y) - n_obs_parent_node = len(y) - split_points_delta_impurities = [] - while split_points: - split_point = split_points.pop() - # print(split_points) - # print('Popped:', split_point) - child_node = {"l": y[x > split_point], "r": y[x <= split_point]} - w_avg_child_impurities = ( - impurity(child_node["l"]) * len(child_node["l"]) + impurity( - child_node["r"]) * len(child_node["r"])) / n_obs_parent_node - split_points_delta_impurities += [(split_point, - impurity_parent_node - w_avg_child_impurities)] - - # print(split_points_delta_impurities) - best_split, best_delta_impurity = max(split_points_delta_impurities, key=lambda x: x[1]) - print(f"{best_split=}, {best_delta_impurity=}") - # print('reached the end') - -bestsplit(x,y) |
