1 files changed, 0 insertions, 112 deletions
diff --git a/getting_started.py b/getting_started.py
deleted file mode 100644
index 34d042d..0000000
--- a/getting_started.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import numpy as np
-
-credit_data = np.genfromtxt('./credit_score.txt',
-                            delimiter=',',
-                            skip_header=True)
-
-# "credit_data" is now a 2d NumPy array. Each rows represent a record and the
-# columns represent the data attributes.
-# [(22, 0, 0, 28, 1, 0)
-#  (46, 0, 1, 32, 0, 0)
-#  (24, 1, 1, 24, 1, 0)
-#  (25, 0, 0, 27, 1, 0)
-#  (29, 1, 1, 32, 0, 0)
-#  (45, 1, 1, 30, 0, 1)
-#  (63, 1, 1, 58, 1, 1)
-#  (36, 1, 0, 52, 1, 1)
-#  (23, 0, 1, 40, 0, 1)
-#  (50, 1, 1, 28, 0, 1)]
-
-def impurity(array) -> None:
-    """
-    @todo: Docstring for 
-    """
-    # Assumes array is a 1 dimensional array, the slice is actually arbitrary i think
-    n_observations = len(array[0:])
-    # print('Total observations is the cardinality of the vector containing all class labels:', n_observations)
-
-    n_labels_1 = array[0:].sum()
-    # print('Since we are working with binary class labels the amount of "1" labels equals the sum of the class label vector:', n_labels_1)
-
-    # Calculate the relative frequency of label 1 with respect to the total sample size
-    rel_freq_1 = n_labels_1 / n_observations
-
-    # Use the symmetry property to also calculate the relative frequency of zeroes
-    rel_freq_0 = 1 - rel_freq_1
-    # print('\nThe rel. freq. of 1: ', rel_freq_1)
-    # print('\nThe rel. freq. of 0: ', rel_freq_0)
-    gini_index = rel_freq_1 * rel_freq_0
-    # print('\nThe gini index: ', gini_index)
-    # pass
-    return gini_index
-
-
-# impurity(test_array)
-
-# x = vector of num values
-# y = vector of class labels ... array([0,1]) ??
-#
-# x and y must be of the same length
-#
-# y[i] must be the class label of the i-th observation, and x[i] is the
-# correspnding value of attribute x
-#
-# Consider splits of type "x <= c" where "c" is the average of two consecutive
-# values of x in the sorted order.
-#
-# So one child contains all elements with
-# "x <= c" and the other child contains all elements with "x > c". This should
-# be considered depending on the modality and skew of the attribute value
-# distribution I think, in an undesirable edge case you might for example
-# consider a child split without observations in it. Here we prevent this
-# putting the condition that the split value has to be in the middle of two
-# attribute values, meaning that there is at least one observation in each
-# child node.
-#
-# We are given already the class labels from the credit_data array
-y = credit_data[:, 5]
-# print(y)
-# And in the example the splits are done based on the income
-#
-# Now we can choose some attribute from the array to make a split on.
-x = credit_data[:, 3]
-
-
-def bestsplit(x, y) -> None:
-    """
-    @todo: Docstring for bestsplit
-    """
-    # Make it unique since we don't want two the same split points
-    num_attr_sorted = np.sort(np.unique(x))
-    # print(num_attr_sorted)
-    # print(type(num_attr_sorted))
-
-    # Use python vector addition to add all corresponding elements and take
-    # their average
-    consec_avg_attr_splitpoints = (num_attr_sorted[0:7] +
-                                   num_attr_sorted[1:8]) / 2
-
-    split_points = list(consec_avg_attr_splitpoints)
-    # print(consec_avg_attr_splitpoints)
-    # print(type(consec_avg_attr_splitpoints))
-
-    impurity_parent_node = impurity(y)
-    n_obs_parent_node = len(y)
-    split_points_delta_impurities = []
-    while split_points:
-        split_point = split_points.pop()
-        # print(split_points)
-        # print('Popped:', split_point)
-        child_node = {"l": y[x > split_point], "r": y[x <= split_point]}
-        w_avg_child_impurities = (
-            impurity(child_node["l"]) * len(child_node["l"]) + impurity(
-                child_node["r"]) * len(child_node["r"])) / n_obs_parent_node
-        split_points_delta_impurities += [(split_point,
-                             impurity_parent_node - w_avg_child_impurities)]
-
-    # print(split_points_delta_impurities)
-    best_split, best_delta_impurity = max(split_points_delta_impurities, key=lambda x: x[1])
-    print(f"{best_split=}, {best_delta_impurity=}")
-    # print('reached the end')
-
-bestsplit(x,y)