1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
import numpy as np
credit_data = np.genfromtxt('./credit_score.txt',
delimiter=',',
skip_header=True)
# "credit_data" is now a 2d NumPy array. Each rows represent a record and the
# columns represent the data attributes.
# [(22, 0, 0, 28, 1, 0)
# (46, 0, 1, 32, 0, 0)
# (24, 1, 1, 24, 1, 0)
# (25, 0, 0, 27, 1, 0)
# (29, 1, 1, 32, 0, 0)
# (45, 1, 1, 30, 0, 1)
# (63, 1, 1, 58, 1, 1)
# (36, 1, 0, 52, 1, 1)
# (23, 0, 1, 40, 0, 1)
# (50, 1, 1, 28, 0, 1)]
def impurity(array) -> None:
"""
@todo: Docstring for
"""
# Assumes array is a 1 dimensional array, the slice is actually arbitrary i think
n_observations = len(array[0:])
# print('Total observations is the cardinality of the vector containing all class labels:', n_observations)
n_labels_1 = array[0:].sum()
# print('Since we are working with binary class labels the amount of "1" labels equals the sum of the class label vector:', n_labels_1)
# Calculate the relative frequency of label 1 with respect to the total sample size
rel_freq_1 = n_labels_1 / n_observations
# Use the symmetry property to also calculate the relative frequency of zeroes
rel_freq_0 = 1 - rel_freq_1
# print('\nThe rel. freq. of 1: ', rel_freq_1)
# print('\nThe rel. freq. of 0: ', rel_freq_0)
gini_index = rel_freq_1 * rel_freq_0
# print('\nThe gini index: ', gini_index)
# pass
return gini_index
# impurity(test_array)
# x = vector of num values
# y = vector of class labels ... array([0,1]) ??
#
# x and y must be of the same length
#
# y[i] must be the class label of the i-th observation, and x[i] is the
# correspnding value of attribute x
#
# Consider splits of type "x <= c" where "c" is the average of two consecutive
# values of x in the sorted order.
#
# So one child contains all elements with
# "x <= c" and the other child contains all elements with "x > c". This should
# be considered depending on the modality and skew of the attribute value
# distribution I think, in an undesirable edge case you might for example
# consider a child split without observations in it. Here we prevent this
# putting the condition that the split value has to be in the middle of two
# attribute values, meaning that there is at least one observation in each
# child node.
#
# We are given already the class labels from the credit_data array
y = credit_data[:, 5]
# print(y)
# And in the example the splits are done based on the income
#
# Now we can choose some attribute from the array to make a split on.
x = credit_data[:, 3]
def bestsplit(x, y) -> None:
"""
@todo: Docstring for bestsplit
"""
# Make it unique since we don't want two the same split points
num_attr_sorted = np.sort(np.unique(x))
# print(num_attr_sorted)
# print(type(num_attr_sorted))
# Use python vector addition to add all corresponding elements and take
# their average
consec_avg_attr_splitpoints = (num_attr_sorted[0:7] +
num_attr_sorted[1:8]) / 2
split_points = list(consec_avg_attr_splitpoints)
# print(consec_avg_attr_splitpoints)
# print(type(consec_avg_attr_splitpoints))
impurity_parent_node = impurity(y)
n_obs_parent_node = len(y)
split_points_delta_impurities = []
while split_points:
split_point = split_points.pop()
# print(split_points)
# print('Popped:', split_point)
child_node = {"l": y[x > split_point], "r": y[x <= split_point]}
w_avg_child_impurities = (
impurity(child_node["l"]) * len(child_node["l"]) + impurity(
child_node["r"]) * len(child_node["r"])) / n_obs_parent_node
split_points_delta_impurities += [(split_point,
impurity_parent_node - w_avg_child_impurities)]
# print(split_points_delta_impurities)
best_split, best_delta_impurity = max(split_points_delta_impurities, key=lambda x: x[1])
print(f"{best_split=}, {best_delta_impurity=}")
# print('reached the end')
bestsplit(x,y)
|