summaryrefslogtreecommitdiff
path: root/assignment1.py
blob: 9acd43eeddc57b02ac1c2008e350e3032250692a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import numpy as np

credit_data = np.genfromtxt('./credit_score.txt',
                            delimiter=',',
                            skip_header=True)

# "credit_data" is now a 2d NumPy array. Each rows represent a record and the
# columns represent the data attributes.
# [(22, 0, 0, 28, 1, 0)
#  (46, 0, 1, 32, 0, 0)
#  (24, 1, 1, 24, 1, 0)
#  (25, 0, 0, 27, 1, 0)
#  (29, 1, 1, 32, 0, 0)
#  (45, 1, 1, 30, 0, 1)
#  (63, 1, 1, 58, 1, 1)
#  (36, 1, 0, 52, 1, 1)
#  (23, 0, 1, 40, 0, 1)
#  (50, 1, 1, 28, 0, 1)]


class Tree():
    """
    @todo: docstring for Tree
    """
    def __init__(self, tr_d_structure):
        """@todo: Docstring for init method.

        /tr_d_structure/ @todo

        """
        self.tr_d_structure = tr_d_structure

    def __repr__(self):
        return str(self.tr_d_structure)


tree_grow_defaults = {
    'x': credit_data[:, :5],
    'y': credit_data[:, 5],
    'n_min': 0,
    'min_leaf': 0,
    'n_feat': 5
}


def tree_grow(x=None,
              y=None,
              n_min=None,
              min_leaf=None,
              n_feat=None,
              **defaults) -> Tree:
    """
    @todo: Docstring for tree_grow
    """
    print(
        "All attribute columns in credit data (to be exhaustively searched per split):\n",
        x, "\n")
    print("Class label column in credit data:\n", y, "\n")
    print(
        f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n'
    )
    print(
        f'Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n'
    )
    print(
        f"Number of features/attributes to be randomly drawn from {x=} to be considered for a split, should only be lower than {len(x[0,:])=} for random forest growing, {n_feat=}"
    )
    tr_d_structure = {}
    return Tree(tr_d_structure)


# Calling the function, unpacking default as argument
# print(tree_grow(**tree_grow_defaults))

tree_pred_defaults = {
    'x': credit_data[:, :5],
    'tr': tree_grow(**tree_grow_defaults)
}


def tree_pred(x=None, tr=None, **defaults) -> np.array:
    """
    @todo: Docstring for tree_pred
    """
    print("\n\n#########Tree_pred output start:\n")
    print(f"Drop a row in {x=} down the tree {tr.__repr__()}")


tree_pred(**tree_pred_defaults)

#
#
# Put all helper functions below this comment!

def impurity(array) -> int:
    """
    Assumes the argument array is a one dimensional vector of zeroes and ones.
    Computes the gini index impurity based on the relative frequency of ones in
    the vector.

    Example:

    >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1])
    >>> array
    array([1,0,1,1,1,0,0,1,1,0,1])

    >>> impurity(array)
    0.23140495867768596
    """
    # Total labels
    n_labels = len(array)
    # Number of tuples labeled 1
    n_labels_1 = array.sum()
    # Calculate the relative frequency of ones with respect to the total labels
    rel_freq_1 = n_labels_1 / n_labels
    # Use the symmetry around the median property to also calculate the
    # relative frequency of zeroes
    rel_freq_0 = 1 - rel_freq_1
    # Multiply the frequencies to get the gini index
    gini_index = rel_freq_1 * rel_freq_0
    return gini_index


# array=np.array([1,0,1,1,1,0,0,1,1,0,1])
# print(impurity(array))
# Should give 0.23....


def bestsplit(x, y) -> int:
    """
    x = vector of num values
    y = vector of class labels ... array([{x: x is 0 or 1}]) ??

    Consider splits of type "x <= c" where "c" is the average of two consecutive
    values of x in the sorted order.

    x and y must be of the same length

    y[i] must be the class label of the i-th observation, and x[i] is the
    correspnding value of attribute x

    Example (best split on income):

    >>> bestsplit(credit_data[:,3],credit_data[:,5])
     36
    """
    # Sort all unique attribute values
    num_attr_sorted = np.sort(np.unique(x))

    # Use python vector addition to add all corresponding consecutive column
    # elements and take their average
    consec_avg_attr_splitpoints = (num_attr_sorted[0:7] +
                                   num_attr_sorted[1:8]) / 2

    # Convert array to list
    split_points = list(consec_avg_attr_splitpoints)

    # Prepare the constants for the delta impurity equation
    impurity_parent_node = impurity(y)
    n_obs_parent_node = len(y)

    # Init return list
    split_points_delta_impurities = []
    while split_points:
        # compute child nodes class vectors for the split value
        split_point = split_points.pop()
        child_node = {"l": y[x > split_point], "r": y[x <= split_point]}

        # Take the weighted average of child node impurities
        w_avg_child_impurities = (
            impurity(child_node["l"]) * len(child_node["l"]) + impurity(
                child_node["r"]) * len(child_node["r"])) / n_obs_parent_node

        # Add the used split point and delta impurity to the return list
        split_points_delta_impurities += [
            (split_point, impurity_parent_node - w_avg_child_impurities)
        ]

    # Take the maximum of the list, and unpack
    best_split, best_delta_impurity = max(split_points_delta_impurities,
                                          key=lambda x: x[1])
    return best_split


# print(bestsplit(credit_data[:,3],credit_data[:,5]))
# Should give 36