diff options
| -rw-r--r-- | ass1_vectorized.py | 369 | ||||
| -rw-r--r-- | assignment1.py | 274 |
2 files changed, 516 insertions, 127 deletions
diff --git a/ass1_vectorized.py b/ass1_vectorized.py new file mode 100644 index 0000000..08a4f8e --- /dev/null +++ b/ass1_vectorized.py @@ -0,0 +1,369 @@ +import time +import numpy as np + +credit_data = np.genfromtxt('./credit_score.txt', + delimiter=',', + skip_header=True) + +# age,married,house,income,gender,class +# [(22, 0, 0, 28, 1, 0) +# (46, 0, 1, 32, 0, 0) +# (24, 1, 1, 24, 1, 0) +# (25, 0, 0, 27, 1, 0) +# (29, 1, 1, 32, 0, 0) +# (45, 1, 1, 30, 0, 1) +# (63, 1, 1, 58, 1, 1) +# (36, 1, 0, 52, 1, 1) +# (23, 0, 1, 40, 0, 1) +# (50, 1, 1, 28, 0, 1)] + +# In the program data points are called rows + +# In the program categorical or numerical attributes are called cols for columns + +# The last column are the classes and will be called as classes in the program + + +class Tree(): + """ + @todo: docstring for Tree + """ + def __init__(self, tree_vec_of_tuples): + """@todo: Docstring for init method. + + /root_node_obj/ @todo + + """ + self.tree_vec_of_tuples = tree_vec_of_tuples + self.leaf_nodes = np.where(tree_vec_of_tuples[:,1] == -1) + self.classes = [(1, -1), (1, -1)] + + def drop(self, y): + """ + @todo: Docstring for drop + """ + return y * 100 + + def leaf(self, y): + """ + @todo: Docstring for drop + """ + return y + + + + + +def impurity(array) -> int: + """ + Assumes the argument array is a one dimensional vector of zeroes and ones. + Computes the gini index impurity based on the relative frequency of ones in + the vector. + + Example: + + >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1]) + >>> array + array([1,0,1,1,1,0,0,1,1,0,1]) + + >>> impurity(array) + 0.23140495867768596 + """ + # Total labels + n_labels = len(array) + if n_labels == 0: + # Prevents division by zero, when the potential split does not have any rows + n_labels = 1 + # Number of tuples labeled 1 + n_labels_1 = array.sum() + # Calculate the relative frequency of ones with respect to the total labels + rel_freq_1 = n_labels_1 / n_labels + # Use the symmetry around the median property to also calculate the + # relative frequency of zeroes + rel_freq_0 = 1 - rel_freq_1 + # Multiply the frequencies to get the gini index + gini_index = rel_freq_1 * rel_freq_0 + return gini_index + + +def bestsplit(x, y) -> int: + """ + x = vector of single col + y = vector of classes (last col in x) + + Consider splits of type "x <= c" where "c" is the average of two consecutive + values of x in the sorted order. + + x and y must be of the same length + + y[i] must be the class label of the i-th observation, and x[i] is the + correspnding value of attribute x + + Example (best split on income): + + >>> bestsplit(credit_data[:,3],credit_data[:,5]) + 36 + """ + x_sorted = np.sort(np.unique(x)) + if len(x_sorted) <= 2: + # Allows splitting on categorical (0 or 1) cols + split_points = [0.5] + else: + # Take average between consecutive numerical rows in the x col + split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 + + # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee + # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes + # bestsplit toe te passen. + # + # Deze lus berekent de best split value, en op basis daarvan weten we welke + # twee "x rows" arrays we moeten returnen, en welke split value het beste + # was natuurlijk. + best_delta_i = None + for split in split_points: + # np.index_exp maakt een boolean vector die zegt welke elementen in de + # col van x hoger of lager zijn dan split + col_slice_boolean_matrices = { + "left": np.index_exp[x > split], + "right": np.index_exp[x <= split] + } + + # delta_i formule met de boolean vector van hierboven + delta_i = impurity( + y) - (len(y[col_slice_boolean_matrices["left"]]) * + impurity(y[col_slice_boolean_matrices["left"]]) + + len(y[col_slice_boolean_matrices["right"]]) * + impurity(y[col_slice_boolean_matrices["right"]])) / len(y) + + print(f"{split=}, {delta_i=}") + # + if best_delta_i is not None: + if delta_i > best_delta_i: + best_delta_i, best_split, best_col_slice_boolean_matrices = delta_i, split, col_slice_boolean_matrices + else: + best_delta_i, best_split, best_col_slice_boolean_matrices = delta_i, split, col_slice_boolean_matrices + return best_delta_i, best_split, best_col_slice_boolean_matrices + +def tree_example(x=None, tr=None, **defaults) -> None: + """ + @todo: Docstring for tree_example + """ + tree_vec = [] + print(tree_vec) + print(type(tree_vec)) + tree_vec.append((36,3)) + print(tree_vec) + tree_vec.append((0,3)) + print(tree_vec) + tree_vec.append((1, -1)) + print(tree_vec) + print(tree_vec[0]) + print(type(tree_vec[0])) + tree_vec = np.array(tree_vec) # , dtype=(int, 2)) + print(tree_vec) + print(type(tree_vec[0])) + + tree = Tree(tree_vec) + + # Let's show how to predict + # 1. maak een vector met root node voor elke row in x waarvoor je een class + # wil predicten. + y = np.ones(len(x), dtype=int) + print(y) + print(type(y)) + print(y.shape) + # 2. Herinner recurrence relatie van whatsapp, en pas hem toe met + # tree.drop(x) op nodes die geen leaf node zijn + # + # Returns indices where not leaf node + print(tree.leaf_nodes) + # print(tree.classes) + # leafs = np.searchsorted( + # print(leafs) + y = np.where(np.searchsorted(tree.leaf_nodes, y)) + + # y = y[:,0] + # print(y) + + + +# +# +# Put all helper functions above this comment! + + +def tree_grow(x=None, + y=None, + n_min=None, + min_leaf=None, + n_feat=None, + **defaults) -> Tree: + """ + @todo: Docstring for tree_grow + """ + # Voor de lus die onze tree growt instantieren we een list die tuples als + # elementen zal hebben, het grote voordeel is dat we op deze manier vector + # operaties meerdere parallele + # prediction drops kunnen doen. (Je kan bijvoorbeeld geen object methodes + # broadcasten als je een numpy array van node objecten hebt) + # + # De tuple moet uiteindelijk de informatie bevatten om voor een row in x + # een class te voorspellen. Hier hebben we voor nodig: + # + # 1. Het split value, voor lager of hoger test + # 2. Het col nummer waar de split bij hoort, anders weten we niet waar we op testen + # + # (split, col) + # + # De enige uitzondering hierop zijn leaf nodes. Om de tree data structure + # een numpy array te maken moeten dit ook tuples zijn. Dit lossen we op + # door een negatieve col aan te duiden. Dit zorgt ervoor dat de prediction + # functie hier eindigt. + # + # (class, negative_int: -1) + # + # Checkout tree example function for more info + tree_vec = [] + # De nodelist heeft in het begin alleen de alle rows van x, omdat alle rows + # altijd in de root in acht worden genomen. + # + # Dit representeren we met een boolean vector, met lengte het aantal rows in x en elementen True. + rows = np.full((1,len(x)), True) + nodelist = [rows] + + # tree_array = np.empty + # while nodelist: + # current_node = nodelist.pop() + # slices = current_node.value + # node_classes = y[slices] + # # print(node_classes) + + # # f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' + # # put stopping rules here before making a split + # if len(node_classes) < n_min: + # current_node.value = Leaf( + # np.argmax(np.bincount(node_classes.astype(int)))) + # print(f"leaf node has majority clas:\n{current_node.value.value=}") + # continue + + # if impurity(node_classes) > 0: + # # print( + # # f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}" + # # ) + + # # If we arrive here ever we are splitting + # # bestsplit(col, node_labels) -> + # # {"slices": list[int], "split": numpyfloat, "best_delta_i": numpyfloat} + + # # slices (list) used for knowing which rows (int) to consider in a node + # # best_split saved in current_node.value + # # best_delta_i used to find best split among x_columns + # best_dict = None + # for i, x_col in enumerate(x[slices].transpose()): + # print( + # "\nExhaustive split search says; \"Entering new column\":") + # col_split_dict = bestsplit(x_col, node_classes, slices) + + # if best_dict is not None: + # if col_split_dict["delta_i"] > best_dict["delta_i"]: + # best_dict = col_split_dict + # best_dict["col"] = i + # else: + # best_dict = col_split_dict + # best_dict["col"] = i + # print("\nThe best split for current node:", best_dict) + + # # Here we store the splitted data into Node objects + # current_node.value = best_dict["split"] + # current_node.col = best_dict["col"] + # # Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' + # if min([len(x) for x in best_dict["slices"].values()]) < min_leaf: + # continue + # else: + # # Invert left and right because we want left to pop() first + # children = [ + # Node(value=best_dict["slices"]["right"]), + # Node(value=best_dict["slices"]["left"]) + # ] + # current_node.add_split(children[1], children[0]) + # nodelist += children + # else: + # current_node.value = Leaf( + # np.argmax(np.bincount(node_classes.astype(int)))) + # print( + # f"\n\nLEAF NODE has majority clas:\n{current_node.value.value=}" + # ) + # continue + # return tree + + +def predict(x, nodes) -> list: + """ + @todo: Docstring for predict + """ + # which row to drop + # print(x) + drop = 0 + while not set(nodes).issubset({0, 1}): + print(nodes) + # print(x[drop]) + if isinstance(nodes[drop].value, Leaf): + nodes[drop] = nodes[drop].value.value + drop += 1 + continue + + print(nodes[drop].value) + print(nodes[drop].col) + # print(nodes[drop].col) + if x[drop, nodes[drop].col] > nodes[drop].value: + nodes[drop] = nodes[drop].left + else: + nodes[drop] = nodes[drop].right + return np.array(nodes) + + +def tree_pred(x=None, tr=None, **defaults) -> np.array: + """ + @todo: Docstring for tree_pred + """ + nodes = [tr.tree] * len(x) + # y = np.linspace(0, len(x), 0) + # y = np.array(ele) + y = predict(x, nodes) + print(f"\n\nPredicted classes for {x=}\n\n are: {y=}") + return y + + +if __name__ == '__main__': + #### IMPURITY TEST + # array=np.array([1,0,1,1,1,0,0,1,1,0,1]) + # print(impurity(array)) + # Should give 0.23.... + + #### BESTSPLIT TEST + # print(bestsplit(credit_data[:, 3], credit_data[:, 5])) + # Should give 36 + + #### TREE_GROW TEST + tree_grow_defaults = { + 'x': credit_data[:, :5], + 'y': credit_data[:, 5], + 'n_min': 2, + 'min_leaf': 1, + 'n_feat': 5 + } + + # Calling the tree grow, unpacking default as argument + # tree_grow(**tree_grow_defaults) + + #### TREE_PRED TEST + tree_pred_defaults = { + 'x': credit_data[:, :5], + # 'tr': tree_grow(**tree_grow_defaults) + } + + tree_example(**tree_pred_defaults) + # tree_pred(**tree_pred_defaults) + +# start_time = time.time() +# print("--- %s seconds ---" % (time.time() - start_time)) diff --git a/assignment1.py b/assignment1.py index 64b865d..e0fb29b 100644 --- a/assignment1.py +++ b/assignment1.py @@ -17,40 +17,63 @@ credit_data = np.genfromtxt('./credit_score.txt', # (23, 0, 1, 40, 0, 1) # (50, 1, 1, 28, 0, 1)] +# In the program data points are called rows -class Node(): +# In the program categorical or numerical attributes are called cols for columns + +# The last column are the classes and will be called as classes in the program + + +class Node: """ - @todo: docstring for Node + The node object points to two other Node objects. """ - def __init__(self, value=None): - """@todo: Docstring for init method. + def __init__(self, split_value_or_rows=None, col=None): + """Initialises the column and split value for the node. + + /split_value_or_rows=None/ can either be the best split value of + a col, or a boolean mask for x that selects the rows to consider for + calculating the split_value - /value=None/ @todo + /col=None/ if the node object has a split_value, then it also has a col + that belongs to this value """ - self.value = value + self.split_value_or_rows = split_value_or_rows + self.col = col def add_split(self, left, right): """ - @todo: Docstring for add_split + Method that is called in the main loop of tree_grow. + + Lets the node object point to two other objects that can be either Leaf + or Node. """ self.left = left self.right = right -class Leaf: - def __init__(self, value: int): - self.value = value +# class Leaf: +# """ +# Simple class that contains only the majority class in the leaf node. +# """ +# def __init__(self, maj_class): +# """Initialises the majority vote. + +# /maj_class/ @todo + +# """ -class Tree(): +class Tree: """ - @todo: docstring for Tree + Tree object that points towards the root node. """ def __init__(self, root_node_obj): - """@todo: Docstring for init method. + """Initialises only by pointing to a Node object. - /root_node_obj/ @todo + /root_node_obj/ is a node object that is made before entering the main + loop of tree grow. """ self.tree = root_node_obj @@ -105,10 +128,10 @@ def impurity(array) -> int: return gini_index -def bestsplit(x, y, slices) -> int: +def bestsplit(x, y) -> int: """ - x = vector of num values - y = vector of class labels ... array([{x: x is 0 or 1}]) ?? + x = vector of single col + y = vector of classes (last col in x) Consider splits of type "x <= c" where "c" is the average of two consecutive values of x in the sorted order. @@ -125,58 +148,43 @@ def bestsplit(x, y, slices) -> int: """ x_sorted = np.sort(np.unique(x)) if len(x_sorted) <= 2: - # Allows for normal cat classes slicing + # Allows splitting on categorical (0 or 1) cols split_points = [0.5] else: + # Take average between consecutive numerical rows in the x col split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 - best_dict = None + # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee + # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes + # bestsplit toe te passen. + # + # Deze lus berekent de best split value, en op basis daarvan weten we welke + # twee "x rows" arrays we moeten returnen, en welke split value het beste + # was natuurlijk. + best_delta_i = None for split in split_points: - x_slices = { - # "left": [row for row in range(len(x)) if x[row] > split], - # "right": [row for row in range(len(x)) if x[row] <= split] + # np.index_exp maakt een boolean vector die zegt welke elementen in de + # col van x hoger of lager zijn dan split + col_slice_boolean_matrices = { "left": np.index_exp[x > split], "right": np.index_exp[x <= split] } - # delta_i formule - delta_i = impurity(y) - (len(y[x_slices["left"]]) * impurity( - y[x_slices["left"]]) + len(y[x_slices["right"]]) * - impurity(y[x_slices["right"]])) / len(y) - - # this part is pretty bad - if isinstance(slices, dict): - x_slices = { - "left": slices["left"][x_slices["left"]], - "right": slices["right"][x_slices["right"]] - # "left": np.index_exp[x > split], - # "right": np.index_exp[x <= split] - } - else: - x_slices = { - "left": slices[x_slices["left"]], - "right": slices[x_slices["right"]] - # "left": np.index_exp[x > split], - # "right": np.index_exp[x <= split] - } + # delta_i formule met de boolean vector van hierboven + delta_i = impurity( + y) - (len(y[col_slice_boolean_matrices["left"]]) * + impurity(y[col_slice_boolean_matrices["left"]]) + + len(y[col_slice_boolean_matrices["right"]]) * + impurity(y[col_slice_boolean_matrices["right"]])) / len(y) print(f"{split=}, {delta_i=}") - # slices = bool_array_2_row_number(x_slices, slices) - if best_dict is not None: - if delta_i > best_dict["delta_i"]: - best_dict = { - # Make slices work regardless of np array dimensions with this list comprehension - "slices": x_slices, - "split": split, - "delta_i": delta_i - } + # + if best_delta_i is not None: + if delta_i > best_delta_i: + best_delta_i, best_split, best_col_slice_boolean_matrices = delta_i, split, col_slice_boolean_matrices else: - best_dict = { - "slices": x_slices, - "split": split, - "delta_i": delta_i - } - return best_dict + best_delta_i, best_split, best_col_slice_boolean_matrices = delta_i, split, col_slice_boolean_matrices + return best_delta_i, best_split, best_col_slice_boolean_matrices # @@ -193,72 +201,84 @@ def tree_grow(x=None, """ @todo: Docstring for tree_grow """ - # store slice as variable - slices = np.array([row for row in range(len(y))]) - # Initiate the nodelist with tuples of slice and class labels - nodelist = [Node(value=slices)] - tree = Tree(nodelist[0]) - while nodelist: - current_node = nodelist.pop() - slices = current_node.value - node_classes = y[slices] - # print(node_classes) - - # f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' - # put stopping rules here before making a split - if len(node_classes) < n_min: - current_node.value = Leaf( - np.argmax(np.bincount(node_classes.astype(int)))) - print(f"leaf node has majority clas:\n{current_node.value.value=}") - continue + # De nodelist heeft in het begin alleen een lijst met alle rows van x, + # omdat alle rows in de root in acht worden genomen voor bestsplit berekening. + # + # Dit representeren we met een boolean vector, met lengte het aantal rows + # in x en elementen True. Deze boolean vector zullen we repeatedly gebruiken als een + # mask over x om de rows voor bestsplit op te halen. + rows = np.full((1,len(x)), True) + + # Het eerste node object moet nu geinstantieerd worden + root = Node(split_value_or_rows=rows) + nodelist = [rows] - if impurity(node_classes) > 0: - # print( - # f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}" - # ) - - # If we arrive here ever we are splitting - # bestsplit(col, node_labels) -> - # {"slices": list[int], "split": numpyfloat, "best_delta_i": numpyfloat} - - # slices (list) used for knowing which rows (int) to consider in a node - # best_split saved in current_node.value - # best_delta_i used to find best split among x_columns - best_dict = None - for i, x_col in enumerate(x[slices].transpose()): - print( - "\nExhaustive split search says; \"Entering new column\":") - col_split_dict = bestsplit(x_col, node_classes, slices) - - if best_dict is not None: - if col_split_dict["delta_i"] > best_dict["delta_i"]: - best_dict = col_split_dict - best_dict["col"] = i - else: - best_dict = col_split_dict - best_dict["col"] = i - print("\nThe best split for current node:", best_dict) - - # Here we store the splitted data into Node objects - current_node.value = best_dict["split"] - current_node.col = best_dict["col"] - # Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' - if min([len(x) for x in best_dict["slices"].values()]) < min_leaf: - continue - else: - # Invert left and right because we want left to pop() first - children = [ - Node(value=best_dict["slices"]["right"]), - Node(value=best_dict["slices"]["left"]) - ] - current_node.add_split(children[1], children[0]) - nodelist += children - else: - current_node.value = Leaf( - np.argmax(np.bincount(node_classes.astype(int)))) - print(f"\n\nLEAF NODE has majority clas:\n{current_node.value.value=}") - continue - return tree + # Initiate the nodelist with tuples of slice and class labels + # nodelist = [Node(value=slices)] + # tree = Tree(nodelist[0]) + # while nodelist: + # current_node = nodelist.pop() + # slices = current_node.value + # node_classes = y[slices] + # # print(node_classes) + + # # f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' + # # put stopping rules here before making a split + # if len(node_classes) < n_min: + # current_node.value = Leaf( + # np.argmax(np.bincount(node_classes.astype(int)))) + # print(f"leaf node has majority clas:\n{current_node.value.value=}") + # continue + + # if impurity(node_classes) > 0: + # # print( + # # f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}" + # # ) + + # # If we arrive here ever we are splitting + # # bestsplit(col, node_labels) -> + # # {"slices": list[int], "split": numpyfloat, "best_delta_i": numpyfloat} + + # # slices (list) used for knowing which rows (int) to consider in a node + # # best_split saved in current_node.value + # # best_delta_i used to find best split among x_columns + # best_dict = None + # for i, x_col in enumerate(x[slices].transpose()): + # print( + # "\nExhaustive split search says; \"Entering new column\":") + # col_split_dict = bestsplit(x_col, node_classes, slices) + + # if best_dict is not None: + # if col_split_dict["delta_i"] > best_dict["delta_i"]: + # best_dict = col_split_dict + # best_dict["col"] = i + # else: + # best_dict = col_split_dict + # best_dict["col"] = i + # print("\nThe best split for current node:", best_dict) + + # # Here we store the splitted data into Node objects + # current_node.value = best_dict["split"] + # current_node.col = best_dict["col"] + # # Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' + # if min([len(x) for x in best_dict["slices"].values()]) < min_leaf: + # continue + # else: + # # Invert left and right because we want left to pop() first + # children = [ + # Node(value=best_dict["slices"]["right"]), + # Node(value=best_dict["slices"]["left"]) + # ] + # current_node.add_split(children[1], children[0]) + # nodelist += children + # else: + # current_node.value = Leaf( + # np.argmax(np.bincount(node_classes.astype(int)))) + # print( + # f"\n\nLEAF NODE has majority clas:\n{current_node.value.value=}" + # ) + # continue + # return tree def predict(x, nodes) -> list: @@ -268,7 +288,7 @@ def predict(x, nodes) -> list: # which row to drop # print(x) drop = 0 - while not set(nodes).issubset({0,1}): + while not set(nodes).issubset({0, 1}): print(nodes) # print(x[drop]) if isinstance(nodes[drop].value, Leaf): @@ -318,7 +338,7 @@ if __name__ == '__main__': } # Calling the tree grow, unpacking default as argument - # tree_grow(**tree_grow_defaults) + tree_grow(**tree_grow_defaults) #### TREE_PRED TEST tree_pred_defaults = { @@ -326,7 +346,7 @@ if __name__ == '__main__': 'tr': tree_grow(**tree_grow_defaults) } - tree_pred(**tree_pred_defaults) + # tree_pred(**tree_pred_defaults) -start_time = time.time() -print("--- %s seconds ---" % (time.time() - start_time)) +# start_time = time.time() +# print("--- %s seconds ---" % (time.time() - start_time)) |
