diff options
Diffstat (limited to 'assignment1.py')
| -rw-r--r-- | assignment1.py | 274 |
1 files changed, 147 insertions, 127 deletions
diff --git a/assignment1.py b/assignment1.py index 64b865d..e0fb29b 100644 --- a/assignment1.py +++ b/assignment1.py @@ -17,40 +17,63 @@ credit_data = np.genfromtxt('./credit_score.txt', # (23, 0, 1, 40, 0, 1) # (50, 1, 1, 28, 0, 1)] +# In the program data points are called rows -class Node(): +# In the program categorical or numerical attributes are called cols for columns + +# The last column are the classes and will be called as classes in the program + + +class Node: """ - @todo: docstring for Node + The node object points to two other Node objects. """ - def __init__(self, value=None): - """@todo: Docstring for init method. + def __init__(self, split_value_or_rows=None, col=None): + """Initialises the column and split value for the node. + + /split_value_or_rows=None/ can either be the best split value of + a col, or a boolean mask for x that selects the rows to consider for + calculating the split_value - /value=None/ @todo + /col=None/ if the node object has a split_value, then it also has a col + that belongs to this value """ - self.value = value + self.split_value_or_rows = split_value_or_rows + self.col = col def add_split(self, left, right): """ - @todo: Docstring for add_split + Method that is called in the main loop of tree_grow. + + Lets the node object point to two other objects that can be either Leaf + or Node. """ self.left = left self.right = right -class Leaf: - def __init__(self, value: int): - self.value = value +# class Leaf: +# """ +# Simple class that contains only the majority class in the leaf node. +# """ +# def __init__(self, maj_class): +# """Initialises the majority vote. + +# /maj_class/ @todo + +# """ -class Tree(): +class Tree: """ - @todo: docstring for Tree + Tree object that points towards the root node. """ def __init__(self, root_node_obj): - """@todo: Docstring for init method. + """Initialises only by pointing to a Node object. - /root_node_obj/ @todo + /root_node_obj/ is a node object that is made before entering the main + loop of tree grow. """ self.tree = root_node_obj @@ -105,10 +128,10 @@ def impurity(array) -> int: return gini_index -def bestsplit(x, y, slices) -> int: +def bestsplit(x, y) -> int: """ - x = vector of num values - y = vector of class labels ... array([{x: x is 0 or 1}]) ?? + x = vector of single col + y = vector of classes (last col in x) Consider splits of type "x <= c" where "c" is the average of two consecutive values of x in the sorted order. @@ -125,58 +148,43 @@ def bestsplit(x, y, slices) -> int: """ x_sorted = np.sort(np.unique(x)) if len(x_sorted) <= 2: - # Allows for normal cat classes slicing + # Allows splitting on categorical (0 or 1) cols split_points = [0.5] else: + # Take average between consecutive numerical rows in the x col split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 - best_dict = None + # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee + # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes + # bestsplit toe te passen. + # + # Deze lus berekent de best split value, en op basis daarvan weten we welke + # twee "x rows" arrays we moeten returnen, en welke split value het beste + # was natuurlijk. + best_delta_i = None for split in split_points: - x_slices = { - # "left": [row for row in range(len(x)) if x[row] > split], - # "right": [row for row in range(len(x)) if x[row] <= split] + # np.index_exp maakt een boolean vector die zegt welke elementen in de + # col van x hoger of lager zijn dan split + col_slice_boolean_matrices = { "left": np.index_exp[x > split], "right": np.index_exp[x <= split] } - # delta_i formule - delta_i = impurity(y) - (len(y[x_slices["left"]]) * impurity( - y[x_slices["left"]]) + len(y[x_slices["right"]]) * - impurity(y[x_slices["right"]])) / len(y) - - # this part is pretty bad - if isinstance(slices, dict): - x_slices = { - "left": slices["left"][x_slices["left"]], - "right": slices["right"][x_slices["right"]] - # "left": np.index_exp[x > split], - # "right": np.index_exp[x <= split] - } - else: - x_slices = { - "left": slices[x_slices["left"]], - "right": slices[x_slices["right"]] - # "left": np.index_exp[x > split], - # "right": np.index_exp[x <= split] - } + # delta_i formule met de boolean vector van hierboven + delta_i = impurity( + y) - (len(y[col_slice_boolean_matrices["left"]]) * + impurity(y[col_slice_boolean_matrices["left"]]) + + len(y[col_slice_boolean_matrices["right"]]) * + impurity(y[col_slice_boolean_matrices["right"]])) / len(y) print(f"{split=}, {delta_i=}") - # slices = bool_array_2_row_number(x_slices, slices) - if best_dict is not None: - if delta_i > best_dict["delta_i"]: - best_dict = { - # Make slices work regardless of np array dimensions with this list comprehension - "slices": x_slices, - "split": split, - "delta_i": delta_i - } + # + if best_delta_i is not None: + if delta_i > best_delta_i: + best_delta_i, best_split, best_col_slice_boolean_matrices = delta_i, split, col_slice_boolean_matrices else: - best_dict = { - "slices": x_slices, - "split": split, - "delta_i": delta_i - } - return best_dict + best_delta_i, best_split, best_col_slice_boolean_matrices = delta_i, split, col_slice_boolean_matrices + return best_delta_i, best_split, best_col_slice_boolean_matrices # @@ -193,72 +201,84 @@ def tree_grow(x=None, """ @todo: Docstring for tree_grow """ - # store slice as variable - slices = np.array([row for row in range(len(y))]) - # Initiate the nodelist with tuples of slice and class labels - nodelist = [Node(value=slices)] - tree = Tree(nodelist[0]) - while nodelist: - current_node = nodelist.pop() - slices = current_node.value - node_classes = y[slices] - # print(node_classes) - - # f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' - # put stopping rules here before making a split - if len(node_classes) < n_min: - current_node.value = Leaf( - np.argmax(np.bincount(node_classes.astype(int)))) - print(f"leaf node has majority clas:\n{current_node.value.value=}") - continue + # De nodelist heeft in het begin alleen een lijst met alle rows van x, + # omdat alle rows in de root in acht worden genomen voor bestsplit berekening. + # + # Dit representeren we met een boolean vector, met lengte het aantal rows + # in x en elementen True. Deze boolean vector zullen we repeatedly gebruiken als een + # mask over x om de rows voor bestsplit op te halen. + rows = np.full((1,len(x)), True) + + # Het eerste node object moet nu geinstantieerd worden + root = Node(split_value_or_rows=rows) + nodelist = [rows] - if impurity(node_classes) > 0: - # print( - # f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}" - # ) - - # If we arrive here ever we are splitting - # bestsplit(col, node_labels) -> - # {"slices": list[int], "split": numpyfloat, "best_delta_i": numpyfloat} - - # slices (list) used for knowing which rows (int) to consider in a node - # best_split saved in current_node.value - # best_delta_i used to find best split among x_columns - best_dict = None - for i, x_col in enumerate(x[slices].transpose()): - print( - "\nExhaustive split search says; \"Entering new column\":") - col_split_dict = bestsplit(x_col, node_classes, slices) - - if best_dict is not None: - if col_split_dict["delta_i"] > best_dict["delta_i"]: - best_dict = col_split_dict - best_dict["col"] = i - else: - best_dict = col_split_dict - best_dict["col"] = i - print("\nThe best split for current node:", best_dict) - - # Here we store the splitted data into Node objects - current_node.value = best_dict["split"] - current_node.col = best_dict["col"] - # Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' - if min([len(x) for x in best_dict["slices"].values()]) < min_leaf: - continue - else: - # Invert left and right because we want left to pop() first - children = [ - Node(value=best_dict["slices"]["right"]), - Node(value=best_dict["slices"]["left"]) - ] - current_node.add_split(children[1], children[0]) - nodelist += children - else: - current_node.value = Leaf( - np.argmax(np.bincount(node_classes.astype(int)))) - print(f"\n\nLEAF NODE has majority clas:\n{current_node.value.value=}") - continue - return tree + # Initiate the nodelist with tuples of slice and class labels + # nodelist = [Node(value=slices)] + # tree = Tree(nodelist[0]) + # while nodelist: + # current_node = nodelist.pop() + # slices = current_node.value + # node_classes = y[slices] + # # print(node_classes) + + # # f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' + # # put stopping rules here before making a split + # if len(node_classes) < n_min: + # current_node.value = Leaf( + # np.argmax(np.bincount(node_classes.astype(int)))) + # print(f"leaf node has majority clas:\n{current_node.value.value=}") + # continue + + # if impurity(node_classes) > 0: + # # print( + # # f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}" + # # ) + + # # If we arrive here ever we are splitting + # # bestsplit(col, node_labels) -> + # # {"slices": list[int], "split": numpyfloat, "best_delta_i": numpyfloat} + + # # slices (list) used for knowing which rows (int) to consider in a node + # # best_split saved in current_node.value + # # best_delta_i used to find best split among x_columns + # best_dict = None + # for i, x_col in enumerate(x[slices].transpose()): + # print( + # "\nExhaustive split search says; \"Entering new column\":") + # col_split_dict = bestsplit(x_col, node_classes, slices) + + # if best_dict is not None: + # if col_split_dict["delta_i"] > best_dict["delta_i"]: + # best_dict = col_split_dict + # best_dict["col"] = i + # else: + # best_dict = col_split_dict + # best_dict["col"] = i + # print("\nThe best split for current node:", best_dict) + + # # Here we store the splitted data into Node objects + # current_node.value = best_dict["split"] + # current_node.col = best_dict["col"] + # # Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' + # if min([len(x) for x in best_dict["slices"].values()]) < min_leaf: + # continue + # else: + # # Invert left and right because we want left to pop() first + # children = [ + # Node(value=best_dict["slices"]["right"]), + # Node(value=best_dict["slices"]["left"]) + # ] + # current_node.add_split(children[1], children[0]) + # nodelist += children + # else: + # current_node.value = Leaf( + # np.argmax(np.bincount(node_classes.astype(int)))) + # print( + # f"\n\nLEAF NODE has majority clas:\n{current_node.value.value=}" + # ) + # continue + # return tree def predict(x, nodes) -> list: @@ -268,7 +288,7 @@ def predict(x, nodes) -> list: # which row to drop # print(x) drop = 0 - while not set(nodes).issubset({0,1}): + while not set(nodes).issubset({0, 1}): print(nodes) # print(x[drop]) if isinstance(nodes[drop].value, Leaf): @@ -318,7 +338,7 @@ if __name__ == '__main__': } # Calling the tree grow, unpacking default as argument - # tree_grow(**tree_grow_defaults) + tree_grow(**tree_grow_defaults) #### TREE_PRED TEST tree_pred_defaults = { @@ -326,7 +346,7 @@ if __name__ == '__main__': 'tr': tree_grow(**tree_grow_defaults) } - tree_pred(**tree_pred_defaults) + # tree_pred(**tree_pred_defaults) -start_time = time.time() -print("--- %s seconds ---" % (time.time() - start_time)) +# start_time = time.time() +# print("--- %s seconds ---" % (time.time() - start_time)) |
