diff options
| author | Mike Vink <mike1994vink@gmail.com> | 2020-09-18 03:29:16 +0200 |
|---|---|---|
| committer | Mike Vink <mike1994vink@gmail.com> | 2020-09-18 03:29:16 +0200 |
| commit | 736ef1bb41a821fbaa0fae2a07fed188897580b4 (patch) | |
| tree | 38905652c189461d6973a1f0cf81aee58175cee6 /ass1_vectorized.py | |
| parent | 937468b3c2c3c59af9e80cf84323686fb28f5078 (diff) | |
nvm
Diffstat (limited to 'ass1_vectorized.py')
| -rw-r--r-- | ass1_vectorized.py | 369 |
1 files changed, 0 insertions, 369 deletions
diff --git a/ass1_vectorized.py b/ass1_vectorized.py deleted file mode 100644 index 08a4f8e..0000000 --- a/ass1_vectorized.py +++ /dev/null @@ -1,369 +0,0 @@ -import time -import numpy as np - -credit_data = np.genfromtxt('./credit_score.txt', - delimiter=',', - skip_header=True) - -# age,married,house,income,gender,class -# [(22, 0, 0, 28, 1, 0) -# (46, 0, 1, 32, 0, 0) -# (24, 1, 1, 24, 1, 0) -# (25, 0, 0, 27, 1, 0) -# (29, 1, 1, 32, 0, 0) -# (45, 1, 1, 30, 0, 1) -# (63, 1, 1, 58, 1, 1) -# (36, 1, 0, 52, 1, 1) -# (23, 0, 1, 40, 0, 1) -# (50, 1, 1, 28, 0, 1)] - -# In the program data points are called rows - -# In the program categorical or numerical attributes are called cols for columns - -# The last column are the classes and will be called as classes in the program - - -class Tree(): - """ - @todo: docstring for Tree - """ - def __init__(self, tree_vec_of_tuples): - """@todo: Docstring for init method. - - /root_node_obj/ @todo - - """ - self.tree_vec_of_tuples = tree_vec_of_tuples - self.leaf_nodes = np.where(tree_vec_of_tuples[:,1] == -1) - self.classes = [(1, -1), (1, -1)] - - def drop(self, y): - """ - @todo: Docstring for drop - """ - return y * 100 - - def leaf(self, y): - """ - @todo: Docstring for drop - """ - return y - - - - - -def impurity(array) -> int: - """ - Assumes the argument array is a one dimensional vector of zeroes and ones. - Computes the gini index impurity based on the relative frequency of ones in - the vector. - - Example: - - >>> array=np.array([1,0,1,1,1,0,0,1,1,0,1]) - >>> array - array([1,0,1,1,1,0,0,1,1,0,1]) - - >>> impurity(array) - 0.23140495867768596 - """ - # Total labels - n_labels = len(array) - if n_labels == 0: - # Prevents division by zero, when the potential split does not have any rows - n_labels = 1 - # Number of tuples labeled 1 - n_labels_1 = array.sum() - # Calculate the relative frequency of ones with respect to the total labels - rel_freq_1 = n_labels_1 / n_labels - # Use the symmetry around the median property to also calculate the - # relative frequency of zeroes - rel_freq_0 = 1 - rel_freq_1 - # Multiply the frequencies to get the gini index - gini_index = rel_freq_1 * rel_freq_0 - return gini_index - - -def bestsplit(x, y) -> int: - """ - x = vector of single col - y = vector of classes (last col in x) - - Consider splits of type "x <= c" where "c" is the average of two consecutive - values of x in the sorted order. - - x and y must be of the same length - - y[i] must be the class label of the i-th observation, and x[i] is the - correspnding value of attribute x - - Example (best split on income): - - >>> bestsplit(credit_data[:,3],credit_data[:,5]) - 36 - """ - x_sorted = np.sort(np.unique(x)) - if len(x_sorted) <= 2: - # Allows splitting on categorical (0 or 1) cols - split_points = [0.5] - else: - # Take average between consecutive numerical rows in the x col - split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 - - # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee - # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes - # bestsplit toe te passen. - # - # Deze lus berekent de best split value, en op basis daarvan weten we welke - # twee "x rows" arrays we moeten returnen, en welke split value het beste - # was natuurlijk. - best_delta_i = None - for split in split_points: - # np.index_exp maakt een boolean vector die zegt welke elementen in de - # col van x hoger of lager zijn dan split - col_slice_boolean_matrices = { - "left": np.index_exp[x > split], - "right": np.index_exp[x <= split] - } - - # delta_i formule met de boolean vector van hierboven - delta_i = impurity( - y) - (len(y[col_slice_boolean_matrices["left"]]) * - impurity(y[col_slice_boolean_matrices["left"]]) + - len(y[col_slice_boolean_matrices["right"]]) * - impurity(y[col_slice_boolean_matrices["right"]])) / len(y) - - print(f"{split=}, {delta_i=}") - # - if best_delta_i is not None: - if delta_i > best_delta_i: - best_delta_i, best_split, best_col_slice_boolean_matrices = delta_i, split, col_slice_boolean_matrices - else: - best_delta_i, best_split, best_col_slice_boolean_matrices = delta_i, split, col_slice_boolean_matrices - return best_delta_i, best_split, best_col_slice_boolean_matrices - -def tree_example(x=None, tr=None, **defaults) -> None: - """ - @todo: Docstring for tree_example - """ - tree_vec = [] - print(tree_vec) - print(type(tree_vec)) - tree_vec.append((36,3)) - print(tree_vec) - tree_vec.append((0,3)) - print(tree_vec) - tree_vec.append((1, -1)) - print(tree_vec) - print(tree_vec[0]) - print(type(tree_vec[0])) - tree_vec = np.array(tree_vec) # , dtype=(int, 2)) - print(tree_vec) - print(type(tree_vec[0])) - - tree = Tree(tree_vec) - - # Let's show how to predict - # 1. maak een vector met root node voor elke row in x waarvoor je een class - # wil predicten. - y = np.ones(len(x), dtype=int) - print(y) - print(type(y)) - print(y.shape) - # 2. Herinner recurrence relatie van whatsapp, en pas hem toe met - # tree.drop(x) op nodes die geen leaf node zijn - # - # Returns indices where not leaf node - print(tree.leaf_nodes) - # print(tree.classes) - # leafs = np.searchsorted( - # print(leafs) - y = np.where(np.searchsorted(tree.leaf_nodes, y)) - - # y = y[:,0] - # print(y) - - - -# -# -# Put all helper functions above this comment! - - -def tree_grow(x=None, - y=None, - n_min=None, - min_leaf=None, - n_feat=None, - **defaults) -> Tree: - """ - @todo: Docstring for tree_grow - """ - # Voor de lus die onze tree growt instantieren we een list die tuples als - # elementen zal hebben, het grote voordeel is dat we op deze manier vector - # operaties meerdere parallele - # prediction drops kunnen doen. (Je kan bijvoorbeeld geen object methodes - # broadcasten als je een numpy array van node objecten hebt) - # - # De tuple moet uiteindelijk de informatie bevatten om voor een row in x - # een class te voorspellen. Hier hebben we voor nodig: - # - # 1. Het split value, voor lager of hoger test - # 2. Het col nummer waar de split bij hoort, anders weten we niet waar we op testen - # - # (split, col) - # - # De enige uitzondering hierop zijn leaf nodes. Om de tree data structure - # een numpy array te maken moeten dit ook tuples zijn. Dit lossen we op - # door een negatieve col aan te duiden. Dit zorgt ervoor dat de prediction - # functie hier eindigt. - # - # (class, negative_int: -1) - # - # Checkout tree example function for more info - tree_vec = [] - # De nodelist heeft in het begin alleen de alle rows van x, omdat alle rows - # altijd in de root in acht worden genomen. - # - # Dit representeren we met een boolean vector, met lengte het aantal rows in x en elementen True. - rows = np.full((1,len(x)), True) - nodelist = [rows] - - # tree_array = np.empty - # while nodelist: - # current_node = nodelist.pop() - # slices = current_node.value - # node_classes = y[slices] - # # print(node_classes) - - # # f'Current node will be leaf node if (( (number of data "tuples" in child node) < {n_min=} )) \n' - # # put stopping rules here before making a split - # if len(node_classes) < n_min: - # current_node.value = Leaf( - # np.argmax(np.bincount(node_classes.astype(int)))) - # print(f"leaf node has majority clas:\n{current_node.value.value=}") - # continue - - # if impurity(node_classes) > 0: - # # print( - # # f"Exhaustive split search says, new node will check these rows for potential spliterinos:\n{x[slices]}" - # # ) - - # # If we arrive here ever we are splitting - # # bestsplit(col, node_labels) -> - # # {"slices": list[int], "split": numpyfloat, "best_delta_i": numpyfloat} - - # # slices (list) used for knowing which rows (int) to consider in a node - # # best_split saved in current_node.value - # # best_delta_i used to find best split among x_columns - # best_dict = None - # for i, x_col in enumerate(x[slices].transpose()): - # print( - # "\nExhaustive split search says; \"Entering new column\":") - # col_split_dict = bestsplit(x_col, node_classes, slices) - - # if best_dict is not None: - # if col_split_dict["delta_i"] > best_dict["delta_i"]: - # best_dict = col_split_dict - # best_dict["col"] = i - # else: - # best_dict = col_split_dict - # best_dict["col"] = i - # print("\nThe best split for current node:", best_dict) - - # # Here we store the splitted data into Node objects - # current_node.value = best_dict["split"] - # current_node.col = best_dict["col"] - # # Split will not happen if (( (number of data "tuples" potential split) < {min_leaf=} ))\n' - # if min([len(x) for x in best_dict["slices"].values()]) < min_leaf: - # continue - # else: - # # Invert left and right because we want left to pop() first - # children = [ - # Node(value=best_dict["slices"]["right"]), - # Node(value=best_dict["slices"]["left"]) - # ] - # current_node.add_split(children[1], children[0]) - # nodelist += children - # else: - # current_node.value = Leaf( - # np.argmax(np.bincount(node_classes.astype(int)))) - # print( - # f"\n\nLEAF NODE has majority clas:\n{current_node.value.value=}" - # ) - # continue - # return tree - - -def predict(x, nodes) -> list: - """ - @todo: Docstring for predict - """ - # which row to drop - # print(x) - drop = 0 - while not set(nodes).issubset({0, 1}): - print(nodes) - # print(x[drop]) - if isinstance(nodes[drop].value, Leaf): - nodes[drop] = nodes[drop].value.value - drop += 1 - continue - - print(nodes[drop].value) - print(nodes[drop].col) - # print(nodes[drop].col) - if x[drop, nodes[drop].col] > nodes[drop].value: - nodes[drop] = nodes[drop].left - else: - nodes[drop] = nodes[drop].right - return np.array(nodes) - - -def tree_pred(x=None, tr=None, **defaults) -> np.array: - """ - @todo: Docstring for tree_pred - """ - nodes = [tr.tree] * len(x) - # y = np.linspace(0, len(x), 0) - # y = np.array(ele) - y = predict(x, nodes) - print(f"\n\nPredicted classes for {x=}\n\n are: {y=}") - return y - - -if __name__ == '__main__': - #### IMPURITY TEST - # array=np.array([1,0,1,1,1,0,0,1,1,0,1]) - # print(impurity(array)) - # Should give 0.23.... - - #### BESTSPLIT TEST - # print(bestsplit(credit_data[:, 3], credit_data[:, 5])) - # Should give 36 - - #### TREE_GROW TEST - tree_grow_defaults = { - 'x': credit_data[:, :5], - 'y': credit_data[:, 5], - 'n_min': 2, - 'min_leaf': 1, - 'n_feat': 5 - } - - # Calling the tree grow, unpacking default as argument - # tree_grow(**tree_grow_defaults) - - #### TREE_PRED TEST - tree_pred_defaults = { - 'x': credit_data[:, :5], - # 'tr': tree_grow(**tree_grow_defaults) - } - - tree_example(**tree_pred_defaults) - # tree_pred(**tree_pred_defaults) - -# start_time = time.time() -# print("--- %s seconds ---" % (time.time() - start_time)) |
