diff options
| author | Mike Vink <mike1994vink@gmail.com> | 2020-09-25 00:47:27 +0200 |
|---|---|---|
| committer | Mike Vink <mike1994vink@gmail.com> | 2020-09-25 00:47:27 +0200 |
| commit | dd8c5300b721d62a983add45ae38e0ef4425921a (patch) | |
| tree | d10e0b049a2f4a26bb1cf03028436622953be44f /tree.py | |
| parent | 53f6101594a410b6fa6f924d4c256f5f0001c0e5 (diff) | |
Fix: compared time profile and went back
Diffstat (limited to 'tree.py')
| -rw-r--r-- | tree.py | 332 |
1 files changed, 98 insertions, 234 deletions
@@ -1,9 +1,9 @@ import numpy as np -from sklearn import metrics -import pstats -from pstats import SortKey import cProfile +import pstats +from pstats import SortKey +from sklearn import metrics # age,married,house,income,gender,class # [(22, 0, 0, 28, 1, 0) @@ -86,10 +86,9 @@ class Tree: # De index van de row van x die we in de boom willen droppen drop = 0 - while not all(pred_class in {0,1} for pred_class in nodes): + while not all(pred_class in {0, 1} for pred_class in nodes): # Als de col None is dan is het een leaf node, dus dan is de row # van x hier beland - # print(nodes[drop].col, nodes[drop].split_value_or_rows) if nodes[drop].col is None: nodes[drop] = nodes[drop].split_value_or_rows drop += 1 @@ -148,21 +147,10 @@ def impurity(array) -> int: >>> impurity(array) 0.23140495867768596 """ - # Total labels n_labels = len(array) - if n_labels == 0: - print( - "division by zero will happen, child node is pure, doesnt contain anything" - ) - n_labels = 1 - # Number of tuples labeled 1 n_labels_1 = array.sum() - # Calculate the relative frequency of ones with respect to the total labels rel_freq_1 = n_labels_1 / n_labels - # Use the symmetry around the median property to also calculate the - # relative frequency of zeroes rel_freq_0 = 1 - rel_freq_1 - # Multiply the frequencies to get the gini index gini_index = rel_freq_1 * rel_freq_0 return gini_index @@ -186,110 +174,70 @@ def bestsplit(x, y, minleaf) -> None: 36 """ x_sorted = np.sort(np.unique(x)) - # if len(x_sorted) <= 2: - # # Allows splitting on categorical (0 or 1) cols - # split_points = x_sorted / 2 - # else: - # # Take average between consecutive numerical rows in the x col - # split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 - split_points = (x_sorted[:len(x_sorted) - 1] + x_sorted[1:]) / 2 - # print(split_points) - - # De toepassing van bestsplit verdeelt de x col vector in tweeen, twee - # arrays van "x rows". Deze moeten we terug krijgen om de in de child nodes - # bestsplit toe te passen. - # - # Deze lus berekent de best split value, en op basis daarvan weten we welke - # twee "x rows" arrays we moeten returnen, en welke split value het beste - # was natuurlijk. - # Hieren stacken we arrays onder (rows for children, delta_i, split value, col will be added later) - best_array = np.zeros((split_points.shape[0], x.shape[0] + 3), dtype='float64') - # print(f"{best_array=}") + # Hieren stoppen we (delta_i, split_value, rows_left, rows_right) + best_list = [] # Stop wanneer de array met split points leeg is while split_points.size != 0: - # Huidige split value split_value = split_points[-1] - # print(split_value) - # boolean masks om x rows mee te masken/slicen - # print(best_array.shape) - # print(best_array) - # print(f"{best_array[current_row_of_best_array,:-2]=}") - - # class voor beide split kanten - classes_left, classes_right = y[x > split_value], y[x <= split_value] - # print(f"{classes_left=} {classes_right=}") + mask_left, mask_right = x > split_value, x <= split_value + classes_left, classes_right = y[mask_left], y[mask_right] # Kijk of er genoeg rows in de gesplitte nodes terechtkomen, anders # mogen we de split niet toelaten vanwege de minleaf constraint - # print(f"{classes_left.shape[0]=}") - if classes_left.shape[0] < minleaf or classes_right.shape[0] < minleaf: - # Haal de huidige split_point uit split_points + if len(classes_left) < minleaf or len(classes_right) < minleaf: split_points = split_points[:-1] continue - # delta_i formule, improved by not taking the parent impurity into - # account, and not making the weighted average but the weigthed sum - # only (thanks Lonnie) - delta_i = ( - impurity(classes_left) * classes_left.shape[0] + - impurity(classes_right) * classes_right.shape[0]) + delta_i = (impurity(classes_left) * len(classes_left) + + impurity(classes_right) * len(classes_right)) # stop huidige splits in de lijst om best split te berekenen - # print(f"{np.array((delta_i, mask_left, mask_right, split_value))=}") - # print(f"{best_array[:, classes_left.shape[0]-1]=}") - current_row_of_best_array = split_points.shape[0]-1 - best_array[current_row_of_best_array, :x.shape[0]] = x > split_value - best_array[current_row_of_best_array,-3:-1] = delta_i, split_value - # print(best_array) + best_list.append((delta_i, mask_left, mask_right, split_value)) # Haal de huidige split_point uit split_points split_points = split_points[:-1] # Bereken de best split voor deze x col, als er ten minste 1 bestaat die # voldoet aan min leaf - return best_array + if best_list: + return min(best_list, key=lambda x: x[0]) + else: + return False def exhaustive_split_search(rows, classes, minleaf): """ @todo: Docstring for exhaustive_split_search """ - # print("\t\t->entering exhaustive split search") - # We hebben enumerate nodig, want we willen weten op welke col + # We hebben enumerate nodig, want we willen weten op welke col (i) # (age,married,house,income,gender) we een split doen - exhaustive_best_array = [] - # print(f"Rows:\n{rows},\n Classes:\n{classes}") + exhaustive_best_list = [] for i, col in enumerate(rows.transpose()): - # calculate the best split for the col that satisfies the minleaf - # constraint - best_array_for_col = bestsplit(col, classes, minleaf) - # add col number to rows - best_array_for_col[:,-1] = i - exhaustive_best_array.append(best_array_for_col) - exhaustive_best_array = np.concatenate(exhaustive_best_array) - # print("The array with exhaustive splits is\n", exhaustive_best_array) - # print("\t\t->returning from exhaustive split search") - return exhaustive_best_array + col_best_split = bestsplit(col, classes, minleaf) + if col_best_split: + # add for which row we calculated the best split + col_best_split += (i, ) + exhaustive_best_list.append(col_best_split) + return exhaustive_best_list def add_children(node, best_split): """ @todo: Docstring for add_children """ - # print("\t\t\t->entering add children") - # The mask that was used to get the rows for the current node from x, we - # need this to update the rows for the children current_mask = node.split_value_or_rows + mask_left, mask_right, node_split_value, node_col = best_split[1:] # Give the current node the split_value and col it needs for predictions - node.split_value_or_rows, node.col = best_split[-2], int(best_split[-1]) + node.split_value_or_rows, node.col = node_split_value, node_col # Updating the row masks to give it to children, keeping numpy dimension consistent - mask_left, mask_right = update_mask(best_split[:-3], current_mask) + mask_left, mask_right = update_mask(mask_left, current_mask), update_mask( + mask_right, current_mask) # Adding the pointer between parent and children - node.add_split(Node(split_value_or_rows=mask_left), Node(split_value_or_rows=mask_right)) - # print("\t\t\t->children added to node and node list\n") + node.add_split(Node(split_value_or_rows=mask_left), + Node(split_value_or_rows=mask_right)) return [node.left, node.right] @@ -298,33 +246,9 @@ def update_mask(mask, current_mask): Updates the spit bool array from any dimension to an array with length equal to the total number of rows in dataset x. """ - # print( - # "\t\t\t\t->entering update mask to update mask that calculates which rows belong to child" - # ) - # Copy heb ik gedaan omdat we een slice assignment doen. - # - # Het punt van deze functie is dat tijdens het tree growen het aantal rows - # in een node afneemt. Deze functie update de bool array die gebruikt is om - # te splitten in bestsplit naar een array met als lengte het totale aantal - # rows in de data set x. Op deze manier kunnen we in tree grow de totale x - # splitten. - # - # Een andere optie zou zijn om de rows letterlijk tijdelijk op te slaan in - # nodes... weet niet wat beter is, het kan best veel geheugen in beslag - # nemen voor grote dataset neem ik aan... Op de manier hier zijn er altijd - # maar 1D bool arrays, die dus een lagere dimensionaliteit hebben als x veel columnen/rows heeft. - current_mask = np.vstack((current_mask, current_mask)) - # print(f"\nBefore update (upper rows goes to the left child, bottom to the right):\n{current_mask}") - # print(mask) - # print(mask.astype(bool)) - # print(current_mask[current_mask == True]) - current_mask[0][current_mask[0] == True] = mask.astype(bool) - current_mask[1][current_mask[1] == True] = np.invert(mask.astype(bool)) - # copy = np.array(current_mask, copy=True) - # copy[np.where(current_mask)] = mask - # print(f"After update:\n{current_mask}") - # print("\t\t\t\t->updated row mask for child node") - return current_mask[0], current_mask[1] + copy = np.array(current_mask, copy=True) + copy[np.where(current_mask)] = mask + return copy # @@ -341,166 +265,106 @@ def tree_grow(x=None, """ @todo: Docstring for tree_grow """ - # De nodelist heeft in het begin alleen een root node met alle rows van x, - # omdat alle rows in de root in acht worden genomen voor bestsplit berekening. - # - # Dit representeren we met een boolean mask over x, met lengte het aantal rows - # in x en elementen True. Deze boolean mask zullen we repeatedly gebruiken als een - # mask over x om de rows voor bestsplit op te halen. mask = np.full(len(x), True) - - # Het eerste node object moet nu geinstantieerd worden root = Node(split_value_or_rows=mask) - - # We instantieren ook gelijk het Tree object tr = Tree(root, (nmin, minleaf, nfeat)) - # De eerste nodelist heeft alleen de root node, daarna zijn twee childs, - # etc. totdat alle splits gemaakt zijn en de lijst leeg is. nodelist = [root] - while nodelist: - # print("->Taking new node from the node list") - # Pop de current node uit de nodelist node = nodelist.pop() - # Gebruik de boolean mask van de node om de rows in de node uit x te halen - # print(node_rows) - # Gebruik de boolean mask van de node om de classes in de node uit y te halen node_classes = y[node.split_value_or_rows] - # print(node_classes) - # print(f"{node_classes}, {node_rows}") - - # Test of de node een leaf node is met nmin if len(node_classes) < nmin: node.is_leaf_node(node_classes) - print( - "\t->Leafnode! Node has less rows than nmin, it is a leaf node, continueing to next node" - ) continue - # print("\t->Node has more rows than nmin, it is not a leaf node") - # Als de node niet puur is, probeer dan te splitten if impurity(node_classes) > 0: - # print("\t->Node is not pure yet starting exhaustive split search") - - # Pick a random set of cols from rows (nfeat, important for forests) - nfeat_col_choice = np.random.choice(range(x.shape[1]), nfeat, replace=False) - node_rows = x[node.split_value_or_rows][:,np.sort(nfeat_col_choice)] - - # We gaan exhaustively voor de rows in de node over de cols - # (age,married,house,income,gender) om de bestsplit te - # bepalen - # - # We geven minleaf als argument omdat: - # "If the algorithm performs a split, it should be the best split that meets the minleaf constrain" - # - # We krijgen een exhaustive lijst terug met splits - exhaustive_best_array = exhaustive_split_search( + node_rows = x[node.split_value_or_rows] + exhaustive_best_list = exhaustive_split_search( node_rows, node_classes, minleaf) - - # Als de array alleen maar zeroes heeft -> no splits for the minleaf leaf node - # print(exhaustive_best_array[:,:-1].any()) - # raise SystemExit - if not exhaustive_best_array[:,:-1].any(): + if not exhaustive_best_list: node.is_leaf_node(node_classes) - print( - "\t\t->Leaf node! No split that fullfils minleaf was found continueing to next node" - ) continue - # print( - # "\t->Exhaustive search found a split fulfilling the minleaf rule!" - # ) - # Hier halen we de beste split, en rows voor de child/split nodes - # uit de exhaustive best list - # [(exhaustive_best_array[:,:-3] == 0).sum(axis=1)] - best_split = exhaustive_best_array[exhaustive_best_array[:,:-3].sum(axis=1) != 0][np.argmin(exhaustive_best_array[:,-3])] - # print((exhaustive_best_array[:,:-3] == 0).sum(axis=1)) - # print(exhaustive_best_array.shape) - # print(exhaustive_best_array) - # print(exhaustive_best_array[:,:-3]) - # print(exhaustive_best_array[:,:-3] == 0) - # print(exhaustive_best_array[:,:-3].sum(axis=1) != 0) - # print(exhaustive_best_array[exhaustive_best_array[:,:-3]]) - # print(exhaustive_best_array[exhaustive_best_array[:,:-3].sum(axis=0) == 0].shape) - # print(exhaustive_best_array[exhaustive_best_array[:,:-3].sum(axis=0) == 0][:,-3].shape) - # print(best_split) - # raise SystemExit - # print(exhaustive_best_array[:,:-1]) - # raise SystemExit - # print(f"\n######################best split array, the last three element=[delta_i, splitvalue, col/attribute], the rest is boolean include or not include row in child node############################\n\n{best_split}\n\n###############################################################################\n") - # Hier voegen we de twee nieuwe nodes toe aan de gesplitte "parent" + best_split = min(exhaustive_best_list, key=lambda z: z[0]) nodelist += add_children(node, best_split) - # print(exhaustive_best_array.shape) - # node.add_split(left_child_node, right_child_node) + else: + # impurity 0 node.is_leaf_node(node_classes) - print("\t\t->Leafnode! The node is already pure, it is necessarily a leaf!") continue - # print(tr) return tr -def tree_pred(x=None, tr=None, **defaults) -> np.array: +def tree_grow_b(x=None, + y=None, + nmin=None, + minleaf=None, + nfeat=None, + m=None, + **defaults) -> Tree: + pass + + +def tree_pred(x=None, tr=None, training=None, **defaults) -> np.array: """ @todo: Docstring for tree_pred """ y = tr.predict(x).astype(float) nmin, minleaf, nfeat = tr.hyper_params - print(np.mean(pima_indians[:,-1] == y)) - # print("-"*80+f"\nPrediction parameters:\nnmin={nmin}\nminleaf={minleaf}\nnfeat={nfeat}") - # print(f"\nFor x rows\n{x}\n\n predicted classes are:\n {y}") + if training is not None: + # print(np.mean(training == y)) + print( + f'Results from: prediction single tree({nmin=}, {minleaf=}, {nfeat=})' + ) + print( + f'\t->Confusion matrix:\n{metrics.confusion_matrix(y, training)}') + print(f'\t->Accuracy:\n\t\t{metrics.accuracy_score(y, training)}') + print(f'\t->Precission:\n\t\t{metrics.precision_score(y, training)}') + print(f'\t->Recall:\n\t\t{metrics.recall_score(y, training)}') return y +def tree_pred_b(x=None, tr=None, training=None, **defaults) -> np.array: + pass -if __name__ == '__main__': - # credit_data = np.genfromtxt('./data/credit_score.txt', - # delimiter=',', - # skip_header=True) - pima_indians = np.genfromtxt('./data/pima_indians.csv', +if __name__ == '__main__': + credit_data = np.genfromtxt('./data/credit_score.txt', delimiter=',', skip_header=True) - #### IMPURITY TEST - # array=np.array([1,0,1,1,1,0,0,1,1,0,1]) - # print(impurity(array)) - # Should give 0.23.... - - #### BESTSPLIT TEST - # print(bestsplit(credit_data[:, 3], credit_data[:, 5])) - # Should give 36 - - #### TREE_GROW TEST - # tree_grow_defaults = { - # 'x': credit_data[:, :5], - # 'y': credit_data[:, 5], - # 'nmin': 2, - # 'minleaf': 1, - # 'nfeat': None # 5, should be 5 for bootstrapping folds - # } - - # Calling the tree grow, unpacking default as argument - # tree_grow(**tree_grow_defaults) - - #### TREE_PRED TEST - # tree_pred_defaults = { - # 'x': credit_data[:, :5], - # 'tr': tree_grow(**tree_grow_defaults) - # } - - # y_pred = tree_pred(x=credit_data[:,:5], tr=tree_grow(x=credit_data[:,0:5], y=credit_data[:,5], nmin=2, minleaf=1, nfeat=5), dataset='credit score') - # print(credit_data[:,5] == y_pred) - - # predictions = tree_pred(x=pima_indians[:,:8], tr=tree_grow(x=pima_indians[:,:8], y=pima_indians[:,8], nmin=20, minleaf=5, nfeat=pima_indians.shape[1]-1), dataset='pima indians') - # print(np.mean(pima_indians[:,8] == y_pred)) - - # Time profile of credit data prediction with single tree + + pima_indians = np.genfromtxt('./data/pima_indians.csv', + delimiter=',', + skip_header=True) + + print("Dataset: credit data") + tree_pred(x=credit_data[:, :5], + tr=tree_grow(x=credit_data[:, 0:5], + y=credit_data[:, 5], + nmin=2, + minleaf=1, + nfeat=5), + training=credit_data[:, 5]) + + print('\nDataset: pima indians') + tree_pred(x=pima_indians[:, :8], + tr=tree_grow(x=pima_indians[:, :8], + y=pima_indians[:, 8], + nmin=20, + minleaf=5, + nfeat=pima_indians.shape[1] - 1), + training=pima_indians[:, 8]) + + # Time profiles: see what functions take what time! :) + + # print("prediction metrics single tree pima indians:") # cProfile.run("tree_pred(x=credit_data[:,:5], tr=tree_grow(x=credit_data[:,0:5], y=credit_data[:,5], nmin=2, minleaf=1, nfeat=5), dataset='credit score')", 'restats') # Time profile of pima indians data prediction with single tree - cProfile.run("tree_pred(x=pima_indians[:,:8], tr=tree_grow(x=pima_indians[:,:8], y=pima_indians[:,8], nmin=20, minleaf=5, nfeat=pima_indians.shape[1]-1), dataset='pima indians')", 'restats') - - - p = pstats.Stats('restats') - p.sort_stats(SortKey.TIME) - p.print_stats() + # print("prediction metrics single tree pima indians:") + # cProfile.run( + # "tree_pred(x=pima_indians[:,:8], tr=tree_grow(x=pima_indians[:,:8], y=pima_indians[:,8], nmin=20, minleaf=5, nfeat=pima_indians.shape[1]-1), training=pima_indians[:,8])", + # 'restats') + + # p = pstats.Stats('restats') + # p.sort_stats(SortKey.TIME) + # p.print_stats() |
