From 9277a0790d4925f1fd50c789cc83a27096f543e7 Mon Sep 17 00:00:00 2001 From: Mike Vink Date: Wed, 16 Sep 2020 02:35:13 +0200 Subject: Major update: obj oriented tree --- gettingStarted.py | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 gettingStarted.py (limited to 'gettingStarted.py') diff --git a/gettingStarted.py b/gettingStarted.py new file mode 100644 index 0000000..3a8d907 --- /dev/null +++ b/gettingStarted.py @@ -0,0 +1,100 @@ +import numpy as np +import random +import math +from copy import deepcopy + +credit_data = np.genfromtxt('/Users/mikevink/Documents/python/2020_data_mining_assignments/credit_score.txt', delimiter=',', skip_header=True) + +#print(credit_data) +#print(credit_data[0]) +#print(credit_data[:,3]) +#print(credit_data[4,0]) +#print(np.sort(np.unique(credit_data[:,3]))) #Give the distinct values of income, sorted from low to high +#print(np.sum(credit_data[:,5])) +#print(credit_data.sum(axis=0)) #Add the entries of each column of credit_data +#print(credit_data.sum(axis=1)) #Add the entries of each row +#print(credit_data[credit_data[:,0] > 27]) # Select all rows where the first column is bigger than 27 +# +#x = np.array([2, 5, 10]) +#print(x) +#print(np.arange(0, 10)) +# +#print(np.arange(0, 10)[credit_data[:,0] > 27]) #Select the *row numbers* of the rows where the first column of credit_data is bigger than 27 +# +#index = np.random.choice(np.arange(0, 10), size=5, replace=False) #Draw a random sample of size 5 from the numbers 1 through 10 (without replacement) +#print(index) +#train = credit_data[index,] +#print(train) +#test = np.delete(credit_data, index, axis=0) #Select all rows with row number not in "index" +#print(test) +# +#print(random.choice(train)) + + +### Practice exercise 1 ### +def impurity(vector): # vector = list of 0s and 1s + num_of_class_labels = len(vector) + num_of_class_1 = sum(vector) + num_of_class_0 = num_of_class_labels - num_of_class_1 + return (num_of_class_0 / num_of_class_labels) * (num_of_class_1 / num_of_class_labels) + +array=np.array([1,0,1,1,1,0,0,1,1,0,1]) +print(impurity(array)) + + +### Practice exercise 2 ### +def bestsplit(x, y): # x = numeric values; y = class labels + x_sorted = np.sort(np.unique(x)) + split_points = (x_sorted[:len(x_sorted)-1] + x_sorted[1:]) / 2 + + best_impurity_after_split = math.inf + for split in split_points: + impurity_after_split = impurity(y[x <= split]) + impurity(y[x > split]) + if impurity_after_split < best_impurity_after_split: + best_split = split + best_impurity_after_split = impurity_after_split + + return best_split + +print(bestsplit(credit_data[:,3], credit_data[:,5])) + + + +class Node: + def _init_(self): + self.left = None + self.right = None + self.split_value = None + +class Leaf: + def __init__(self, predicted_class: int): + self.predicted_class = predicted_class + + +def tree_grow(x, y): # x = numeric values; y = class labels + root = Node() + root.split_value = bestsplit(x, y) + root.left = Leaf(0) + root.right = Leaf(1) + return root + +def tree_pred(x, tr): + y = [] + for value in x: + y.append(single_value_pred(value, tr)) + return y + +def single_value_pred(value, current_tree): + if isinstance(current_tree, Leaf): + return current_tree.predicted_class + else: + if value <= current_tree.split_value: + return single_value_pred(value, current_tree.left) + else: + return single_value_pred(value, current_tree.right) + +tree = tree_grow(credit_data[:,3], credit_data[:,5]) +print(tree_pred([32, 38, 3, 40], tree)) + + + -- cgit v1.2.3