From da8ca975fb9d11d3801fef66344736e675734c42 Mon Sep 17 00:00:00 2001 From: Mike Vink Date: Wed, 23 Sep 2020 08:45:49 +0200 Subject: Deleting some misc files --- gettingStarted.py | 100 ------------------------------------------------------ 1 file changed, 100 deletions(-) delete mode 100644 gettingStarted.py (limited to 'gettingStarted.py') diff --git a/gettingStarted.py b/gettingStarted.py deleted file mode 100644 index 3a8d907..0000000 --- a/gettingStarted.py +++ /dev/null @@ -1,100 +0,0 @@ -import numpy as np -import random -import math -from copy import deepcopy - -credit_data = np.genfromtxt('/Users/mikevink/Documents/python/2020_data_mining_assignments/credit_score.txt', delimiter=',', skip_header=True) - -#print(credit_data) -#print(credit_data[0]) -#print(credit_data[:,3]) -#print(credit_data[4,0]) -#print(np.sort(np.unique(credit_data[:,3]))) #Give the distinct values of income, sorted from low to high -#print(np.sum(credit_data[:,5])) -#print(credit_data.sum(axis=0)) #Add the entries of each column of credit_data -#print(credit_data.sum(axis=1)) #Add the entries of each row -#print(credit_data[credit_data[:,0] > 27]) # Select all rows where the first column is bigger than 27 -# -#x = np.array([2, 5, 10]) -#print(x) -#print(np.arange(0, 10)) -# -#print(np.arange(0, 10)[credit_data[:,0] > 27]) #Select the *row numbers* of the rows where the first column of credit_data is bigger than 27 -# -#index = np.random.choice(np.arange(0, 10), size=5, replace=False) #Draw a random sample of size 5 from the numbers 1 through 10 (without replacement) -#print(index) -#train = credit_data[index,] -#print(train) -#test = np.delete(credit_data, index, axis=0) #Select all rows with row number not in "index" -#print(test) -# -#print(random.choice(train)) - - -### Practice exercise 1 ### -def impurity(vector): # vector = list of 0s and 1s - num_of_class_labels = len(vector) - num_of_class_1 = sum(vector) - num_of_class_0 = num_of_class_labels - num_of_class_1 - return (num_of_class_0 / num_of_class_labels) * (num_of_class_1 / num_of_class_labels) - -array=np.array([1,0,1,1,1,0,0,1,1,0,1]) -print(impurity(array)) - - -### Practice exercise 2 ### -def bestsplit(x, y): # x = numeric values; y = class labels - x_sorted = np.sort(np.unique(x)) - split_points = (x_sorted[:len(x_sorted)-1] + x_sorted[1:]) / 2 - - best_impurity_after_split = math.inf - for split in split_points: - impurity_after_split = impurity(y[x <= split]) + impurity(y[x > split]) - if impurity_after_split < best_impurity_after_split: - best_split = split - best_impurity_after_split = impurity_after_split - - return best_split - -print(bestsplit(credit_data[:,3], credit_data[:,5])) - - - -class Node: - def _init_(self): - self.left = None - self.right = None - self.split_value = None - -class Leaf: - def __init__(self, predicted_class: int): - self.predicted_class = predicted_class - - -def tree_grow(x, y): # x = numeric values; y = class labels - root = Node() - root.split_value = bestsplit(x, y) - root.left = Leaf(0) - root.right = Leaf(1) - return root - -def tree_pred(x, tr): - y = [] - for value in x: - y.append(single_value_pred(value, tr)) - return y - -def single_value_pred(value, current_tree): - if isinstance(current_tree, Leaf): - return current_tree.predicted_class - else: - if value <= current_tree.split_value: - return single_value_pred(value, current_tree.left) - else: - return single_value_pred(value, current_tree.right) - -tree = tree_grow(credit_data[:,3], credit_data[:,5]) -print(tree_pred([32, 38, 3, 40], tree)) - - - -- cgit v1.2.3