gettingStarted.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

import numpy as np
import random
import math
from copy import deepcopy

credit_data = np.genfromtxt('/Users/mikevink/Documents/python/2020_data_mining_assignments/credit_score.txt', delimiter=',', skip_header=True)

#print(credit_data)
#print(credit_data[0])
#print(credit_data[:,3])
#print(credit_data[4,0])
#print(np.sort(np.unique(credit_data[:,3]))) #Give the distinct values of income, sorted from low to high
#print(np.sum(credit_data[:,5]))
#print(credit_data.sum(axis=0)) #Add the entries of each column of credit_data
#print(credit_data.sum(axis=1)) #Add the entries of each row
#print(credit_data[credit_data[:,0] > 27]) # Select all rows where the first column is bigger than 27
#
#x = np.array([2, 5, 10])
#print(x)
#print(np.arange(0, 10))
#
#print(np.arange(0, 10)[credit_data[:,0] > 27]) #Select the *row numbers* of the rows where the first column of credit_data is bigger than 27
#
#index = np.random.choice(np.arange(0, 10), size=5, replace=False) #Draw a random sample of size 5 from the numbers 1 through 10 (without replacement)
#print(index)
#train = credit_data[index,]
#print(train)
#test = np.delete(credit_data, index, axis=0) #Select all rows with row number not in "index"
#print(test)
#
#print(random.choice(train))


### Practice exercise 1 ###
def impurity(vector): # vector = list of 0s and 1s
    num_of_class_labels = len(vector)
    num_of_class_1 = sum(vector)
    num_of_class_0 = num_of_class_labels - num_of_class_1
    return (num_of_class_0 / num_of_class_labels) * (num_of_class_1 / num_of_class_labels)

array=np.array([1,0,1,1,1,0,0,1,1,0,1])
print(impurity(array))


### Practice exercise 2 ###
def bestsplit(x, y): # x = numeric values; y = class labels
    x_sorted = np.sort(np.unique(x))
    split_points = (x_sorted[:len(x_sorted)-1] + x_sorted[1:]) / 2
    
    best_impurity_after_split = math.inf
    for split in split_points:
        impurity_after_split = impurity(y[x <= split]) + impurity(y[x > split])
        if impurity_after_split < best_impurity_after_split:
            best_split = split
            best_impurity_after_split = impurity_after_split

    return best_split

print(bestsplit(credit_data[:,3], credit_data[:,5]))


class Node:
    def _init_(self):
        self.left  = None
        self.right = None
        self.split_value = None
        
class Leaf:
    def __init__(self, predicted_class: int):
        self.predicted_class = predicted_class


def tree_grow(x, y): # x = numeric values; y = class labels
    root = Node()
    root.split_value = bestsplit(x, y)
    root.left = Leaf(0)
    root.right = Leaf(1)
    return root
    
def tree_pred(x, tr):
    y = []
    for value in x:
        y.append(single_value_pred(value, tr))  
    return y

def single_value_pred(value, current_tree):
    if isinstance(current_tree, Leaf):
        return current_tree.predicted_class
    else:
        if value <= current_tree.split_value:
            return single_value_pred(value, current_tree.left)
        else:
            return single_value_pred(value, current_tree.right)        

tree = tree_grow(credit_data[:,3], credit_data[:,5])
print(tree_pred([32, 38, 3, 40], tree))