diff options
| author | Hunter <h.sterk@students.uu.nl> | 2020-10-23 13:14:32 +0200 |
|---|---|---|
| committer | Hunter <h.sterk@students.uu.nl> | 2020-10-23 13:14:32 +0200 |
| commit | 34d42d581ee5f5385c823385835182029b9eb2ff (patch) | |
| tree | ecbed0902f0466feb7bd461e7c43619cf1ddbc6f /Assignment 2/main.py | |
| parent | 6bf5c09a9bcb5f9d3f47d795b0539cb8c19fb089 (diff) | |
Training- en testdata toegevoegd, inclusief een functie om deze data in een dataframe te zetten
Diffstat (limited to 'Assignment 2/main.py')
| -rw-r--r-- | Assignment 2/main.py | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/Assignment 2/main.py b/Assignment 2/main.py new file mode 100644 index 0000000..6e06004 --- /dev/null +++ b/Assignment 2/main.py @@ -0,0 +1,54 @@ +import fnmatch +import os +import pandas as pd +import regex as re +from nltk.corpus import stopwords + + +def fetch_reviews(testdata): + path = 'op_spam_v1.4/' + label = [] + + # Fetch all the file paths of the .txt files and append in a list + if testdata: + file_paths = [os.path.join(subdir,f) + for subdir, dirs, files in os.walk(path) + for f in fnmatch.filter(files, '*.txt') if 'fold5' in subdir] + else: + file_paths = [os.path.join(subdir, f) + for subdir, dirs, files in os.walk(path) + for f in fnmatch.filter(files, '*.txt') if 'fold5' not in subdir] + + # Fetch all the labels and append in a list + for path in file_paths: + c = re.search('(trut|deceptiv)\w',path) + label.append(c.group()) + + # Create a dataframe of the label list + labels = pd.DataFrame(label, columns=['Label']) + + # Fetch all the reviews and append in a list + reviews = [] + for path in file_paths: + with open(path) as f_input: + reviews.append(f_input.read()) + + # Create a dataframe of the review list + reviews = pd.DataFrame(reviews, columns=['Review']) + + # Merge the review dataframe and label dataframe + data = pd.merge(reviews, labels, right_index=True, left_index=True) + # convert reviews to lowercase + data['Review'] = data['Review'].map(lambda x: x.lower()) + # remove stopwords + stop = stopwords.words('english') + data['Review without stopwords'] = data['Review'].apply(lambda x: ' '.join( + [word for word in x.split() if word not in (stop)])) + + return data + + +training_data = fetch_reviews(testdata=False) +print(training_data) +test_data = fetch_reviews(testdata=True) +print(test_data)
\ No newline at end of file |
