summaryrefslogtreecommitdiff
path: root/Assignment 2/main.py
diff options
context:
space:
mode:
authorHunter <h.sterk@students.uu.nl>2020-10-23 13:14:32 +0200
committerHunter <h.sterk@students.uu.nl>2020-10-23 13:14:32 +0200
commit34d42d581ee5f5385c823385835182029b9eb2ff (patch)
treeecbed0902f0466feb7bd461e7c43619cf1ddbc6f /Assignment 2/main.py
parent6bf5c09a9bcb5f9d3f47d795b0539cb8c19fb089 (diff)
Training- en testdata toegevoegd, inclusief een functie om deze data in een dataframe te zetten
Diffstat (limited to 'Assignment 2/main.py')
-rw-r--r--Assignment 2/main.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/Assignment 2/main.py b/Assignment 2/main.py
new file mode 100644
index 0000000..6e06004
--- /dev/null
+++ b/Assignment 2/main.py
@@ -0,0 +1,54 @@
+import fnmatch
+import os
+import pandas as pd
+import regex as re
+from nltk.corpus import stopwords
+
+
+def fetch_reviews(testdata):
+ path = 'op_spam_v1.4/'
+ label = []
+
+ # Fetch all the file paths of the .txt files and append in a list
+ if testdata:
+ file_paths = [os.path.join(subdir,f)
+ for subdir, dirs, files in os.walk(path)
+ for f in fnmatch.filter(files, '*.txt') if 'fold5' in subdir]
+ else:
+ file_paths = [os.path.join(subdir, f)
+ for subdir, dirs, files in os.walk(path)
+ for f in fnmatch.filter(files, '*.txt') if 'fold5' not in subdir]
+
+ # Fetch all the labels and append in a list
+ for path in file_paths:
+ c = re.search('(trut|deceptiv)\w',path)
+ label.append(c.group())
+
+ # Create a dataframe of the label list
+ labels = pd.DataFrame(label, columns=['Label'])
+
+ # Fetch all the reviews and append in a list
+ reviews = []
+ for path in file_paths:
+ with open(path) as f_input:
+ reviews.append(f_input.read())
+
+ # Create a dataframe of the review list
+ reviews = pd.DataFrame(reviews, columns=['Review'])
+
+ # Merge the review dataframe and label dataframe
+ data = pd.merge(reviews, labels, right_index=True, left_index=True)
+ # convert reviews to lowercase
+ data['Review'] = data['Review'].map(lambda x: x.lower())
+ # remove stopwords
+ stop = stopwords.words('english')
+ data['Review without stopwords'] = data['Review'].apply(lambda x: ' '.join(
+ [word for word in x.split() if word not in (stop)]))
+
+ return data
+
+
+training_data = fetch_reviews(testdata=False)
+print(training_data)
+test_data = fetch_reviews(testdata=True)
+print(test_data) \ No newline at end of file