1 files changed, 54 insertions, 0 deletions
diff --git a/Assignment 2/main.py b/Assignment 2/main.py
new file mode 100644
index 0000000..6e06004
--- /dev/null
+++ b/Assignment 2/main.py
@@ -0,0 +1,54 @@
+import fnmatch
+import os
+import pandas as pd
+import regex as re
+from nltk.corpus import stopwords
+
+
+def fetch_reviews(testdata):
+    path = 'op_spam_v1.4/'
+    label = []
+
+    # Fetch all the file paths of the .txt files and append in a list
+    if testdata:
+        file_paths = [os.path.join(subdir,f)
+            for subdir, dirs, files in os.walk(path)
+                for f in fnmatch.filter(files, '*.txt') if 'fold5' in subdir]
+    else:
+        file_paths = [os.path.join(subdir, f)
+                      for subdir, dirs, files in os.walk(path)
+                      for f in fnmatch.filter(files, '*.txt') if 'fold5' not in subdir]
+
+    # Fetch all the labels and append in a list
+    for path in file_paths:
+        c = re.search('(trut|deceptiv)\w',path)
+        label.append(c.group())
+
+    # Create a dataframe of the label list
+    labels = pd.DataFrame(label, columns=['Label'])
+
+    # Fetch all the reviews and append in a list
+    reviews = []
+    for path in file_paths:
+        with open(path) as f_input:
+            reviews.append(f_input.read())
+
+    # Create a dataframe of the review list
+    reviews = pd.DataFrame(reviews, columns=['Review'])
+
+    # Merge the review dataframe and label dataframe
+    data = pd.merge(reviews, labels, right_index=True, left_index=True)
+    # convert reviews to lowercase
+    data['Review'] = data['Review'].map(lambda x: x.lower())
+    # remove stopwords
+    stop = stopwords.words('english')
+    data['Review without stopwords'] = data['Review'].apply(lambda x: ' '.join(
+        [word for word in x.split() if word not in (stop)]))
+
+    return data
+
+
+training_data = fetch_reviews(testdata=False)
+print(training_data)
+test_data = fetch_reviews(testdata=True)
+print(test_data)
+\ No newline at end of file