summaryrefslogtreecommitdiff
path: root/Assignment 2/main.py
diff options
context:
space:
mode:
authormike <mike1994vink@gmail.com>2021-03-17 17:36:45 +0100
committermike <mike1994vink@gmail.com>2021-03-17 17:36:45 +0100
commit6a37e0c93349bc1ade29a0c3858295d897310856 (patch)
treee41fa86796db496904b428d9dcbac51172f2c49e /Assignment 2/main.py
parent5ab1872bc99282722726c65142a28b87aacaca5c (diff)
remove(not relevant)
Diffstat (limited to 'Assignment 2/main.py')
-rw-r--r--Assignment 2/main.py49
1 files changed, 0 insertions, 49 deletions
diff --git a/Assignment 2/main.py b/Assignment 2/main.py
deleted file mode 100644
index 8a90d02..0000000
--- a/Assignment 2/main.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import fnmatch
-import os
-import pandas as pd
-import regex as re
-
-
-def fetch_reviews(testdata):
- path = 'op_spam_v1.4/'
- label = []
-
- # Fetch all the file paths of the .txt files and append in a list
- if testdata:
- file_paths = [os.path.join(subdir,f)
- for subdir, dirs, files in os.walk(path)
- for f in fnmatch.filter(files, '*.txt') if 'fold5' in subdir]
- else:
- file_paths = [os.path.join(subdir, f)
- for subdir, dirs, files in os.walk(path)
- for f in fnmatch.filter(files, '*.txt') if 'fold5' not in subdir]
-
- # Fetch all the labels and append in a list
- for path in file_paths:
- c = re.search('(trut|deceptiv)\w',path)
- label.append(c.group())
-
- # Create a dataframe of the label list
- labels = pd.DataFrame(label, columns=['Label'])
-
- # Fetch all the reviews and append in a list
- reviews = []
- for path in file_paths:
- with open(path) as f_input:
- reviews.append(f_input.read())
-
- # Create a dataframe of the review list
- reviews = pd.DataFrame(reviews, columns=['Review'])
-
- # Merge the review dataframe and label dataframe
- data = pd.merge(reviews, labels, right_index=True, left_index=True)
- # convert reviews to lowercase
- data['Review'] = data['Review'].map(lambda x: x.lower())
-
- return data
-
-
-training_data = fetch_reviews(testdata=False)
-print(training_data)
-test_data = fetch_reviews(testdata=True)
-print(test_data) \ No newline at end of file