diff options
Diffstat (limited to 'Assignment 2/main.py')
| -rw-r--r-- | Assignment 2/main.py | 49 |
1 files changed, 0 insertions, 49 deletions
diff --git a/Assignment 2/main.py b/Assignment 2/main.py deleted file mode 100644 index 8a90d02..0000000 --- a/Assignment 2/main.py +++ /dev/null @@ -1,49 +0,0 @@ -import fnmatch -import os -import pandas as pd -import regex as re - - -def fetch_reviews(testdata): - path = 'op_spam_v1.4/' - label = [] - - # Fetch all the file paths of the .txt files and append in a list - if testdata: - file_paths = [os.path.join(subdir,f) - for subdir, dirs, files in os.walk(path) - for f in fnmatch.filter(files, '*.txt') if 'fold5' in subdir] - else: - file_paths = [os.path.join(subdir, f) - for subdir, dirs, files in os.walk(path) - for f in fnmatch.filter(files, '*.txt') if 'fold5' not in subdir] - - # Fetch all the labels and append in a list - for path in file_paths: - c = re.search('(trut|deceptiv)\w',path) - label.append(c.group()) - - # Create a dataframe of the label list - labels = pd.DataFrame(label, columns=['Label']) - - # Fetch all the reviews and append in a list - reviews = [] - for path in file_paths: - with open(path) as f_input: - reviews.append(f_input.read()) - - # Create a dataframe of the review list - reviews = pd.DataFrame(reviews, columns=['Review']) - - # Merge the review dataframe and label dataframe - data = pd.merge(reviews, labels, right_index=True, left_index=True) - # convert reviews to lowercase - data['Review'] = data['Review'].map(lambda x: x.lower()) - - return data - - -training_data = fetch_reviews(testdata=False) -print(training_data) -test_data = fetch_reviews(testdata=True) -print(test_data)
\ No newline at end of file |
