-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathMain.py
More file actions
93 lines (59 loc) · 2.59 KB
/
Main.py
File metadata and controls
93 lines (59 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import spacy, csv, io
import pandas as pd
import json
import string
from io import StringIO
pd.set_option('display.max_rows',None)
pd.set_option('display.max_colwidth', 5000)
import stanza
# Initialize spacy 'en' model, keeping only tagger component
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
all_stopwords = nlp.Defaults.stop_words
col_list = ["reviewText"]
df_og = pd.read_csv("Export1.csv", usecols=col_list)
# Input and output CSV files
IN_FILE_NAME = "Export1.csv"
OUT_FILE_NAME = "Stop.csv"
# print("Before Edits")
# print(df_og.to_string(columns=col_list, index=False,header=False,max_rows=5))
def stopwordRemover(column):
"""Takes a string as an input and outputs the string without stopwords"""
# Parse the sentence using the loaded 'en' model object `nlp`
# """Takes a string as an input and outputs the string without stopwords"""
# Join words that aren't stopwords
new_words = []
for row in column:
new_words.append(" ".join([word for word in (str(row).split()) if word.lower() not in all_stopwords]))
return new_words
###################
# Removing Punctuation
exclude = set(string.punctuation)
def removePunctuation(sentence):
for ele in sentence:
if (ele in exclude):
sentence = sentence.replace(ele, " ")
return sentence
# reviews_cleaned = df_og.copy()
test = "The game itself worked!! great but the story line videos would never play, the sound was fine but the picture!! would freeze and go black every time."
test = removePunctuation(test)
print(test)
#print("Before: ", df_og['reviewText'].head(10))
df_og['reviewText'] = df_og['reviewText'].str.replace(r'\d+', '') #removing digits
df_og['reviewText'] = stopwordRemover(df_og["reviewText"].values)
df_og['reviewText'] = df_og['reviewText'].str.replace('[^\w\s]','')
#df_og['reviewText'] = removePunctuation(df_og["reviewText"].values)
print(df_og.head(10))
df_og.to_csv("Cleanedre.csv")
#df_og['reviewText'] = stopwordRemover(df_og["reviewText"].head(10))
#print(df_og['reviewText'].to_numpy(dtype=str))
#print("Stopwords")
#print(stopwordRemover(df_og.to_string(columns=col_list, index=False,header=False,max_rows=10)))
#print("After:", df_og['reviewText'].head(10))
# print(testRemove(test))
# print(stopwordRemover(test))
# #Punctuation remove
# print("Remove Punctuation")
# testRemove(df_og.to_string(columns=col_list, index=False,header=False,max_rows=5))
# #Remove Digits
# stopwordRemover(df_og.to_string(columns=col_list, index=False,header=False,max_rows=5))
# print(df_og.to_string(columns=col_list, index=False,header=False,max_rows=5))