Skip to content

Commit e898223

Browse files
committed
Added erickha
1 parent 5d978ba commit e898223

File tree

6 files changed

+341
-0
lines changed

6 files changed

+341
-0
lines changed

.DS_Store

0 Bytes
Binary file not shown.

erickha/README.md

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# CS 41 Final Project: Tweet Sentiment Analysis
2+
3+
#### Contributors: Erick Hernandez and Ruben Sanchez
4+
5+
## Functionality
6+
7+
This project implements a tweet sentiment aggreator tool that can be run as a python executable. It will take in input that provides the hashtag we would like to search for and the keywords we would like to seperate by under that hashtag.
8+
9+
From there, the program will aggregate tweets and do sentiment analysis on them by displaying a graph as shown below.
10+
11+
![Sample photo](./sample.png)
12+
13+
This graph was created by searching the hashtag "COVID19" and has keyworsds, China, US, Italy, Trump.
14+
15+
## Usage
16+
17+
1) The first step is creating/using the provided config file to configure your search results.
18+
19+
edit `config.yaml` to look something like:
20+
```
21+
hashtag: COVID19
22+
keywords:
23+
- China
24+
- US
25+
- Italy
26+
- Trump
27+
```
28+
or
29+
```
30+
hashtag: DemDebate
31+
keywords:
32+
- Clinton
33+
- Bernie
34+
- Biden
35+
- Bloomberg
36+
```
37+
2) You can run the script by providing the name of the config file with the `-f` flag, e.g.
38+
```
39+
12:04 PM (pyvenv) $ python sentiment.py -f config.yaml
40+
```
41+
42+
## Bugs
43+
44+
We currently do not have any bugs that we have found. Our biggest issue that we are currently limited to a small number of tweets because the twitter API only allows us to query 18k tweets per minute. The downloading is very slow and we are not able to break into shorter time intervals.
45+
46+
## Contact
47+
Please contact:
48+
- rubensan [at] stanford [dot] edu
49+
- erickha [at] stanford [dot] edu

erickha/config.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#Gathers tweets from past seven days (limit: 10000 tweets)
2+
hashtag: DemDebate
3+
keywords:
4+
- Biden
5+
- Bernie
6+
- Trump
7+
- Bloomberg
8+
- Booker

erickha/requirements.txt

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
alabaster==0.7.12
2+
appnope==0.1.0
3+
attrs==19.3.0
4+
autopep8==1.4.4
5+
Babel==2.8.0
6+
backcall==0.1.0
7+
bleach==3.1.0
8+
certifi==2019.11.28
9+
chardet==3.0.4
10+
Click==7.0
11+
cycler==0.10.0
12+
decorator==4.4.1
13+
defusedxml==0.6.0
14+
docutils==0.15.2
15+
entrypoints==0.3
16+
Flask==1.1.1
17+
get==2019.4.13
18+
idna==2.8
19+
imagesize==1.2.0
20+
ipykernel==5.1.3
21+
ipyparallel==6.2.4
22+
ipython==7.11.1
23+
ipython-genutils==0.2.0
24+
ipywidgets==7.5.1
25+
itsdangerous==1.1.0
26+
jedi==0.15.2
27+
Jinja2==2.10.3
28+
joblib==0.14.1
29+
json5==0.8.5
30+
jsonpickle==1.3
31+
jsonschema==3.2.0
32+
jupyter==1.0.0
33+
jupyter-client==5.3.4
34+
jupyter-console==6.0.0
35+
jupyter-core==4.6.1
36+
jupyterlab==1.2.4
37+
jupyterlab-server==1.0.6
38+
kiwisolver==1.1.0
39+
MarkupSafe==1.1.1
40+
matplotlib==3.1.2
41+
mistune==0.8.4
42+
nbconvert==5.6.1
43+
nbformat==5.0.3
44+
nltk==3.4.5
45+
nose==1.3.7
46+
notebook==6.0.2
47+
numpy==1.18.1
48+
oauthlib==3.1.0
49+
packaging==20.0
50+
pandocfilters==1.4.2
51+
parso==0.5.2
52+
pexpect==4.7.0
53+
pickleshare==0.7.5
54+
Pillow==7.0.0
55+
post==2019.4.13
56+
prometheus-client==0.7.1
57+
prompt-toolkit==2.0.10
58+
ptyprocess==0.6.0
59+
public==2019.4.13
60+
pycodestyle==2.5.0
61+
Pygments==2.5.2
62+
pyparsing==2.4.6
63+
pyrsistent==0.15.7
64+
PySocks==1.7.1
65+
python-dateutil==2.8.1
66+
pytz==2019.3
67+
PyYAML==5.3
68+
pyzmq==18.1.1
69+
qtconsole==4.6.0
70+
query-string==2019.4.13
71+
request==2019.4.13
72+
requests==2.22.0
73+
requests-oauthlib==1.3.0
74+
scikit-learn==0.22.1
75+
scipy==1.4.1
76+
Send2Trash==1.5.0
77+
six==1.13.0
78+
snowballstemmer==2.0.0
79+
Sphinx==2.3.1
80+
sphinxcontrib-applehelp==1.0.1
81+
sphinxcontrib-devhelp==1.0.1
82+
sphinxcontrib-htmlhelp==1.0.2
83+
sphinxcontrib-jsmath==1.0.1
84+
sphinxcontrib-qthelp==1.0.2
85+
sphinxcontrib-serializinghtml==1.1.3
86+
terminado==0.8.3
87+
testpath==0.4.4
88+
textblob==0.15.3
89+
tornado==6.0.3
90+
traitlets==4.3.3
91+
tweepy==3.8.0
92+
urllib3==1.25.7
93+
utils==1.0.0
94+
wcwidth==0.1.8
95+
webencodings==0.5.1
96+
Werkzeug==0.16.0
97+
widgetsnbextension==3.5.1

erickha/sample.png

210 KB
Loading

erickha/sentiment.py

+187
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
"""
2+
CS 41 Final Project: Tweet Sentiment Analysis
3+
Authors: Ruben Sanchez and Erick Hernandez
4+
SUNet: rubensan & erickha
5+
6+
This script will aggregate tweets under a certain hashtag, and take keywords under those tweets.
7+
From there, it will do sentiment analysis on tweets that contain the keywords to see how they change
8+
"""
9+
import sys
10+
import argparse
11+
import yaml
12+
from os import path
13+
try:
14+
from yaml import CLoader as Loader, CDumper as Dumper
15+
except ImportError:
16+
from yaml import Loader, Dumper
17+
import tweepy
18+
import subprocess
19+
import jsonpickle
20+
import json
21+
from textblob import TextBlob
22+
import matplotlib.pyplot as plt
23+
import numpy as np
24+
25+
def displayAnalysis(argDict, sentiments):
26+
#uses matpot lib to graph the senitment
27+
print("Displaying analysis")
28+
n_groups = len(sentiments) - 2
29+
start = []
30+
end = []
31+
for keyword in argDict["keywords"]:
32+
scores = sentiments[keyword]
33+
start.append(scores[0])
34+
end.append(scores[1])
35+
36+
fig, ax = plt.subplots()
37+
index = np.arange(n_groups)
38+
bar_width = 0.25
39+
opacity = 0.75
40+
41+
rects1 = plt.bar(index, start, bar_width,
42+
alpha=opacity,
43+
color='b',
44+
label="Start: " + sentiments["startDateTime"])
45+
46+
rects2 = plt.bar(index + bar_width, end, bar_width,
47+
alpha=opacity,
48+
color='r',
49+
label="End: " + sentiments["endDateTime"])
50+
51+
#plot labels
52+
plt.xlabel('Keywords')
53+
plt.ylabel('Sentiment Score')
54+
plt.title('Tweet Sentiment Analysis')
55+
plt.xticks(index + bar_width/2, argDict["keywords"])
56+
plt.legend()
57+
58+
#displays graph
59+
plt.tight_layout()
60+
plt.show()
61+
62+
63+
def analyzeTweets(argDict):
64+
#using the textblob library, we can traverse parsed tweets and categorize them
65+
sentiments = {}
66+
fName = '.parsed-tweets.txt'
67+
tweets = []
68+
with open(fName, 'r') as f:
69+
for line in f:
70+
tweet = json.loads(line)
71+
tweets.append(tweet)
72+
tweets = list(reversed(tweets))
73+
half = len(tweets) / 2
74+
# loops over keywords
75+
for keyword in argDict["keywords"]:
76+
start = 0
77+
end = 0
78+
count = 0
79+
# loops over all tweets searching for keywords
80+
for t in tweets:
81+
count += 1
82+
if keyword.lower() in t["content"].lower():
83+
score = TextBlob(t["content"])
84+
if (count < half):
85+
start += score.sentiment.polarity
86+
else:
87+
end += score.sentiment.polarity
88+
sentiments[keyword] = (start/half, end/half)
89+
sentiments["startDateTime"] = tweets[0]["timestamp"]
90+
sentiments["endDateTime"] = tweets[len(tweets) - 1]["timestamp"]
91+
return sentiments
92+
93+
94+
95+
def loadTweets(argDict):
96+
#auth tokens provided by twitter for developers
97+
auth = tweepy.OAuthHandler("1OY1cxVgUXZFoMgvxq1eEKpjx", "LukFltoFXodtADUHCjZ0IaN1C7ous7uo2ZgqQ01HLmTWatXgMw")
98+
auth.set_access_token("1147601573993803776-UmfCvlhkRxogSmDyIU6iF7k58yoD1c", "jmL3TMCfI6NPZrEoH0rMvtv2l7fRU3Ueyxj6Z8dFVNLQQ")
99+
100+
api = tweepy.API(auth)
101+
102+
searchQuery = '#' + argDict["hashtag"] # this is what we're searching for
103+
maxTweets = 10000 # max number of tweets (Twitter only allows 18k tweets/15 min)
104+
tweetsPerQry = 100 # this is the max the API permits
105+
fName = '.tweets.txt' # We'll store the tweets in a text file.
106+
107+
# If results only below a specific ID are, set max_id to that ID.
108+
# else default to no upper limit, start from the most recent tweet matching the search query.
109+
max_id = -1
110+
111+
tweetCount = 0
112+
print("Downloading max {0} tweets".format(maxTweets))
113+
with open(fName, 'w') as f:
114+
while tweetCount < maxTweets:
115+
try:
116+
new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
117+
if not new_tweets:
118+
print("No more tweets found")
119+
break
120+
for tweet in new_tweets:
121+
f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
122+
'\n')
123+
tweetCount += len(new_tweets)
124+
print("Downloaded {0} tweets".format(tweetCount))
125+
max_id = new_tweets[-1].id
126+
except tweepy.TweepError as e:
127+
# Just exit if any error
128+
print("some error : " + str(e))
129+
break
130+
131+
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))
132+
subprocess.call(["rm", ".cached.txt"])
133+
with open(".cached.txt", 'w') as file:
134+
file.write(argDict["hashtag"])
135+
136+
def parseTweets():
137+
#goes through the downloaded tweets file to clean up tweets and extract relvant info (timestamp and content)
138+
fReadName = '.tweets.txt'
139+
fWriteName = '.parsed-tweets.txt'
140+
with open(fWriteName, 'w') as parsed:
141+
with open(fReadName, 'r') as f:
142+
for line in f:
143+
# creates json object, one per line that contains timestamp and contents
144+
tweetDict = {}
145+
tweet=jsonpickle.decode(line)
146+
tweetDict["timestamp"] = (tweet["created_at"])
147+
text = tweet["text"]
148+
index = text.find(':')
149+
text = text[index + 2:]
150+
tweetDict["content"] = text
151+
json.dump(tweetDict, parsed)
152+
parsed.write('\n')
153+
154+
155+
def main(argv):
156+
#parser to parse the name of the config file we are passed
157+
parser = argparse.ArgumentParser()
158+
parser.add_argument('-f','--file', help= "realtive path to yaml file with config")
159+
args = parser.parse_args()
160+
filename = args.file.strip()
161+
162+
#opens the config file and loads the arguments into a dict
163+
with open(filename) as file:
164+
argDict = yaml.full_load(file)
165+
166+
#gets the name of the most recently cached tweets that fall under a hashtag
167+
cachedFile = '.cached.txt'
168+
cachedHastag = ''
169+
if (path.exists(cachedFile)):
170+
with open(cachedFile, 'r') as file:
171+
for line in file:
172+
cachedHastag = line
173+
174+
#does not load tweets if the hashtag has already been cached
175+
if (cachedHastag != argDict["hashtag"]):
176+
print("Tweets not cached. Will download now")
177+
loadTweets(argDict) # conditional on the fact that the tweets we might have already saved
178+
parseTweets()
179+
else:
180+
print ("Tweets for this hashtag have been cached. Fast-forwarding to rendering them")
181+
182+
# parses and analyzes the tweets
183+
sentiments = analyzeTweets(argDict)
184+
displayAnalysis(argDict, sentiments)
185+
186+
if __name__ == "__main__":
187+
main(sys.argv)

0 commit comments

Comments
 (0)