Added erickha

cooper-mj · cooper-mj · commit e898223c5aff · 2020-03-24T09:08:16.000-07:00
diff --git a/.DS_Store b/.DS_Store
diff --git a/erickha/README.md b/erickha/README.md
@@ -0,0 +1,49 @@
+# CS 41 Final Project: Tweet Sentiment Analysis
+
+#### Contributors: Erick Hernandez and Ruben Sanchez
+
+## Functionality
+
+This project implements a tweet sentiment aggreator tool that can be run as a python executable. It will take in input that provides the hashtag we would like to search for and the keywords we would like to seperate by under that hashtag.
+
+From there, the program will aggregate tweets and do sentiment analysis on them by displaying a graph as shown below. 
+
+![Sample photo](./sample.png)
+
+This graph was created by searching the hashtag "COVID19" and has keyworsds, China, US, Italy, Trump.
+
+## Usage
+
+1) The first step is creating/using the provided config file to configure your search results.
+
+edit `config.yaml` to look something like:
+```
+hashtag: COVID19
+keywords: 
+  - China
+  - US
+  - Italy
+  - Trump
+```
+or 
+```
+hashtag: DemDebate
+keywords: 
+  - Clinton
+  - Bernie
+  - Biden
+  - Bloomberg
+```
+2) You can run the script by providing the name of the config file with the `-f` flag, e.g. 
+```
+12:04 PM (pyvenv) $ python sentiment.py -f config.yaml
+```
+
+## Bugs
+
+We currently do not have any bugs that we have found. Our biggest issue that we are currently limited to a small number of tweets because the twitter API only allows us to query 18k tweets per minute. The downloading is very slow and we are not able to break into shorter time intervals.
+
+## Contact
+Please contact:
+- rubensan [at] stanford [dot] edu 
+- erickha [at] stanford [dot] edu 
diff --git a/erickha/config.yaml b/erickha/config.yaml
@@ -0,0 +1,8 @@
+#Gathers tweets from past seven days (limit: 10000 tweets)
+hashtag: DemDebate
+keywords: 
+  - Biden
+  - Bernie
+  - Trump
+  - Bloomberg
+  - Booker
diff --git a/erickha/requirements.txt b/erickha/requirements.txt
@@ -0,0 +1,97 @@
+alabaster==0.7.12
+appnope==0.1.0
+attrs==19.3.0
+autopep8==1.4.4
+Babel==2.8.0
+backcall==0.1.0
+bleach==3.1.0
+certifi==2019.11.28
+chardet==3.0.4
+Click==7.0
+cycler==0.10.0
+decorator==4.4.1
+defusedxml==0.6.0
+docutils==0.15.2
+entrypoints==0.3
+Flask==1.1.1
+get==2019.4.13
+idna==2.8
+imagesize==1.2.0
+ipykernel==5.1.3
+ipyparallel==6.2.4
+ipython==7.11.1
+ipython-genutils==0.2.0
+ipywidgets==7.5.1
+itsdangerous==1.1.0
+jedi==0.15.2
+Jinja2==2.10.3
+joblib==0.14.1
+json5==0.8.5
+jsonpickle==1.3
+jsonschema==3.2.0
+jupyter==1.0.0
+jupyter-client==5.3.4
+jupyter-console==6.0.0
+jupyter-core==4.6.1
+jupyterlab==1.2.4
+jupyterlab-server==1.0.6
+kiwisolver==1.1.0
+MarkupSafe==1.1.1
+matplotlib==3.1.2
+mistune==0.8.4
+nbconvert==5.6.1
+nbformat==5.0.3
+nltk==3.4.5
+nose==1.3.7
+notebook==6.0.2
+numpy==1.18.1
+oauthlib==3.1.0
+packaging==20.0
+pandocfilters==1.4.2
+parso==0.5.2
+pexpect==4.7.0
+pickleshare==0.7.5
+Pillow==7.0.0
+post==2019.4.13
+prometheus-client==0.7.1
+prompt-toolkit==2.0.10
+ptyprocess==0.6.0
+public==2019.4.13
+pycodestyle==2.5.0
+Pygments==2.5.2
+pyparsing==2.4.6
+pyrsistent==0.15.7
+PySocks==1.7.1
+python-dateutil==2.8.1
+pytz==2019.3
+PyYAML==5.3
+pyzmq==18.1.1
+qtconsole==4.6.0
+query-string==2019.4.13
+request==2019.4.13
+requests==2.22.0
+requests-oauthlib==1.3.0
+scikit-learn==0.22.1
+scipy==1.4.1
+Send2Trash==1.5.0
+six==1.13.0
+snowballstemmer==2.0.0
+Sphinx==2.3.1
+sphinxcontrib-applehelp==1.0.1
+sphinxcontrib-devhelp==1.0.1
+sphinxcontrib-htmlhelp==1.0.2
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.2
+sphinxcontrib-serializinghtml==1.1.3
+terminado==0.8.3
+testpath==0.4.4
+textblob==0.15.3
+tornado==6.0.3
+traitlets==4.3.3
+tweepy==3.8.0
+urllib3==1.25.7
+utils==1.0.0
+wcwidth==0.1.8
+webencodings==0.5.1
+Werkzeug==0.16.0
+widgetsnbextension==3.5.1
diff --git a/erickha/sample.png b/erickha/sample.png
diff --git a/erickha/sentiment.py b/erickha/sentiment.py
@@ -0,0 +1,187 @@
+"""
+CS 41 Final Project: Tweet Sentiment Analysis
+Authors: Ruben Sanchez and Erick Hernandez
+SUNet: rubensan & erickha
+
+This script will aggregate tweets under a certain hashtag, and take keywords under those tweets.
+From there, it will do sentiment analysis on tweets that contain the keywords to see how they change
+"""
+import sys
+import argparse
+import yaml
+from os import path
+try:
+    from yaml import CLoader as Loader, CDumper as Dumper
+except ImportError:
+    from yaml import Loader, Dumper
+import tweepy
+import subprocess
+import jsonpickle
+import json
+from textblob import TextBlob
+import matplotlib.pyplot as plt
+import numpy as np
+
+def displayAnalysis(argDict, sentiments): 
+    #uses matpot lib to graph the senitment
+    print("Displaying analysis")
+    n_groups = len(sentiments) - 2
+    start = []
+    end = []
+    for keyword in argDict["keywords"]: 
+        scores = sentiments[keyword]
+        start.append(scores[0])
+        end.append(scores[1])
+
+    fig, ax = plt.subplots()
+    index = np.arange(n_groups)
+    bar_width = 0.25
+    opacity = 0.75
+
+    rects1 = plt.bar(index, start, bar_width,
+    alpha=opacity,
+    color='b',
+    label="Start: " + sentiments["startDateTime"])
+
+    rects2 = plt.bar(index + bar_width, end, bar_width,
+    alpha=opacity,
+    color='r',
+    label="End: " + sentiments["endDateTime"])
+
+    #plot labels
+    plt.xlabel('Keywords')
+    plt.ylabel('Sentiment Score')
+    plt.title('Tweet Sentiment Analysis')
+    plt.xticks(index + bar_width/2, argDict["keywords"])
+    plt.legend()
+
+    #displays graph
+    plt.tight_layout()
+    plt.show()
+
+
+def analyzeTweets(argDict): 
+    #using the textblob library, we can traverse parsed tweets and categorize them
+    sentiments = {}
+    fName = '.parsed-tweets.txt'
+    tweets = []
+    with open(fName, 'r') as f:
+        for line in f:
+            tweet = json.loads(line)
+            tweets.append(tweet)
+    tweets = list(reversed(tweets))
+    half = len(tweets) / 2
+    # loops over keywords
+    for keyword in argDict["keywords"]:
+        start = 0
+        end = 0
+        count = 0
+        # loops over all tweets searching for keywords
+        for t in tweets: 
+            count += 1
+            if keyword.lower() in t["content"].lower(): 
+                score = TextBlob(t["content"])
+                if (count < half): 
+                    start += score.sentiment.polarity                  
+                else: 
+                    end += score.sentiment.polarity
+        sentiments[keyword] = (start/half, end/half)
+    sentiments["startDateTime"] = tweets[0]["timestamp"]
+    sentiments["endDateTime"] = tweets[len(tweets) - 1]["timestamp"]
+    return sentiments
+
+
+
+def loadTweets(argDict):
+    #auth tokens provided by twitter for developers
+    auth = tweepy.OAuthHandler("1OY1cxVgUXZFoMgvxq1eEKpjx", "LukFltoFXodtADUHCjZ0IaN1C7ous7uo2ZgqQ01HLmTWatXgMw")
+    auth.set_access_token("1147601573993803776-UmfCvlhkRxogSmDyIU6iF7k58yoD1c", "jmL3TMCfI6NPZrEoH0rMvtv2l7fRU3Ueyxj6Z8dFVNLQQ")
+
+    api = tweepy.API(auth)
+
+    searchQuery = '#' + argDict["hashtag"]  # this is what we're searching for
+    maxTweets = 10000 # max number of tweets (Twitter only allows 18k tweets/15 min)
+    tweetsPerQry = 100  # this is the max the API permits
+    fName = '.tweets.txt' # We'll store the tweets in a text file.
+
+    # If results only below a specific ID are, set max_id to that ID.
+    # else default to no upper limit, start from the most recent tweet matching the search query.
+    max_id = -1
+
+    tweetCount = 0
+    print("Downloading max {0} tweets".format(maxTweets))
+    with open(fName, 'w') as f:
+        while tweetCount < maxTweets:
+            try:
+                new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
+                if not new_tweets:
+                    print("No more tweets found")
+                    break
+                for tweet in new_tweets:
+                    f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
+                            '\n')
+                tweetCount += len(new_tweets)
+                print("Downloaded {0} tweets".format(tweetCount))
+                max_id = new_tweets[-1].id
+            except tweepy.TweepError as e:
+                # Just exit if any error
+                print("some error : " + str(e))
+                break
+
+    print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))
+    subprocess.call(["rm", ".cached.txt"])
+    with open(".cached.txt", 'w') as file: 
+        file.write(argDict["hashtag"])
+
+def parseTweets():
+    #goes through the downloaded tweets file to clean up tweets and extract relvant info (timestamp and content)
+    fReadName = '.tweets.txt'
+    fWriteName = '.parsed-tweets.txt'
+    with open(fWriteName, 'w') as parsed:
+        with open(fReadName, 'r') as f:
+            for line in f:
+                # creates json object, one per line that contains timestamp and contents
+                tweetDict = {}
+                tweet=jsonpickle.decode(line)
+                tweetDict["timestamp"] = (tweet["created_at"])
+                text = tweet["text"]
+                index = text.find(':')
+                text = text[index + 2:]
+                tweetDict["content"] = text
+                json.dump(tweetDict, parsed)
+                parsed.write('\n')
+
+
+def main(argv):
+    #parser to parse the name of the config file we are passed
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f','--file', help= "realtive path to yaml file with config")
+    args = parser.parse_args()
+    filename = args.file.strip()
+
+    #opens the config file and loads the arguments into a dict
+    with open(filename) as file: 
+        argDict = yaml.full_load(file)
+    
+    #gets the name of the most recently cached tweets that fall under a hashtag
+    cachedFile = '.cached.txt'
+    cachedHastag = ''
+    if (path.exists(cachedFile)):
+        with open(cachedFile, 'r') as file: 
+            for line in file:
+                cachedHastag = line
+
+    #does not load tweets if the hashtag has already been cached
+    if (cachedHastag != argDict["hashtag"]):
+        print("Tweets not cached. Will download now")
+        loadTweets(argDict) # conditional on the fact that the tweets we might have already saved
+        parseTweets()
+    else:
+        print ("Tweets for this hashtag have been cached. Fast-forwarding to rendering them")
+
+    # parses and analyzes the tweets
+    sentiments = analyzeTweets(argDict)
+    displayAnalysis(argDict, sentiments)
+
+if __name__ == "__main__":
+    main(sys.argv)