sd17fall · sokuno222 · Oct 12, 2017 · Oct 20, 2017 · Oct 20, 2017 · matthewruehle
diff --git a/GetTweets.py b/GetTweets.py
@@ -0,0 +1,24 @@
+import twitter
+from TwitterKeys import *
+from Pickling import PickleBuddy, unPickleBuddy
+
+def cleanTweet(unsorted):
+    textList = []
+    cleanList = []
+    for tweet in unsorted:
+        textList.append(tweet.text)
+    for text in textList:
+        if text[0] == 'R':
+            cleaned = (text.split(':')[1])
+        if '#' in cleaned:
+            cleaned = cleaned.split("#")[0]
+        if cleaned != '':
+            cleanList.append(cleaned)
+    return cleanList
+
+
+def getTweets():
+    PickleBuddy()
+    unfilteredTweet = unPickleBuddy()
+    niceTweet = cleanTweet(unfilteredTweet)
+    return niceTweet
diff --git a/Pickling.py b/Pickling.py
@@ -0,0 +1,25 @@
+filePath = "C:/Users/sokuno/TextMining"
+import os
+import pickle
+import twitter
+from TwitterKeys import *
+
+
+def PickleBuddy():
+    files = os.listdir(filePath)  # make a list of all the files that already exist
+    if not "cachedData.pickle" in files:
+        #setting up api using twitter key info from TwitterKey.py
+        api = twitter.Api(consumer_key=CONSUMER_KEY,
+                          consumer_secret=CONSUMER_SECRET,
+                          access_token_key=ACCESS_TOKEN_KEY,
+                          access_token_secret=ACCESS_TOKEN_SECRET)
+        fullTweet = api.GetSearch(term='#inspirationalquotes', raw_query=None, geocode=None, since_id=None, max_id=None, until=None, since=None, count=100, lang=None, locale=None, result_type='recent', include_entities=None)
+        pickles = open('cachedData.pickle', 'wb')
+        pickle.dump(fullTweet, pickles)
+        pickles.close
+
+def unPickleBuddy():
+    ultimateUnPickle = []
+    unpickles = open(filePath+'/cachedData.pickle', 'rb')
+    untest = pickle.load(unpickles)
+    return untest
diff --git a/README.md b/README.md
@@ -1,3 +1,13 @@
 # TextMining
 
 This is the base repo for the text mining and analysis project for Software Design at Olin College.
+
+First, install/import the following:
+-re
+-random
+-sys
+-twitter
+-os
+-pickle
+
+Then run using the TwitterQuoteBot.py file
diff --git a/TextMining_reflection.pdf b/TextMining_reflection.pdf
diff --git a/TwitterQuoteBot.py b/TwitterQuoteBot.py
@@ -0,0 +1,6 @@
+import re
+from GetTweets import getTweets
+from sentence_generator import buildMapping, genSentence, main
+words = "".join(getTweets())
+words = words.replace('\r', '').replace('\n', '').replace('T.','').replace('D.','')
+main(words,1)
diff --git a/cachedData.pickle b/cachedData.pickle
diff --git a/cachedData1.pickle b/cachedData1.pickle
diff --git a/cachedData2.pickle b/cachedData2.pickle
diff --git a/sentence_generator.py b/sentence_generator.py
@@ -0,0 +1,128 @@
+#!/usr/bin/python
+
+import re
+import random
+import sys
+
+# These mappings can get fairly large -- they're stored globally to
+# save copying time.
+
+# (tuple of words) -> {dict: word -> number of times the word appears following the tuple}
+# Example entry:
+#    ('eyes', 'turned') => {'to': 2.0, 'from': 1.0}
+# Used briefly while first constructing the normalized mapping
+tempMapping = {}
+
+# (tuple of words) -> {dict: word -> *normalized* number of times the word appears following the tuple}
+# Example entry:
+#    ('eyes', 'turned') => {'to': 0.66666666, 'from': 0.33333333}
+mapping = {}
+
+# Contains the set of words that can start sentences
+starts = []
+
+# We want to be able to compare words independent of their capitalization.
+def fixCaps(word):
+    # Ex: "FOO" -> "foo"
+    if word.isupper() and word != "I":
+        word = word.lower()
+        # Ex: "LaTeX" => "Latex"
+    elif word [0].isupper():
+        word = word.lower().capitalize()
+        # Ex: "wOOt" -> "woot"
+    else:
+        word = word.lower()
+    return word
+
+# Tuples can be hashed; lists can't.  We need hashable values for dict keys.
+# This looks like a hack (and it is, a little) but in practice it doesn't
+# affect processing time too negatively.
+def toHashKey(lst):
+    return tuple(lst)
+
+# Returns the contents of the file, split into a list of words and
+# (some) punctuation.
+def wordlist(words):
+    # f = open(filename, 'r')
+    wordlist = [fixCaps(w) for w in re.findall(r"[\w']+|[.,!?;]", words)]
+    # f.close()
+    return wordlist
+
+# Self-explanatory -- adds "word" to the "tempMapping" dict under "history".
+# tempMapping (and mapping) both match each word to a list of possible next
+# words.
+# Given history = ["the", "rain", "in"] and word = "Spain", we add "Spain" to
+# the entries for ["the", "rain", "in"], ["rain", "in"], and ["in"].
+def addItemToTempMapping(history, word):
+    global tempMapping
+    while len(history) > 0:
+        first = toHashKey(history)
+        if first in tempMapping:
+            if word in tempMapping[first]:
+                tempMapping[first][word] += 1.0
+            else:
+                tempMapping[first][word] = 1.0
+        else:
+            tempMapping[first] = {}
+            tempMapping[first][word] = 1.0
+        history = history[1:]
+
+# Building and normalizing the mapping.
+def buildMapping(wordlist, markovLength):
+    global tempMapping
+    starts.append(wordlist [0])
+    for i in range(1, len(wordlist) - 1):
+        if i <= markovLength:
+            history = wordlist[: i + 1]
+        else:
+            history = wordlist[i - markovLength + 1 : i + 1]
+        follow = wordlist[i + 1]
+        # if the last elt was a period, add the next word to the start list
+        if history[-1] == "." and follow not in ".,!?;":
+            starts.append(follow)
+        addItemToTempMapping(history, follow)
+    # Normalize the values in tempMapping, put them into mapping
+    for first, followset in tempMapping.items():
+        total = sum(followset.values())
+        # Normalizing here:
+        mapping[first] = dict([(k, v / total) for k, v in followset.items()])
+
+# Returns the next word in the sentence (chosen randomly),
+# given the previous ones.
+def next(prevList):
+    sum = 0.0
+    retval = ""
+    index = random.random()
+    # Shorten prevList until it's in mapping
+    while toHashKey(prevList) not in mapping:
+        prevList.pop(0)
+    # Get a random word from the mapping, given prevList
+    for k, v in mapping[toHashKey(prevList)].items():
+        sum += v
+        if sum >= index and retval == "":
+            retval = k
+    return retval
+
+def genSentence(markovLength):
+    # Start with a random "starting word"
+    curr = random.choice(starts)
+    sent = curr.capitalize()
+    prevList = [curr]
+    # Keep adding words until we hit a period
+    while (curr not in "."):
+        curr = next(prevList)
+        prevList.append(curr)
+        # if the prevList has gotten too long, trim it
+        if len(prevList) > markovLength:
+            prevList.pop(0)
+        if (curr not in ".,!?;"):
+            sent += " " # Add spaces between words (but not punctuation)
+        sent += curr
+    return sent
+
+def main(words,markov):
+    buildMapping(wordlist(words), markov)
+    print(genSentence(markov))
+
+if __name__ == "__main__":
+    main()