diff --git a/README.md b/README.md index 8cce527..0172a01 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ # TextMining -This is the base repo for the text mining and analysis project for Software Design at Olin College. +Before Using This Script: +Open Anaconda, +'pip install wikipedia', +'pip install matplotlib' diff --git a/WriteUp.docx b/WriteUp.docx new file mode 100644 index 0000000..eb673f2 Binary files /dev/null and b/WriteUp.docx differ diff --git a/WriteUp.pdf b/WriteUp.pdf new file mode 100644 index 0000000..1582952 Binary files /dev/null and b/WriteUp.pdf differ diff --git a/dist_vs_words.png b/dist_vs_words.png new file mode 100644 index 0000000..380b845 Binary files /dev/null and b/dist_vs_words.png differ diff --git a/text_mining.py b/text_mining.py new file mode 100644 index 0000000..aeeef24 --- /dev/null +++ b/text_mining.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- +""" +YOUR HEADER COMMENT HERE + +@author: ISA BLANCETT + +""" + +import wikipedia +from collections import Counter + + +def get_content(title): + """ grabs the content of a wikipedia page + + title: name of the wikipedia page + returns: content of wiki page in plain text + + >>> get_content('Olin College') + 'Olin College of Engineering (also known as Olin College or simply Olin) is a private....' + """ + article = wikipedia.page(title) + return article.content + +def find_freq_words(title, freq_words): + """ inserts word_freq list ordered by frequency & deletes punctuation + + title: name of the wikipedia page + calls: get_content(title) + returns: list of words ordered by frequency to be appended to freq_words + """ + + words = get_content(title).split() + + # Checks for words carrying punctuation marks + for i in range(len(words)): + if words[i].endswith('.') or words[i].endswith(',') or words[i].endswith(')') or words[i].endswith(':'): + words[i] = words[i][:-1] + if words[i].startswith("("): + words[i] = words[i][1:] + if words[i].endswith("'s"): + words[i] = words[i][:-2] + + ordered = sorted(words, key = words.count, reverse=True) + + # Orders by frequency without repeat terms + seen = {} + ordered_set = [seen.setdefault(x, x) for x in ordered if x not in seen] + return ordered_set + +def remove_matches(numStates, freq_words): + """ removes words that are present in 10 state lists (e.g. 'the') + + returns: nothing + """ + +# Make a histogram of all the words + all_words = [] + for entry in freq_words: + all_words.extend(entry) + hist = Counter(all_words) + +# Delete words that appear in 10 or more articles + for word in hist.keys(): + if hist[word] >= 10: + for i in range(numStates): + if word in freq_words[i]: + freq_words[i].remove(word) + + return freq_words + + +def find_common_words(states): + """ Finds a list of uncommon words for each state + + returns: updated freq_words list + """ + + freq_words = [] + + numStates = len(states) + +# make list of all words (no repeats per list) + for name in states: + ordered_set = find_freq_words(name, freq_words) + freq_words.append(ordered_set) + print(name) + +# get rid of matching words across state articles + freq_words = remove_matches(numStates, freq_words) + return freq_words + +def compare_states(state1,state2): + """ Compares uncommon words between two states + + returns: the common_words of two states + """ + + states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', + 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia (U.S. state)', + 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', + 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', + 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', + 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', + 'New Mexico', 'New York (state)', 'North Carolina', 'North Dakota', 'Ohio', + 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', + 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', + 'Virginia', 'Washington (state)', 'West Virginia', 'Wisconsin', 'Wyoming'] + +# Get all info for comparison + freq_words = find_common_words(states) + common_words = [] + index1 = states.index(state1) + index2 = states.index(state2) + +# Compare the states + for item in freq_words[index1]: + if item in freq_words[index2]: + common_words.append(item) + + print(len(common_words)) + return common_words + +if __name__ == "__main__": + # example run + print(compare_states('Alabama', 'Vermont'))