forked from sd17fall/TextMining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_mining_tfidf.py
172 lines (136 loc) · 6.42 KB
/
text_mining_tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""Mines text from Project Gutenburg. Performs TF-IDF and sentiment analysis
on text and generates a word cloud from most important words.
@author: Vivien Chen
"""
import requests
import string
import math
from pickle import dump, load
from os.path import exists
from stop_words import get_stop_words # pip install stop_words
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud # pip install wordcloud
from textblob import TextBlob as tb # pip install textblob
from text_mining import get_cache, filter_PG_text, get_histogram
def get_word_list(text):
"""Takes a string of text as input. Strips away punctuation and whitespace.
Converts all words to lowercase. Returns a list of the words used in the string.
Args:
text: a string of text, such as the filtered text from a Project Gutenberg book
Returns:
a list of words, all lowercase, from the string
"""
text = text.lower()
text = text.split()
for i in range(len(text)):
text[i] = text[i].strip(string.punctuation)
return text
def get_top_n_words(word_list, n):
"""Takes a list of words as input and returns a string of the n most frequently
occurring words, adjusted for the number of times each word occurs, ordered from
most to least frequently occurring.
Args:
word_list: a list of words (assumed to all be in lower case with no punctuation)
n: the number of words to consider
Returns:
a string of n most frequently occurring words, adjusted for the number of times
each word occurs, ordered from most to least frequently occurring
"""
word_counts = get_histogram(word_list)
ordered_by_frequency = sorted(word_counts, key=word_counts.get, reverse=True)
ordered_by_frequency = ordered_by_frequency[0:n]
list_ = []
# for the top n words, multiply the word by the frequency to simulate the frequency of the word in the text
for word in ordered_by_frequency:
list_.append((word + ' ') * word_counts[word])
return ' '.join(list_)
def sentiment_analyzer(text):
"""Takes a string of text as input and returns the sentiment analysis of the text.
Converts text to string if text is not already a string.
Args:
text: a string of text to be analyzed
Returns:
a sentiment analysis of the text
"""
# make sure the text is a string before using the sentiment analyzer
if type(text) != str:
text = ' '.join(text)
analyzer = SentimentIntensityAnalyzer()
return analyzer.polarity_scores(text)
def word_cloud(text, title):
"""Takes a string of text and a title as input and saves the generated wordcloud as a png
with the title as the file name. Converts text to string if text is not already a string.
Args:
text: a string of text used to generate a wordcloud
title: the file name of the generated wordcloud
Returns:
nothing; saves the wordcloud to title.png
"""
# make sure the text is a string before using the wordcloud generator
if type(text) == list:
text = ' '.join(text)
wordcloud = WordCloud(width = 1000, height = 500, background_color="black").generate(text)
wordcloud.to_file('%s_tfidf.png' % title)
def tfidf(word, text, text_list):
"""Calculates and returns the TF-IDF score of a word from the text by computing
the TF score from the text and the IDF score from all the texts combined.
Args:
word: a given word from the text
text: a string of text from which the TF-IDF score of each word is calculated
text_list: a list of all the texts to compare with
Returns:
the TF-IDF score of a word from the text
"""
# term frequency = number of times the word appears in the text / total number of words in the text
tf = text.words.count(word) / len(text.words)
# number of texts containing the word
n_containing = sum(1 for text in text_list if word in text.words)
# inverse document frequency = log(number of texts / (1 + number of texts containing the word))
idf = math.log(len(text_list) / (1 + n_containing))
# term frequency-inverse document frequency = term frequency * inverse document frequency
return tf * idf
if __name__ == "__main__":
titles = ('Frankenstein',
'Paradise_Lost',
'The_Romance_of_Lust',
)
url = {'Frankenstein': 'http://www.gutenberg.org/cache/epub/84/pg84.txt',
'Paradise_Lost': 'http://www.gutenberg.org/cache/epub/20/pg20.txt',
'The_Romance_of_Lust': 'http://www.gutenberg.org/cache/epub/30254/pg30254.txt',
}
# create a list for the text from each title
text_list = []
# for each title
for title in titles:
# get the text from url and strip the header comments
text = filter_PG_text(get_cache(url['%s' % title], '%s.txt' % title))
# append the adjusted top 500 words converted to textblob to the list of texts
# saves time by only calculating the TF-IDF scores for the top 500 words with the top 500 words
text_list.append(tb(get_top_n_words(get_word_list(text), 500)))
# create list of lists, one list for each title to store the top n words
list_ = [[] for i in range(len(titles))]
n = 50
# for each text
for i, text in enumerate(text_list):
print('Top %d Words in %s with TF-IDF Scores:' % (n, titles[i]))
# create a dictionary with the words as the key and the scores as the value
scores = {word: tfidf(word, text, text_list) for word in text.words}
# sort the words based on the scores from highest to lowest
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:n]:
# print the word and its TF-IDF score for the top n words
print('\tWord: {}, TF-IDF: {}'.format(word, round(score, 10)))
# append the word to the appropriate list
list_[i].append(word)
print('')
print('')
# for each title
for i, title in enumerate(titles):
# print the top n words
print('Top %d Words in %s:' % (n, title))
print(list_[i], '\n')
# print the sentiment of the top n words
print('Sentiment of Top %d Words in %s:' % (n, title))
print(sentiment_analyzer(list_[i]), '\n')
# generate a wordcloud with the top n words
word_cloud(list_[i], title)