-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
134 lines (114 loc) · 5.04 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import streamlit as st
import csv
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import zipfile
import fitz
import nltk
import pdfplumber
import pytesseract
from collections import Counter
from io import StringIO
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
from wordcloud import WordCloud
# Configure page title
st.set_page_config(page_title="Multi-WordCloud Generator")
nltk.download('stopwords')
uploaded = {} # to hold files and file info after uploading
INDIVIDUAL_WORD_CLOUDS = False
CONTAINS_SCANNED_PDFS = False # Set to true if you know your data contains scanned documents, but you cannot specify which are scanned
SCANNED_PDFS_TAGGED = False # Only set to True if you have named ALL scanned PDFs correctly i.e. file name ends with _scanned.pdf
# add title
st.title("Multi-WordCloud Generator")
# add a subtitle
st.subheader("An OCR-enabled research tool for generating wordclouds from multiple files at a go.")
# Note on supported formats
st.info("Currently supports PDFs, Word Documents, and raw text files.", icon='ℹ️')
# Add file uploader
uploaded_files = st.file_uploader(
"Add your reference PDF files:", accept_multiple_files=True)
combine_wordcloud = st.checkbox('Combine Wordclouds')
def create_word_cloud(text, title):
# Removing non-alphanumeric characters in string
re_pattern = re.compile(r'[^\w\s]', re.UNICODE)
text = re_pattern.sub('', text)
# Remove unnecessary words (stop words) like "the", "and", etc.
words_to_count = text.split() # Split sentence into list of words
stop_word_set = set(stopwords.words('english'))
words_to_count = [word for word in words_to_count if word not in stop_word_set] # Remove stop words
# Count the words using Python's Counter
word_cloud_dict = Counter(words_to_count)
wordcloud = WordCloud(
max_font_size=40,
background_color="white"
).generate_from_frequencies(word_cloud_dict)
fig, ax = plt.subplots()
# ax.figure(figsize=(16, 10))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off")
ax.set_title(title)
st.pyplot(fig)
def get_combined_cloud(uploaded_files):
files_text = ""
for uploaded_file in uploaded_files:
# PDFs
if uploaded_file.name.lower().endswith('.pdf'):
with fitz.open(stream=uploaded_file.read()) as doc:
for page in doc.pages():
extracted_text = page.get_text()
# If no text was found, assume page was scanned and treat as picture
if len(extracted_text) == 0:
pix = page.get_pixmap()
output = "outfile.png"
pix.save(output)
files_text += (pytesseract.image_to_string('outfile.png').lower() + " ")
os.remove("./outfile.png")
else:
files_text += (extracted_text + " ")
# WORD DOCS
elif uploaded_file.name.lower().endswith(('.doc', '.docx')):
docx = zipfile.ZipFile(uploaded_file)
single_file_text = docx.read('word/document.xml').decode('utf-8')
single_file_text = re.sub('<(.|\n)*?>','',single_file_text).lower()
files_text += (single_file_text + " ")
# PLAIN TEXT
else:
single_file_text = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
files_text += (single_file_text + " ")
create_word_cloud(files_text, "combined Word Cloud")
def get_individual_clouds(uploaded_files):
for uploaded_file in uploaded_files:
single_file_text = ""
# PDFs
if uploaded_file.name.lower().endswith('.pdf'):
with fitz.open(stream=uploaded_file.read()) as doc:
for page in doc.pages():
extracted_text = page.get_text()
# If no text was found, assume page was scanned and treat as picture
if len(extracted_text) == 0:
pix = page.get_pixmap()
output = "outfile.png"
pix.save(output)
single_file_text += (pytesseract.image_to_string('outfile.png').lower() + " ")
os.remove("./outfile.png")
else:
single_file_text += (extracted_text + " ")
# WORD DOCS
elif uploaded_file.name.lower().endswith(('.doc', '.docx')):
docx = zipfile.ZipFile(uploaded_file)
single_file_text = docx.read('word/document.xml').decode('utf-8')
single_file_text = re.sub('<(.|\n)*?>','',single_file_text).lower()
# PLAIN TEXT
else:
single_file_text = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
create_word_cloud(single_file_text, uploaded_file.name)
if st.button('Generate'):
if combine_wordcloud:
get_combined_cloud(uploaded_files)
else:
get_individual_clouds(uploaded_files)