Skip to content

Commit b9216e7

Browse files
authored
Merge pull request #145 from Police-Data-Accessibility-Project/main-holder
Merge PDAP-Scraper-Setup-GUI with branch scraper-setup-script
2 parents 491d358 + efec110 commit b9216e7

File tree

15 files changed

+4997
-0
lines changed

15 files changed

+4997
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import sys
2+
import os
3+
import requests
4+
import json
5+
from pathlib import Path
6+
from bs4 import BeautifulSoup
7+
import pandas as pd
8+
from tqdm import tqdm
9+
import time
10+
from pathlib import Path
11+
12+
# This is a hack that loads that root common folder like a module (without you expressly needing to install it).
13+
# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3]
14+
p = Path(__file__).resolve().parents[1]
15+
sys.path.insert(1, str(p))
16+
17+
# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py
18+
from common.utils import hash_comparer, page_hasher, page_update
19+
20+
# import data_parser from common/crimegraphics/utils/data_parser.py
21+
from crimegraphics.utils import data_parser
22+
23+
# this function is used for gathering time stats
24+
def function_timer(stats):
25+
if stats != False:
26+
return time.perf_counter()
27+
28+
29+
# this function simply calculates and prints the difference between the end and start times
30+
def time_dif(stats, string, start, end):
31+
if stats != False:
32+
print(f"{string}: {end - start} seconds")
33+
34+
35+
# configs = {
36+
# "url": "",
37+
# "department_code": "",
38+
# }
39+
40+
# Stats default to False
41+
def crimegraphics_bulletin(configs, save_dir, stats=False, configs_file=False):
42+
if not configs_file: # Default setting
43+
department_code = configs["department_code"]
44+
url = configs["url"]
45+
else:
46+
department_code = configs.department_code
47+
url = configs.url
48+
49+
# Automatically have the CLERYMenu clicked for daily crime data
50+
payload = {
51+
"MYAGCODE": configs.department_code,
52+
"__EVENTTARGET": "MainMenu$BulletinMenu",
53+
"__EVENTARGUMENT": "BulletinMenu",
54+
}
55+
56+
# Initialize "data" table (a table called data, not a datatable)
57+
data = []
58+
59+
print("Receiving Data... Please wait...")
60+
request_start = function_timer(stats)
61+
62+
# Send a POST request to the url with our headers
63+
response = requests.request("POST", configs.url, data=payload)
64+
request_end = function_timer(stats)
65+
time_dif(stats, "Request Time", request_start, request_end)
66+
67+
print("Data received.")
68+
parse_start = function_timer(stats)
69+
70+
# Parse the response using bs4
71+
soup = BeautifulSoup(response.text, "html.parser")
72+
# with open("html.html", 'wb') as output:
73+
# output.write(str(soup).encode('utf-8'))
74+
# output.close()
75+
parse_end = function_timer(stats)
76+
time_dif(stats, "Parse time", parse_start, parse_end)
77+
78+
search_start = function_timer(stats)
79+
80+
table = soup.find("span", id="Bull")
81+
# Send "table" to page_update to be hashed and compared.
82+
page_update(table)
83+
search_end = function_timer(stats)
84+
time_dif(stats, "Search time", search_start, search_end)
85+
86+
# Import the parser
87+
data_parser(configs, save_dir, table)
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import sys
2+
import os
3+
import requests
4+
import json
5+
from pathlib import Path
6+
from bs4 import BeautifulSoup
7+
import pandas as pd
8+
from tqdm import tqdm
9+
import time
10+
from datetime import date
11+
from pathlib import Path
12+
13+
# This is a hack that loads that root common folder like a module (without you expressly needing to install it).
14+
# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3]
15+
p = Path(__file__).resolve().parents[1]
16+
sys.path.insert(1, str(p))
17+
18+
# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py
19+
from common.utils import hash_comparer, page_hasher, page_update
20+
21+
# this function is used for gathering time stats
22+
def function_timer(stats):
23+
if stats != False:
24+
return time.perf_counter()
25+
26+
27+
# this function simply calculates and prints the difference between the end and start times
28+
def time_dif(stats, string, start, end):
29+
if stats != False:
30+
print(f"{string}: {end - start} seconds")
31+
32+
33+
# stats default to False
34+
def crimegraphics_clery(configs, save_dir, stats=False, configs_file=False):
35+
if not configs_file: # Default setting
36+
department_code = configs["department_code"]
37+
url = configs["url"]
38+
list_header = configs["list_header"]
39+
else:
40+
department_code = configs.department_code
41+
url = configs.url
42+
list_header = configs.list_header
43+
44+
# automatically have the CLERYMenu clicked for daily crime data
45+
payload = {
46+
"MYAGCODE": configs.department_code,
47+
"__EVENTTARGET": "MainMenu$CLERYMenu",
48+
"__EVENTARGUMENT": "CLERYMenu",
49+
}
50+
51+
# initialize "data" table (a table called data, not a datatable)
52+
data = []
53+
54+
print("Receiving Data... Please wait...")
55+
56+
# used for stats, mark beginning of request
57+
request_start = function_timer(stats)
58+
59+
# Send a POST request to the url with our headers
60+
response = requests.request("POST", configs.url, data=payload)
61+
request_end = function_timer(stats)
62+
time_dif(stats, "Request Time", request_start, request_end)
63+
64+
print("Data received.")
65+
parse_start = function_timer(stats)
66+
67+
# Parse the response using bs4
68+
soup = BeautifulSoup(response.text, "html.parser")
69+
parse_end = function_timer(stats)
70+
time_dif(stats, "Parse time", parse_start, parse_end)
71+
72+
search_start = function_timer(stats)
73+
# this website has a bunch of empty tables with the same name
74+
# the 6th index has the data we need
75+
table = soup.find_all("table", {"class": "ob_gBody"})[6]
76+
search_end = function_timer(stats)
77+
time_dif(stats, "Search time", search_start, search_end)
78+
79+
hash_start = function_timer(stats)
80+
# Checks if the page has been updated
81+
page_update(table)
82+
83+
hash_end = function_timer(stats)
84+
time_dif(stats, "Hash time", hash_start, hash_end)
85+
86+
# Use BeautifulSoup4 (bs4)'s find_all method to find all html table rows (tr)
87+
rows = table.find_all("tr")
88+
for row in tqdm(rows):
89+
# Use BeautifulSoup4 (bs4)'s find_all method to find all html tags for table data (td)
90+
td = row.find_all("td")
91+
table_data = []
92+
for actual_data in td:
93+
table_data.append(actual_data.get_text())
94+
data.append(table_data)
95+
96+
date_name = date.today()
97+
file_name = "_" + str(date_name).replace("-", "_") # + "_"
98+
99+
dataframe = pd.DataFrame(data=data, columns=configs.list_header)
100+
101+
dataframe.to_csv(save_dir + configs.department_code + file_name + "_daily_bulletin")
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import requests
2+
import os
3+
from bs4 import BeautifulSoup
4+
import urllib
5+
import re
6+
import time
7+
import sys
8+
from pathlib import Path
9+
10+
# This is a hack that basically loads that root common folder like a module (without you expressly needing to install it).
11+
p = Path(__file__).resolve().parents[3]
12+
sys.path.insert(1, str(p))
13+
from common.utils import get_files
14+
from common.utils import extract_info
15+
16+
"""
17+
configs = {
18+
"webpage": "",
19+
"web_path": "",
20+
"domain_included": "",
21+
"domain": "",
22+
"sleep_time": "",
23+
"non_important": "",
24+
"debug": "",
25+
"csv_dir": "",
26+
}
27+
"""
28+
29+
30+
def list_pdf_v2(
31+
configs,
32+
save_dir,
33+
name_in_url=True,
34+
extract_name=False,
35+
add_date=False,
36+
try_overwite=False,
37+
no_overwrite=False,
38+
debug=False,
39+
flavor="stream",
40+
extract_tables=False,
41+
configs_file=False,
42+
):
43+
# If save_dir does not exist, make the directory
44+
if not os.path.exists(save_dir):
45+
os.makedirs(save_dir)
46+
47+
# Check added for backwards compatibility.
48+
if not configs_file: # Default setting
49+
webpage = configs["webpage"]
50+
sleep_time = configs["sleep_time"]
51+
if extract_tables:
52+
try:
53+
csv_dir = configs["csv_dir"]
54+
except AttributeError:
55+
pass
56+
else:
57+
webpage = configs.webpage
58+
sleep_time = configs.sleep_time
59+
if extract_tables:
60+
try:
61+
csv_dir = configs.csv_dir
62+
except AttributeError:
63+
pass
64+
65+
# Use python's requests module to fetch the webpage as plain html
66+
html_page = requests.get(webpage).text
67+
68+
# use BeautifulSoup4 (bs4) to parse the returned html_page using BeautifulSoup4's html parser (html.parser)
69+
soup = BeautifulSoup(html_page, "html.parser")
70+
71+
# initialize url_name table
72+
url_name = []
73+
74+
# Attempts to remove any residual url_name.txt file as we will want to create a new clean version.
75+
try:
76+
os.remove("url_name.txt")
77+
except FileNotFoundError:
78+
# if os.remove returns FileNotFoundError, handle the error by continuing.
79+
pass
80+
81+
print(" [*] Extracting info...")
82+
83+
# the following two functions are imported from ./common/utils/list_pdf_utils/
84+
# send soup, the configs, and the setting of extract_name to the extract_info module
85+
extract_info(
86+
soup, configs, extract_name=extract_name, configs_file=configs_file, debug=debug
87+
)
88+
89+
# Check added for backwards compatibility.
90+
# pass the variable save_dir, access sleep_time from configs, set name_in_url to the value of name_in_url, and set add_date to the value of add_date
91+
get_files(save_dir, sleep_time, name_in_url=name_in_url, add_date=add_date)
92+
93+
# this imports etl for eric to do his magic.
94+
import etl
95+
96+
# this section of code only runs if extract_tables is True
97+
if extract_tables:
98+
# import the pdf_extract module from ./common/etl/data_extraction.py
99+
from common.etl import pdf_extract
100+
101+
try:
102+
# Pass save_dir to pdf_extract's pdf_directory param, and retrieve csv_dir from the configs
103+
pdf_extract(save_dir, csv_dir)
104+
105+
except AttributeError:
106+
# this will happen if csv_dir was not defined in the configs.
107+
if debug:
108+
# because i hate having tons of stuff printed in my terminal, this will only print if debug=True (set when calling list_pdf_v2)
109+
print(" [INFO] csv_dir is not defined in the configs.")
110+
print(
111+
" If you want to save in a different location for some reason, "
112+
)
113+
print(' define it in the configs as `csv_dir="<folder>"`')
114+
# call pdf_extract again, this time without passing csv_dir to it.
115+
pdf_extract(pdf_directory=save_dir, flavor=flavor)
116+
pass
117+
118+
# honestly not sure why this is down here, but there is probably a e
119+
# import etl.py

0 commit comments

Comments
 (0)