Skip to content

Commit 47e52f5

Browse files
committed
Accidentally used the common scripts instead of the base_scripts
1 parent b9216e7 commit 47e52f5

File tree

2 files changed

+36
-174
lines changed

2 files changed

+36
-174
lines changed
Lines changed: 12 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,87 +1,21 @@
11
import sys
22
import os
3-
import requests
4-
import json
5-
from pathlib import Path
6-
from bs4 import BeautifulSoup
7-
import pandas as pd
8-
from tqdm import tqdm
9-
import time
3+
import CG_configs as configs
104
from pathlib import Path
115

12-
# This is a hack that loads that root common folder like a module (without you expressly needing to install it).
13-
# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3]
14-
p = Path(__file__).resolve().parents[1]
6+
p = Path(__file__).resolve().parents[5]
157
sys.path.insert(1, str(p))
8+
from common.base_scrapers import crimegraphics_bulletin
169

17-
# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py
18-
from common.utils import hash_comparer, page_hasher, page_update
19-
20-
# import data_parser from common/crimegraphics/utils/data_parser.py
21-
from crimegraphics.utils import data_parser
22-
23-
# this function is used for gathering time stats
24-
def function_timer(stats):
25-
if stats != False:
26-
return time.perf_counter()
27-
28-
29-
# this function simply calculates and prints the difference between the end and start times
30-
def time_dif(stats, string, start, end):
31-
if stats != False:
32-
print(f"{string}: {end - start} seconds")
33-
34-
35-
# configs = {
36-
# "url": "",
37-
# "department_code": "",
38-
# }
39-
40-
# Stats default to False
41-
def crimegraphics_bulletin(configs, save_dir, stats=False, configs_file=False):
42-
if not configs_file: # Default setting
43-
department_code = configs["department_code"]
44-
url = configs["url"]
45-
else:
46-
department_code = configs.department_code
47-
url = configs.url
48-
49-
# Automatically have the CLERYMenu clicked for daily crime data
50-
payload = {
51-
"MYAGCODE": configs.department_code,
52-
"__EVENTTARGET": "MainMenu$BulletinMenu",
53-
"__EVENTARGUMENT": "BulletinMenu",
54-
}
55-
56-
# Initialize "data" table (a table called data, not a datatable)
57-
data = []
58-
59-
print("Receiving Data... Please wait...")
60-
request_start = function_timer(stats)
61-
62-
# Send a POST request to the url with our headers
63-
response = requests.request("POST", configs.url, data=payload)
64-
request_end = function_timer(stats)
65-
time_dif(stats, "Request Time", request_start, request_end)
66-
67-
print("Data received.")
68-
parse_start = function_timer(stats)
69-
70-
# Parse the response using bs4
71-
soup = BeautifulSoup(response.text, "html.parser")
72-
# with open("html.html", 'wb') as output:
73-
# output.write(str(soup).encode('utf-8'))
74-
# output.close()
75-
parse_end = function_timer(stats)
76-
time_dif(stats, "Parse time", parse_start, parse_end)
10+
configs = {
11+
"url": "",
12+
"department_code": "",
13+
}
7714

78-
search_start = function_timer(stats)
15+
save_dir = "./data/"
16+
data = []
7917

80-
table = soup.find("span", id="Bull")
81-
# Send "table" to page_update to be hashed and compared.
82-
page_update(table)
83-
search_end = function_timer(stats)
84-
time_dif(stats, "Search time", search_start, search_end)
18+
if not os.path.exists(save_dir):
19+
os.makedirs(save_dir)
8520

86-
# Import the parser
87-
data_parser(configs, save_dir, table)
21+
crimegraphics_bulletin(configs, save_dir)
Lines changed: 24 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,29 @@
11
import sys
22
import os
3-
import requests
4-
import json
5-
from pathlib import Path
6-
from bs4 import BeautifulSoup
7-
import pandas as pd
8-
from tqdm import tqdm
9-
import time
10-
from datetime import date
3+
import CG_configs as configs
114
from pathlib import Path
125

13-
# This is a hack that loads that root common folder like a module (without you expressly needing to install it).
14-
# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3]
15-
p = Path(__file__).resolve().parents[1]
6+
p = Path(__file__).resolve().parents[5]
167
sys.path.insert(1, str(p))
17-
18-
# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py
19-
from common.utils import hash_comparer, page_hasher, page_update
20-
21-
# this function is used for gathering time stats
22-
def function_timer(stats):
23-
if stats != False:
24-
return time.perf_counter()
25-
26-
27-
# this function simply calculates and prints the difference between the end and start times
28-
def time_dif(stats, string, start, end):
29-
if stats != False:
30-
print(f"{string}: {end - start} seconds")
31-
32-
33-
# stats default to False
34-
def crimegraphics_clery(configs, save_dir, stats=False, configs_file=False):
35-
if not configs_file: # Default setting
36-
department_code = configs["department_code"]
37-
url = configs["url"]
38-
list_header = configs["list_header"]
39-
else:
40-
department_code = configs.department_code
41-
url = configs.url
42-
list_header = configs.list_header
43-
44-
# automatically have the CLERYMenu clicked for daily crime data
45-
payload = {
46-
"MYAGCODE": configs.department_code,
47-
"__EVENTTARGET": "MainMenu$CLERYMenu",
48-
"__EVENTARGUMENT": "CLERYMenu",
49-
}
50-
51-
# initialize "data" table (a table called data, not a datatable)
52-
data = []
53-
54-
print("Receiving Data... Please wait...")
55-
56-
# used for stats, mark beginning of request
57-
request_start = function_timer(stats)
58-
59-
# Send a POST request to the url with our headers
60-
response = requests.request("POST", configs.url, data=payload)
61-
request_end = function_timer(stats)
62-
time_dif(stats, "Request Time", request_start, request_end)
63-
64-
print("Data received.")
65-
parse_start = function_timer(stats)
66-
67-
# Parse the response using bs4
68-
soup = BeautifulSoup(response.text, "html.parser")
69-
parse_end = function_timer(stats)
70-
time_dif(stats, "Parse time", parse_start, parse_end)
71-
72-
search_start = function_timer(stats)
73-
# this website has a bunch of empty tables with the same name
74-
# the 6th index has the data we need
75-
table = soup.find_all("table", {"class": "ob_gBody"})[6]
76-
search_end = function_timer(stats)
77-
time_dif(stats, "Search time", search_start, search_end)
78-
79-
hash_start = function_timer(stats)
80-
# Checks if the page has been updated
81-
page_update(table)
82-
83-
hash_end = function_timer(stats)
84-
time_dif(stats, "Hash time", hash_start, hash_end)
85-
86-
# Use BeautifulSoup4 (bs4)'s find_all method to find all html table rows (tr)
87-
rows = table.find_all("tr")
88-
for row in tqdm(rows):
89-
# Use BeautifulSoup4 (bs4)'s find_all method to find all html tags for table data (td)
90-
td = row.find_all("td")
91-
table_data = []
92-
for actual_data in td:
93-
table_data.append(actual_data.get_text())
94-
data.append(table_data)
95-
96-
date_name = date.today()
97-
file_name = "_" + str(date_name).replace("-", "_") # + "_"
98-
99-
dataframe = pd.DataFrame(data=data, columns=configs.list_header)
100-
101-
dataframe.to_csv(save_dir + configs.department_code + file_name + "_daily_bulletin")
8+
from common.base_scrapers import crimegraphics_scraper
9+
10+
configs = {
11+
"url": "",
12+
"department_code": "",
13+
"list_header": [
14+
"ChargeDescription",
15+
"CaseNum",
16+
"ReportDate",
17+
"OffenseDate",
18+
"Location",
19+
"ChargeDisposition",
20+
],
21+
}
22+
23+
save_dir = "./data/"
24+
data = []
25+
26+
if not os.path.exists(save_dir):
27+
os.makedirs(save_dir)
28+
29+
crimegraphics_scraper(configs, save_dir)

0 commit comments

Comments
 (0)