Police-Data-Accessibility-Project
diff --git a/‎setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_bulletin.py‎
Lines changed: 87 additions & 0 deletions b/‎setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_bulletin.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_clery.py‎
Lines changed: 101 additions & 0 deletions b/‎setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_clery.py‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎setup_gui/Base_Scripts/Scrapers/list_pdf_extractors/list_pdf_v2.py‎
Lines changed: 119 additions & 0 deletions b/‎setup_gui/Base_Scripts/Scrapers/list_pdf_extractors/list_pdf_v2.py‎
Lines changed: 119 additions & 0 deletions
@@ -0,0 +1,87 @@
+import sys
+import os
+import requests
+import json
+from pathlib import Path
+from bs4 import BeautifulSoup
+import pandas as pd
+from tqdm import tqdm
+import time
+from pathlib import Path
+
+# This is a hack that loads that root common folder like a module (without you expressly needing to install it).
+# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3]
+p = Path(__file__).resolve().parents[1]
+sys.path.insert(1, str(p))
+
+# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py
+from common.utils import hash_comparer, page_hasher, page_update
+
+# import data_parser from common/crimegraphics/utils/data_parser.py
+from crimegraphics.utils import data_parser
+
+# this function is used for gathering time stats
+def function_timer(stats):
+    if stats != False:
+        return time.perf_counter()
+
+
+# this function simply calculates and prints the difference between the end and start times
+def time_dif(stats, string, start, end):
+    if stats != False:
+        print(f"{string}: {end - start} seconds")
+
+
+# configs = {
+#     "url": "",
+#     "department_code": "",
+# }
+
+# Stats default to False
+def crimegraphics_bulletin(configs, save_dir, stats=False, configs_file=False):
+    if not configs_file:  # Default setting
+        department_code = configs["department_code"]
+        url = configs["url"]
+    else:
+        department_code = configs.department_code
+        url = configs.url
+
+    # Automatically have the CLERYMenu clicked for daily crime data
+    payload = {
+        "MYAGCODE": configs.department_code,
+        "__EVENTTARGET": "MainMenu$BulletinMenu",
+        "__EVENTARGUMENT": "BulletinMenu",
+    }
+
+    # Initialize "data" table (a table called data, not a datatable)
+    data = []
+
+    print("Receiving Data... Please wait...")
+    request_start = function_timer(stats)
+
+    # Send a POST request to the url with our headers
+    response = requests.request("POST", configs.url, data=payload)
+    request_end = function_timer(stats)
+    time_dif(stats, "Request Time", request_start, request_end)
+
+    print("Data received.")
+    parse_start = function_timer(stats)
+
+    # Parse the response using bs4
+    soup = BeautifulSoup(response.text, "html.parser")
+    # with open("html.html", 'wb') as output:
+    #     output.write(str(soup).encode('utf-8'))
+    # output.close()
+    parse_end = function_timer(stats)
+    time_dif(stats, "Parse time", parse_start, parse_end)
+
+    search_start = function_timer(stats)
+
+    table = soup.find("span", id="Bull")
+    # Send "table" to page_update to be hashed and compared.
+    page_update(table)
+    search_end = function_timer(stats)
+    time_dif(stats, "Search time", search_start, search_end)
+
+    # Import the parser
+    data_parser(configs, save_dir, table)
@@ -0,0 +1,101 @@
+import sys
+import os
+import requests
+import json
+from pathlib import Path
+from bs4 import BeautifulSoup
+import pandas as pd
+from tqdm import tqdm
+import time
+from datetime import date
+from pathlib import Path
+
+# This is a hack that loads that root common folder like a module (without you expressly needing to install it).
+# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3]
+p = Path(__file__).resolve().parents[1]
+sys.path.insert(1, str(p))
+
+# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py
+from common.utils import hash_comparer, page_hasher, page_update
+
+# this function is used for gathering time stats
+def function_timer(stats):
+    if stats != False:
+        return time.perf_counter()
+
+
+# this function simply calculates and prints the difference between the end and start times
+def time_dif(stats, string, start, end):
+    if stats != False:
+        print(f"{string}: {end - start} seconds")
+
+
+# stats default to False
+def crimegraphics_clery(configs, save_dir, stats=False, configs_file=False):
+    if not configs_file:  # Default setting
+        department_code = configs["department_code"]
+        url = configs["url"]
+        list_header = configs["list_header"]
+    else:
+        department_code = configs.department_code
+        url = configs.url
+        list_header = configs.list_header
+
+    # automatically have the CLERYMenu clicked for daily crime data
+    payload = {
+        "MYAGCODE": configs.department_code,
+        "__EVENTTARGET": "MainMenu$CLERYMenu",
+        "__EVENTARGUMENT": "CLERYMenu",
+    }
+
+    # initialize "data" table (a table called data, not a datatable)
+    data = []
+
+    print("Receiving Data... Please wait...")
+
+    # used for stats, mark beginning of request
+    request_start = function_timer(stats)
+
+    # Send a POST request to the url with our headers
+    response = requests.request("POST", configs.url, data=payload)
+    request_end = function_timer(stats)
+    time_dif(stats, "Request Time", request_start, request_end)
+
+    print("Data received.")
+    parse_start = function_timer(stats)
+
+    # Parse the response using bs4
+    soup = BeautifulSoup(response.text, "html.parser")
+    parse_end = function_timer(stats)
+    time_dif(stats, "Parse time", parse_start, parse_end)
+
+    search_start = function_timer(stats)
+    # this website has a bunch of empty tables with the same name
+    # the 6th index has the data we need
+    table = soup.find_all("table", {"class": "ob_gBody"})[6]
+    search_end = function_timer(stats)
+    time_dif(stats, "Search time", search_start, search_end)
+
+    hash_start = function_timer(stats)
+    # Checks if the page has been updated
+    page_update(table)
+
+    hash_end = function_timer(stats)
+    time_dif(stats, "Hash time", hash_start, hash_end)
+
+    # Use BeautifulSoup4 (bs4)'s find_all method to find all html table rows (tr)
+    rows = table.find_all("tr")
+    for row in tqdm(rows):
+        # Use BeautifulSoup4 (bs4)'s find_all method to find all html tags for table data (td)
+        td = row.find_all("td")
+        table_data = []
+        for actual_data in td:
+            table_data.append(actual_data.get_text())
+        data.append(table_data)
+
+    date_name = date.today()
+    file_name = "_" + str(date_name).replace("-", "_")  # + "_"
+
+    dataframe = pd.DataFrame(data=data, columns=configs.list_header)
+
+    dataframe.to_csv(save_dir + configs.department_code + file_name + "_daily_bulletin")
@@ -0,0 +1,119 @@
+import requests
+import os
+from bs4 import BeautifulSoup
+import urllib
+import re
+import time
+import sys
+from pathlib import Path
+
+# This is a hack that basically loads that root common folder like a module (without you expressly needing to install it).
+p = Path(__file__).resolve().parents[3]
+sys.path.insert(1, str(p))
+from common.utils import get_files
+from common.utils import extract_info
+
+"""
+configs = {
+    "webpage": "",
+    "web_path": "",
+    "domain_included": "",
+    "domain": "",
+    "sleep_time": "",
+    "non_important": "",
+    "debug": "",
+    "csv_dir": "",
+}
+"""
+
+
+def list_pdf_v2(
+    configs,
+    save_dir,
+    name_in_url=True,
+    extract_name=False,
+    add_date=False,
+    try_overwite=False,
+    no_overwrite=False,
+    debug=False,
+    flavor="stream",
+    extract_tables=False,
+    configs_file=False,
+):
+    # If save_dir does not exist, make the directory
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    # Check added for backwards compatibility.
+    if not configs_file:  # Default setting
+        webpage = configs["webpage"]
+        sleep_time = configs["sleep_time"]
+        if extract_tables:
+            try:
+                csv_dir = configs["csv_dir"]
+            except AttributeError:
+                pass
+    else:
+        webpage = configs.webpage
+        sleep_time = configs.sleep_time
+        if extract_tables:
+            try:
+                csv_dir = configs.csv_dir
+            except AttributeError:
+                pass
+
+    # Use python's requests module to fetch the webpage as plain html
+    html_page = requests.get(webpage).text
+
+    # use BeautifulSoup4 (bs4) to parse the returned html_page using BeautifulSoup4's html parser (html.parser)
+    soup = BeautifulSoup(html_page, "html.parser")
+
+    # initialize url_name table
+    url_name = []
+
+    # Attempts to remove any residual url_name.txt file as we will want to create a new clean version.
+    try:
+        os.remove("url_name.txt")
+    except FileNotFoundError:
+        # if os.remove returns FileNotFoundError, handle the error by continuing.
+        pass
+
+    print(" [*] Extracting info...")
+
+    # the following two functions are imported from ./common/utils/list_pdf_utils/
+    # send soup, the configs, and the setting of extract_name to the extract_info module
+    extract_info(
+        soup, configs, extract_name=extract_name, configs_file=configs_file, debug=debug
+    )
+
+    # Check added for backwards compatibility.
+    # pass the variable save_dir, access sleep_time from configs, set name_in_url to the value of name_in_url, and set add_date to the value of add_date
+    get_files(save_dir, sleep_time, name_in_url=name_in_url, add_date=add_date)
+
+    # this imports etl for eric to do his magic.
+    import etl
+
+    # this section of code only runs if extract_tables is True
+    if extract_tables:
+        # import the pdf_extract module from ./common/etl/data_extraction.py
+        from common.etl import pdf_extract
+
+        try:
+            # Pass save_dir to pdf_extract's pdf_directory param, and retrieve csv_dir from the configs
+            pdf_extract(save_dir, csv_dir)
+
+        except AttributeError:
+            # this will happen if csv_dir was not defined in the configs.
+            if debug:
+                # because i hate having tons of stuff printed in my terminal, this will only print if debug=True (set when calling list_pdf_v2)
+                print("  [INFO] csv_dir is not defined in the configs.")
+                print(
+                    "      If you want to save in a different location for some reason, "
+                )
+                print('      define it in the configs as `csv_dir="<folder>"`')
+            # call pdf_extract again, this time without passing csv_dir to it.
+            pdf_extract(pdf_directory=save_dir, flavor=flavor)
+            pass
+
+    # honestly not sure why this is down here, but there is probably a e
+    # import etl.py