add main fn with CLI; remove date range params in package frontend fn stubs

nmdefries · nmdefries · commit 47e9836229ab · 2024-10-11T18:46:55.000-04:00
diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py
@@ -1,3 +1,5 @@
+from datetime import datetime
+
 # The dataset calls the same viruses, provinces, regions (province groups),
 # and country by multiple names. Map each of those to a common abbreviation.
 VIRUSES = {
@@ -104,3 +106,5 @@
 POSITIVE_TESTS_OUTPUT_FILE = "positive_tests.csv"
 
 LAST_WEEK_OF_YEAR = 35
+
+NOW = datetime.datetime.now()
diff --git a/src/acquisition/rvdss/database.py b/src/acquisition/rvdss/database.py
@@ -0,0 +1,121 @@
+"""
+===============
+=== Purpose ===
+===============
+
+Stores data provided by rvdss Corp., which contains flu lab test results.
+See: rvdss.py
+
+
+=======================
+=== Data Dictionary ===
+=======================
+
+`rvdss` is the table where rvdss data is stored.
++----------+-------------+------+-----+---------+----------------+
+| Field    | Type        | Null | Key | Default | Extra          |
++----------+-------------+------+-----+---------+----------------+
+| id       | int(11)     | NO   | PRI | NULL    | auto_increment |
+| location | varchar(8)  | NO   | MUL | NULL    |                |
+| epiweek  | int(11)     | NO   | MUL | NULL    |                |
+| value    | float       | NO   |     | NULL    |                |
++----------+-------------+------+-----+---------+----------------+
+id: unique identifier for each record
+location: hhs1-10
+epiweek: the epiweek during which the queries were executed
+value: number of total test records per facility, within each epiweek
+
+=================
+=== Changelog ===
+=================
+2017-12-14:
+  * add "need update" check
+
+2017-12-02:
+  * original version
+"""
+
+# standard library
+import argparse
+
+# third party
+import mysql.connector
+
+# first party
+from delphi.epidata.acquisition.rvdss import rvdss
+import delphi.operations.secrets as secrets
+from delphi.utils.epidate import EpiDate
+import delphi.utils.epiweek as flu
+from delphi.utils.geo.locations import Locations
+
+LOCATIONS = Locations.hhs_list
+DATAPATH = "/home/automation/rvdss_data"
+
+
+def update(locations, first=None, last=None, force_update=False, load_email=True):
+    # download and prepare data first
+    qd = rvdss.rvdssData(DATAPATH, load_email)
+    if not qd.need_update and not force_update:
+        print("Data not updated, nothing needs change.")
+        return
+
+    qd_data = qd.load_csv()
+    qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4)
+    qd_ts = rvdss.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last)
+    # connect to the database
+    u, p = secrets.db.epi
+    cnx = mysql.connector.connect(user=u, password=p, database="epidata")
+    cur = cnx.cursor()
+
+    def get_num_rows():
+        cur.execute("SELECT count(1) `num` FROM `rvdss`")
+        for (num,) in cur:
+            pass
+        return num
+
+    # check from 4 weeks preceeding the last week with data through this week
+    cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `rvdss`")
+    for (ew0, ew1) in cur:
+        ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4)
+    ew0 = ew0 if first is None else first
+    ew1 = ew1 if last is None else last
+    print(f"Checking epiweeks between {int(ew0)} and {int(ew1)}...")
+
+    # keep track of how many rows were added
+    rows_before = get_num_rows()
+
+    # check rvdss for new and/or revised data
+    sql = """
+    INSERT INTO
+      `rvdss` (`location`, `epiweek`, `value`)
+    VALUES
+      (%s, %s, %s)
+    ON DUPLICATE KEY UPDATE
+      `value` = %s
+    """
+
+    total_rows = 0
+
+    for location in locations:
+        if location not in qd_ts:
+            continue
+        ews = sorted(qd_ts[location].keys())
+        num_missing = 0
+        for ew in ews:
+            v = qd_ts[location][ew]
+            sql_data = (location, ew, v, v)
+            cur.execute(sql, sql_data)
+            total_rows += 1
+            if v == 0:
+                num_missing += 1
+        if num_missing > 0:
+            print(f" [{location}] missing {int(num_missing)}/{len(ews)} value(s)")
+
+    # keep track of how many rows were added
+    rows_after = get_num_rows()
+    print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)")
+
+    # cleanup
+    cur.close()
+    cnx.commit()
+    cnx.close()
diff --git a/src/acquisition/rvdss/run.py b/src/acquisition/rvdss/run.py
@@ -1,75 +1,97 @@
-# TODO: this is pseudocode and may not run or may not run correctly
-
 import pandas as pd
 
+
 def fetch_report_urls(season):
     """Get all report URLs from a season's report index page"""
     pass
 
-# TODO: consider how to encode a "season" object, maybe as a tuple of start/end years `(2023, 2024)`, or a string `2023-2024`.
+## TODO: consider how to encode a "season" object, maybe as a tuple of start/end years `(2023, 2024)`, or a string `2023-2024`.
+## TODO: I think there's already a fn for this that includes the loop and seasons
 def fetch_one_season_from_report(season):
     report_urls = fetch_report_urls(season)
     df_list = [fetch_one_report(url) for url in report_urls]
     df = pd.concat(df_list)
 
     return df
 
-def fetch_one_dashboard(url = None):
+def fetch_dashboard_data(url = None):
     """Get data from current or archived dashboard"""
-    # If no url is provided, fetch data from the current dashboard (whose URL is static).
-    if not url:
-        url = DEFAULT_DASHBOARD_URL
-
-    # TODO: put rest of scraping code in here
     pass
 
-def fetch_report_data(start_date, end_date):
-    included_seasons = compute_seasons_in_range(start_date, end_date)
+def fetch_report_data():
+    seasons = [...]
 
-    # Fetch all reports made for each season.
-    # We do this because fetching reports is pretty fast, and it saves us from
-    # having to parse either URLs or text on the webpage. We will drop data
-    # outside the requested range later.
-    df_list = [fetch_one_season_from_report(season) for season in included_seasons]
+    # Fetch all reports made for all seasons.
+    ## TODO: I think there's already a fn for this that includes the loop and seasons
+    df_list = [fetch_one_season_from_report(season) for season in seasons]
     df = pd.concat(df_list)
 
-    # Only keep data that was issued within the requested date range.
-    df = df[start_date <= df.issue <= end_date]
-
     return df
 
-def fetch_historical_dashboard_data(start_date, end_date):
-    included_weeks = compute_weeks_in_range(start_date, end_date)
-    included_report_urls = construct_archived_dashboard_urls(included_weeks)
-
-    df_list = [fetch_one_dashboard(url) for url in included_report_urls]
+def fetch_historical_dashboard_data():
+    included_report_urls = fetch_archived_dashboard_urls()
+    df_list = [fetch_dashboard_data(url) for url in included_report_urls]
     df = pd.concat(df_list)
 
     return df
 
-def fetch_historical_dashboard_data(start_date, end_date):
-    create all historical_dashboard_urls included in date range
+def fetch_historical_dashboard_data():
+    create/scrape all historical_dashboard_urls
     loop over urls:
         fetch_dashboard_data(historical_dashboard_url)
 
-    included_seasons = compute_seasons_in_range(start_date, end_date)
-    df_list = [fetch_one_season_from_report(season) for season in included_seasons]
     df = pd.concat(df_list)
-    df = df[start_date <= df.issue <= end_date]
 
     return df
 
 def fetch_current_dashboard_data():
-    fetch_dashboard_data(current_dashboard_url)
-
-def fetch_data(start_date, end_date):
-  if (start_date, end_date) not exist:
-    data = fetch_current_dashboard_data()
-  else:
-    early_range, late_range = split_date_range_by_dashboard_release_date(start_date, end_date)
-    report_data = fetch_report_data(early_range)
-    dashboard_data = fetch_historical_dashboard_data(late_range)
-
-    data = [report_data, dashboard_data].concat()
-
-  return data
+    return fetch_dashboard_data(DEFAULT_DASHBOARD_URL)
+
+def update_current_data(start_date, end_date):
+  data = fetch_current_dashboard_data()
+  update_database(data)
+
+def update_historical_data():
+  report_data = fetch_report_data()
+  dashboard_data = fetch_historical_dashboard_data()
+
+  data = [report_data, dashboard_data].concat()
+
+  update_database(data)
+
+
+def main():
+    # args and usage
+    parser = argparse.ArgumentParser()
+    # fmt: off
+    parser.add_argument(
+        "--current",
+        "-c",
+        action="store_true",
+        help="fetch current data, that is, data for the latest epiweek"
+    )
+    parser.add_argument(
+        "--historical",
+        "-h",
+        action="store_true",
+        help="fetch historical data, that is, data for all available time periods other than the latest epiweek"
+    )
+    # fmt: on
+    args = parser.parse_args()
+
+    current_flag, historical_flag = (
+        args.current,
+        args.historical,
+    )
+    if not current_flag and not historical_flag:
+        raise Exception("no data was requested")
+
+    # Decide what to update
+    if current_flag:
+        update_current_data()
+    if historical_flag:
+        update_historical_data()
+
+
+if __name__ == "__main__":
+    main()