Skip to content

Commit 47e9836

Browse files
committed
add main fn with CLI; remove date range params in package frontend fn stubs
1 parent a44ad10 commit 47e9836

File tree

3 files changed

+189
-42
lines changed

3 files changed

+189
-42
lines changed

src/acquisition/rvdss/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from datetime import datetime
2+
13
# The dataset calls the same viruses, provinces, regions (province groups),
24
# and country by multiple names. Map each of those to a common abbreviation.
35
VIRUSES = {
@@ -104,3 +106,5 @@
104106
POSITIVE_TESTS_OUTPUT_FILE = "positive_tests.csv"
105107

106108
LAST_WEEK_OF_YEAR = 35
109+
110+
NOW = datetime.datetime.now()

src/acquisition/rvdss/database.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""
2+
===============
3+
=== Purpose ===
4+
===============
5+
6+
Stores data provided by rvdss Corp., which contains flu lab test results.
7+
See: rvdss.py
8+
9+
10+
=======================
11+
=== Data Dictionary ===
12+
=======================
13+
14+
`rvdss` is the table where rvdss data is stored.
15+
+----------+-------------+------+-----+---------+----------------+
16+
| Field | Type | Null | Key | Default | Extra |
17+
+----------+-------------+------+-----+---------+----------------+
18+
| id | int(11) | NO | PRI | NULL | auto_increment |
19+
| location | varchar(8) | NO | MUL | NULL | |
20+
| epiweek | int(11) | NO | MUL | NULL | |
21+
| value | float | NO | | NULL | |
22+
+----------+-------------+------+-----+---------+----------------+
23+
id: unique identifier for each record
24+
location: hhs1-10
25+
epiweek: the epiweek during which the queries were executed
26+
value: number of total test records per facility, within each epiweek
27+
28+
=================
29+
=== Changelog ===
30+
=================
31+
2017-12-14:
32+
* add "need update" check
33+
34+
2017-12-02:
35+
* original version
36+
"""
37+
38+
# standard library
39+
import argparse
40+
41+
# third party
42+
import mysql.connector
43+
44+
# first party
45+
from delphi.epidata.acquisition.rvdss import rvdss
46+
import delphi.operations.secrets as secrets
47+
from delphi.utils.epidate import EpiDate
48+
import delphi.utils.epiweek as flu
49+
from delphi.utils.geo.locations import Locations
50+
51+
LOCATIONS = Locations.hhs_list
52+
DATAPATH = "/home/automation/rvdss_data"
53+
54+
55+
def update(locations, first=None, last=None, force_update=False, load_email=True):
56+
# download and prepare data first
57+
qd = rvdss.rvdssData(DATAPATH, load_email)
58+
if not qd.need_update and not force_update:
59+
print("Data not updated, nothing needs change.")
60+
return
61+
62+
qd_data = qd.load_csv()
63+
qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4)
64+
qd_ts = rvdss.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last)
65+
# connect to the database
66+
u, p = secrets.db.epi
67+
cnx = mysql.connector.connect(user=u, password=p, database="epidata")
68+
cur = cnx.cursor()
69+
70+
def get_num_rows():
71+
cur.execute("SELECT count(1) `num` FROM `rvdss`")
72+
for (num,) in cur:
73+
pass
74+
return num
75+
76+
# check from 4 weeks preceeding the last week with data through this week
77+
cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `rvdss`")
78+
for (ew0, ew1) in cur:
79+
ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4)
80+
ew0 = ew0 if first is None else first
81+
ew1 = ew1 if last is None else last
82+
print(f"Checking epiweeks between {int(ew0)} and {int(ew1)}...")
83+
84+
# keep track of how many rows were added
85+
rows_before = get_num_rows()
86+
87+
# check rvdss for new and/or revised data
88+
sql = """
89+
INSERT INTO
90+
`rvdss` (`location`, `epiweek`, `value`)
91+
VALUES
92+
(%s, %s, %s)
93+
ON DUPLICATE KEY UPDATE
94+
`value` = %s
95+
"""
96+
97+
total_rows = 0
98+
99+
for location in locations:
100+
if location not in qd_ts:
101+
continue
102+
ews = sorted(qd_ts[location].keys())
103+
num_missing = 0
104+
for ew in ews:
105+
v = qd_ts[location][ew]
106+
sql_data = (location, ew, v, v)
107+
cur.execute(sql, sql_data)
108+
total_rows += 1
109+
if v == 0:
110+
num_missing += 1
111+
if num_missing > 0:
112+
print(f" [{location}] missing {int(num_missing)}/{len(ews)} value(s)")
113+
114+
# keep track of how many rows were added
115+
rows_after = get_num_rows()
116+
print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)")
117+
118+
# cleanup
119+
cur.close()
120+
cnx.commit()
121+
cnx.close()

src/acquisition/rvdss/run.py

Lines changed: 64 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,97 @@
1-
# TODO: this is pseudocode and may not run or may not run correctly
2-
31
import pandas as pd
42

3+
54
def fetch_report_urls(season):
65
"""Get all report URLs from a season's report index page"""
76
pass
87

9-
# TODO: consider how to encode a "season" object, maybe as a tuple of start/end years `(2023, 2024)`, or a string `2023-2024`.
8+
## TODO: consider how to encode a "season" object, maybe as a tuple of start/end years `(2023, 2024)`, or a string `2023-2024`.
9+
## TODO: I think there's already a fn for this that includes the loop and seasons
1010
def fetch_one_season_from_report(season):
1111
report_urls = fetch_report_urls(season)
1212
df_list = [fetch_one_report(url) for url in report_urls]
1313
df = pd.concat(df_list)
1414

1515
return df
1616

17-
def fetch_one_dashboard(url = None):
17+
def fetch_dashboard_data(url = None):
1818
"""Get data from current or archived dashboard"""
19-
# If no url is provided, fetch data from the current dashboard (whose URL is static).
20-
if not url:
21-
url = DEFAULT_DASHBOARD_URL
22-
23-
# TODO: put rest of scraping code in here
2419
pass
2520

26-
def fetch_report_data(start_date, end_date):
27-
included_seasons = compute_seasons_in_range(start_date, end_date)
21+
def fetch_report_data():
22+
seasons = [...]
2823

29-
# Fetch all reports made for each season.
30-
# We do this because fetching reports is pretty fast, and it saves us from
31-
# having to parse either URLs or text on the webpage. We will drop data
32-
# outside the requested range later.
33-
df_list = [fetch_one_season_from_report(season) for season in included_seasons]
24+
# Fetch all reports made for all seasons.
25+
## TODO: I think there's already a fn for this that includes the loop and seasons
26+
df_list = [fetch_one_season_from_report(season) for season in seasons]
3427
df = pd.concat(df_list)
3528

36-
# Only keep data that was issued within the requested date range.
37-
df = df[start_date <= df.issue <= end_date]
38-
3929
return df
4030

41-
def fetch_historical_dashboard_data(start_date, end_date):
42-
included_weeks = compute_weeks_in_range(start_date, end_date)
43-
included_report_urls = construct_archived_dashboard_urls(included_weeks)
44-
45-
df_list = [fetch_one_dashboard(url) for url in included_report_urls]
31+
def fetch_historical_dashboard_data():
32+
included_report_urls = fetch_archived_dashboard_urls()
33+
df_list = [fetch_dashboard_data(url) for url in included_report_urls]
4634
df = pd.concat(df_list)
4735

4836
return df
4937

50-
def fetch_historical_dashboard_data(start_date, end_date):
51-
create all historical_dashboard_urls included in date range
38+
def fetch_historical_dashboard_data():
39+
create/scrape all historical_dashboard_urls
5240
loop over urls:
5341
fetch_dashboard_data(historical_dashboard_url)
5442

55-
included_seasons = compute_seasons_in_range(start_date, end_date)
56-
df_list = [fetch_one_season_from_report(season) for season in included_seasons]
5743
df = pd.concat(df_list)
58-
df = df[start_date <= df.issue <= end_date]
5944

6045
return df
6146

6247
def fetch_current_dashboard_data():
63-
fetch_dashboard_data(current_dashboard_url)
64-
65-
def fetch_data(start_date, end_date):
66-
if (start_date, end_date) not exist:
67-
data = fetch_current_dashboard_data()
68-
else:
69-
early_range, late_range = split_date_range_by_dashboard_release_date(start_date, end_date)
70-
report_data = fetch_report_data(early_range)
71-
dashboard_data = fetch_historical_dashboard_data(late_range)
72-
73-
data = [report_data, dashboard_data].concat()
74-
75-
return data
48+
return fetch_dashboard_data(DEFAULT_DASHBOARD_URL)
49+
50+
def update_current_data(start_date, end_date):
51+
data = fetch_current_dashboard_data()
52+
update_database(data)
53+
54+
def update_historical_data():
55+
report_data = fetch_report_data()
56+
dashboard_data = fetch_historical_dashboard_data()
57+
58+
data = [report_data, dashboard_data].concat()
59+
60+
update_database(data)
61+
62+
63+
def main():
64+
# args and usage
65+
parser = argparse.ArgumentParser()
66+
# fmt: off
67+
parser.add_argument(
68+
"--current",
69+
"-c",
70+
action="store_true",
71+
help="fetch current data, that is, data for the latest epiweek"
72+
)
73+
parser.add_argument(
74+
"--historical",
75+
"-h",
76+
action="store_true",
77+
help="fetch historical data, that is, data for all available time periods other than the latest epiweek"
78+
)
79+
# fmt: on
80+
args = parser.parse_args()
81+
82+
current_flag, historical_flag = (
83+
args.current,
84+
args.historical,
85+
)
86+
if not current_flag and not historical_flag:
87+
raise Exception("no data was requested")
88+
89+
# Decide what to update
90+
if current_flag:
91+
update_current_data()
92+
if historical_flag:
93+
update_historical_data()
94+
95+
96+
if __name__ == "__main__":
97+
main()

0 commit comments

Comments
 (0)