Skip to content

Commit 3492573

Browse files
committed
start filling out historical fn stubs
1 parent 47e9836 commit 3492573

File tree

3 files changed

+51
-81
lines changed

3 files changed

+51
-81
lines changed

src/acquisition/rvdss/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@
8383
# disease data in a dashboard with a static URL. Therefore, this collection
8484
# of URLs does _NOT_ need to be updated. It is used for fetching historical
8585
# data (for dates on or before June 8, 2024) only.
86-
HISTORIC_SEASON_URL = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in
86+
HISTORIC_SEASON_URLS = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in
8787
(
8888
"2013-2014",
8989
"2014-2015",

src/acquisition/rvdss/pull_historic.py

Lines changed: 25 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import math
1616

1717
from delphi.epidata.acquisition.rvdss.constants import (
18-
DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URL,
18+
DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URLS,
1919
ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR,
2020
RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE
2121
)
@@ -367,7 +367,7 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
367367

368368
return(table)
369369

370-
def get_season_reports(url):
370+
def fetch_one_season_from_report(url):
371371
# From the url, go to the main landing page for a season
372372
# which contains all the links to each week in the season
373373
page=requests.get(url)
@@ -382,13 +382,13 @@ def get_season_reports(url):
382382
# create tables to hold all the data for the season
383383
all_positive_tables=pd.DataFrame()
384384
all_number_tables=pd.DataFrame()
385-
all_respiratory_detection_table=pd.DataFrame()
385+
all_respiratory_detection_tables=pd.DataFrame()
386386

387387
for week_num in range(len(urls)):
388388
current_week = weeks[week_num]
389389
current_week_end = end_dates[week_num]
390390

391-
# In the 2019=2020 season, the webpages for weeks 5 and 47 only have
391+
# In the 2019-2020 season, the webpages for weeks 5 and 47 only have
392392
# the abbreviations table and the headers for the respiratory detections
393393
# table, so they are effectively empty, and skipped
394394
if season[0] == '2019':
@@ -532,8 +532,8 @@ def get_season_reports(url):
532532
# If not, add the weeks tables into the season table
533533

534534
# check for deduplication pandas
535-
if not respiratory_detection_table.index.isin(all_respiratory_detection_table.index).any():
536-
all_respiratory_detection_table= pd.concat([all_respiratory_detection_table,respiratory_detection_table])
535+
if not respiratory_detection_table.index.isin(all_respiratory_detection_tables.index).any():
536+
all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
537537

538538
if not combined_positive_tables.index.isin(all_positive_tables.index).any():
539539
all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
@@ -542,40 +542,28 @@ def get_season_reports(url):
542542
if not number_detections_table.index.isin(all_number_tables.index).any():
543543
all_number_tables=pd.concat([all_number_tables,number_detections_table])
544544

545-
# write files to csvs
546-
all_respiratory_detection_table.to_csv(path+"/" + RESP_COUNTS_OUTPUT_FILE, index=True)
547-
all_positive_tables.to_csv(path+"/" + POSITIVE_TESTS_OUTPUT_FILE, index=True)
548-
549-
# Write the number of detections table to csv if it exists (i.e has rows)
550-
if len(all_number_tables) != 0:
551-
all_number_tables.to_csv(path+"/number_of_detections.csv", index=True)
545+
return {
546+
"respiratory_detection": all_respiratory_detection_tables,
547+
"positive": all_positive_tables,
548+
"count": all_number_tables,
549+
}
552550

553-
def main():
554-
# Scrape each season. Saves data to CSVs as a side effect.
555-
[get_season_reports(url) for url in HISTORIC_SEASON_URL]
551+
def fetch_report_data():
552+
# Scrape each season.
553+
dict_list = [fetch_one_season_from_report(url) for url in HISTORIC_SEASON_URLS]
556554

557-
# Update the end of the 2023-2024 season with the dashboard data
555+
return dict_list
558556

559-
# Load old csvs
560-
old_detection_data = pd.read_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
561-
old_positive_data = pd.read_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
557+
def fetch_historical_dashboard_data():
558+
# Update the end of the 2023-2024 season with the dashboard data
559+
included_urls = fetch_archived_dashboard_urls()
560+
dict_list = [{} for url in included_urls]
562561

563-
for base_url in DASHBOARD_BASE_URLS_2023_2024_SEASON:
562+
for i, base_url in enumerate(included_urls):
564563
# Get weekly dashboard data
565-
weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
566-
positive_data = get_revised_data(base_url)
567-
568-
# Check if indices are already present in the old data
569-
# If not, add the new data
570-
if not weekly_data.index.isin(old_detection_data.index).any():
571-
old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0)
572-
573-
if not positive_data.index.isin(old_positive_data.index).any():
574-
old_positive_data= pd.concat([old_positive_data,positive_data],axis=0)
575-
576-
# Overwrite/update csvs
577-
old_detection_data.to_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE,index=True)
578-
old_positive_data.to_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE,index=True)
564+
## TODO: what to do with this "2023"? Need to parse the start year of the season from the URL
565+
## TODO: how to "weekly" and "positive" correspond to the dict keys from historical reports?
566+
dict_list[i]["weekly"] = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
567+
dict_list[i]["positive"] = get_revised_data(base_url)
579568

580-
if __name__ == '__main__':
581-
main()
569+
return dict_list

src/acquisition/rvdss/run.py

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,45 @@
11
import pandas as pd
22

33

4-
def fetch_report_urls(season):
5-
"""Get all report URLs from a season's report index page"""
6-
pass
7-
8-
## TODO: consider how to encode a "season" object, maybe as a tuple of start/end years `(2023, 2024)`, or a string `2023-2024`.
9-
## TODO: I think there's already a fn for this that includes the loop and seasons
10-
def fetch_one_season_from_report(season):
11-
report_urls = fetch_report_urls(season)
12-
df_list = [fetch_one_report(url) for url in report_urls]
13-
df = pd.concat(df_list)
14-
15-
return df
4+
def fetch_archived_dashboard_urls():
5+
## TODO: paste in Christine's code for scraping this list https://health-infobase.canada.ca/respiratory-virus-detections/archive.html
166

177
def fetch_dashboard_data(url = None):
188
"""Get data from current or archived dashboard"""
199
pass
2010

21-
def fetch_report_data():
22-
seasons = [...]
23-
24-
# Fetch all reports made for all seasons.
25-
## TODO: I think there's already a fn for this that includes the loop and seasons
26-
df_list = [fetch_one_season_from_report(season) for season in seasons]
27-
df = pd.concat(df_list)
28-
29-
return df
30-
31-
def fetch_historical_dashboard_data():
32-
included_report_urls = fetch_archived_dashboard_urls()
33-
df_list = [fetch_dashboard_data(url) for url in included_report_urls]
34-
df = pd.concat(df_list)
35-
36-
return df
37-
38-
def fetch_historical_dashboard_data():
39-
create/scrape all historical_dashboard_urls
40-
loop over urls:
41-
fetch_dashboard_data(historical_dashboard_url)
42-
43-
df = pd.concat(df_list)
44-
45-
return df
4611

4712
def fetch_current_dashboard_data():
4813
return fetch_dashboard_data(DEFAULT_DASHBOARD_URL)
4914

5015
def update_current_data(start_date, end_date):
51-
data = fetch_current_dashboard_data()
52-
update_database(data)
16+
data = fetch_current_dashboard_data()
17+
update_database(data)
5318

5419
def update_historical_data():
55-
report_data = fetch_report_data()
56-
dashboard_data = fetch_historical_dashboard_data()
20+
report_dict_list = fetch_report_data()
21+
dashboard_dict_list = fetch_historical_dashboard_data()
22+
23+
table_types = (
24+
"respiratory_detection",
25+
"positive",
26+
"count",
27+
)
28+
for tt in table_types:
29+
## TODO: need to merge tables together from dashboards and reports. Expect 3 tables out.
30+
pass
31+
# ??
32+
data = [report_data, dashboard_data].concat()
33+
34+
# Write the three tables to separate csvs
35+
all_respiratory_detection_tables.to_csv(path+"/" + RESP_COUNTS_OUTPUT_FILE, index=True)
36+
all_positive_tables.to_csv(path+"/" + POSITIVE_TESTS_OUTPUT_FILE, index=True)
5737

58-
data = [report_data, dashboard_data].concat()
38+
# Write the number of detections table to csv if it exists (i.e has rows)
39+
if len(all_number_tables) != 0:
40+
all_number_tables.to_csv(path+"/number_of_detections.csv", index=True)
5941

60-
update_database(data)
42+
update_database(data)
6143

6244

6345
def main():

0 commit comments

Comments
 (0)