Skip to content

Commit f9b9842

Browse files
committed
address todos and fix historical fetching
1 parent c1a70a2 commit f9b9842

File tree

4 files changed

+207
-159
lines changed

4 files changed

+207
-159
lines changed

src/acquisition/rvdss/constants.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
"saskatchewan":"sk",
3737
"alberta": "ab",
3838
"british columbia" :"bc",
39-
"yukon" : "yk",
39+
"yukon" : "yt",
4040
"northwest territories" : "nt",
4141
"nunavut" : "nu",
4242
"canada":"ca",
@@ -56,6 +56,8 @@
5656
# Construct dashboard and data report URLS.
5757
DASHBOARD_BASE_URL = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/"
5858
DASHBOARD_W_DATE_URL = DASHBOARD_BASE_URL + "archive/{date}/"
59+
60+
# May not need this since we write a function for this in pull_historic
5961
DASHBOARD_BASE_URLS_2023_2024_SEASON = (
6062
DASHBOARD_W_DATE_URL.format(date = date) for date in
6163
(
@@ -111,4 +113,5 @@
111113

112114
FIRST_WEEK_OF_YEAR = 35
113115

114-
NOW = datetime.datetime.now()
116+
UPDATE_DATES_FILE = "update_dates.txt"
117+
NOW = datetime.now()

src/acquisition/rvdss/pull_historic.py

Lines changed: 29 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,17 @@
1313
from epiweeks import Week
1414
from datetime import datetime, timedelta
1515
import math
16-
from pandas.io.json import json_normalize
1716

18-
from delphi.epidata.acquisition.rvdss.constants import (
19-
DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URLS,
17+
from constants import (
18+
HISTORIC_SEASON_URLS,
2019
ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, FIRST_WEEK_OF_YEAR,
21-
RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE,DASHBOARD_ARCHIVED_DATES_URL
20+
RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE,DASHBOARD_ARCHIVED_DATES_URL,
21+
DASHBOARD_BASE_URL
2222
)
23-
from delphi.epidata.acquisition.rvdss.utils import (
23+
from utils import (
2424
abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format,
25-
get_revised_data, get_weekly_data, fetch_dashboard_data
25+
get_positive_data, get_detections_data, fetch_dashboard_data,preprocess_table_columns,
26+
make_signal_type_spelling_consistent,add_flu_prefix
2627
)
2728
#%% Functions
2829

@@ -138,9 +139,9 @@ def get_modified_dates(soup,week_end_date):
138139
meta_tags=soup.find_all("meta",title="W3CDTF")
139140
for tag in meta_tags:
140141
if tag.get("name", None) == "dcterms.modified" or tag.get("property", None) == "dcterms.modified":
141-
modified_date = tag.get("content", None)
142+
date_modified = tag.get("content", None)
142143

143-
mod_date = datetime.strptime(modified_date, "%Y-%m-%d")
144+
mod_date = datetime.strptime(date_modified, "%Y-%m-%d")
144145
week_date = datetime.strptime(week_end_date, "%Y-%m-%d")
145146

146147
diff_days = (mod_date-week_date).days
@@ -184,65 +185,13 @@ def deduplicate_rows(table):
184185
new_table=table
185186
return(new_table)
186187

187-
def add_flu_prefix(flu_subtype):
188-
""" Add the prefix `flu` when only the subtype is reported """
188+
def drop_ah1_columns(table):
189+
h1n1_column_exists = any([re.search("h1n1",c) for c in table.columns])
190+
ah1_column_exists = any([re.search(r"ah1\b",c) for c in table.columns])
189191

190-
pat1 =r"^ah3"
191-
pat2= r"^auns"
192-
pat3= r"^ah1pdm09"
193-
pat4= r"^ah1n1pdm09"
194-
combined_pat = '|'.join((pat1, pat2,pat3,pat4))
195-
196-
full_fluname = re.sub(combined_pat, r"flu\g<0>",flu_subtype)
197-
return(full_fluname)
198-
199-
def make_signal_type_spelling_consistent(signal):
200-
"""
201-
Make the signal type (i.e. percent positive, number tests, total tests) have consistent spelling
202-
Also remove total from signal names
203-
"""
204-
205-
pat1 = "positive"
206-
pat2 = 'pos'
207-
combined_pat = '|'.join((pat1, pat2))
208-
209-
pat3 = r"test\b"
210-
pat4 = 'tested'
211-
combined_pat2 = '|'.join((pat3, pat4))
212-
213-
new_signal = re.sub(combined_pat, "positive_tests",signal)
214-
new_signal = re.sub(combined_pat2, "tests",new_signal)
215-
new_signal =re.sub(" *%", "_pct_positive",new_signal)
216-
new_signal = re.sub("total ", "",new_signal)
217-
return(new_signal)
218-
219-
def preprocess_table_columns(table):
220-
"""
221-
Remove characters like . or * from columns
222-
Abbreviate the viruses in columns
223-
Change some naming of signals in columns (i.e order of hpiv and other)
224-
Change some naming of locations in columns (i.e at instead of atl)
225-
"""
226-
table.columns = [re.sub("\xa0"," ", col) for col in table.columns] # \xa0 to space
227-
table.columns = [re.sub("(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns
228-
table.columns =[re.sub("\.", "", s)for s in table.columns] #remove periods
229-
table.columns =[re.sub(r"\((all)\)", "", s)for s in table.columns] # remove (all)
230-
table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns]
231-
table.columns = [re.sub(' +', ' ', col) for col in table.columns] # Make any muliple spaces into one space
232-
table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns] # replace () for _
233-
table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _
234-
235-
table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns]
236-
table.columns = [re.sub("canada","can",t) for t in table.columns]
237-
238-
table.columns =[re.sub(r"h1n1 2009 |h1n12009", "ah1n1pdm09", s)for s in table.columns]
239-
table.columns =[abbreviate_virus(col) for col in table.columns] # abbreviate viruses
240-
table.columns = [re.sub(r"flu a","flua",t) for t in table.columns]
241-
table.columns = [re.sub(r"flu b","flub",t) for t in table.columns]
242-
table.columns = [re.sub("flutest","flu test", col) for col in table.columns]
243-
table.columns = [re.sub(r"other hpiv","hpivother",t) for t in table.columns]
244-
245-
table.columns=[make_signal_type_spelling_consistent(col) for col in table.columns]
192+
if ah1_column_exists and h1n1_column_exists:
193+
column_name_to_drop = list(table.filter(regex=r'ah1\b'))
194+
table.drop(columns = column_name_to_drop,inplace=True)
246195
return(table)
247196

248197
def create_detections_table(table,modified_date,week_number,week_end_date,start_year):
@@ -400,6 +349,7 @@ def fetch_one_season_from_report(url):
400349
temp_url=urls[week_num]
401350
temp_page=requests.get(temp_url)
402351
new_soup = BeautifulSoup(temp_page.text, 'html.parser')
352+
403353
captions = extract_captions_of_interest(new_soup)
404354
modified_date = get_modified_dates(new_soup,current_week_end)
405355

@@ -432,7 +382,7 @@ def fetch_one_season_from_report(url):
432382

433383
# Read table, coding all the abbreviations for missing data into NA
434384
# Also use dropna because removing footers causes the html to have an empty row
435-
na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"]
385+
na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"not available","not tested","N.D.","-"]
436386
table = pd.read_html(tab,na_values=na_values)[0].dropna(how="all")
437387

438388
# Check for multiline headers
@@ -469,6 +419,9 @@ def fetch_one_season_from_report(url):
469419
# a date is written as 022-09-03, instead of 2022-09-03
470420
table.loc[table['week'] == 35, 'week end'] = "2022-09-03"
471421

422+
# check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is always empty
423+
table = drop_ah1_columns(table)
424+
472425
# Rename columns
473426
table= preprocess_table_columns(table)
474427

@@ -549,11 +502,13 @@ def fetch_one_season_from_report(url):
549502
"count": all_number_tables,
550503
}
551504

552-
def fetch_archived_dashboard_urls(archive_url):
505+
def fetch_archived_dashboard_dates(archive_url):
553506
r=requests.get(archive_url)
554507
values=r.json()
555-
data=json_normalize(values)
556-
archived_dates = data[data["lang"]=="en"]
508+
data=pd.json_normalize(values)
509+
english_data = data[data["lang"]=="en"]
510+
511+
archived_dates=english_data['date'].to_list()
557512
return(archived_dates)
558513

559514

@@ -565,7 +520,9 @@ def fetch_report_data():
565520

566521
def fetch_historical_dashboard_data():
567522
# Update the end of the 2023-2024 season with the dashboard data
568-
included_urls = fetch_archived_dashboard_urls(DASHBOARD_ARCHIVED_DATES_URL)
569-
dict_list = [fetch_dashboard_data(url) for url in included_urls]
523+
archived_dates = fetch_archived_dashboard_dates(DASHBOARD_ARCHIVED_DATES_URL)
524+
525+
archived_urls= [DASHBOARD_BASE_URL + "archive/"+ date+"/" for date in archived_dates]
526+
dict_list = [fetch_dashboard_data(url) for url in archived_urls]
570527

571528
return dict_list

src/acquisition/rvdss/run.py

Lines changed: 62 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -6,49 +6,66 @@
66

77
import pandas as pd
88
import os
9+
import argparse
910

10-
from delphi.epidata.acquisition.rvdss.utils import get_weekly_data, get_revised_data, get_dashboard_update_date
11-
from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL, RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE, COUNTS_OUTPUT_FILE
12-
11+
from utils import fetch_dashboard_data, check_most_recent_update_date,get_dashboard_update_date
12+
from constants import DASHBOARD_BASE_URL, RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE, COUNTS_OUTPUT_FILE,UPDATE_DATES_FILE
13+
from pull_historic import fetch_report_data,fetch_historical_dashboard_data
1314

1415
def update_current_data():
15-
## TODO: what is the base path for these files?
16-
base_path = "."
17-
18-
data_dict = fetch_dashboard_data(DASHBOARD_BASE_URL, 2024)
19-
20-
table_types = {
21-
"respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE,
22-
"positive": POSITIVE_TESTS_OUTPUT_FILE,
23-
# "count": COUNTS_OUTPUT_FILE, # Dashboards don't contain this data.
16+
17+
## Check if data for current update date has already been fetched
18+
headers = {
19+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
2420
}
25-
for tt in table_types.keys():
26-
data = data_dict[table_types]
27-
28-
# Write the tables to separate csvs
29-
path = base_path + "/" + table_types[tt]
30-
31-
# Since this function generates new data weekly, we need to combine it with the existing data, if it exists.
32-
if not os.path.exists(path):
33-
data.to_csv(path,index=True)
34-
else:
35-
old_data = pd.read_csv(path).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
36-
37-
# If index already exists in the data on disk, don't add the new data -- we may have already run the weekly data fetch.
38-
## TODO: The check on index maybe should be stricter? Although we do deduplication upstream, so this probably won't find true duplicates
39-
if not data.index.isin(old_data.index).any():
40-
old_data= pd.concat([old_data,data],axis=0)
41-
old_data.to_csv(path,index=True)
42-
43-
# ## TODO
44-
# update_database(data)
45-
4621

22+
update_date = get_dashboard_update_date(DASHBOARD_BASE_URL, headers)
23+
already_updated = check_most_recent_update_date(update_date,UPDATE_DATES_FILE)
24+
25+
if not already_updated:
26+
with open(UPDATE_DATES_FILE, 'a') as testfile:
27+
testfile.write(update_date+ "\n")
28+
29+
## TODO: what is the base path for these files?
30+
base_path = "."
31+
32+
data_dict = fetch_dashboard_data(DASHBOARD_BASE_URL)
33+
34+
table_types = {
35+
"respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE,
36+
"positive": POSITIVE_TESTS_OUTPUT_FILE,
37+
# "count": COUNTS_OUTPUT_FILE, # Dashboards don't contain this data.
38+
}
39+
for tt in table_types.keys():
40+
data = data_dict[tt]
41+
42+
# Write the tables to separate csvs
43+
path = base_path + "/" + table_types[tt]
44+
45+
# Since this function generates new data weekly, we need to combine it with the existing data, if it exists.
46+
if not os.path.exists(path):
47+
data.to_csv(path,index=True)
48+
else:
49+
old_data = pd.read_csv(path).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
50+
51+
# If index already exists in the data on disk, don't add the new data -- we may have already run the weekly data fetch.
52+
## TODO: The check on index maybe should be stricter? Although we do deduplication upstream, so this probably won't find true duplicates
53+
if not data.index.isin(old_data.index).any():
54+
old_data= pd.concat([old_data,data],axis=0)
55+
old_data.to_csv(path,index=True)
56+
57+
# ## TODO
58+
# update_database(data)
59+
else:
60+
print("Data is already up to date")
61+
4762
def update_historical_data():
4863
## TODO: what is the base path for these files?
4964
base_path = "."
5065

51-
report_dict_list = fetch_report_data()
66+
report_dict_list = fetch_report_data() # a dict for every season, and every seasonal dict has 2/3 tables inside
67+
68+
# a dict with an entry for every week that has an archival dashboard, and each entry has 2/3 tables
5269
dashboard_dict_list = fetch_historical_dashboard_data()
5370

5471
table_types = {
@@ -58,12 +75,17 @@ def update_historical_data():
5875
}
5976
for tt in table_types.keys():
6077
# Merge tables together from dashboards and reports for each table type.
61-
dashboard_data = [elem.get(tt, None) for elem in dashboard_dict_list]
62-
report_data = [elem.get(tt, None) for elem in report_dict_list]
63-
data = [report_data, dashboard_data].concat()
64-
78+
dashboard_data = [elem.get(tt, pd.DataFrame()) for elem in dashboard_dict_list] # a list of all the dashboard tables
79+
report_data = [elem.get(tt, None) for elem in report_dict_list] # a list of the report table
80+
81+
all_report_tables = pd.concat(report_data)
82+
all_dashboard_tables = pd.concat(dashboard_data)
83+
84+
data = pd.concat([all_report_tables, all_dashboard_tables])
85+
6586
# Write the tables to separate csvs
66-
data.to_csv(base_path +"/" + table_types[tt], index=True)
87+
if not data.empty:
88+
data.to_csv(base_path +"/" + table_types[tt], index=True)
6789

6890
# ## TODO
6991
# update_database(data)
@@ -81,7 +103,7 @@ def main():
81103
)
82104
parser.add_argument(
83105
"--historical",
84-
"-h",
106+
"-hist",
85107
action="store_true",
86108
help="fetch historical data, that is, data for all available time periods other than the latest epiweek"
87109
)

0 commit comments

Comments
 (0)