Skip to content

Commit c1a70a2

Browse files
committed
Add in archived dashboards, and calculate start year from data
1 parent f7b40da commit c1a70a2

File tree

3 files changed

+26
-16
lines changed

3 files changed

+26
-16
lines changed

src/acquisition/rvdss/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
SEASON_BASE_URL = "https://www.canada.ca"
7777
ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"
7878
HISTORIC_SEASON_REPORTS_URL = SEASON_BASE_URL+"/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html"
79+
DASHBOARD_ARCHIVED_DATES_URL= "https://health-infobase.canada.ca/src/js/respiratory-virus-detections/ArchiveData.json"
7980

8081
# Each URL created here points to a list of all data reports made during that
8182
# season, e.g.
@@ -103,10 +104,11 @@
103104
DASHBOARD_UPDATE_DATE_FILE = "RVD_UpdateDate.csv"
104105
DASHBOARD_DATA_FILE = "RVD_WeeklyData.csv"
105106

107+
106108
RESP_DETECTIONS_OUTPUT_FILE = "respiratory_detections.csv"
107109
POSITIVE_TESTS_OUTPUT_FILE = "positive_tests.csv"
108110
COUNTS_OUTPUT_FILE = "number_of_detections.csv"
109111

110-
LAST_WEEK_OF_YEAR = 35
112+
FIRST_WEEK_OF_YEAR = 35
111113

112114
NOW = datetime.datetime.now()

src/acquisition/rvdss/pull_historic.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,16 @@
1313
from epiweeks import Week
1414
from datetime import datetime, timedelta
1515
import math
16+
from pandas.io.json import json_normalize
1617

1718
from delphi.epidata.acquisition.rvdss.constants import (
1819
DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URLS,
19-
ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR,
20-
RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE
20+
ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, FIRST_WEEK_OF_YEAR,
21+
RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE,DASHBOARD_ARCHIVED_DATES_URL
2122
)
2223
from delphi.epidata.acquisition.rvdss.utils import (
2324
abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format,
24-
get_revised_data, get_weekly_data
25+
get_revised_data, get_weekly_data, fetch_dashboard_data
2526
)
2627
#%% Functions
2728

@@ -78,7 +79,7 @@ def get_report_date(week,start_year,epi=False):
7879
epi - if True, return the date in cdc format (yearweek)
7980
8081
"""
81-
if week < LAST_WEEK_OF_YEAR:
82+
if week < FIRST_WEEK_OF_YEAR:
8283
year=int(start_year)+1
8384
else:
8485
year=int(start_year)
@@ -523,7 +524,7 @@ def fetch_one_season_from_report(url):
523524
positive_tables.append(pos_table)
524525

525526
# create path to save files
526-
path = "season_" + season[0]+"_"+season[1]
527+
#path = "season_" + season[0]+"_"+season[1]
527528

528529
# combine all the positive tables
529530
combined_positive_tables=pd.concat(positive_tables,axis=1)
@@ -548,9 +549,13 @@ def fetch_one_season_from_report(url):
548549
"count": all_number_tables,
549550
}
550551

551-
def fetch_archived_dashboard_urls():
552-
## TODO: paste in Christine's code for scraping this list https://health-infobase.canada.ca/respiratory-virus-detections/archive.html
553-
pass
552+
def fetch_archived_dashboard_urls(archive_url):
553+
r=requests.get(archive_url)
554+
values=r.json()
555+
data=json_normalize(values)
556+
archived_dates = data[data["lang"]=="en"]
557+
return(archived_dates)
558+
554559

555560
def fetch_report_data():
556561
# Scrape each season.
@@ -560,7 +565,7 @@ def fetch_report_data():
560565

561566
def fetch_historical_dashboard_data():
562567
# Update the end of the 2023-2024 season with the dashboard data
563-
included_urls = fetch_archived_dashboard_urls()
564-
dict_list = [fetch_dashboard_data(url, 2023) for url in included_urls]
568+
included_urls = fetch_archived_dashboard_urls(DASHBOARD_ARCHIVED_DATES_URL)
569+
dict_list = [fetch_dashboard_data(url) for url in included_urls]
565570

566571
return dict_list

src/acquisition/rvdss/utils.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import string
1010

1111
from delphi.epidata.acquisition.rvdss.constants import (
12-
VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR,
12+
VIRUSES, GEOS, REGIONS, NATION, FIRST_WEEK_OF_YEAR,
1313
DASHBOARD_UPDATE_DATE_FILE, DASHBOARD_DATA_FILE
1414
)
1515

@@ -105,7 +105,7 @@ def get_revised_data(base_url,headers,update_date):
105105
return(df)
106106

107107
## TODO: the `start_year` arg is making calling this complicated. If we know that LAST_WEEK_OF_YEAR (really, of the season) is always 35, then we should be able to derive `start_year` from `update_date`.
108-
def get_weekly_data(base_url,start_year,headers,update_date):
108+
def get_weekly_data(base_url,headers,update_date):
109109
# Get current week and year
110110
summary_url = base_url + "RVD_SummaryText.csv"
111111
summary_url_response = requests.get(summary_url, headers=headers)
@@ -114,11 +114,14 @@ def get_weekly_data(base_url,start_year,headers,update_date):
114114
week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")]
115115
week_string = week_df.iloc[0]['Text'].lower()
116116
current_week = int(re.search("week (.+?) ", week_string).group(1))
117+
current_year= int(re.search("20\d{2}", week_string).group(0))
117118

118-
if current_week < LAST_WEEK_OF_YEAR:
119+
'''
120+
if current_week < FIRST_WEEK_OF_YEAR:
119121
current_year = start_year+1
120122
else:
121123
current_year = start_year
124+
'''
122125

123126
current_epiweek= Week(current_year,current_week)
124127

@@ -150,15 +153,15 @@ def get_weekly_data(base_url,start_year,headers,update_date):
150153

151154
return(df_weekly.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']))
152155

153-
def fetch_dashboard_data(url, start_year):
156+
def fetch_dashboard_data(url):
154157
"""Get data from current or archived dashboard"""
155158
headers = {
156159
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
157160
}
158161

159162
update_date = get_dashboard_update_date(url, headers)
160163

161-
weekly_data = get_weekly_data(url,start_year,headers,update_date)
164+
weekly_data = get_weekly_data(url,headers,update_date)
162165
positive_data = get_revised_data(url,headers,update_date)
163166

164167
## TODO: how to "weekly" and "positive" correspond to the dict keys ("respiratory_detection", "positive", "count") from historical reports? Need to make sure keys used here are from the same set.

0 commit comments

Comments
 (0)