1515import math
1616
1717from delphi .epidata .acquisition .rvdss .constants import (
18- DASHBOARD_BASE_URLS_2023_2024_SEASON , HISTORIC_SEASON_URL ,
18+ DASHBOARD_BASE_URLS_2023_2024_SEASON , HISTORIC_SEASON_URLS ,
1919 ALTERNATIVE_SEASON_BASE_URL , SEASON_BASE_URL , LAST_WEEK_OF_YEAR ,
2020 RESP_COUNTS_OUTPUT_FILE , POSITIVE_TESTS_OUTPUT_FILE
2121 )
@@ -367,7 +367,7 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
367367
368368 return (table )
369369
370- def get_season_reports (url ):
370+ def fetch_one_season_from_report (url ):
371371 # From the url, go to the main landing page for a season
372372 # which contains all the links to each week in the season
373373 page = requests .get (url )
@@ -382,13 +382,13 @@ def get_season_reports(url):
382382 # create tables to hold all the data for the season
383383 all_positive_tables = pd .DataFrame ()
384384 all_number_tables = pd .DataFrame ()
385- all_respiratory_detection_table = pd .DataFrame ()
385+ all_respiratory_detection_tables = pd .DataFrame ()
386386
387387 for week_num in range (len (urls )):
388388 current_week = weeks [week_num ]
389389 current_week_end = end_dates [week_num ]
390390
391- # In the 2019= 2020 season, the webpages for weeks 5 and 47 only have
391+ # In the 2019- 2020 season, the webpages for weeks 5 and 47 only have
392392 # the abbreviations table and the headers for the respiratory detections
393393 # table, so they are effectively empty, and skipped
394394 if season [0 ] == '2019' :
@@ -532,8 +532,8 @@ def get_season_reports(url):
532532 # If not, add the weeks tables into the season table
533533
534534 # check for deduplication pandas
535- if not respiratory_detection_table .index .isin (all_respiratory_detection_table .index ).any ():
536- all_respiratory_detection_table = pd .concat ([all_respiratory_detection_table ,respiratory_detection_table ])
535+ if not respiratory_detection_table .index .isin (all_respiratory_detection_tables .index ).any ():
536+ all_respiratory_detection_tables = pd .concat ([all_respiratory_detection_tables ,respiratory_detection_table ])
537537
538538 if not combined_positive_tables .index .isin (all_positive_tables .index ).any ():
539539 all_positive_tables = pd .concat ([all_positive_tables ,combined_positive_tables ])
@@ -542,40 +542,28 @@ def get_season_reports(url):
542542 if not number_detections_table .index .isin (all_number_tables .index ).any ():
543543 all_number_tables = pd .concat ([all_number_tables ,number_detections_table ])
544544
545- # write files to csvs
546- all_respiratory_detection_table .to_csv (path + "/" + RESP_COUNTS_OUTPUT_FILE , index = True )
547- all_positive_tables .to_csv (path + "/" + POSITIVE_TESTS_OUTPUT_FILE , index = True )
548-
549- # Write the number of detections table to csv if it exists (i.e has rows)
550- if len (all_number_tables ) != 0 :
551- all_number_tables .to_csv (path + "/number_of_detections.csv" , index = True )
545+ return {
546+ "respiratory_detection" : all_respiratory_detection_tables ,
547+ "positive" : all_positive_tables ,
548+ "count" : all_number_tables ,
549+ }
552550
553- def main ():
554- # Scrape each season. Saves data to CSVs as a side effect.
555- [ get_season_reports (url ) for url in HISTORIC_SEASON_URL ]
551+ def fetch_report_data ():
552+ # Scrape each season.
553+ dict_list = [ fetch_one_season_from_report (url ) for url in HISTORIC_SEASON_URLS ]
556554
557- # Update the end of the 2023-2024 season with the dashboard data
555+ return dict_list
558556
559- # Load old csvs
560- old_detection_data = pd .read_csv ('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
561- old_positive_data = pd .read_csv ('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
557+ def fetch_historical_dashboard_data ():
558+ # Update the end of the 2023-2024 season with the dashboard data
559+ included_urls = fetch_archived_dashboard_urls ()
560+ dict_list = [{} for url in included_urls ]
562561
563- for base_url in DASHBOARD_BASE_URLS_2023_2024_SEASON :
562+ for i , base_url in enumerate ( included_urls ) :
564563 # Get weekly dashboard data
565- weekly_data = get_weekly_data (base_url ,2023 ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
566- positive_data = get_revised_data (base_url )
567-
568- # Check if indices are already present in the old data
569- # If not, add the new data
570- if not weekly_data .index .isin (old_detection_data .index ).any ():
571- old_detection_data = pd .concat ([old_detection_data ,weekly_data ],axis = 0 )
572-
573- if not positive_data .index .isin (old_positive_data .index ).any ():
574- old_positive_data = pd .concat ([old_positive_data ,positive_data ],axis = 0 )
575-
576- # Overwrite/update csvs
577- old_detection_data .to_csv ('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE ,index = True )
578- old_positive_data .to_csv ('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE ,index = True )
564+ ## TODO: what to do with this "2023"? Need to parse the start year of the season from the URL
565+ ## TODO: how to "weekly" and "positive" correspond to the dict keys from historical reports?
566+ dict_list [i ]["weekly" ] = get_weekly_data (base_url ,2023 ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
567+ dict_list [i ]["positive" ] = get_revised_data (base_url )
579568
580- if __name__ == '__main__' :
581- main ()
569+ return dict_list
0 commit comments