1313from epiweeks import Week
1414from datetime import datetime , timedelta
1515import math
16- from pandas .io .json import json_normalize
1716
18- from delphi . epidata . acquisition . rvdss . constants import (
19- DASHBOARD_BASE_URLS_2023_2024_SEASON , HISTORIC_SEASON_URLS ,
17+ from constants import (
18+ HISTORIC_SEASON_URLS ,
2019 ALTERNATIVE_SEASON_BASE_URL , SEASON_BASE_URL , FIRST_WEEK_OF_YEAR ,
21- RESP_DETECTIONS_OUTPUT_FILE , POSITIVE_TESTS_OUTPUT_FILE ,DASHBOARD_ARCHIVED_DATES_URL
20+ RESP_DETECTIONS_OUTPUT_FILE , POSITIVE_TESTS_OUTPUT_FILE ,DASHBOARD_ARCHIVED_DATES_URL ,
21+ DASHBOARD_BASE_URL
2222 )
23- from delphi . epidata . acquisition . rvdss . utils import (
23+ from utils import (
2424 abbreviate_virus , abbreviate_geo , create_geo_types , check_date_format ,
25- get_revised_data , get_weekly_data , fetch_dashboard_data
25+ get_positive_data , get_detections_data , fetch_dashboard_data ,preprocess_table_columns ,
26+ make_signal_type_spelling_consistent ,add_flu_prefix
2627 )
2728 #%% Functions
2829
@@ -138,9 +139,9 @@ def get_modified_dates(soup,week_end_date):
138139 meta_tags = soup .find_all ("meta" ,title = "W3CDTF" )
139140 for tag in meta_tags :
140141 if tag .get ("name" , None ) == "dcterms.modified" or tag .get ("property" , None ) == "dcterms.modified" :
141- modified_date = tag .get ("content" , None )
142+ date_modified = tag .get ("content" , None )
142143
143- mod_date = datetime .strptime (modified_date , "%Y-%m-%d" )
144+ mod_date = datetime .strptime (date_modified , "%Y-%m-%d" )
144145 week_date = datetime .strptime (week_end_date , "%Y-%m-%d" )
145146
146147 diff_days = (mod_date - week_date ).days
@@ -184,65 +185,13 @@ def deduplicate_rows(table):
184185 new_table = table
185186 return (new_table )
186187
187- def add_flu_prefix (flu_subtype ):
188- """ Add the prefix `flu` when only the subtype is reported """
188+ def drop_ah1_columns (table ):
189+ h1n1_column_exists = any ([re .search ("h1n1" ,c ) for c in table .columns ])
190+ ah1_column_exists = any ([re .search (r"ah1\b" ,c ) for c in table .columns ])
189191
190- pat1 = r"^ah3"
191- pat2 = r"^auns"
192- pat3 = r"^ah1pdm09"
193- pat4 = r"^ah1n1pdm09"
194- combined_pat = '|' .join ((pat1 , pat2 ,pat3 ,pat4 ))
195-
196- full_fluname = re .sub (combined_pat , r"flu\g<0>" ,flu_subtype )
197- return (full_fluname )
198-
199- def make_signal_type_spelling_consistent (signal ):
200- """
201- Make the signal type (i.e. percent positive, number tests, total tests) have consistent spelling
202- Also remove total from signal names
203- """
204-
205- pat1 = "positive"
206- pat2 = 'pos'
207- combined_pat = '|' .join ((pat1 , pat2 ))
208-
209- pat3 = r"test\b"
210- pat4 = 'tested'
211- combined_pat2 = '|' .join ((pat3 , pat4 ))
212-
213- new_signal = re .sub (combined_pat , "positive_tests" ,signal )
214- new_signal = re .sub (combined_pat2 , "tests" ,new_signal )
215- new_signal = re .sub (" *%" , "_pct_positive" ,new_signal )
216- new_signal = re .sub ("total " , "" ,new_signal )
217- return (new_signal )
218-
219- def preprocess_table_columns (table ):
220- """
221- Remove characters like . or * from columns
222- Abbreviate the viruses in columns
223- Change some naming of signals in columns (i.e order of hpiv and other)
224- Change some naming of locations in columns (i.e at instead of atl)
225- """
226- table .columns = [re .sub ("\xa0 " ," " , col ) for col in table .columns ] # \xa0 to space
227- table .columns = [re .sub ("(.*?)(\.\d+)" , "\\ 1" , c ) for c in table .columns ] # remove .# for duplicated columns
228- table .columns = [re .sub ("\." , "" , s )for s in table .columns ] #remove periods
229- table .columns = [re .sub (r"\((all)\)" , "" , s )for s in table .columns ] # remove (all)
230- table .columns = [re .sub (r"\s*\(|\)" , "" , s )for s in table .columns ]
231- table .columns = [re .sub (' +' , ' ' , col ) for col in table .columns ] # Make any muliple spaces into one space
232- table .columns = [re .sub (r'\(|\)' , '' , col ) for col in table .columns ] # replace () for _
233- table .columns = [re .sub (r'/' , '_' , col ) for col in table .columns ] # replace / with _
234-
235- table .columns = [re .sub (r"^at\b" ,"atl " ,t ) for t in table .columns ]
236- table .columns = [re .sub ("canada" ,"can" ,t ) for t in table .columns ]
237-
238- table .columns = [re .sub (r"h1n1 2009 |h1n12009" , "ah1n1pdm09" , s )for s in table .columns ]
239- table .columns = [abbreviate_virus (col ) for col in table .columns ] # abbreviate viruses
240- table .columns = [re .sub (r"flu a" ,"flua" ,t ) for t in table .columns ]
241- table .columns = [re .sub (r"flu b" ,"flub" ,t ) for t in table .columns ]
242- table .columns = [re .sub ("flutest" ,"flu test" , col ) for col in table .columns ]
243- table .columns = [re .sub (r"other hpiv" ,"hpivother" ,t ) for t in table .columns ]
244-
245- table .columns = [make_signal_type_spelling_consistent (col ) for col in table .columns ]
192+ if ah1_column_exists and h1n1_column_exists :
193+ column_name_to_drop = list (table .filter (regex = r'ah1\b' ))
194+ table .drop (columns = column_name_to_drop ,inplace = True )
246195 return (table )
247196
248197def create_detections_table (table ,modified_date ,week_number ,week_end_date ,start_year ):
@@ -400,6 +349,7 @@ def fetch_one_season_from_report(url):
400349 temp_url = urls [week_num ]
401350 temp_page = requests .get (temp_url )
402351 new_soup = BeautifulSoup (temp_page .text , 'html.parser' )
352+
403353 captions = extract_captions_of_interest (new_soup )
404354 modified_date = get_modified_dates (new_soup ,current_week_end )
405355
@@ -432,7 +382,7 @@ def fetch_one_season_from_report(url):
432382
433383 # Read table, coding all the abbreviations for missing data into NA
434384 # Also use dropna because removing footers causes the html to have an empty row
435- na_values = ['N.A.' ,'N.A' , 'N.C.' ,'N.R.' ,'Not Available' ,'Not Tested' ,"N.D." ,"-" ]
385+ na_values = ['N.A.' ,'N.A' , 'N.C.' ,'N.R.' ,'Not Available' ,'Not Tested' ,"not available" , "not tested" , " N.D." ,"-" ]
436386 table = pd .read_html (tab ,na_values = na_values )[0 ].dropna (how = "all" )
437387
438388 # Check for multiline headers
@@ -469,6 +419,9 @@ def fetch_one_season_from_report(url):
469419 # a date is written as 022-09-03, instead of 2022-09-03
470420 table .loc [table ['week' ] == 35 , 'week end' ] = "2022-09-03"
471421
422+ # check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is always empty
423+ table = drop_ah1_columns (table )
424+
472425 # Rename columns
473426 table = preprocess_table_columns (table )
474427
@@ -549,11 +502,13 @@ def fetch_one_season_from_report(url):
549502 "count" : all_number_tables ,
550503 }
551504
552- def fetch_archived_dashboard_urls (archive_url ):
505+ def fetch_archived_dashboard_dates (archive_url ):
553506 r = requests .get (archive_url )
554507 values = r .json ()
555- data = json_normalize (values )
556- archived_dates = data [data ["lang" ]== "en" ]
508+ data = pd .json_normalize (values )
509+ english_data = data [data ["lang" ]== "en" ]
510+
511+ archived_dates = english_data ['date' ].to_list ()
557512 return (archived_dates )
558513
559514
@@ -565,7 +520,9 @@ def fetch_report_data():
565520
566521def fetch_historical_dashboard_data ():
567522 # Update the end of the 2023-2024 season with the dashboard data
568- included_urls = fetch_archived_dashboard_urls (DASHBOARD_ARCHIVED_DATES_URL )
569- dict_list = [fetch_dashboard_data (url ) for url in included_urls ]
523+ archived_dates = fetch_archived_dashboard_dates (DASHBOARD_ARCHIVED_DATES_URL )
524+
525+ archived_urls = [DASHBOARD_BASE_URL + "archive/" + date + "/" for date in archived_dates ]
526+ dict_list = [fetch_dashboard_data (url ) for url in archived_urls ]
570527
571528 return dict_list
0 commit comments