address todos and fix historical fetching

cchuong · cchuong · commit f9b984233d4e · 2024-11-21T15:26:02.000-08:00
diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py
@@ -36,7 +36,7 @@
     "saskatchewan":"sk",
     "alberta": "ab",
     "british columbia" :"bc",
-    "yukon" : "yk",
+    "yukon" : "yt",
     "northwest territories" : "nt",
     "nunavut" : "nu",
     "canada":"ca",
@@ -56,6 +56,8 @@
 # Construct dashboard and data report URLS.
 DASHBOARD_BASE_URL = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/"
 DASHBOARD_W_DATE_URL = DASHBOARD_BASE_URL + "archive/{date}/"
+
+# May not need this since we write a function for this in pull_historic
 DASHBOARD_BASE_URLS_2023_2024_SEASON = (
     DASHBOARD_W_DATE_URL.format(date = date) for date in
     (
@@ -111,4 +113,5 @@
 
 FIRST_WEEK_OF_YEAR = 35
 
-NOW = datetime.datetime.now()
+UPDATE_DATES_FILE = "update_dates.txt"
+NOW = datetime.now()
diff --git a/src/acquisition/rvdss/pull_historic.py b/src/acquisition/rvdss/pull_historic.py
@@ -13,16 +13,17 @@
 from epiweeks import Week
 from datetime import datetime, timedelta
 import math
-from pandas.io.json import json_normalize
 
-from delphi.epidata.acquisition.rvdss.constants import (
-        DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URLS,
+from constants import (
+        HISTORIC_SEASON_URLS,
         ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, FIRST_WEEK_OF_YEAR,
-        RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE,DASHBOARD_ARCHIVED_DATES_URL
+        RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE,DASHBOARD_ARCHIVED_DATES_URL,
+        DASHBOARD_BASE_URL
     )
-from delphi.epidata.acquisition.rvdss.utils import (
+from utils import (
         abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format,
-        get_revised_data, get_weekly_data, fetch_dashboard_data
+        get_positive_data, get_detections_data, fetch_dashboard_data,preprocess_table_columns,
+        make_signal_type_spelling_consistent,add_flu_prefix
     )
  #%% Functions
  
@@ -138,9 +139,9 @@ def get_modified_dates(soup,week_end_date):
     meta_tags=soup.find_all("meta",title="W3CDTF")
     for tag in meta_tags:
         if tag.get("name", None) == "dcterms.modified" or tag.get("property", None) == "dcterms.modified":
-            modified_date = tag.get("content", None)
+            date_modified = tag.get("content", None)
 
-    mod_date = datetime.strptime(modified_date, "%Y-%m-%d")
+    mod_date = datetime.strptime(date_modified, "%Y-%m-%d")
     week_date = datetime.strptime(week_end_date, "%Y-%m-%d")
     
     diff_days = (mod_date-week_date).days
@@ -184,65 +185,13 @@ def deduplicate_rows(table):
         new_table=table
     return(new_table)
 
-def add_flu_prefix(flu_subtype):
-    """ Add the prefix `flu` when only the subtype is reported """
+def drop_ah1_columns(table):
+    h1n1_column_exists = any([re.search("h1n1",c) for c in table.columns])
+    ah1_column_exists = any([re.search(r"ah1\b",c) for c in table.columns])
     
-    pat1 =r"^ah3"
-    pat2= r"^auns" 
-    pat3= r"^ah1pdm09"
-    pat4= r"^ah1n1pdm09"
-    combined_pat = '|'.join((pat1, pat2,pat3,pat4))
-    
-    full_fluname = re.sub(combined_pat, r"flu\g<0>",flu_subtype)
-    return(full_fluname)
-
-def make_signal_type_spelling_consistent(signal):
-    """
-    Make the signal type (i.e. percent positive, number tests, total tests) have consistent spelling
-    Also remove total from signal names
-    """
-    
-    pat1 = "positive"
-    pat2 = 'pos'
-    combined_pat = '|'.join((pat1, pat2))
-    
-    pat3 = r"test\b"
-    pat4 = 'tested'
-    combined_pat2 = '|'.join((pat3, pat4))
-    
-    new_signal = re.sub(combined_pat, "positive_tests",signal)
-    new_signal = re.sub(combined_pat2, "tests",new_signal)
-    new_signal =re.sub(" *%", "_pct_positive",new_signal)
-    new_signal = re.sub("total ", "",new_signal)
-    return(new_signal)
-
-def preprocess_table_columns(table):
-    """ 
-    Remove characters like . or * from columns
-    Abbreviate the viruses in columns
-    Change some naming of signals in columns (i.e order of hpiv and other)
-    Change some naming of locations in columns (i.e at instead of atl)
-    """
-    table.columns = [re.sub("\xa0"," ", col) for col in table.columns] # \xa0 to space
-    table.columns = [re.sub("(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns
-    table.columns =[re.sub("\.", "", s)for s in table.columns] #remove periods
-    table.columns =[re.sub(r"\((all)\)", "", s)for s in table.columns] # remove (all) 
-    table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns]
-    table.columns = [re.sub(' +', ' ', col) for col in table.columns] # Make any muliple spaces into one space
-    table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns] # replace () for _
-    table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _
-    
-    table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns]
-    table.columns = [re.sub("canada","can",t) for t in table.columns]
-    
-    table.columns =[re.sub(r"h1n1 2009 |h1n12009", "ah1n1pdm09", s)for s in table.columns]
-    table.columns =[abbreviate_virus(col) for col in table.columns] # abbreviate viruses
-    table.columns = [re.sub(r"flu a","flua",t) for t in table.columns]
-    table.columns = [re.sub(r"flu b","flub",t) for t in table.columns]
-    table.columns = [re.sub("flutest","flu test", col) for col in table.columns]
-    table.columns = [re.sub(r"other hpiv","hpivother",t) for t in table.columns]
-    
-    table.columns=[make_signal_type_spelling_consistent(col) for col in table.columns]
+    if ah1_column_exists and h1n1_column_exists:
+        column_name_to_drop = list(table.filter(regex=r'ah1\b'))
+        table.drop(columns = column_name_to_drop,inplace=True)
     return(table)
 
 def create_detections_table(table,modified_date,week_number,week_end_date,start_year):
@@ -400,6 +349,7 @@ def fetch_one_season_from_report(url):
         temp_url=urls[week_num]
         temp_page=requests.get(temp_url)
         new_soup = BeautifulSoup(temp_page.text, 'html.parser')
+        
         captions = extract_captions_of_interest(new_soup)
         modified_date = get_modified_dates(new_soup,current_week_end) 
 
@@ -432,7 +382,7 @@ def fetch_one_season_from_report(url):
             
             # Read table, coding all the abbreviations for missing data into NA
             # Also use dropna because removing footers causes the html to have an empty row
-            na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"]
+            na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"not available","not tested","N.D.","-"]
             table =  pd.read_html(tab,na_values=na_values)[0].dropna(how="all")
             
             # Check for multiline headers
@@ -469,6 +419,9 @@ def fetch_one_season_from_report(url):
                 # a date is written as 022-09-03, instead of 2022-09-03
                  table.loc[table['week'] == 35, 'week end'] = "2022-09-03"
             
+            # check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is always empty
+            table = drop_ah1_columns(table) 
+            
             # Rename columns
             table= preprocess_table_columns(table)
  
@@ -549,11 +502,13 @@ def fetch_one_season_from_report(url):
         "count": all_number_tables,
     }
 
-def fetch_archived_dashboard_urls(archive_url):
+def fetch_archived_dashboard_dates(archive_url):
     r=requests.get(archive_url)
     values=r.json()
-    data=json_normalize(values)
-    archived_dates = data[data["lang"]=="en"]
+    data=pd.json_normalize(values)
+    english_data = data[data["lang"]=="en"]
+    
+    archived_dates=english_data['date'].to_list()
     return(archived_dates)
 
 
@@ -565,7 +520,9 @@ def fetch_report_data():
 
 def fetch_historical_dashboard_data():
     # Update the end of the 2023-2024 season with the dashboard data
-    included_urls = fetch_archived_dashboard_urls(DASHBOARD_ARCHIVED_DATES_URL)
-    dict_list = [fetch_dashboard_data(url) for url in included_urls]
+    archived_dates = fetch_archived_dashboard_dates(DASHBOARD_ARCHIVED_DATES_URL)
+    
+    archived_urls= [DASHBOARD_BASE_URL + "archive/"+ date+"/" for date in archived_dates]
+    dict_list = [fetch_dashboard_data(url) for url in archived_urls]
 
     return dict_list
diff --git a/src/acquisition/rvdss/run.py b/src/acquisition/rvdss/run.py
@@ -6,49 +6,66 @@
 
 import pandas as pd
 import os
+import argparse
 
-from delphi.epidata.acquisition.rvdss.utils import get_weekly_data, get_revised_data, get_dashboard_update_date
-from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL, RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE, COUNTS_OUTPUT_FILE
-
+from utils import fetch_dashboard_data, check_most_recent_update_date,get_dashboard_update_date
+from constants import DASHBOARD_BASE_URL, RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE, COUNTS_OUTPUT_FILE,UPDATE_DATES_FILE
+from pull_historic import fetch_report_data,fetch_historical_dashboard_data
 
 def update_current_data():
-    ## TODO: what is the base path for these files?
-    base_path = "."
-
-    data_dict = fetch_dashboard_data(DASHBOARD_BASE_URL, 2024)
-
-    table_types = {
-        "respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE,
-        "positive": POSITIVE_TESTS_OUTPUT_FILE,
-        # "count": COUNTS_OUTPUT_FILE, # Dashboards don't contain this data.
+    
+    ## Check if data for current update date has already been fetched
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
     }
-    for tt in table_types.keys():
-        data = data_dict[table_types]
-
-        # Write the tables to separate csvs
-        path = base_path + "/" + table_types[tt]
-
-        # Since this function generates new data weekly, we need to combine it with the existing data, if it exists.
-        if not os.path.exists(path):
-            data.to_csv(path,index=True)
-        else:
-            old_data = pd.read_csv(path).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
-
-            # If index already exists in the data on disk, don't add the new data -- we may have already run the weekly data fetch.
-            ## TODO: The check on index maybe should be stricter? Although we do deduplication upstream, so this probably won't find true duplicates
-            if not data.index.isin(old_data.index).any():
-                old_data= pd.concat([old_data,data],axis=0)
-                old_data.to_csv(path,index=True)
-
-        # ## TODO
-        # update_database(data)
-
 
+    update_date = get_dashboard_update_date(DASHBOARD_BASE_URL, headers)
+    already_updated = check_most_recent_update_date(update_date,UPDATE_DATES_FILE)
+    
+    if not already_updated:
+        with open(UPDATE_DATES_FILE, 'a') as testfile:
+            testfile.write(update_date+ "\n")
+    
+        ## TODO: what is the base path for these files?
+        base_path = "."
+    
+        data_dict = fetch_dashboard_data(DASHBOARD_BASE_URL)
+    
+        table_types = {
+            "respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE,
+            "positive": POSITIVE_TESTS_OUTPUT_FILE,
+            # "count": COUNTS_OUTPUT_FILE, # Dashboards don't contain this data.
+        }
+        for tt in table_types.keys():
+            data = data_dict[tt]
+    
+            # Write the tables to separate csvs
+            path = base_path + "/" + table_types[tt]
+    
+            # Since this function generates new data weekly, we need to combine it with the existing data, if it exists.
+            if not os.path.exists(path):
+                data.to_csv(path,index=True)
+            else:
+                old_data = pd.read_csv(path).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
+    
+                # If index already exists in the data on disk, don't add the new data -- we may have already run the weekly data fetch.
+                ## TODO: The check on index maybe should be stricter? Although we do deduplication upstream, so this probably won't find true duplicates
+                if not data.index.isin(old_data.index).any():
+                    old_data= pd.concat([old_data,data],axis=0)
+                    old_data.to_csv(path,index=True)
+    
+            # ## TODO
+            # update_database(data)
+    else:
+        print("Data is already up to date")        
+    
 def update_historical_data():
     ## TODO: what is the base path for these files?
     base_path = "."
 
-    report_dict_list = fetch_report_data()
+    report_dict_list = fetch_report_data() # a dict for every season, and every seasonal dict has 2/3 tables inside
+    
+    # a dict with an entry for every week that has an archival dashboard, and each entry has 2/3 tables
     dashboard_dict_list = fetch_historical_dashboard_data()
 
     table_types = {
@@ -58,12 +75,17 @@ def update_historical_data():
     }
     for tt in table_types.keys():
         # Merge tables together from dashboards and reports for each table type.
-        dashboard_data = [elem.get(tt, None) for elem in dashboard_dict_list]
-        report_data = [elem.get(tt, None) for elem in report_dict_list]
-        data = [report_data, dashboard_data].concat()
-
+        dashboard_data = [elem.get(tt, pd.DataFrame()) for elem in dashboard_dict_list] # a list of all the dashboard tables
+        report_data = [elem.get(tt, None) for elem in report_dict_list] # a list of the report table
+        
+        all_report_tables = pd.concat(report_data)
+        all_dashboard_tables = pd.concat(dashboard_data)
+        
+        data = pd.concat([all_report_tables, all_dashboard_tables])
+        
         # Write the tables to separate csvs
-        data.to_csv(base_path +"/" + table_types[tt], index=True)
+        if not data.empty:
+            data.to_csv(base_path +"/" + table_types[tt], index=True)
 
         # ## TODO
         # update_database(data)
@@ -81,7 +103,7 @@ def main():
     )
     parser.add_argument(
         "--historical",
-        "-h",
+        "-hist",
         action="store_true",
         help="fetch historical data, that is, data for all available time periods other than the latest epiweek"
     )
diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py