start filling out historical fn stubs

nmdefries · nmdefries · commit 349257328536 · 2024-10-11T19:34:32.000-04:00
diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py
@@ -83,7 +83,7 @@
 # disease data in a dashboard with a static URL. Therefore, this collection
 # of URLs does _NOT_ need to be updated. It is used for fetching historical
 # data (for dates on or before June 8, 2024) only.
-HISTORIC_SEASON_URL = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in
+HISTORIC_SEASON_URLS = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in
     (
         "2013-2014",
         "2014-2015",
diff --git a/src/acquisition/rvdss/pull_historic.py b/src/acquisition/rvdss/pull_historic.py
@@ -15,7 +15,7 @@
 import math
 
 from delphi.epidata.acquisition.rvdss.constants import (
-        DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URL,
+        DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URLS,
         ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR,
         RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE
     )
@@ -367,7 +367,7 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
 
     return(table)
 
-def get_season_reports(url):
+def fetch_one_season_from_report(url):
     # From the url, go to the main landing page for a season
     # which contains all the links to each week in the season
     page=requests.get(url)
@@ -382,13 +382,13 @@ def get_season_reports(url):
     # create tables to hold all the data for the season
     all_positive_tables=pd.DataFrame()
     all_number_tables=pd.DataFrame()
-    all_respiratory_detection_table=pd.DataFrame()
+    all_respiratory_detection_tables=pd.DataFrame()
     
     for week_num in range(len(urls)):
         current_week = weeks[week_num]
         current_week_end = end_dates[week_num]
         
-        # In the 2019=2020 season, the webpages for weeks 5 and 47 only have
+        # In the 2019-2020 season, the webpages for weeks 5 and 47 only have
         # the abbreviations table and the headers for the respiratory detections 
         # table, so they are effectively empty, and skipped
         if season[0] == '2019':
@@ -532,8 +532,8 @@ def get_season_reports(url):
         # If not, add the weeks tables into the season table
         
         # check for deduplication pandas
-        if not respiratory_detection_table.index.isin(all_respiratory_detection_table.index).any():
-            all_respiratory_detection_table= pd.concat([all_respiratory_detection_table,respiratory_detection_table])
+        if not respiratory_detection_table.index.isin(all_respiratory_detection_tables.index).any():
+            all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
             
         if not combined_positive_tables.index.isin(all_positive_tables.index).any():
             all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
@@ -542,40 +542,28 @@ def get_season_reports(url):
             if not number_detections_table.index.isin(all_number_tables.index).any():
                 all_number_tables=pd.concat([all_number_tables,number_detections_table])
 
-    # write files to csvs
-    all_respiratory_detection_table.to_csv(path+"/" + RESP_COUNTS_OUTPUT_FILE, index=True)
-    all_positive_tables.to_csv(path+"/" + POSITIVE_TESTS_OUTPUT_FILE, index=True)
-    
-    # Write the number of detections table to csv if it exists (i.e has rows)
-    if len(all_number_tables) != 0:
-        all_number_tables.to_csv(path+"/number_of_detections.csv", index=True) 
+    return {
+        "respiratory_detection": all_respiratory_detection_tables,
+        "positive": all_positive_tables,
+        "count": all_number_tables,
+    }
 
-def main():
-    # Scrape each season. Saves data to CSVs as a side effect.
-    [get_season_reports(url) for url in HISTORIC_SEASON_URL]
+def fetch_report_data():
+    # Scrape each season.
+    dict_list = [fetch_one_season_from_report(url) for url in HISTORIC_SEASON_URLS]
 
-    # Update the end of the 2023-2024 season with the dashboard data
+    return dict_list
 
-    # Load old csvs
-    old_detection_data = pd.read_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
-    old_positive_data = pd.read_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
+def fetch_historical_dashboard_data():
+    # Update the end of the 2023-2024 season with the dashboard data
+    included_urls = fetch_archived_dashboard_urls()
+    dict_list = [{} for url in included_urls]
 
-    for base_url in DASHBOARD_BASE_URLS_2023_2024_SEASON:
+    for i, base_url in enumerate(included_urls):
         # Get weekly dashboard data
-        weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
-        positive_data = get_revised_data(base_url)
-
-        # Check if indices are already present in the old data
-        # If not, add the new data
-        if not weekly_data.index.isin(old_detection_data.index).any():
-            old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0)
-
-        if not positive_data.index.isin(old_positive_data.index).any():
-            old_positive_data= pd.concat([old_positive_data,positive_data],axis=0)
-
-    # Overwrite/update csvs
-    old_detection_data.to_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE,index=True)
-    old_positive_data.to_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE,index=True)
+        ## TODO: what to do with this "2023"? Need to parse the start year of the season from the URL
+        ## TODO: how to "weekly" and "positive" correspond to the dict keys from historical reports?
+        dict_list[i]["weekly"] = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
+        dict_list[i]["positive"] = get_revised_data(base_url)
 
-if __name__ == '__main__':
-    main()
+    return dict_list
diff --git a/src/acquisition/rvdss/run.py b/src/acquisition/rvdss/run.py
@@ -1,63 +1,45 @@
 import pandas as pd
 
 
-def fetch_report_urls(season):
-    """Get all report URLs from a season's report index page"""
-    pass
-
-## TODO: consider how to encode a "season" object, maybe as a tuple of start/end years `(2023, 2024)`, or a string `2023-2024`.
-## TODO: I think there's already a fn for this that includes the loop and seasons
-def fetch_one_season_from_report(season):
-    report_urls = fetch_report_urls(season)
-    df_list = [fetch_one_report(url) for url in report_urls]
-    df = pd.concat(df_list)
-
-    return df
+def fetch_archived_dashboard_urls():
+    ## TODO: paste in Christine's code for scraping this list https://health-infobase.canada.ca/respiratory-virus-detections/archive.html
 
 def fetch_dashboard_data(url = None):
     """Get data from current or archived dashboard"""
     pass
 
-def fetch_report_data():
-    seasons = [...]
-
-    # Fetch all reports made for all seasons.
-    ## TODO: I think there's already a fn for this that includes the loop and seasons
-    df_list = [fetch_one_season_from_report(season) for season in seasons]
-    df = pd.concat(df_list)
-
-    return df
-
-def fetch_historical_dashboard_data():
-    included_report_urls = fetch_archived_dashboard_urls()
-    df_list = [fetch_dashboard_data(url) for url in included_report_urls]
-    df = pd.concat(df_list)
-
-    return df
-
-def fetch_historical_dashboard_data():
-    create/scrape all historical_dashboard_urls
-    loop over urls:
-        fetch_dashboard_data(historical_dashboard_url)
-
-    df = pd.concat(df_list)
-
-    return df
 
 def fetch_current_dashboard_data():
     return fetch_dashboard_data(DEFAULT_DASHBOARD_URL)
 
 def update_current_data(start_date, end_date):
-  data = fetch_current_dashboard_data()
-  update_database(data)
+    data = fetch_current_dashboard_data()
+    update_database(data)
 
 def update_historical_data():
-  report_data = fetch_report_data()
-  dashboard_data = fetch_historical_dashboard_data()
+    report_dict_list = fetch_report_data()
+    dashboard_dict_list = fetch_historical_dashboard_data()
+
+    table_types = (
+        "respiratory_detection",
+        "positive",
+        "count",
+    )
+    for tt in table_types:
+        ## TODO: need to merge tables together from dashboards and reports. Expect 3 tables out.
+        pass
+        # ??
+        data = [report_data, dashboard_data].concat()
+
+    # Write the three tables to separate csvs
+    all_respiratory_detection_tables.to_csv(path+"/" + RESP_COUNTS_OUTPUT_FILE, index=True)
+    all_positive_tables.to_csv(path+"/" + POSITIVE_TESTS_OUTPUT_FILE, index=True)
 
-  data = [report_data, dashboard_data].concat()
+    # Write the number of detections table to csv if it exists (i.e has rows)
+    if len(all_number_tables) != 0:
+        all_number_tables.to_csv(path+"/number_of_detections.csv", index=True)
 
-  update_database(data)
+    update_database(data)
 
 
 def main():

Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@`
`83`	`83`	`# disease data in a dashboard with a static URL. Therefore, this collection`
`84`	`84`	`# of URLs does _NOT_ need to be updated. It is used for fetching historical`
`85`	`85`	`# data (for dates on or before June 8, 2024) only.`
`86`		`-HISTORIC_SEASON_URL = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in`
	`86`	`+HISTORIC_SEASON_URLS = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in`
`87`	`87`	`(`
`88`	`88`	`"2013-2014",`
`89`	`89`	`"2014-2015",`