-
Notifications
You must be signed in to change notification settings - Fork 16
2085 add proportions nhsn #2111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
ff91c4c
5ef99b2
6b19402
ad92262
f4b3c40
1df478c
6e5a99b
7cabd8a
6a73c35
6e0d4c2
2da6c08
1e408ba
77662dc
783ab24
76d5436
18de943
e3e96bf
e9bb0a7
33f3db5
88fbc6e
7e6b23a
a220e0d
d8f237b
11ceae9
ebe52aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,30 +1,87 @@ | ||
| # -*- coding: utf-8 -*- | ||
| """Functions for pulling NSSP ER data.""" | ||
| import logging | ||
| import random | ||
| import time | ||
| from datetime import datetime, timedelta | ||
| from pathlib import Path | ||
| from typing import Optional | ||
| from urllib.error import HTTPError | ||
|
|
||
| import pandas as pd | ||
| from delphi_epidata import Epidata | ||
| from delphi_utils import create_backup_csv | ||
| from epiweeks import Week | ||
| from sodapy import Socrata | ||
|
|
||
| from .constants import MAIN_DATASET_ID, PRELIM_DATASET_ID, PRELIM_SIGNALS_MAP, PRELIM_TYPE_DICT, SIGNALS_MAP, TYPE_DICT | ||
|
|
||
|
|
||
| def pull_data(socrata_token: str, dataset_id: str): | ||
| def get_latest_nhsn_issue_date() -> datetime: | ||
| """Check the api and return approxiate latest issue date available.""" | ||
| response = Epidata.covidcast_meta() | ||
| meta_df = pd.DataFrame(response["epidata"]) | ||
| nhsn_df = meta_df[meta_df["data_source"] == "nhsn"] | ||
| last_updated = datetime.utcfromtimestamp(min(nhsn_df.last_update)) | ||
| max_issue = Week.fromstring(str(min(nhsn_df.max_issue))) | ||
| # last_updated can be multiple days off from max issue from patching / other processes | ||
| if max_issue.startdate() <= last_updated.date() <= max_issue.enddate(): | ||
| return last_updated | ||
| # defaults to start of epiweek if last_update is vastly different from max_issue | ||
| return datetime.combine(max_issue.startdate(), datetime.min.time()) | ||
|
|
||
|
|
||
| def check_last_updated(client, dataset_id, logger): | ||
| """Check last updated timestamp to determine data should be pulled or not.""" | ||
| # retry logic for 500 error | ||
| try: | ||
| response = client.get_metadata(dataset_id) | ||
| except HTTPError as err: | ||
| if err.code == 503: | ||
| time.sleep(2 + random.randint(0, 1000) / 1000.0) | ||
| response = client.get_metadata(dataset_id) | ||
| else: | ||
| raise err | ||
aysim319 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| updated_timestamp = datetime.utcfromtimestamp(int(response["rowsUpdatedAt"])) | ||
| now = datetime.utcnow() | ||
| recently_updated_source = (now - updated_timestamp) < timedelta(days=1) | ||
|
|
||
| latest_issue_date = get_latest_nhsn_issue_date() | ||
| # generally expect one issue per week | ||
| recently_updated_api = (updated_timestamp - latest_issue_date) < timedelta(days=6) | ||
|
|
||
| prelim_prefix = "Preliminary " if dataset_id == PRELIM_DATASET_ID else "" | ||
| if recently_updated_source: | ||
| logger.info(f"{prelim_prefix}NHSN data was recently updated; Pulling data", updated_timestamp=updated_timestamp) | ||
| elif not recently_updated_api: | ||
| logger.info( | ||
| f"{prelim_prefix}NHSN data is missing issue; Pulling data", | ||
| updated_timestamp=updated_timestamp, | ||
| issue=latest_issue_date, | ||
| ) | ||
| else: | ||
| logger.info(f"{prelim_prefix}NHSN data is stale; Skipping", updated_timestamp=updated_timestamp) | ||
| return recently_updated_source or not recently_updated_api | ||
aysim319 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| def pull_data(socrata_token: str, dataset_id: str, logger): | ||
| """Pull data from Socrata API.""" | ||
| client = Socrata("data.cdc.gov", socrata_token) | ||
| results = [] | ||
| offset = 0 | ||
| limit = 50000 # maximum limit allowed by SODA 2.0 | ||
| while True: | ||
| page = client.get(dataset_id, limit=limit, offset=offset) | ||
| if not page: | ||
| break # exit the loop if no more results | ||
| results.extend(page) | ||
| offset += limit | ||
|
|
||
| df = pd.DataFrame.from_records(results) | ||
| recently_updated = check_last_updated(client, dataset_id, logger) | ||
| df = pd.DataFrame() | ||
| if recently_updated: | ||
| results = [] | ||
| offset = 0 | ||
| limit = 50000 # maximum limit allowed by SODA 2.0 | ||
| while True: | ||
aysim319 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| page = client.get(dataset_id, limit=limit, offset=offset) | ||
| if not page: | ||
| break # exit the loop if no more results | ||
| results.extend(page) | ||
| offset += limit | ||
|
|
||
| df = pd.DataFrame.from_records(results) | ||
| return df | ||
|
|
||
|
|
||
|
|
@@ -89,7 +146,7 @@ def pull_nhsn_data( | |
| """ | ||
| # Pull data from Socrata API | ||
| df = ( | ||
| pull_data(socrata_token, dataset_id=MAIN_DATASET_ID) | ||
| pull_data(socrata_token, MAIN_DATASET_ID, logger) | ||
| if not custom_run | ||
| else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=False) | ||
| ) | ||
|
|
@@ -102,12 +159,20 @@ def pull_nhsn_data( | |
| df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"}) | ||
|
|
||
| for signal, col_name in SIGNALS_MAP.items(): | ||
| df[signal] = df[col_name] | ||
| # older backups don't have certain columns | ||
| try: | ||
| df[signal] = df[col_name] | ||
| except KeyError: | ||
| logger.info("column not available in data", col_name=col_name) | ||
| keep_columns.remove(signal) | ||
|
|
||
| df = df[keep_columns] | ||
| df["geo_id"] = df["geo_id"].str.lower() | ||
| df.loc[df["geo_id"] == "usa", "geo_id"] = "us" | ||
| df = df.astype(TYPE_DICT) | ||
| try: | ||
| df = df.astype(TYPE_DICT) | ||
| except KeyError: | ||
|
||
| pass | ||
| else: | ||
| df = pd.DataFrame(columns=keep_columns) | ||
|
|
||
|
|
@@ -144,8 +209,9 @@ def pull_preliminary_nhsn_data( | |
| pd.DataFrame | ||
| Dataframe as described above. | ||
| """ | ||
| # Pull data from Socrata API | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know they're similar, i thought about it and went back and forth about it but I was in the thought of maybe in the future there would be something different going on so kept it seperate. I'm not too concerned about this, since we'll be slowly deprecating this codebase; |
||
| df = ( | ||
| pull_data(socrata_token, dataset_id=PRELIM_DATASET_ID) | ||
| pull_data(socrata_token, PRELIM_DATASET_ID, logger) | ||
| if not custom_run | ||
| else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=True) | ||
| ) | ||
|
|
@@ -158,10 +224,17 @@ def pull_preliminary_nhsn_data( | |
| df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"}) | ||
|
|
||
| for signal, col_name in PRELIM_SIGNALS_MAP.items(): | ||
| df[signal] = df[col_name] | ||
| try: | ||
| df[signal] = df[col_name] | ||
| except KeyError: | ||
| logger.info("column not available in data", col_name=col_name, signal=signal) | ||
| keep_columns.remove(signal) | ||
|
|
||
| df = df[keep_columns] | ||
| df = df.astype(PRELIM_TYPE_DICT) | ||
| try: | ||
| df = df.astype(PRELIM_TYPE_DICT) | ||
| except KeyError: | ||
| pass | ||
| df["geo_id"] = df["geo_id"].str.lower() | ||
| df.loc[df["geo_id"] == "usa", "geo_id"] = "us" | ||
| else: | ||
|
|
||

Uh oh!
There was an error while loading. Please reload this page.