Skip to content

Commit fd6250f

Browse files
minhkhulmelange396
andauthored
Add hsa_nci geo resolution + Adjust csv regex patterns (#1690)
* Add hsa_nci geo resolution + Adjust csv regex patterns * adjust test * replace old geo_type check with different warning * Add validation check for hsa_nci + validation test cases * add warning to input validation code chunk about hsa_nci * remove hsa from GEOGRAPHIC_RESOLUTIONS * remove hsa from test * adjust PATTERN_DAILY PATTERN_WEEKLY string construction to avoid doubel braces * Update src/server/_params.py Co-authored-by: george <[email protected]> * Update src/acquisition/covidcast/csv_importer.py Co-authored-by: george <[email protected]> * integration test --------- Co-authored-by: george <[email protected]>
1 parent fc450d5 commit fd6250f

File tree

4 files changed

+38
-12
lines changed

4 files changed

+38
-12
lines changed

integrations/server/test_covidcast.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,3 +476,14 @@ def test_week_formats(self):
476476
self.assertEqual(expected, colond)
477477
self.assertEqual(expected, dashed)
478478
self.assertEqual(expected, enumed)
479+
480+
def test_hsa_nci(self):
481+
row = CovidcastTestRow.make_default_row(geo_type='hsa_nci', geo_value='99')
482+
self._insert_rows([row])
483+
response = self.request_based_on_row(row)
484+
expected = [row.as_api_row_dict()]
485+
self.assertEqual(response, {
486+
'result': 1,
487+
'epidata': expected,
488+
'message': 'success',
489+
})

src/acquisition/covidcast/csv_importer.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,21 @@ class CsvRowValue:
5252
class CsvImporter:
5353
"""Finds and parses covidcast CSV files."""
5454

55+
# set of allowed resolutions (aka "geo_type")
56+
GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation', 'hsa_nci'}
57+
58+
# regex pattern for matching geo types, note: sort longer string first to avoid wrong substring matches
59+
geo_types_pattern = "|".join(sorted(GEOGRAPHIC_RESOLUTIONS, key=len, reverse=True))
60+
5561
# .../source/yyyymmdd_geo_signal.csv
56-
PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(\w+?)_(\w+)\.csv$')
62+
PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(' + geo_types_pattern + r')_(.+)\.csv$')
5763

5864
# .../source/weekly_yyyyww_geo_signal.csv
59-
PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(\w+?)_(\w+)\.csv$')
65+
PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(' + geo_types_pattern + r')_(.+)\.csv$')
6066

6167
# .../issue_yyyymmdd
6268
PATTERN_ISSUE_DIR = re.compile(r'^.*/([^/]*)/issue_(\d{8})$')
6369

64-
# set of allowed resolutions (aka "geo_type")
65-
GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation'}
66-
6770
# set of required CSV columns
6871
REQUIRED_COLUMNS = {'geo_id', 'val', 'se', 'sample_size'}
6972

@@ -158,7 +161,7 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
158161
daily_match = CsvImporter.PATTERN_DAILY.match(path.lower())
159162
weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower())
160163
if not daily_match and not weekly_match:
161-
logger.warning(event='invalid csv path/filename', detail=path, file=path)
164+
logger.warning(event='invalid csv path/filename or geo_type', detail=path, file=path)
162165
yield (path, None)
163166
continue
164167

@@ -186,12 +189,8 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
186189
issue_value=issue_epiweek_value
187190
lag_value=delta_epiweeks(time_value_week, issue_epiweek_value)
188191

189-
# # extract and validate geographic resolution
192+
# extract geographic resolution
190193
geo_type = match.group(3).lower()
191-
if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS:
192-
logger.warning(event='invalid geo_type', detail=geo_type, file=path)
193-
yield (path, None)
194-
continue
195194

196195
# extract additional values, lowercased for consistency
197196
source = match.group(1).lower()
@@ -300,7 +299,7 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s
300299
# geo_id was `None`
301300
return (None, 'geo_id')
302301

303-
if geo_type in ('hrr', 'msa', 'dma', 'hhs'):
302+
if geo_type in ('hrr', 'msa', 'dma', 'hhs', 'hsa_nci'):
304303
# these particular ids are prone to be written as ints -- and floats
305304
try:
306305
geo_id = str(CsvImporter.floaty_int(geo_id))
@@ -339,6 +338,12 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s
339338
if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz':
340339
return (None, 'geo_id')
341340

341+
elif geo_type == 'hsa_nci':
342+
# valid codes should be 1-3 digit numbers, or the special code of "1022" for blank
343+
# https://seer.cancer.gov/seerstat/variables/countyattribs/hsa.html
344+
if not re.match(r'^(1022|\d{1,3})$', geo_id):
345+
return (None, 'geo_id')
346+
342347
else:
343348
return (None, 'geo_type')
344349

src/server/_params.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def __init__(self, geo_type: str, geo_values: Union[bool, Sequence[str]]):
5959
if geo_values == ['']:
6060
raise ValidationFailedException(f"geo_value is empty for the requested geo_type {geo_type}!")
6161
# TODO: keep this translator in sync with CsvImporter.GEOGRAPHIC_RESOLUTIONS in acquisition/covidcast/ and with GeoMapper
62+
# NOTE: We are not including `hsa_nci` here as the geomapper code does not support that version of the HSA definition.
6263
geo_type_translator = {
6364
"county": "fips",
6465
"state": "state_id",

tests/acquisition/covidcast/test_csv_importer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ def test_find_csv_files(self, mock_glob: MagicMock):
9494
path_prefix + 'invalid/weekly_222222_b_c.csv',
9595
# invalid geography
9696
path_prefix + 'invalid/20200418_province_c.csv',
97+
# valid hsa_nci day
98+
path_prefix + 'valid/20200408_hsa_nci_sig.csv',
99+
# valid hsa_nci week
100+
path_prefix + 'valid/weekly_202015_hsa_nci_sig.csv',
97101
# ignored
98102
path_prefix + 'ignored/README.md',
99103
]
@@ -113,6 +117,8 @@ def test_find_csv_files(self, mock_glob: MagicMock):
113117
(glob_paths[5], None),
114118
(glob_paths[6], None),
115119
(glob_paths[7], None),
120+
(glob_paths[8], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa_nci')),
121+
(glob_paths[9], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa_nci')),
116122
])
117123
self.assertEqual(found, expected)
118124

@@ -182,6 +188,7 @@ def make_row(
182188

183189
# cases to test each failure mode
184190
failure_cases = [
191+
(make_row(geo_type='hsa_nci', geo_id='1111'), 'geo_id'),
185192
(make_row(geo_type='county', geo_id='1234'), 'geo_id'),
186193
(make_row(geo_type='county', geo_id='00000'), 'geo_id'),
187194
(make_row(geo_type='hrr', geo_id='600'), 'geo_id'),
@@ -215,6 +222,8 @@ def make_row(
215222
(make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvRowValue('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)),
216223
(make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvRowValue('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)),
217224
(make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvRowValue('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)),
225+
(make_row(geo_type='hsa_nci', geo_id='1022'), CsvRowValue('1022', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
226+
(make_row(geo_type='hsa_nci', geo_id='012'), CsvRowValue('12', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
218227
]
219228

220229
for ((geo_type, row), field) in success_cases:

0 commit comments

Comments
 (0)