Skip to content

Commit ff823c6

Browse files
committed
Add hsa_nci geo resolution + Adjust csv regex patterns
1 parent 9450fa8 commit ff823c6

File tree

2 files changed

+20
-5
lines changed

2 files changed

+20
-5
lines changed

src/acquisition/covidcast/csv_importer.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,21 @@ class CsvRowValue:
5252
class CsvImporter:
5353
"""Finds and parses covidcast CSV files."""
5454

55+
# set of allowed resolutions (aka "geo_type")
56+
GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation', "hsa", "hsa_nci"}
57+
58+
# regex pattern for matching geo types, note: sort longer string first to avoid wrong substring matches
59+
geo_types_pattern = "|".join(sorted(GEOGRAPHIC_RESOLUTIONS, key=len, reverse=True))
60+
5561
# .../source/yyyymmdd_geo_signal.csv
56-
PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(\w+?)_(\w+)\.csv$')
62+
PATTERN_DAILY = re.compile(rf'^.*/([^/]*)/(\d{{8}})_({geo_types_pattern})_(\w+)\.csv$')
5763

5864
# .../source/weekly_yyyyww_geo_signal.csv
59-
PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(\w+?)_(\w+)\.csv$')
65+
PATTERN_WEEKLY = re.compile(rf'^.*/([^/]*)/weekly_(\d{{6}})_({geo_types_pattern})_(\w+)\.csv$')
6066

6167
# .../issue_yyyymmdd
6268
PATTERN_ISSUE_DIR = re.compile(r'^.*/([^/]*)/issue_(\d{8})$')
6369

64-
# set of allowed resolutions (aka "geo_type")
65-
GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation'}
66-
6770
# set of required CSV columns
6871
REQUIRED_COLUMNS = {'geo_id', 'val', 'se', 'sample_size'}
6972

tests/acquisition/covidcast/test_csv_importer.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,14 @@ def test_find_csv_files(self, mock_glob: MagicMock):
9696
path_prefix + 'invalid/20200418_province_c.csv',
9797
# ignored
9898
path_prefix + 'ignored/README.md',
99+
# valid hsa day
100+
path_prefix + 'valid/20200408_hsa_sig.csv',
101+
# valid hsa_nci day
102+
path_prefix + 'valid/20200408_hsa_nci_sig.csv',
103+
# valid hsa_nci week
104+
path_prefix + 'valid/weekly_202015_hsa_nci_sig.csv',
105+
# valid hsa week
106+
path_prefix + 'valid/weekly_202015_hsa_sig.csv',
99107
]
100108
mock_glob.return_value = glob_paths
101109

@@ -113,6 +121,10 @@ def test_find_csv_files(self, mock_glob: MagicMock):
113121
(glob_paths[5], None),
114122
(glob_paths[6], None),
115123
(glob_paths[7], None),
124+
(glob_paths[8], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa')),
125+
(glob_paths[9], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa_nci')),
126+
(glob_paths[10], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa_nci')),
127+
(glob_paths[11], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa')),
116128
])
117129
self.assertEqual(found, expected)
118130

0 commit comments

Comments
 (0)