Skip to content

Commit c89e9ff

Browse files
authored
Merge pull request #417 from knaaptime/ltdbreader
update filepaths for new ltdb zips
2 parents a1264fd + b4842a1 commit c89e9ff

File tree

1 file changed

+139
-80
lines changed

1 file changed

+139
-80
lines changed

geosnap/io/storage.py

Lines changed: 139 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def store_seda(data_dir="auto", accept_eula=False):
6464
6565
"""
6666
assert accept_eula, (
67-
"You must accept the EULA by passing `accept_eula=True` \n" f"{eula}"
67+
f"You must accept the EULA by passing `accept_eula=True` \n{eula}"
6868
)
6969
pth = pathlib.Path(_make_data_dir(data_dir), "seda")
7070
pathlib.Path(pth).mkdir(parents=True, exist_ok=True)
@@ -242,7 +242,7 @@ def store_nces(years="all", dataset="all", data_dir="auto"):
242242
p = quilt3.Package.browse(f"nces/{d}", "s3://spatial-ucr")
243243
for year in years:
244244
p[f"{d}_{year}.parquet"].fetch(
245-
dest=pathlib.Path(pth, "nces" f"{d}_{year}.parquet")
245+
dest=pathlib.Path(pth, f"nces{d}_{year}.parquet")
246246
)
247247

248248

@@ -282,21 +282,87 @@ def store_acs(years="all", level="tract", data_dir="auto"):
282282
)
283283

284284

285-
def store_ltdb(sample, fullcount, data_dir="auto"):
285+
def _ltdb_reader(path, year, dropcols=None, currency_year=None):
286+
df = pd.read_csv(
287+
path,
288+
na_values=["", " ", 99999, -999],
289+
converters={0: str, "placefp10": str},
290+
low_memory=False,
291+
encoding="latin1",
292+
)
293+
294+
if dropcols:
295+
df.drop(dropcols, axis=1, inplace=True)
296+
df.columns = df.columns.str.lower()
297+
names = df.columns.values.tolist()
298+
names[0] = "geoid"
299+
newlist = []
300+
301+
# ignoring the first 4 columns, remove year suffix from column names
302+
for name in names[4:]:
303+
newlist.append(name[:-2])
304+
colnames = names[:4] + newlist
305+
df.columns = colnames
306+
307+
# prepend a 0 when FIPS is too short
308+
df["geoid"] = df["geoid"].str.rjust(11, "0")
309+
df.set_index("geoid", inplace=True)
310+
311+
df["year"] = year
312+
313+
inflate_cols = [
314+
"mhmval",
315+
"mrent",
316+
"incpc",
317+
"hinc",
318+
"hincw",
319+
"hincb",
320+
"hinch",
321+
"hinca",
322+
]
323+
324+
inflate_available = list(set(df.columns).intersection(set(inflate_cols)))
325+
326+
if len(inflate_available):
327+
df = adjust_inflation(df, inflate_available, year, base_year=currency_year)
328+
return df
329+
330+
331+
def store_ltdb(
332+
sample_zip=None,
333+
fullcount_zip=None,
334+
sample_paths=None,
335+
fullcount_paths=None,
336+
data_dir="auto",
337+
currency_year=2010,
338+
):
286339
"""
287340
Read & store data from Brown's Longitudinal Tract Database (LTDB).
288341
289342
Parameters
290343
----------
291-
sample : str
344+
sample_zip : str
292345
file path of the zip file containing the standard Sample CSV files
293346
downloaded from
294347
https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx
295-
296-
fullcount: str
348+
fullcount_zip: str
297349
file path of the zip file containing the standard Fullcount CSV files
298350
downloaded from
299351
https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx
352+
sample_paths: dict
353+
dictionary of CSV files (e.g. if manually unzipping the archive from LTDB). The
354+
dict should be formatted using `sample_{year}` as the key, with the value storing
355+
the path to the given csv, as in {'sample_1970': 'path/to/sample_1970.csv',}
356+
fullcount_paths: dict
357+
dictionary of CSV files (e.g. if manually unzipping the archive from LTDB). The
358+
dict should be formatted using `fullcount_{year}` as the key, with the value storing
359+
the path to the given csv, as in {'fullcount_1970': 'path/to/fullcount_1970.csv',}
360+
data_dir: str
361+
directory to store the resulting parquet file. If 'auto' (default) the default
362+
geosnap data directory will be used (via the `platformdirs` package)
363+
currency_year : int
364+
year used to express common dollar values. Default is 2010 meaning all currency
365+
variables (e.g. median income) will be expressed in real 2010 values.
300366
301367
Returns
302368
-------
@@ -305,97 +371,94 @@ def store_ltdb(sample, fullcount, data_dir="auto"):
305371
"""
306372
codebook = pd.read_csv(Path(script_dir, "variables.csv"))
307373

308-
sample_zip = zipfile.ZipFile(sample)
309-
fullcount_zip = zipfile.ZipFile(fullcount)
310-
311-
def _ltdb_reader(path, file, year, dropcols=None):
312-
df = pd.read_csv(
313-
path.open(file),
314-
na_values=["", " ", 99999, -999],
315-
converters={0: str, "placefp10": str},
316-
low_memory=False,
317-
encoding="latin1",
374+
if sample_zip is None and sample_paths is None:
375+
raise ValueError(
376+
"No Sample Data Found. You must pass either a zip archive of "
377+
"LTDB data or a dict of csv paths from the unpacked archive "
378+
"using `sample_{year}` format for the dictionary keys"
318379
)
319-
320-
if dropcols:
321-
df.drop(dropcols, axis=1, inplace=True)
322-
df.columns = df.columns.str.lower()
323-
names = df.columns.values.tolist()
324-
names[0] = "geoid"
325-
newlist = []
326-
327-
# ignoring the first 4 columns, remove year suffix from column names
328-
for name in names[4:]:
329-
newlist.append(name[:-2])
330-
colnames = names[:4] + newlist
331-
df.columns = colnames
332-
333-
# prepend a 0 when FIPS is too short
334-
df["geoid"] = df["geoid"].str.rjust(11, "0")
335-
df.set_index("geoid", inplace=True)
336-
337-
df["year"] = year
338-
339-
inflate_cols = [
340-
"mhmval",
341-
"mrent",
342-
"incpc",
343-
"hinc",
344-
"hincw",
345-
"hincb",
346-
"hinch",
347-
"hinca",
348-
]
349-
350-
inflate_available = list(set(df.columns).intersection(set(inflate_cols)))
351-
352-
if len(inflate_available):
353-
df = adjust_inflation(df, inflate_available, year)
354-
return df
380+
elif sample_paths is None:
381+
sample_base = "ltdb_std_all_sample/ltdb_std_all_sample/"
382+
sample_zip = zipfile.ZipFile(sample_zip)
383+
sample_paths = {
384+
"sample_1970": sample_zip.open(f"{sample_base}ltdb_std_1970_sample.csv"),
385+
"sample_1980": sample_zip.open(f"{sample_base}ltdb_std_1980_sample.csv"),
386+
"sample_1990": sample_zip.open(f"{sample_base}ltdb_std_1990_sample.csv"),
387+
"sample_2000": sample_zip.open(f"{sample_base}LTDB_Std_ 2000_ Sample.csv"),
388+
"sample_2010": sample_zip.open(f"{sample_base}LTDB_std_200812_Sample.csv"),
389+
}
390+
391+
if fullcount_zip is None and fullcount_paths is None:
392+
raise ValueError(
393+
"No Fullcount Data Found. You must pass either a zip archive of "
394+
"LTDB data or a dict of csv paths from the unpacked archive "
395+
"using `fullcount_{year}` format for the dictionary keys"
396+
)
397+
elif fullcount_paths is None:
398+
fullcount_base = "ltdb_std_all_fullcount/ltdb_std_all_fullcount/"
399+
fullcount_zip = zipfile.ZipFile(fullcount_zip)
400+
fullcount_paths = {
401+
"fullcount_1970": fullcount_zip.open(
402+
f"{fullcount_base}LTDB_Std_1970_fullcount.csv"
403+
),
404+
"fullcount_1980": fullcount_zip.open(
405+
f"{fullcount_base}LTDB_Std_1980_fullcount.csv"
406+
),
407+
"fullcount_1990": fullcount_zip.open(
408+
f"{fullcount_base}LTDB_Std_1990_fullcount.csv"
409+
),
410+
"fullcount_2000": fullcount_zip.open(
411+
f"{fullcount_base}LTDB_Std_2000_fullcount.csv"
412+
),
413+
}
355414

356415
# read in Brown's LTDB data, both the sample and fullcount files for each
357416
# year population, housing units & occupied housing units appear in both
358417
# "sample" and "fullcount" files-- currently drop sample and keep fullcount
359418

419+
# read all samples
360420
sample70 = _ltdb_reader(
361-
sample_zip,
362-
"ltdb_std_all_sample/ltdb_std_1970_sample.csv",
421+
sample_paths["sample_1970"],
363422
dropcols=["POP70SP1", "HU70SP", "OHU70SP"],
364423
year=1970,
424+
currency_year=currency_year,
365425
)
366-
367-
fullcount70 = _ltdb_reader(fullcount_zip, "LTDB_Std_1970_fullcount.csv", year=1970)
368-
369426
sample80 = _ltdb_reader(
370-
sample_zip,
371-
"ltdb_std_all_sample/ltdb_std_1980_sample.csv",
427+
sample_paths["sample_1980"],
372428
dropcols=["pop80sf3", "pop80sf4", "hu80sp", "ohu80sp"],
373429
year=1980,
430+
currency_year=currency_year,
374431
)
375-
376-
fullcount80 = _ltdb_reader(fullcount_zip, "LTDB_Std_1980_fullcount.csv", year=1980)
377-
378432
sample90 = _ltdb_reader(
379-
sample_zip,
380-
"ltdb_std_all_sample/ltdb_std_1990_sample.csv",
433+
sample_paths["sample_1990"],
381434
dropcols=["POP90SF3", "POP90SF4", "HU90SP", "OHU90SP"],
382435
year=1990,
436+
currency_year=currency_year,
383437
)
384-
385-
fullcount90 = _ltdb_reader(fullcount_zip, "LTDB_Std_1990_fullcount.csv", year=1990)
386-
387438
sample00 = _ltdb_reader(
388-
sample_zip,
389-
"ltdb_std_all_sample/ltdb_std_2000_sample.csv",
439+
sample_paths["sample_2000"],
390440
dropcols=["POP00SF3", "HU00SP", "OHU00SP"],
391441
year=2000,
442+
currency_year=currency_year,
392443
)
393-
394-
fullcount00 = _ltdb_reader(fullcount_zip, "LTDB_Std_2000_fullcount.csv", year=2000)
395-
396444
sample10 = _ltdb_reader(
397-
sample_zip, "ltdb_std_all_sample/ltdb_std_2010_sample.csv", year=2010
445+
sample_paths["sample_2010"], year=2010, currency_year=currency_year
446+
)
447+
448+
# read all fullcount files
449+
fullcount70 = _ltdb_reader(
450+
fullcount_paths["fullcount_1970"], year=1970, currency_year=currency_year
451+
)
452+
fullcount80 = _ltdb_reader(
453+
fullcount_paths["fullcount_1980"], year=1980, currency_year=currency_year
454+
)
455+
fullcount90 = _ltdb_reader(
456+
fullcount_paths["fullcount_1990"], year=1990, currency_year=currency_year
398457
)
458+
fullcount00 = _ltdb_reader(
459+
fullcount_paths["fullcount_2000"], year=2000, currency_year=currency_year
460+
)
461+
399462
# join the sample and fullcount variables into a single df for the year
400463
ltdb_1970 = sample70.drop(columns=["year"]).join(
401464
fullcount70.iloc[:, 7:], how="left"
@@ -489,11 +552,7 @@ def store_ncdb(filepath, data_dir="auto"):
489552

490553
orig = []
491554
for col in cols:
492-
if col.endswith("D"):
493-
orig.append(col)
494-
elif col.endswith("N"):
495-
orig.append(col)
496-
elif col.endswith("1A"):
555+
if col.endswith("D") or col.endswith("N") or col.endswith("1A"):
497556
orig.append(col)
498557

499558
renamer = dict(zip(orig, fixed))
@@ -521,8 +580,8 @@ def store_ncdb(filepath, data_dir="auto"):
521580
for row in codebook["formula"].dropna().tolist():
522581
try:
523582
df.eval(row, inplace=True)
524-
except:
525-
warn("Unable to compute " + str(row))
583+
except Exception as e:
584+
warn(f"Unable to compute {row} with {e}", stacklevel=2)
526585

527586
keeps = df.columns[df.columns.isin(codebook["variable"].tolist() + ["year"])]
528587

0 commit comments

Comments
 (0)