@@ -64,7 +64,7 @@ def store_seda(data_dir="auto", accept_eula=False):
64
64
65
65
"""
66
66
assert accept_eula , (
67
- "You must accept the EULA by passing `accept_eula=True` \n " f" { eula } "
67
+ f "You must accept the EULA by passing `accept_eula=True` \n { eula } "
68
68
)
69
69
pth = pathlib .Path (_make_data_dir (data_dir ), "seda" )
70
70
pathlib .Path (pth ).mkdir (parents = True , exist_ok = True )
@@ -242,7 +242,7 @@ def store_nces(years="all", dataset="all", data_dir="auto"):
242
242
p = quilt3 .Package .browse (f"nces/{ d } " , "s3://spatial-ucr" )
243
243
for year in years :
244
244
p [f"{ d } _{ year } .parquet" ].fetch (
245
- dest = pathlib .Path (pth , "nces" f" { d } _{ year } .parquet" )
245
+ dest = pathlib .Path (pth , f"nces { d } _{ year } .parquet" )
246
246
)
247
247
248
248
@@ -282,21 +282,87 @@ def store_acs(years="all", level="tract", data_dir="auto"):
282
282
)
283
283
284
284
285
- def store_ltdb (sample , fullcount , data_dir = "auto" ):
285
+ def _ltdb_reader (path , year , dropcols = None , currency_year = None ):
286
+ df = pd .read_csv (
287
+ path ,
288
+ na_values = ["" , " " , 99999 , - 999 ],
289
+ converters = {0 : str , "placefp10" : str },
290
+ low_memory = False ,
291
+ encoding = "latin1" ,
292
+ )
293
+
294
+ if dropcols :
295
+ df .drop (dropcols , axis = 1 , inplace = True )
296
+ df .columns = df .columns .str .lower ()
297
+ names = df .columns .values .tolist ()
298
+ names [0 ] = "geoid"
299
+ newlist = []
300
+
301
+ # ignoring the first 4 columns, remove year suffix from column names
302
+ for name in names [4 :]:
303
+ newlist .append (name [:- 2 ])
304
+ colnames = names [:4 ] + newlist
305
+ df .columns = colnames
306
+
307
+ # prepend a 0 when FIPS is too short
308
+ df ["geoid" ] = df ["geoid" ].str .rjust (11 , "0" )
309
+ df .set_index ("geoid" , inplace = True )
310
+
311
+ df ["year" ] = year
312
+
313
+ inflate_cols = [
314
+ "mhmval" ,
315
+ "mrent" ,
316
+ "incpc" ,
317
+ "hinc" ,
318
+ "hincw" ,
319
+ "hincb" ,
320
+ "hinch" ,
321
+ "hinca" ,
322
+ ]
323
+
324
+ inflate_available = list (set (df .columns ).intersection (set (inflate_cols )))
325
+
326
+ if len (inflate_available ):
327
+ df = adjust_inflation (df , inflate_available , year , base_year = currency_year )
328
+ return df
329
+
330
+
331
+ def store_ltdb (
332
+ sample_zip = None ,
333
+ fullcount_zip = None ,
334
+ sample_paths = None ,
335
+ fullcount_paths = None ,
336
+ data_dir = "auto" ,
337
+ currency_year = 2010 ,
338
+ ):
286
339
"""
287
340
Read & store data from Brown's Longitudinal Tract Database (LTDB).
288
341
289
342
Parameters
290
343
----------
291
- sample : str
344
+ sample_zip : str
292
345
file path of the zip file containing the standard Sample CSV files
293
346
downloaded from
294
347
https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx
295
-
296
- fullcount: str
348
+ fullcount_zip: str
297
349
file path of the zip file containing the standard Fullcount CSV files
298
350
downloaded from
299
351
https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx
352
+ sample_paths: dict
353
+ dictionary of CSV files (e.g. if manually unzipping the archive from LTDB). The
354
+ dict should be formatted using `sample_{year}` as the key, with the value storing
355
+ the path to the given csv, as in {'sample_1970': 'path/to/sample_1970.csv',}
356
+ fullcount_paths: dict
357
+ dictionary of CSV files (e.g. if manually unzipping the archive from LTDB). The
358
+ dict should be formatted using `fullcount_{year}` as the key, with the value storing
359
+ the path to the given csv, as in {'fullcount_1970': 'path/to/fullcount_1970.csv',}
360
+ data_dir: str
361
+ directory to store the resulting parquet file. If 'auto' (default) the default
362
+ geosnap data directory will be used (via the `platformdirs` package)
363
+ currency_year : int
364
+ year used to express common dollar values. Default is 2010 meaning all currency
365
+ variables (e.g. median income) will be expressed in real 2010 values.
300
366
301
367
Returns
302
368
-------
@@ -305,97 +371,94 @@ def store_ltdb(sample, fullcount, data_dir="auto"):
305
371
"""
306
372
codebook = pd .read_csv (Path (script_dir , "variables.csv" ))
307
373
308
- sample_zip = zipfile .ZipFile (sample )
309
- fullcount_zip = zipfile .ZipFile (fullcount )
310
-
311
- def _ltdb_reader (path , file , year , dropcols = None ):
312
- df = pd .read_csv (
313
- path .open (file ),
314
- na_values = ["" , " " , 99999 , - 999 ],
315
- converters = {0 : str , "placefp10" : str },
316
- low_memory = False ,
317
- encoding = "latin1" ,
374
+ if sample_zip is None and sample_paths is None :
375
+ raise ValueError (
376
+ "No Sample Data Found. You must pass either a zip archive of "
377
+ "LTDB data or a dict of csv paths from the unpacked archive "
378
+ "using `sample_{year}` format for the dictionary keys"
318
379
)
319
-
320
- if dropcols :
321
- df .drop (dropcols , axis = 1 , inplace = True )
322
- df .columns = df .columns .str .lower ()
323
- names = df .columns .values .tolist ()
324
- names [0 ] = "geoid"
325
- newlist = []
326
-
327
- # ignoring the first 4 columns, remove year suffix from column names
328
- for name in names [4 :]:
329
- newlist .append (name [:- 2 ])
330
- colnames = names [:4 ] + newlist
331
- df .columns = colnames
332
-
333
- # prepend a 0 when FIPS is too short
334
- df ["geoid" ] = df ["geoid" ].str .rjust (11 , "0" )
335
- df .set_index ("geoid" , inplace = True )
336
-
337
- df ["year" ] = year
338
-
339
- inflate_cols = [
340
- "mhmval" ,
341
- "mrent" ,
342
- "incpc" ,
343
- "hinc" ,
344
- "hincw" ,
345
- "hincb" ,
346
- "hinch" ,
347
- "hinca" ,
348
- ]
349
-
350
- inflate_available = list (set (df .columns ).intersection (set (inflate_cols )))
351
-
352
- if len (inflate_available ):
353
- df = adjust_inflation (df , inflate_available , year )
354
- return df
380
+ elif sample_paths is None :
381
+ sample_base = "ltdb_std_all_sample/ltdb_std_all_sample/"
382
+ sample_zip = zipfile .ZipFile (sample_zip )
383
+ sample_paths = {
384
+ "sample_1970" : sample_zip .open (f"{ sample_base } ltdb_std_1970_sample.csv" ),
385
+ "sample_1980" : sample_zip .open (f"{ sample_base } ltdb_std_1980_sample.csv" ),
386
+ "sample_1990" : sample_zip .open (f"{ sample_base } ltdb_std_1990_sample.csv" ),
387
+ "sample_2000" : sample_zip .open (f"{ sample_base } LTDB_Std_ 2000_ Sample.csv" ),
388
+ "sample_2010" : sample_zip .open (f"{ sample_base } LTDB_std_200812_Sample.csv" ),
389
+ }
390
+
391
+ if fullcount_zip is None and fullcount_paths is None :
392
+ raise ValueError (
393
+ "No Fullcount Data Found. You must pass either a zip archive of "
394
+ "LTDB data or a dict of csv paths from the unpacked archive "
395
+ "using `fullcount_{year}` format for the dictionary keys"
396
+ )
397
+ elif fullcount_paths is None :
398
+ fullcount_base = "ltdb_std_all_fullcount/ltdb_std_all_fullcount/"
399
+ fullcount_zip = zipfile .ZipFile (fullcount_zip )
400
+ fullcount_paths = {
401
+ "fullcount_1970" : fullcount_zip .open (
402
+ f"{ fullcount_base } LTDB_Std_1970_fullcount.csv"
403
+ ),
404
+ "fullcount_1980" : fullcount_zip .open (
405
+ f"{ fullcount_base } LTDB_Std_1980_fullcount.csv"
406
+ ),
407
+ "fullcount_1990" : fullcount_zip .open (
408
+ f"{ fullcount_base } LTDB_Std_1990_fullcount.csv"
409
+ ),
410
+ "fullcount_2000" : fullcount_zip .open (
411
+ f"{ fullcount_base } LTDB_Std_2000_fullcount.csv"
412
+ ),
413
+ }
355
414
356
415
# read in Brown's LTDB data, both the sample and fullcount files for each
357
416
# year population, housing units & occupied housing units appear in both
358
417
# "sample" and "fullcount" files-- currently drop sample and keep fullcount
359
418
419
+ # read all samples
360
420
sample70 = _ltdb_reader (
361
- sample_zip ,
362
- "ltdb_std_all_sample/ltdb_std_1970_sample.csv" ,
421
+ sample_paths ["sample_1970" ],
363
422
dropcols = ["POP70SP1" , "HU70SP" , "OHU70SP" ],
364
423
year = 1970 ,
424
+ currency_year = currency_year ,
365
425
)
366
-
367
- fullcount70 = _ltdb_reader (fullcount_zip , "LTDB_Std_1970_fullcount.csv" , year = 1970 )
368
-
369
426
sample80 = _ltdb_reader (
370
- sample_zip ,
371
- "ltdb_std_all_sample/ltdb_std_1980_sample.csv" ,
427
+ sample_paths ["sample_1980" ],
372
428
dropcols = ["pop80sf3" , "pop80sf4" , "hu80sp" , "ohu80sp" ],
373
429
year = 1980 ,
430
+ currency_year = currency_year ,
374
431
)
375
-
376
- fullcount80 = _ltdb_reader (fullcount_zip , "LTDB_Std_1980_fullcount.csv" , year = 1980 )
377
-
378
432
sample90 = _ltdb_reader (
379
- sample_zip ,
380
- "ltdb_std_all_sample/ltdb_std_1990_sample.csv" ,
433
+ sample_paths ["sample_1990" ],
381
434
dropcols = ["POP90SF3" , "POP90SF4" , "HU90SP" , "OHU90SP" ],
382
435
year = 1990 ,
436
+ currency_year = currency_year ,
383
437
)
384
-
385
- fullcount90 = _ltdb_reader (fullcount_zip , "LTDB_Std_1990_fullcount.csv" , year = 1990 )
386
-
387
438
sample00 = _ltdb_reader (
388
- sample_zip ,
389
- "ltdb_std_all_sample/ltdb_std_2000_sample.csv" ,
439
+ sample_paths ["sample_2000" ],
390
440
dropcols = ["POP00SF3" , "HU00SP" , "OHU00SP" ],
391
441
year = 2000 ,
442
+ currency_year = currency_year ,
392
443
)
393
-
394
- fullcount00 = _ltdb_reader (fullcount_zip , "LTDB_Std_2000_fullcount.csv" , year = 2000 )
395
-
396
444
sample10 = _ltdb_reader (
397
- sample_zip , "ltdb_std_all_sample/ltdb_std_2010_sample.csv" , year = 2010
445
+ sample_paths ["sample_2010" ], year = 2010 , currency_year = currency_year
446
+ )
447
+
448
+ # read all fullcount files
449
+ fullcount70 = _ltdb_reader (
450
+ fullcount_paths ["fullcount_1970" ], year = 1970 , currency_year = currency_year
451
+ )
452
+ fullcount80 = _ltdb_reader (
453
+ fullcount_paths ["fullcount_1980" ], year = 1980 , currency_year = currency_year
454
+ )
455
+ fullcount90 = _ltdb_reader (
456
+ fullcount_paths ["fullcount_1990" ], year = 1990 , currency_year = currency_year
398
457
)
458
+ fullcount00 = _ltdb_reader (
459
+ fullcount_paths ["fullcount_2000" ], year = 2000 , currency_year = currency_year
460
+ )
461
+
399
462
# join the sample and fullcount variables into a single df for the year
400
463
ltdb_1970 = sample70 .drop (columns = ["year" ]).join (
401
464
fullcount70 .iloc [:, 7 :], how = "left"
@@ -489,11 +552,7 @@ def store_ncdb(filepath, data_dir="auto"):
489
552
490
553
orig = []
491
554
for col in cols :
492
- if col .endswith ("D" ):
493
- orig .append (col )
494
- elif col .endswith ("N" ):
495
- orig .append (col )
496
- elif col .endswith ("1A" ):
555
+ if col .endswith ("D" ) or col .endswith ("N" ) or col .endswith ("1A" ):
497
556
orig .append (col )
498
557
499
558
renamer = dict (zip (orig , fixed ))
@@ -521,8 +580,8 @@ def store_ncdb(filepath, data_dir="auto"):
521
580
for row in codebook ["formula" ].dropna ().tolist ():
522
581
try :
523
582
df .eval (row , inplace = True )
524
- except :
525
- warn ("Unable to compute " + str ( row ) )
583
+ except Exception as e :
584
+ warn (f "Unable to compute { row } with { e } " , stacklevel = 2 )
526
585
527
586
keeps = df .columns [df .columns .isin (codebook ["variable" ].tolist () + ["year" ])]
528
587
0 commit comments