-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgendata.py
382 lines (285 loc) · 10.2 KB
/
gendata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
import argparse
import sys
from typing import Iterable, Optional
import censusdis.data as ced
import censusdis.maps as cem
import divintseg as dis
import geopandas as gpd
import numpy as np
import pandas as pd
from censusdis.states import ALL_STATES_DC_AND_PR, STATE_NAMES_FROM_IDS
verbose = False
DATASET = 'dec/pl'
def state_bounds(year: int, epsg: int) -> gpd.GeoDataFrame:
"""
Generate a file of state vector boundaries.
Parameters
----------
year
Year of census data to use.
epsg
The epsg to project to.
Returns
-------
The states in a geo data frame.
"""
gdf_states = ced.download(
DATASET, year,
['NAME'],
state=ALL_STATES_DC_AND_PR,
with_geometry=True
)
gdf_states = cem.relocate_ak_hi_pr(gdf_states)
gdf_states = gdf_states.to_crs(epsg=epsg)
return gdf_states
def save_state_bounds(year: int, epsg: int, filename: str, rep_filename: Optional[str]):
"""
Generate and save the state bounds.
Parameters
----------
year
Year of census data to use.
epsg
The epsg to project to.
filename
The path to the output file.
Returns
-------
None
"""
gdf_states = state_bounds(year, epsg)
gdf_states.to_file(filename)
if rep_filename is not None:
gdf_states.geometry = gdf_states.representative_point()
gdf_states.to_file(rep_filename)
def city_bounds(year: int, epsg: int) -> gpd.GeoDataFrame:
"""
Generate a file of city vector boundaries.
Parameters
----------
year
Year of census data to use.
epsg
The epsg to project to.
Returns
-------
The cities in a geo data frame.
"""
group = "P2"
total_col = f'{group}_001N'
gdf_cities = ced.download(
DATASET, year,
['NAME', total_col],
place="*",
with_geometry=True
)
gdf_cities = gdf_cities.sort_values(by=total_col, ascending=False).reset_index(drop=False)
if verbose:
print(gdf_cities[['NAME', total_col]].head(10))
gdf_cities = cem.relocate_ak_hi_pr(gdf_cities)
gdf_cities = gdf_cities.to_crs(epsg=epsg)
return gdf_cities
def save_city_bounds(year: int, epsg: int, filename: str, rep_filename: Optional[str]):
"""
Generate and save city bounds.
Parameters
----------
year
Year of census data to use.
epsg
The epsg to project to.
filename
The path to the output file.
Returns
-------
None
"""
gdf_cities = city_bounds(year, epsg)
gdf_cities.to_file(filename)
if rep_filename is not None:
gdf_cities.geometry = gdf_cities.representative_point()
gdf_cities.to_file(rep_filename)
def hl_variable(variable: str) -> str:
"""
Convert to hispanic or latino variable name.
Parameters
----------
variable
Original variable name.
Returns
-------
Name of derived variable we will use for non-Hispanic or
Latino version of the concept measured by this variable.
"""
if not variable.startswith("P1_"):
raise ValueError("Must be a P1 variable.")
# See https://api.census.gov/data/2020/dec/pl/groups/P1.html
# and https://api.census.gov/data/2020/dec/pl/groups/P2.html.
var_num = int(variable[-4:-1])
var_num = var_num + 2
return f'hl_{var_num:03d}N'
def nhl_variable_for_hl_variable(hl_var: str) -> str:
if not hl_var.startswith('hl_'):
raise ValueError("Must be an hl variable.")
return hl_var.replace("hl_", "P2_")
def all_state_data(year: int, group1: str, group2: str, states: Iterable[str], total_col: str):
"""Generate all the state data, one data frame per state."""
for state in states:
if verbose:
print(f"Processing {STATE_NAMES_FROM_IDS[state]}")
# If we load all the variables at once across
# both groups the server side sometimes has issues.
# So we download the groups separately and then merge.
if verbose:
print(f"Downloading {group1}")
df_block_for_state_group1 = ced.download(
DATASET,
year,
leaves_of_group=group1,
state=state,
block="*",
)
if verbose:
print(f"Downloading {group2}")
df_block_for_state_group2 = ced.download(
DATASET,
year,
[total_col],
leaves_of_group=group2,
state=state,
block="*",
)
df_block_for_state = df_block_for_state_group2.merge(
df_block_for_state_group1,
on=["STATE", "COUNTY", "TRACT", "BLOCK"],
)
if verbose:
print(f"Downladed {len(df_block_for_state.index)} rows.")
group1_leaves = [var for var in df_block_for_state.columns if var.startswith(group1)]
# Compute the NHL version of each leaf.
for var in group1_leaves:
hl_var = hl_variable(var)
nhl_var = nhl_variable_for_hl_variable(hl_var)
df_block_for_state[hl_var] = df_block_for_state[var] - df_block_for_state[nhl_var]
df_block_for_state = df_block_for_state.drop(group1_leaves, axis='columns')
hl_leaves = [var for var in df_block_for_state.columns if var.startswith('hl')]
nhl_leaves = [nhl_variable_for_hl_variable(var) for var in hl_leaves]
total_hl_and_non_hl = df_block_for_state[hl_leaves + nhl_leaves].sum(axis='columns')
if not (df_block_for_state[total_col].equals(total_hl_and_non_hl)):
raise ValueError(
"These values should be the same! "
"Something went wrong in computing race counts for Hispanic or Latino population."
)
yield df_block_for_state
# From https://github.com/vengroff/censusdis/blob/main/notebooks/Nationwide%20Diversity%20and%20Integration.ipynb
def tract_bounds(year: int, epsg: int, states: Iterable[str]) -> gpd.GeoDataFrame:
group1 = "P1"
group2 = "P2"
total_col = f'{group2}_001N'
hl_col_in_group2 = f'{group2}_002N'
# Download the data
# If we try to bulk download all states at once we
# get a 500 from the API with the error, "There was
# an error while running your query. We've logged
# the error and we'll correct it ASAP. Sorry for
# the inconvenience." So go state by state and
# concat them.
df_block = pd.concat(all_state_data(year, group1, group2, states, total_col))
if verbose:
print(f"Downloaded a total of {len(df_block.index)} rows.")
leaf_cols = [
col for col in df_block.columns
if col.startswith(group2) and col not in [total_col, hl_col_in_group2] or col.startswith('hl')
]
# Compute diversity and integration
df_di = dis.di(
df_block[["STATE", "COUNTY", "TRACT", "BLOCK"] + leaf_cols],
by=["STATE", "COUNTY", "TRACT"],
over="BLOCK",
).reset_index()
if verbose:
print(f"DI over {len(df_di.index)} rows.")
# Sum up over tracts and merge in.
df_by_tracts = df_block.groupby(
["STATE", "COUNTY", "TRACT"]
)[[total_col] + leaf_cols].sum().reset_index()
df_di = df_di.merge(df_by_tracts, on=["STATE", "COUNTY", "TRACT"])
# Infer the geographies
gdf_di = ced.add_inferred_geography(df_di, year)
gdf_di = gdf_di[~gdf_di.geometry.isnull()]
# Get census county names.
df_county_names = ced.download(
DATASET,
year,
['NAME'],
state="*",
county="*",
).rename({'NAME': 'COUNTY_NAME'}, axis='columns')
gdf_di = gdf_di.merge(df_county_names, on=['STATE', 'COUNTY'])
gdf_di = cem.relocate_ak_hi_pr(gdf_di)
return gdf_di.to_crs(epsg=epsg)
def save_tract_bounds(year: int, epsg: int, filename: str, states: Iterable[str], csv: Optional[str] = None):
"""
Generate and save the state bounds.
Parameters
----------
year
Year of census data to use.
epsg
The epsg to project to.
filename
The path to the geojson output file.
states
The state(s) to download data for.
csv
Optional name of file for csv output.
Returns
-------
None
"""
if states is None:
# Warning: the Census API servers are flakey enough that
# you are not likely to get through all the states without
# any errors.
states = ALL_STATES_DC_AND_PR
gdf_tracts = tract_bounds(year, epsg, states=states)
if csv:
gdf_tracts[
[col for col in gdf_tracts if col not in ['geometry', 'COUNTY_NAME']]
].to_csv(csv, index=False)
# Replace zeros with NaN so they are left out of the
# geojson file we write.
gdf_tracts.replace(0, np.nan, inplace=True)
with open(filename, 'w') as file:
file.write(gdf_tracts.to_json(na='drop'))
def main(argv: Iterable[str]):
parser = argparse.ArgumentParser(prog=argv[0])
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument('-y', '--year', required=True, type=int)
parser.add_argument('-e', '--epsg', type=int, default=4326)
parser.add_argument('-o', '--output', type=str, help="Output file.", required=True)
parser.add_argument('-r', '--rep-output', type=str, help="Output file of representative points.")
parser.add_argument(
'-s', '--states', nargs="*",
help="States to download. Only used for generating data for tracts. Ignored otherwise"
)
parser.add_argument('-c', '--csv', type=str)
parser.add_argument(
dest='layer',
choices=['states', 'tracts', 'cities']
)
args = parser.parse_args(argv[1:])
global verbose
verbose = args.verbose
if verbose:
print("Verbose mode on.")
filename = args.output
rep_filename = args.rep_output
if args.layer == 'states':
save_state_bounds(year=args.year, epsg=args.epsg, filename=filename, rep_filename=rep_filename)
elif args.layer == 'tracts':
save_tract_bounds(year=args.year, epsg=args.epsg, filename=filename, csv=args.csv, states=args.states)
elif args.layer == 'cities':
save_city_bounds(year=args.year, epsg=args.epsg, filename=filename, rep_filename=rep_filename)
if __name__ == "__main__":
main(sys.argv)