-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathacsByNTA.py
116 lines (98 loc) · 3.92 KB
/
acsByNTA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import numpy as np
import geopandas
# ref: https://www1.nyc.gov/assets/planning/download/pdf/data-maps/open-data/nta_acs_2014_2018_datadictionary.pdf?r=1
# Economic variables
columnsToKeepEcon= [
# "BoroCode",
# "BoroName",
"NTACode",
"NTAName",
# Commuting to work
"cw_pbtrnsP", # % commuting to work via public transportation (excluding taxicab)
# "mntrvtmE", # mean travel time to work (minutes)
# Occupation
# "mgbsciartP", # % population employed in Management, business, science and arts
"srvcP", # % population employed in service occupations
# "salesoffP", # % population employed in sales and office occupations
# "infoP", # % population employed information
# Industry
# "prfsmgawmP", # % population employed in Professional, scientific, and management, and administrative and waste management services
"edhlthcsaP", # % population employed in Educational services, and health care and social assistance
# "artenrafsP", # % population employed in Arts, entertainment, and recreation, and accommodation and food services
"pubadminP", # % population employed in Public administration
"gvtwrkrP", # % population government workers
# Income and benefits
# "mdhhincE", # median household income ($)
# "mnhhincE", # mean household income ($)
"inc_snapP", # % with Food Stamps/SNAP benefits in last 12 months
"percapincE", # per capita income ($)
# Health Insurance
# "hinsP", # % with health insurance
# "pvhinsP", # % with health insurance who have private insurance
# "pbhinsP", # % with health insurance who have public insurance
"nhinsP", # % with no health insurance coverage
# "nhinsE", # population with no health insurance coverage
# "emnhinsP", # % employed in labor force with no health insurance
# "uemnhinsP", # % unemployed in labor force with no health insurance
# "nlfnhinsP", # % not in labor force with no health insurance
# Poverty
"fambwpvP", # % families below poverty line
"geometry"
]
# Demographic variables
columnsToKeepDemo = [
"NTACode",
# Demographics
# "blnhP", # % Black or African American Alone
"wtnhP", # % White Alone
# "hsp1P", # % Hispanic/Latino (of any race) Alone
]
def pullInACSGeoJson():
"""
Pulls in ACS data by Neighborhood Tabulation Area (NTA).
Selects subset of columns.
"""
econPath = "data/Neighborhood Tabulation Areas (NTA)/ACS/nta_with_acs_economics.geojson"
econDf = geopandas.read_file(econPath)
econDf = econDf[columnsToKeepEcon]
demoPath = "data/Neighborhood Tabulation Areas (NTA)/ACS/nta_with_acs.geojson"
demoDf = geopandas.read_file(demoPath)
demoDf = demoDf[columnsToKeepDemo]
demoDf["pct_nonwhite"] = 100 - demoDf["wtnhP"]
return econDf.set_index("NTACode").join(demoDf.set_index("NTACode")).reset_index()
def pullInStationCSV():
"""
Pulls in station csv.
Turns into geopandas so that it can be used in a spatial join.
"""
path = "data/Stations/MTA_Station_Mapping_from_turnstile_data.csv"
df = pd.read_csv(path)
stations = df[~(df.lat.isnull() | df.lat.isna())] # remove empty geometries
return geopandas.GeoDataFrame(
stations, geometry=geopandas.points_from_xy(stations["long"], stations["lat"]))
def handleSpatialJoin(ntas, stations):
"""
Merges stations into their enclosing ntas.
"""
stationsWithNTA = geopandas.sjoin(stations, ntas, how="left", op="intersects")
stationColsToKeep = [
'station_code',
'station',
"GTFS_stop_id",
"C/A",
"line_name",
# 'BoroCode',
# 'BoroName',
'NTACode',
'NTAName',
"geometry", "lat", "long", "unit"]
return stationsWithNTA[stationColsToKeep]
def main():
ntas = pullInACSGeoJson()
stations = pullInStationCSV()
joined = handleSpatialJoin(ntas, stations)
ntas.to_file("data/output/acs_nta.geojson", driver='GeoJSON')
joined.to_csv("data/output/stations_with_ntas.csv", index=False)
if __name__ == "__main__":
main()