forked from PolicyEngine/slcsp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmergeLA9.py
223 lines (175 loc) · 7.73 KB
/
mergeLA9.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import pandas as pd
import numpy as np
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def standardize_county_name(county):
"""Standardize county names to handle common variations"""
if pd.isna(county):
return county
county = str(county).strip()
# Remove 'County' and standardize city
county = county.replace(' County', '').replace(' city', ' City')
county = county.replace(' Parish', '') # For Louisiana parishes
county = county.replace('</p>', '') # Remove stray HTML tags
# State-specific fixes
replacements = {
# Indiana fixes
'Kosclusko': 'Kosciusko',
'Deleware': 'Delaware',
'Davless': 'Daviess',
'Dubols': 'Dubois',
'Marlon': 'Marion',
# Illinois fixes
'Dupage': 'DuPage',
'De Witt': 'DeWitt',
# California fixes
'San Bernadino': 'San Bernardino',
# Kansas fixes
'Chautaugua': 'Chautauqua',
# North Dakota fixes
'Trail': 'Traill',
'Trailll': 'Traill',
# Louisiana fixes
'Vermillion': 'Vermilion',
# Texas fixes
'Culbertson': 'Culberson',
'Ochittree': 'Ochiltree',
'Wheiler': 'Wheeler',
# Wisconsin fixes
'LaFayette': 'Lafayette',
# Georgia fixes
'Heralson': 'Haralson',
'DeKalb': 'De Kalb',
# Florida fixes
'Desoto': 'DeSoto',
# Minnesota fixes
'Lac Qui Parle': 'Lac qui Parle',
'Lac qui Parle': 'Lac qui Parle',
# Ohio fixes
'Galia': 'Gallia',
# South Dakota fixes
'Mc Cook': 'McCook',
'Bonn Homme': 'Bon Homme',
'DeBaca': 'De Baca',
# Common variations
'Saint': 'St.',
'St ': 'St. '
}
for old, new in replacements.items():
county = county.replace(old, new)
return county.strip()
def merge_rating_areas(zip_county_df, areas_df):
"""
Merge ZIP codes with rating areas using both county-based and ZIP3-based matching
"""
# Make copies to avoid modifying original dataframes
zip_county_df = zip_county_df.copy()
areas_df = areas_df.copy()
# Store original county names for reference
zip_county_df['ORIGINAL_COUNTYNAME'] = zip_county_df['COUNTYNAME']
# Initialize result DataFrame
result = zip_county_df[['ZIP', 'COUNTYNAME', 'ORIGINAL_COUNTYNAME', 'STATE', 'STCOUNTYFP', 'CLASSFP']].copy()
result.columns = ['zip_code', 'county_standardized', 'county_original', 'state', 'stcountyfp', 'classfp']
# Ensure ZIP codes are strings before extracting ZIP3
result['zip_code'] = result['zip_code'].astype(str).str.zfill(5)
result['ZIP3'] = result['zip_code'].str[:3]
result['rating_area'] = np.nan
# First handle territories - they all get rating area 1
territories = ['GU', 'VI', 'PR', 'AS', 'MP']
territory_mask = result['state'].isin(territories)
result.loc[territory_mask, 'rating_area'] = '1'
# For non-territory states, proceed with regular matching
non_territory_mask = ~result['state'].isin(territories)
non_territory_records = result[non_territory_mask]
# Standardize county names for non-territory records
non_territory_records['county_standardized'] = non_territory_records['county_standardized'].apply(standardize_county_name)
areas_df['COUNTY_ZIP3'] = areas_df['COUNTY_ZIP3'].apply(standardize_county_name)
# Create mapping dictionary for ZIP3-based areas
zip3_mapping = {}
for _, row in areas_df.iterrows():
if str(row['COUNTY_ZIP3']).isdigit():
zip3_mapping[(row['STATE'], row['COUNTY_ZIP3'])] = row['AREA']
# Create mapping for county-based areas
county_mapping = {}
for _, row in areas_df.iterrows():
if not str(row['COUNTY_ZIP3']).isdigit():
county_mapping[(row['STATE'], row['COUNTY_ZIP3'])] = row['AREA']
# Process non-territory records
for idx, row in non_territory_records.iterrows():
# Handle Los Angeles ZIP3s specially
if row['state'] == 'CA':
if row['ZIP3'] in ['906', '907', '908', '910', '911', '912', '915', '917', '918', '935']:
result.at[idx, 'rating_area'] = '15'
elif row['ZIP3'] in ['900', '902', '903', '904', '905', '913', '914', '916', '923', '928', '932']:
result.at[idx, 'rating_area'] = '16'
continue
# Try ZIP3-based matching
key = (row['state'], row['ZIP3'])
if key in zip3_mapping:
result.at[idx, 'rating_area'] = zip3_mapping[key]
continue
# Try county-based matching
key = (row['state'], row['county_standardized'])
if key in county_mapping:
result.at[idx, 'rating_area'] = county_mapping[key]
# Generate report
total_records = len(result)
matched_records = result['rating_area'].notna().sum()
print(f"\nMatching Results:")
print(f"Total records: {total_records}")
print(f"Successfully matched: {matched_records}")
print(f"Success rate: {(matched_records/total_records)*100:.2f}%")
# Get unmatched records
unmatched = result[result['rating_area'].isna()].copy()
if not unmatched.empty:
print("\nUnmatched records by state:")
state_counts = unmatched['state'].value_counts()
print(state_counts)
# Save all unmatched records
unmatched_sorted = unmatched.sort_values(['state', 'county_original', 'zip_code'])
# Create comparison with rating areas file
rating_areas_counties = areas_df[~areas_df['COUNTY_ZIP3'].str.isdigit()].copy()
rating_areas_counties = rating_areas_counties[['STATE', 'COUNTY_ZIP3']].rename(
columns={'STATE': 'state', 'COUNTY_ZIP3': 'rating_area_county'})
# Add column showing available counties in rating areas file
unmatched_sorted['available_counties'] = unmatched_sorted['state'].map(
rating_areas_counties.groupby('state')['rating_area_county'].apply(lambda x: ', '.join(sorted(set(x)))))
# Select and reorder columns for output
output_columns = [
'state',
'county_original',
'county_standardized',
'zip_code',
'stcountyfp',
'classfp',
'available_counties'
]
unmatched_path = 'unmatched_records_9.csv'
unmatched_sorted[output_columns].to_csv(unmatched_path, index=False)
print(f"\nUnmatched records saved to: {unmatched_path}")
# Print sample of unmatched records
print("\nSample of unmatched records:")
print(unmatched_sorted[output_columns].head())
return result
def main():
print("Script starting...")
try:
# Load data
logger.info("Loading data files...")
areas_df = pd.read_csv('areas.csv')
print(f"Successfully loaded areas.csv with {len(areas_df)} rows")
zip_county_df = pd.read_csv('ZIP-COUNTY-FIPS_2017-06.csv')
print(f"Successfully loaded ZIP-COUNTY file with {len(zip_county_df)} rows")
logger.info("Processing merge...")
result_df = merge_rating_areas(zip_county_df, areas_df)
# Save results
output_path = 'merged_results_v9.csv'
result_df.to_csv(output_path, index=False)
print(f"\nResults saved to: {output_path}")
except FileNotFoundError as e:
print(f"Error: Could not find input file: {e}")
except Exception as e:
print(f"Error: An unexpected error occurred: {e}")
if __name__ == "__main__":
main()