-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathmerge_data.py
157 lines (128 loc) · 4.88 KB
/
merge_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import utils
import logging
import glob, sys
log = logging.getLogger(__name__)
log.addHandler(logging.StreamHandler(sys.stdout))
log.setLevel(logging.WARNING)
def load_districts(fn):
column_map = {
'GEOID': 'tract',
'CDFP': 'cd',
}
tract_list = utils.load_csv_columns(fn, column_map)
# trim tract to block geoid, so we can use dict for faster lookup
for tract in tract_list:
tract['block'] = tract['tract'][:-4]
blocks = utils.list_key_values(tract_list, 'block')
return blocks
def load_tracts(fn):
column_map = {
'GEOID_TRACT_20': 'tract',
'GEOID_ZCTA5_20': 'zcta',
}
tracts_list = utils.load_csv_columns(fn, column_map, delimiter='|')
zcta = utils.list_key_values(tracts_list, 'zcta')
return zcta
def load_fips(fn):
column_map = {
'STATE': 'state_fips',
'STUSAB': 'state',
}
fips_data = utils.load_csv_columns(fn, column_map, delimiter='|')
fips_dict = {}
for row in fips_data:
fips_dict[row['state_fips']] = row['state']
return fips_dict
def merge_by_tract(cd_dict, zcta_dict):
merged = []
for (zcta, zcta_row) in zcta_dict.items():
if not zcta:
# skip initial blanks
continue
tracts = set(entry['tract'] for entry in zcta_row)
matched_list = []
for tract in tracts:
matched_cds = cd_dict[tract]
state_fips = tract[:2]
matched_list.extend(f"{state_fips}-{m['cd']}" for m in matched_cds)
matched_unique = list(set(matched_list))
for matched_cd in matched_unique:
(state_fips, cd_id) = matched_cd.split('-')
new_zcta = {'zcta': zcta, 'cd': cd_id, 'state_fips': state_fips}
log.info(new_zcta)
merged.append(new_zcta)
return merged
def state_fips_to_name(zccd):
# append state abbreviation from FIPS
for row in zccd:
row['state_abbr'] = FIPS_TO_STATE[row['state_fips']]
return zccd
def remove_district_padding(zccd):
cleaned = []
for row in zccd:
if row['cd'] in ['null', '', 'ZZ']:
# skip empty rows
# ZZ means mostly water
continue
# non-voting districts are noted as 98 in census, but 0 in other sources
if row['cd'] == '98':
row['cd'] = 0
row['cd'] = str(int(row['cd']))
# do this weird conversion to get rid of zero padding
cleaned.append(row)
return cleaned
def sanity_check(zccd, incorrect_states_dict):
# there are entries which are clearly inaccurate
checked = []
for row in zccd:
state = row['state_abbr']
if state in incorrect_states_dict.keys():
should_not_start_with = incorrect_states_dict[state]
if row['zcta'].startswith(should_not_start_with):
log.info('zcta %s in %s should not start with %s, SKIPPING' % (row['zcta'], state, should_not_start_with))
continue
checked.append(row)
return checked
FIPS_TO_STATE = {}
STATE_TO_FIPS = {}
if __name__ == "__main__":
# load state FIPS codes
FIPS_TO_STATE = load_fips('raw/state_fips.txt')
STATE_TO_FIPS = {v: k for k, v in FIPS_TO_STATE.items()}
# load national tract file
tract_to_zcta = load_tracts('raw/zcta520_tract20_natl.txt')
zccd_national = []
zccd_files = sorted(glob.glob('raw/cd119/*.txt'), reverse=True)
# process in reverse alpha order, so we get national first
for filename in zccd_files:
# load district file
cd_to_tract = load_districts(filename)
# merge by the tract geoid
zccd = merge_by_tract(cd_to_tract, tract_to_zcta)
# clean output
zccd_cleaned = remove_district_padding(zccd)
# insert state abbreviation column
zccd_named = state_fips_to_name(zccd_cleaned)
print("got %s ZCTA->CD mappings for %s" % (len(zccd_named), filename))
zccd_national.extend(zccd_named)
# now we have some duplicates between the national and state mappings
# merge using a set with key like "zcta-cd"
print("deduplicating state and national mappings")
zccd_set = set()
for zccd in zccd_national:
# print(zccd)
zccd_set.add(f"{zccd['zcta']}-{zccd['cd']}")
# print(zccd_set)
zccd_unique = []
for key in zccd_set:
# split the key and look up in national list
zcta,cd = key.split('-')
# take first matching entry
zccd_match = list(filter(lambda z: z['zcta'] == zcta and z['cd'] == cd, zccd_national))
if zccd_match:
zccd_unique.append(zccd_match[0])
print("got %s ZCTA->CD mappings for %s" % (len(zccd_unique), 'national'))
# re-sort by state FIPS code
zccd_sorted = sorted(zccd_unique, key=lambda k: (k['state_fips'], k['zcta'], k['cd']))
# write output
utils.csv_writer('zccd.csv', zccd_sorted, ['state_fips', 'state_abbr', 'zcta', 'cd'])