-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcovid_data.py
499 lines (377 loc) · 20.2 KB
/
covid_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
import requests
import json
import os
import sqlite3
import pandas as pd
from sodapy import Socrata
import datetime
from datetime import date, timedelta
from bs4 import BeautifulSoup
import re
import csv
import plotly.express as px
##PART 1: TRANSPORT DATA
client = Socrata("data.bts.gov", None)
def get_resp(level, date, state_code = 'MI'):
'''Takes in three parameters and returns a list of dictionaries with data on trips by distance from the Bureau
of Transportation Statistics.
Values for level: National, State (note cases)
Value for date: Any ISO formated Floating Datetime Object
Value for state_code: Any two-digit state code (ie CA, AK). MI is the default value'''
if level == 'State':
resp = client.get("w96p-f2qv", limit = 25, state_code = state_code, date = date)
else:
url = f"https://data.bts.gov/resource/w96p-f2qv.json?level={level}*date={date}&limit=25"
resp = client.get("w96p-f2qv", level = level, limit=25, date = date)
return resp
def increment_time(start_date):
'''Takes in a DateTime object. Increments it by 7 days to jump ahead to the next week.
Returns the incremented DateTime object'''
next_week = start_date + timedelta(days=7)
return next_week
def build_national_lst(start_date):
'''Takes in a start date. Calls the get_resp function 53 times to get data for a complete year.
Appends results from each call to get_resp to a list and returns that list.'''
national_lst = []
date = start_date
for i in range(53):
data = get_resp(level = 'National', date = date.isoformat())
national_lst.append(data)
date = increment_time(date)
return national_lst
def build_state_lst(start_date, state_code = 'MI'):
'''Takes in a start date and state code (Michigan is the defaul value). Calls the get_resp function 53 times to get
a data dictionary for a complete year. Appends results from each call to get_resp to a list.
Returns a list of lists of dictionaries.'''
state_lst = []
date = start_date
for i in range(53):
data = get_resp(level = 'State', date = date.isoformat(), state_code = state_code)
state_lst.append(data)
date = increment_time(date)
return state_lst
def setUpDatabase(db_name):
'''Takes in the name of a database. Creates the database and returns a cur and conn'''
path = os.path.dirname(os.path.abspath(__file__))
conn = sqlite3.connect(path+'/'+db_name)
cur = conn.cursor()
return cur, conn
def create_transport_database(national_lst, state_lst):
'''Takes in a list of dictionaries of state data and list of dictionaries of national data.
Extracts values from the list, conver them from strings to floating points, inserts values
into a table and commits the changes. Does not return anything'''
cur, conn = setUpDatabase('full_data.db')
cur.execute('''CREATE TABLE IF NOT EXISTS transport (i INTEGER PRIMARY KEY UNIQUE, week_id INTEGER, location_id INTEGER, at_home INTEGER,
not_at_home INTEGER, ratio_at_home INTEGER, one_mile_trips INTEGER, one_to_three_mile_trips INTEGER,
three_to_five_mile_trips INTGER, short_trips INTEGER, hundred_to_250_mile_trips INTEGER,
twohundredfifty_to_500_mile_trips INTEGER, long_trips INTEGER, ratio_short_to_long INTEGER)''')
cur.execute("CREATE TABLE IF NOT EXISTS weeks (id INTEGER PRIMARY KEY, week TEXT UNIQUE)")
cur.execute("CREATE TABLE IF NOT EXISTS locs (id INTEGER PRIMARY KEY, loc TEXT UNIQUE)")
for i in range(len(national_lst)):
week = str(national_lst[i][0]['date'])[:10]
cur.execute('''INSERT OR IGNORE INTO weeks (week) VALUES (?)''', (week,))
cur.execute('''SELECT id FROM weeks where week = (?)''', (week,))
week_id = cur.fetchone()[0]
location = national_lst[i][0]['level']
cur.execute('''INSERT OR IGNORE INTO locs (loc) VALUES (?)''', (location,))
cur.execute('''SELECT id FROM locs where loc = (?)''', (location,))
location_id = cur.fetchone()[0]
at_home = float(national_lst[i][0]['pop_stay_at_home'])
not_at_home = float(national_lst[i][0]['pop_not_stay_at_home'])
ratio_at_home = (at_home) / (not_at_home)
one_mile_trips = float(national_lst[i][0]['trips_1'])
one_to_three_mile_trips = float(national_lst[i][0]['trips_1_3'])
three_to_five_mile_trips = float(national_lst[i][0]['trips_3_5'])
short_trips = float(national_lst[i][0]['trips_10_25']) + float(national_lst[i][0]['trips_25_50'])
hundred_to_250_mile_trips = float(national_lst[i][0]['trips_100_250'])
twohundredfifty_to_500_mile_trips = float(national_lst[i][0]['trips_250_500'])
long_trips = float(national_lst[i][0]['trips_500'])
try:
ratio_short_to_long= short_trips / long_trips
except:
ratio_short_to_long = 0
cur.execute('''INSERT OR IGNORE INTO transport (i, week_id, location_id, at_home, not_at_home, ratio_at_home,
one_mile_trips, one_to_three_mile_trips, three_to_five_mile_trips, short_trips, hundred_to_250_mile_trips,
twohundredfifty_to_500_mile_trips, long_trips, ratio_short_to_long) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
(i, week_id, location_id, at_home, not_at_home, ratio_at_home, one_mile_trips, one_to_three_mile_trips,
three_to_five_mile_trips, short_trips, hundred_to_250_mile_trips, twohundredfifty_to_500_mile_trips,
long_trips, ratio_short_to_long))
conn.commit()
primary_key = len(state_lst)
for i in range(len(state_lst)):
primary_key += 1
week = str(state_lst[i][0]['date'])[:10]
cur.execute('''INSERT OR IGNORE INTO weeks (week) VALUES (?)''', (week,))
cur.execute('''SELECT id FROM weeks where week = (?)''', (week,))
week_id = cur.fetchone()[0]
location = state_lst[i][0]['state_code']
cur.execute('''INSERT OR IGNORE INTO locs (loc) VALUES (?)''', (location,))
cur.execute('''SELECT id FROM locs where loc = (?)''', (location,))
location_id = cur.fetchone()[0]
at_home = float(state_lst[i][0]['pop_stay_at_home'])
not_at_home = float(state_lst[i][0]['pop_not_stay_at_home'])
ratio_at_home = (at_home) / (not_at_home)
one_mile_trips = float(state_lst[i][0]['trips_1'])
one_to_three_mile_trips = float(state_lst[i][0]['trips_1_3'])
three_to_five_mile_trips = float(state_lst[i][0]['trips_3_5'])
short_trips = float(state_lst[i][0]['trips_10_25']) + float(state_lst[i][0]['trips_25_50'])
hundred_to_250_mile_trips = float(state_lst[i][0]['trips_100_250'])
twohundredfifty_to_500_mile_trips = float(state_lst[i][0]['trips_250_500'])
long_trips = float(state_lst[i][0]['trips_500'])
try:
ratio_short_to_long= short_trips / long_trips
except:
ratio_short_to_long = 0
cur.execute('''INSERT OR IGNORE INTO transport (i, week_id, location_id, at_home, not_at_home, ratio_at_home,
one_mile_trips, one_to_three_mile_trips, three_to_five_mile_trips, short_trips, hundred_to_250_mile_trips,
twohundredfifty_to_500_mile_trips, long_trips, ratio_short_to_long) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
(primary_key, week_id, location_id, at_home, not_at_home, ratio_at_home, one_mile_trips, one_to_three_mile_trips,
three_to_five_mile_trips, short_trips, hundred_to_250_mile_trips, twohundredfifty_to_500_mile_trips,
long_trips, ratio_short_to_long))
conn.commit()
##PART 2: COVID 19 DATA
def get_data():
myapp_token = "1OabKlWEiltQ0w4gC7C7XJd6Q"
client = Socrata("data.cdc.gov", myapp_token)
resultsUS1 = client.get("r8kw-7aab", state = "United States",year = 2020, where = "mmwr_week>11", limit=21) #returns 42 weeks data
resultsUS2 = client.get("r8kw-7aab", state = "United States", year = 2020, where = "mmwr_week>31")
resultsUS3 = client.get("r8kw-7aab", state = "United States",year = 2021, where = "mmwr_week<12")
resultsMI1 = client.get("r8kw-7aab",state = "Michigan",year = 2020, where = "mmwr_week>11", limit=21)
resultsMI2 = client.get("r8kw-7aab", state = "Michigan", year = 2020, where = "mmwr_week>31")
resultsMI3=client.get("r8kw-7aab",state = "Michigan",year = 2021, where = "mmwr_week<12")
midict ={}
usdict={}
for result in resultsMI1:
midict[result['start_date'][:10]]=int(result['covid_19_deaths'])
for result in resultsMI2:
midict[result['start_date'][:10]]=int(result['covid_19_deaths'])
for result in resultsMI3:
midict[result['start_date'][:10]]=int(result['covid_19_deaths'])
for result in resultsUS2:
usdict[result['start_date'][:10]]=int(result["covid_19_deaths"])
for result in resultsUS1:
usdict[result['start_date'][:10]]=int(result["covid_19_deaths"])
for result in resultsUS3:
usdict[result['start_date'][:10]]=int(result["covid_19_deaths"])
return [midict, usdict]
def create_db(midata, usdata):
cur, conn = setUpDatabase('full_data.db')
cur.execute("CREATE TABLE IF NOT EXISTS covid_deaths (i INTEGER PRIMARY KEY, wk TEXT, loc INTEGER, deaths NUMBER)")
#cur.execute("CREATE TABLE IF NOT EXISTS weeks (id INTEGER PRIMARY KEY, week TEXT UNIQUE)")
#cur.execute("CREATE TABLE IF NOT EXISTS locs (id INTEGER, loc TEXT UNIQUE)")
for i in range(len(usdata.keys())):
weeks = list(usdata.keys())
week = weeks[i]
deaths = usdata[week]
loc = 'National'
cur.execute('''INSERT OR IGNORE INTO locs (loc) VALUES (?)''', (loc,))
cur.execute('''SELECT id FROM locs where loc = (?)''', (loc,))
loc_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO weeks (week) VALUES (?)''', (week,))
cur.execute('''SELECT id FROM weeks where week = (?)''', (week,))
week_id = cur.fetchone()[0]
cur.execute("INSERT OR IGNORE INTO covid_deaths (i, wk,loc, deaths) VALUES (?,?,?,?)", (i, week_id,loc_id,deaths))
conn.commit()
primary_key = len(usdata.keys())
for i in range(len(midata.keys())):
primary_key += 1
weeks = list(midata.keys())
week = weeks[i]
deaths = midata[week]
loc = 'MI'
cur.execute('''INSERT OR IGNORE INTO locs (loc) VALUES (?)''', (loc,))
cur.execute('''SELECT id FROM locs where loc = (?)''', (loc,))
loc_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO weeks (week) VALUES (?)''', (week,))
cur.execute('''SELECT id FROM weeks where week = (?)''', (week,))
week_id = cur.fetchone()[0]
cur.execute("INSERT OR IGNORE INTO covid_deaths (i, wk,loc,deaths) VALUES (?,?,?,?)", (primary_key, week_id,loc_id,deaths))
conn.commit()
# set primary keys ?? how to do this
#PART 3: UNEMPLOYMENT DATA
with open("unemployment_report.html") as f:
soup = BeautifulSoup(f, "html.parser")
def get_week_dates():
#Gets start dates of weeks and formats into keys that match the rest of the code
#Function will return a list of these weekly start dates
#List returns dates chronologically from March 2020 to may 2020
date_list = []
anchor = soup.find("table")
rows = anchor.find_all('tr')
#rows is sliced to get data from specified dates our group decided upon
for row in rows[13:38]:
header = row.find('td')['headers']
date = header[0]
#broke up the date into year, month, and day for reformatting purposes
year = date[-4:]
day = date[3:5]
month = date[:2]
#this formats the date given into a format our group agreed upon
formatted_date = str(year) + '-' + str(month) + '-' + str(day)
date_list.append(formatted_date)
for row in rows[38:63]:
header = row.find('td')['headers']
date = header[0]
#broke up the date into year, month, and day for reformatting purposes
year = date[-4:]
day = date[3:5]
month = date[:2]
#this formats the date given into a format our group agreed upon
formatted_date = str(year) + '-' + str(month) + '-' + str(day)
date_list.append(formatted_date)
for row in rows[63:-42]:
header = row.find('td')['headers']
date = header[0]
#broke up the date into year, month, and day for reformatting purposes
year = date[-4:]
day = date[3:5]
month = date[:2]
#this formats the date given into a format our group agreed upon
formatted_date = str(year) + '-' + str(month) + '-' + str(day)
date_list.append(formatted_date)
return date_list
def get_national_initial_nsa_claims():
#Gets weekly initial not seasonally adjusted jobless claims from week of March 14, 2020 to March 13, 2021.
#Function will return a list of these numbers per week
#List returns numbers chronologically from March 2020 to may 2020
nsa_list = []
anchor = soup.find("table")
rows = anchor.find_all('tr')
#rows is sliced to get data from specified dates our group decided upon
for row in rows[13:38]:
nsa_initial_claim_num = row.find('td').text
#this gets rid of commas in numbers so I can convert the strings into integers without error
num = nsa_initial_claim_num.replace(',', '')
nsa_list.append(int(num))
for row in rows[38:63]:
nsa_initial_claim_num = row.find('td').text
#this gets rid of commas in numbers so I can convert the strings into integers without error
num = nsa_initial_claim_num.replace(',', '')
nsa_list.append(int(num))
for row in rows[63:-42]:
nsa_initial_claim_num = row.find('td').text
#this gets rid of commas in numbers so I can convert the strings into integers without error
num = nsa_initial_claim_num.replace(',', '')
nsa_list.append(int(num))
return nsa_list
def get_mich_initial_nsa_claims():
#Gets weekly initial not seasonally adjusted jobless claims from week of March 14, 2020 to March 13, 2021.
#Function will return a list of these numbers per week
#List returns numbers chronologically from March 2020 to may 2020
with open("mich_unemployment.html") as f:
soup = BeautifulSoup(f, "html.parser")
nsa_list = []
anchor = soup.find("table")
rows = anchor.find_all('tr')
#rows is sliced to get data from specified dates our group decided upon
for row in rows[12:37]:
nsa_initial_claim_num = row.find('td').text
#this gets rid of commas in numbers so I can convert the strings into integers without error
num = nsa_initial_claim_num.replace(',', '')
nsa_list.append(int(num))
for row in rows[37:62]:
nsa_initial_claim_num = row.find('td').text
#this gets rid of commas in numbers so I can convert the strings into integers without error
num = nsa_initial_claim_num.replace(',', '')
nsa_list.append(int(num))
for row in rows[62:-4]:
nsa_initial_claim_num = row.find('td').text
#this gets rid of commas in numbers so I can convert the strings into integers without error
num = nsa_initial_claim_num.replace(',', '')
nsa_list.append(int(num))
return nsa_list
def get_national_weekly_cumulative_total():
#Function gets a weekly cumulative total of initial NSA unemployment claims as weeks go on
#Function will return a list of the cumulative total per week
#List returns numbers chronologically from March 2020 to may 2020
total = 0
cumulative_totals = []
claim_per_week = get_national_initial_nsa_claims()
for num in claim_per_week:
total += num
cumulative_totals.append(total)
return cumulative_totals
def get_mich_weekly_cumulative_total():
#Function gets a weekly cumulative total of initial Michigan NSA unemployment claims as weeks go on
#Function will return a list of the cumulative total per week
#List returns numbers chronologically from March 2020 to may 2020
total = 0
cumulative_totals = []
claim_per_week = get_mich_initial_nsa_claims()
for num in claim_per_week:
total += num
cumulative_totals.append(total)
return cumulative_totals
def make_national_dict():
#Function calls get_week_dates and get_national_initial_nsa_claims
#Function will turn both lists that are returned into a dictionary
#Dates will be the keys and initial NSA claims for national data will be values
#Function will return the dictionary
dates = get_week_dates()
claim_nums = get_national_initial_nsa_claims()
return dict(zip(dates, claim_nums))
def make_mich_dict():
#Function calls get_week_dates and get_mich_initial_nsa_claims
#Function will turn both lists that are returned into a dictionary
#Dates will be the keys and initial NSA claims for Michigan data will be values
#Function will return the dictionary
dates = get_week_dates()
claim_nums = get_mich_initial_nsa_claims()
return dict(zip(dates, claim_nums))
def fill_database(nat_unemployment_dict, mich_unemployment_dict, nat_totals_list, mich_totals_list, cur, conn):
#creates weeks table
#cur.execute('''CREATE TABLE IF NOT EXISTS weeks (id INTEGER PRIMARY KEY, week TEXT UNIQUE)''')
#creates table of unemployment rates
cur.execute('''CREATE TABLE IF NOT EXISTS unemployment_rates (i INTEGER PRIMARY KEY, week INTEGER, loc INTEGER, initial_nsa_claims INTEGER, total_claims INTEGER)''')
for x in range(len(nat_unemployment_dict.keys())):
weeks = list(nat_unemployment_dict.keys())
#creates variables to insert into table
week_key = weeks[x]
unemployment_claim_num = nat_unemployment_dict[week_key]
nat_total = nat_totals_list[x]
week_num = x + 1
loc = "MI"
#get location id
cur.execute('''SELECT id FROM locs where loc = (?)''', (loc,))
loc_id = cur.fetchone()[0]
#adds information into weeks table
cur.execute('''INSERT OR IGNORE INTO Weeks (id, week) VALUES (?, ?)''', (week_num, week_key))
#adds information into unemployment rates table
cur.execute('''INSERT OR IGNORE INTO unemployment_rates (i, week, loc, initial_nsa_claims, total_claims) VALUES (?, ?, ?, ?, ?)''', (x, week_num, loc_id, unemployment_claim_num, nat_total))
for x in range(len(mich_unemployment_dict.keys())):
weeks = list(mich_unemployment_dict.keys())
#creates variables to insert into table
week_key = weeks[x]
mich_unemployment_claim_num = mich_unemployment_dict[week_key]
mich_total = mich_totals_list[x]
week_num = x + 1
loc = "MI"
cur.execute('''SELECT id FROM locs where loc = (?)''', (loc,))
loc_id = cur.fetchone()[0]
#adds information into unemployment rates table
cur.execute('''INSERT OR IGNORE INTO unemployment_rates (i, week, loc, initial_nsa_claims, total_claims) VALUES (?, ?, ?, ?, ?)''', (x, week_num, loc_id, mich_unemployment_claim_num, mich_total))
print("Finished adding data")
conn.commit()
def main():
'''Calls functions to get data from the CDC, BTS, and Unemployment Bureau.
Function takes in no inputs and does not return anything'''
#PART 1: Collect and insert mobility data
t = datetime.datetime(2020, 3, 15)
nats = build_national_lst(t)
mi = build_state_lst(t)
create_transport_database(nats, mi)
#PART 2: Collect and insert COVID-19 deaths data
midata = get_data()[0]
usdata = get_data()[1]
create_db(midata,usdata)
#PART 3: Collect and insert unemployment claims data
cur, conn = setUpDatabase('full_data.db')
weekly_national_umemployment_claims_dict = make_national_dict()
weekly_mich_umemployment_claims_dict = make_mich_dict()
cumulative_national_claims_list = get_national_weekly_cumulative_total()
cumulative_mich_claims_list = get_mich_weekly_cumulative_total()
fill_database(weekly_national_umemployment_claims_dict, weekly_mich_umemployment_claims_dict, cumulative_national_claims_list, cumulative_mich_claims_list, cur, conn)
#Close connection
conn.close()
if __name__ == "__main__":
main()