-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathimport_nsc.py
515 lines (459 loc) · 22 KB
/
import_nsc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
#!python3
'''
This file takes a CSV file from National Student Clearinghouse
and converts it to a table of enrollment records suitable to import
into an alumni database
'''
from botutils.tkintertools import tktools
from botutils.ADB import AlumniDatabaseInterface as aDBi
from botutils.tabletools import tableclass as tc
from botutils.tabletools import tabletools as tt
from nsc_modules import nsc_names as n
from datetime import date
def check_degrees_table_complete(nsc_data, degree_dict):
'''performs check and if not, creates an update file and exits'''
nsc_degrees = set(nsc_data.get_column(n.DEGREE_TITLE)) - set([''])
missing_degrees = nsc_degrees - set(degree_dict)
if missing_degrees:
degree_lookup = { 'A': "Associate's",
'B': "Bachelor's",
'C': "Certificate",
'M': "Master's"}
output_table = []
for miss in missing_degrees:
if miss:
if miss[0] in degree_lookup:
output_table.append([miss, degree_lookup[miss[0]]])
else:
output_table.append([miss, '?'])
print('Some degrees from the NSC file are not included in your\n'+
'degree list (default filename degreelist.csv). Please\n'+
'inspect the file "missing_degrees.csv" and copy the values\n'+
'to the bottom of degreelist.csv before running again.')
tt.table_to_csv('missing_degrees.csv',output_table)
exit()
def check_college_table_complete(nsc_data, college_dict):
'''performs check and if not, creates an update file and exits'''
nsc_colleges = set(nsc_data.get_column(
n.COLLEGE_CODE_BRANCH)) - set([''])
# The OPEIDs we're comparing have follow the format '001650-01' in NSC
# and 165001 in the IPEDS database. Also, if the two digit ending is
# not found, '00' will specify the same school at the main branch
opeid_missing=[]
for entry in nsc_colleges:
parts = entry.split('-')
opeid1 = str(int(parts[0]+parts[1]))
opeid2 = str(int(parts[0]+'00'))
if opeid1 not in college_dict and opeid2 not in college_dict:
opeid_missing.append(entry)
if opeid_missing:
nsc_dict = nsc_data.create_dict_list(n.COLLEGE_CODE_BRANCH,
[n.COLLEGE_NAME, n.COLLEGE_STATE,
n.PUBLIC_PRIVATE])
output = [['OPEID', 'NCESID', 'Name', 'State', 'Control']]
for college in opeid_missing:
data = nsc_dict[college]
parts = college.split('-')
opeid1 = str(int(parts[0]+parts[1]))
row = [ opeid1, '?', data[0], data[1], data[2]]
output.append(row)
print('Some colleges from the NSC file are not included in your\n'+
'college list (default filename collegelist.csv). Please\n'+
'inspect the file "missing_colleges.csv" and use it to\n'+
'add new colleges to the bottom of collegelist.csv\n'+
'before running again.')
tt.table_to_csv('missing_colleges.csv',output)
exit()
def grab_inputs(nsc_fn, sch_fn, deg_fn):
'''Loads the main CSV inputs and checks to ensure the records in the
college and degree tables cover all those given in the NSC data. (If
not, it creates an output with the missing records and provides
an error message.)'''
nsc_data = tc.Table(nsc_fn)
college_list = tc.Table(sch_fn)
degree_list = tc.Table(deg_fn)
college_dict = college_list.create_dict_list('OPEID',
['NCESID', 'Name'])
degree_dict = degree_list.create_dict('UpdateDegree', 'DegreeType')
# These two functions will abort the script if not complete
check_degrees_table_complete(nsc_data, degree_dict)
check_college_table_complete(nsc_data, college_dict)
return (nsc_data, college_dict, degree_dict)
def add_columns_to_nsc_data(nsc_data, college_dict):
'''Uses data in standard columns to add a few more useful columns to
the main NSC data table'''
sId = []
hs_class = []
NCESid = []
colName = []
startDate = []
endDate = []
gradDate = []
grab_columns = [n.YOUR_UNIQUE_IDENTIFIER,
n.HIGH_SCHOOL_GRAD_DATE,
n.COLLEGE_CODE_BRANCH,
n.ENROLLMENT_BEGIN,
n.ENROLLMENT_END,
n.GRADUATION_DATE,
]
# The following will yield lists with only the above columns
for row in nsc_data.get_columns(grab_columns):
# NSC Unique Identifier is our ID with a '_' appended
sId.append(row[0][:-1])
# HS Class is the same as the year part of the grad year
hs_class.append(row[1][0:4])
# We can OPEID to our college database in two different ways
opeid_raw = row[2]
if opeid_raw:
parts = opeid_raw.split('-')
opeid1 = str(int(parts[0]+parts[1]))
opeid2 = str(int(parts[0]+'00'))
if opeid1 in college_dict: opeid = opeid1
else: opeid = opeid2
NCESid.append(college_dict[opeid][0])
colName.append(college_dict[opeid][1])
else:
NCESid.append('')
colName.append('')
# These 3 date fields have the form YYYYMMDD
startD_raw = row[3]
if startD_raw:
startDate.append(date(int(startD_raw[0:4]), int(startD_raw[4:6]),
int(startD_raw[6:])))
else: startDate.append('')
endD_raw = row[4]
if endD_raw:
endDate.append(date(int(endD_raw[0:4]), int(endD_raw[4:6]),
int(endD_raw[6:])))
else: endDate.append('')
gradD_raw = row[5]
if gradD_raw:
gradDate.append(date(int(gradD_raw[0:4]), int(gradD_raw[4:6]),
int(gradD_raw[6:])))
else: gradDate.append('')
# After looping through all raw records, add the results to main table
nsc_data.add_column('sId', sId)
nsc_data.add_column('HS Class', hs_class)
nsc_data.add_column('NCES ID', NCESid)
nsc_data.add_column('College Name', colName)
nsc_data.add_column('Start Date', startDate)
nsc_data.add_column('End Date', endDate)
nsc_data.add_column('Grad Date', gradDate)
def combine_s_c_enrollments(s_c_table, hd, daysgap):
'''s_c_table is a list of lists containing all the enrollments for
a single student at a single column (with columns defined by the header
dictionary hd. This combines enrollments that occur within daysgap
of each other'''
#First, move "Grad Date" into start and end date if blank
for row in s_c_table:
if not row[hd['Start Date']]: #if true, always true for "End Date" also
row[hd['Start Date']] = row[hd['Grad Date']]
row[hd['End Date']] = row[hd['Grad Date']]
#Second combine the enrollments that are contiguous, bringing up (as
# (as common) the following fields: GRADUATED, DEGREE_TITLE, MAJOR
if len(s_c_table) > 1:
s_c_table.sort(key=lambda x: x[hd['Start Date']])
# Now create a mapping of which records are the same
enrollments = []
new_enr = [0] # The current enrollment is primed with the first one
for i in range(1,len(s_c_table)):
last_end = s_c_table[new_enr[-1]][hd['End Date']]
this_start = s_c_table[i][hd['Start Date']]
days_diff = (this_start - last_end).days
if (days_diff > daysgap or #too many days apart
( s_c_table[i-1][hd['Grad Date']] and #continuing after
not s_c_table[i][hd['Grad Date']])): #graduating
# New enrollment starting with this one
enrollments.append(new_enr)
new_enr = [i]
else: # same enrollment
new_enr.append(i)
enrollments.append(new_enr)
# Take the mapping of like records and create single enrollment for each
new_s_c_table = []
for enrollment in enrollments:
record = s_c_table[enrollment[0]]
# columns to copy the max value into
for col in [hd['End Date'], hd[n.GRADUATED],
hd[n.DEGREE_TITLE], hd[n.MAJOR]]:
record[col] = max([s_c_table[enr][col] for enr in enrollment])
# columns to copy the final value into
for col in [hd[n.ENROLLMENT_STATUS], ]:
record[col] = s_c_table[enrollment[-1]][col]
new_s_c_table.append(record)
else: # the case if there is only a single entry for the student/college
new_s_c_table = s_c_table
return new_s_c_table
def combine_contiguous_enrollments(nsc_data, daysgap):
'''Takes the main table and combines contiguous enrollments for the
same college and student. Enrollments are judged to be contiguous
if the days between them is less than daysgap. Enrollments have no
end date if the end date is after effdate.'''
# First get a list of students with matches
students_raw = nsc_data.get_columns(['sId', n.RECORD_FOUND_Y_N])
students = {student[0] for student in students_raw if student[1]=='Y'}
hd = nsc_data.get_header_dict() #So we can reference the elements
results_table = []
# Now process the records for each student
print('Beginning to process %d students' % len(students))
sCount = 0 # For screen display
for sId in students:
sCount += 1
student_table = list(nsc_data.get_match_rows('sId',sId))
colleges = {s[hd['NCES ID']] for s in student_table}
for col in colleges:
s_c_table = [row for row in student_table
if row[hd['NCES ID']] == col
and row[hd['RECORD_FOUND_Y/N']] != 'N']
#Now need to process each college in student_table
s_c_condensed = combine_s_c_enrollments(s_c_table, hd,
daysgap)
results_table.extend(s_c_condensed)
#if not sCount % 10: print('.', end='', flush=True)
if not sCount %100: print('%d contacts processed.' % sCount, flush=True)
tt.add_header(results_table, hd)
#Also need to handle records with no date
return tc.Table(results_table)
def add_status_fields(combined_nsc, degree_dict, daysgap, effdate):
'''Adds a few interpreted status fields to the combined records:
Status = Attending, Graduated, Transferred Out, Withdrew
DegreeType = Bachelor's, Associate's, Certificate, Associate's or
Certificate (TBD), Master's
DataSource = NSC (only option
LastVerified = Date provided by effdate
(Note that combined_nsc is changed in place)
'''
# Will pass through every record to compute this into columns for each
hd = combined_nsc.get_header_dict()
status = []
degree_type = []
data_source = []
last_verified = []
dlv = date(int(effdate[-4:]), int(effdate[0:2]), int(effdate[3:5]))
for row in combined_nsc.rows():
last_verified.append(dlv) # Always the same for every record
data_source.append('NSC') # Always the same for every record
# Status will be based off the end date and grad/withdraw indicators
date_gap = (dlv - row[hd['End Date']]).days
end_status = row[hd[n.ENROLLMENT_STATUS]]
if row[hd[n.GRADUATED]] == 'Y':
status.append('Graduated')
elif end_status == 'W' or date_gap > daysgap:
status.append('T/W')
else:
status.append('Attending')
# Degree type is found from "graduated" degree titles or guessed from
# type of college
degree_title = row[hd[n.DEGREE_TITLE]]
if degree_title:
degree_type.append(degree_dict[degree_title])
else:
if row[hd[n.TWO_YEAR_4_YEAR]][0] == '4':
degree_type.append("Bachelor's")
else:
degree_type.append("Associate's or Certificate (TBD)")
# Add new columns to table
combined_nsc.add_column('Status', status)
combined_nsc.add_column('Degree Type', degree_type)
combined_nsc.add_column('Data Source', data_source)
combined_nsc.add_column('Last Verified', last_verified)
hd = combined_nsc.get_header_dict()
print('Now cleaning up T/W statuses.')
# Pass through and determine if T/W fields are T or W
students = set(combined_nsc.get_column('sId'))
for student in students:
s_rows = list(combined_nsc.get_match_rows('sId', student))
if len(s_rows) > 1:
s_rows.sort(key=lambda x: x[hd['Start Date']])
big_s = date(1900, 1,1) # big_s and big_e are for checking if there's
big_e = date(1901, 1,1) # a large enrollment that encompasses small ones
for i in range(len(s_rows)):
sDate = s_rows[i][hd['Start Date']]
eDate = s_rows[i][hd['End Date']]
if sDate > big_e: # this enrollment occurs after the previous
big_s = sDate # "big" enrollment
big_e = eDate
if s_rows[i][hd['Status']] == 'T/W':
if i+1 == len(s_rows): # final row for this student
if eDate >= big_e: # not a summer/catchup class
s_rows[i][hd['Status']] = 'Withdrew'
else: # probably a summer/catchup class
s_rows[i][hd['Status']] = 'Transferred out'
else:
date_gap = (s_rows[i+1][hd['Start Date']] -
s_rows[i][hd['End Date']]).days
if date_gap > daysgap:
if eDate >= big_e: # not a summer/catchup class
s_rows[i][hd['Status']] = 'Withdrew'
else:
s_rows[i][hd['Status']] = 'Transferred out'
else:
s_rows[i][hd['Status']] = 'Transferred out'
def import_ids_from_salesforce():
'''Interfaces with the Salesforce database to grab the translation from
NCES ID->SF ID for colleges and Student Number->SF ID for alumni and
returns the translations as a tuple of two dictionarys'''
alumni_table, college_table = aDBi.grabAccountContactTranslations()
alumni_dict = tt.create_dict(alumni_table[1:], 0, 1)
college_dict = tt.create_dict(college_table[1:], 0, 1)
return (alumni_dict, college_dict)
def add_Salesforce_indices(combined_nsc):
''' Adds the Salesforce names for alumni and colleges using the student
id "YOUR UNIQUE IDENTIFIER" from NSC and using the NCES code for college.
Will use "N/A" for any alumnus/alumnae or college not found in the lookup'''
# First check whether the user would prefer to add from Salesforce API
# calls directly or via file upload
choices = ['Grab directly from Salesforce',
'Import as CSV files (see README for details)']
response = tktools.get_buttons_answer(choices,
'How would you like to get the SF IDs for alumni and colleges?')
# Once the user responds, generate a dictionary lookup for both fields
if response == choices[0]: # user selected 'Grab directly from Salesforce'
alumni_dict, college_dict = import_ids_from_salesforce()
else: # user wants to enter the info as a set of CSV files
filenames = {
'alum':['alumniNames.csv',
'Alumni names file (two columns: Your ID/SF ID)', 'r'],
'coll':['collegeNames.csv',
'College names file (two columns: NCES ID/ SF ID)', 'r']
}
tktools.get_filenames(filenames)
alumni_table = tt.grab_csv_table(filenames['alum'][0])
alumni_dict = tt.create_dict(alumni_table[1:], 0, 1)
college_table = tt.grab_csv_table(filenames['coll'][0])
college_dict = tt.create_dict(college_table[1:], 0, 1)
# Now that we have a dictionary for both, generate the two new columns
college_ids =[]
alumni_ids =[]
for row in combined_nsc.rows():
try:
college_ids.append(college_dict[row[combined_nsc.c('NCES ID')]])
except:
college_ids.append('N/A')
try:
alumni_ids.append(alumni_dict[row[combined_nsc.c('sId')]])
except:
alumni_ids.append('N/A')
# Finally, add the new columns back to the master table
combined_nsc.add_column('College__c', college_ids)
combined_nsc.add_column('Student__c', alumni_ids)
def finalize_output_columns(combined_nsc):
'''Takes the result of all the prior functions and reorders/renames
columns for a slightly more user friendly/Salesforce expected set'''
output_col = [ 'Student ID',
'Last Name',
'First Name',
'HS Class',
'College NCES ID',
'College Name',
'Student__c',
'College__c',
'Start_Date__c',
'End_Date__c',
'Status__c',
'Degree_Type__c',
'Data_Source__c',
'Date_Last_Verified__c',
'Degree_Text__c',
'Major_Text__c',
]
source_col = [ 'sId',
n.LAST_NAME,
n.FIRST_NAME,
'HS Class',
'NCES ID',
'College Name',
'Student__c',
'College__c',
'Start Date',
'End Date',
'Status',
'Degree Type',
'Data Source',
'Last Verified',
n.DEGREE_TITLE,
n.MAJOR,
]
new_table = combined_nsc.new_subtable(source_col, output_col)
new_table.apply_func('Last Name', str.title)
new_table.apply_func('First Name', str.title)
for row in new_table.rows():
if row[new_table.c('Status__c')] == 'Attending':
row[new_table.c('End_Date__c')] = ''
return new_table
def main(nsc, sch, deg, out, daysgap, effdate):
'''Main control flow for processing NSC files'''
print('-'*40)
print('NSC file is %s' % nsc)
print('Schools file is %s' % sch)
print('Degrees file is %s' % deg)
print('Output file will be %s' % out)
print('Allowed days gap is %d' % int(daysgap))
print('NSC report produced on %s' % effdate)
print('-'*40)
# Load input files (aborts with error if college/degree incomplete)
nsc_data, college_dict, degree_dict = grab_inputs(nsc, sch, deg)
# Add a few interpreted columns to the nsc table
print('Beginning to process NSC file.')
add_columns_to_nsc_data(nsc_data, college_dict)
# Combine records that reflect contiguous enrollments
print('Beginning to combine contiguous records.')
combined_nsc = combine_contiguous_enrollments(nsc_data, daysgap)
print('-'*40)
# Add Status, DegreeType, Date Verified, DataSource fields
print('Enrollments combined, now adding status fields.')
# had previously forced daysgap=40 in the below; not sure why?
add_status_fields(combined_nsc, degree_dict, daysgap, effdate)
# Add in Salesforce names for students and for colleges
# ToDo: Pass a "silent" variable to this function based on a to-be-added
# command line option to supress the query for SF/files
print('-'*40)
print('Adding Salesforce IDs for alumni and colleges')
add_Salesforce_indices(combined_nsc)
# Throw out the columns we don't need anymore and reorder others
print('-'*40)
print('Creating final output and saving to %s.' % out)
final_output_table = finalize_output_columns(combined_nsc)
#Finally, save the output file
final_output_table.to_csv(out)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Process new NSC detail file')
nsc_help='Detailed CSV file from Clearinghouse'
parser.add_argument('-nsc', dest='nsc', action='store', help=nsc_help)
sch_help='Colleges table (provided with this file)'
parser.add_argument('-sch', dest='sch', action='store', help=sch_help,
default='nsc_inputs/collegelist.csv')
deg_help='Degrees table (provided with this file)'
parser.add_argument('-deg', dest='deg', action='store', help=deg_help,
default='nsc_inputs/degreelist.csv')
out_help='Output filename (csv)'
parser.add_argument('-out', dest='out', action='store', help=out_help,
default='import_nsc_output.csv')
days_help='Number of days allowed between enrollments'
parser.add_argument('-days', dest='daysgap', type=int, help=days_help)
effdate_help='Date to use for NSC file run (MM/DD/YYYY)'
parser.add_argument('-date', dest='effdate', help=effdate_help)
args = parser.parse_args()
# The following dict format is expected by tktools.get_filenames
arg_dict = { 'nsc': [args.nsc, nsc_help, 'r'],
'sch': [args.sch, sch_help, 'r'],
'deg': [args.deg, deg_help, 'r'],
'out': [args.out, out_help, 'w'],
}
if args.nsc is None: #Assume the user needs to be queried with a window
tktools.get_filenames(arg_dict)
if args.daysgap is None:
args.daysgap=131
if args.effdate is None:
#args.effdate='08/10/2016'
#args.effdate='11/18/2016'
args.effdate='08/30/2018'
main( arg_dict['nsc'][0],
arg_dict['sch'][0],
arg_dict['deg'][0],
arg_dict['out'][0],
args.daysgap,
args.effdate
)
s = input('----(hit enter to close)----')