From ef2b72aa9b2580c8cffda9853eb0e41ccfa64fbb Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Thu, 3 May 2018 13:43:02 -0700 Subject: [PATCH 01/76] DCF First Pass --- accounts/urls.py | 2 ++ accounts/views.py | 81 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/accounts/urls.py b/accounts/urls.py index 18c17327..3bda2cf6 100755 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -28,6 +28,8 @@ url(r'^logout', views.extended_logout_view, name='account_logout'), url(r'^login/$', google_views.oauth2_login, name='account_login'), # url(r'^nih_login/$', views.nih_login, name='nih_login'), + url(r'^dcf/login/callback/$', views.oauth2_callback, name='dcf_callback'), + url(r'^dcf_login/$', views.oauth2_login, name='dcf_login'), url(r'^unlink_accounts/', views.unlink_accounts, name='unlink_accounts'), # Google Cloud Project related diff --git a/accounts/views.py b/accounts/views.py index ac3bbfcb..c867432d 100755 --- a/accounts/views.py +++ b/accounts/views.py @@ -40,6 +40,19 @@ unregister_all_gcp_sa, unregister_sa_with_id, service_account_dict, \ do_nih_unlink, deactivate_nih_add_to_open + +import requests +import random +import string + +from django.http import HttpResponseRedirect +from django.contrib.auth.decorators import login_required +from allauth.socialaccount.models import SocialApp, SocialToken + +from oauthlib.oauth2 import WebApplicationClient + + + import json logger = logging.getLogger('main_logger') @@ -50,6 +63,10 @@ GOOGLE_ORG_WHITELIST_PATH = settings.GOOGLE_ORG_WHITELIST_PATH MANAGED_SERVICE_ACCOUNTS_PATH = settings.MANAGED_SERVICE_ACCOUNTS_PATH +DCF_AUTH_URL = settings.DCF_AUTH_URL +DCF_TOKEN_URL = settings.DCF_TOKEN_URL +DCF_USER_URL = settings.DCF_USER_URL + @login_required def extended_logout_view(request): response = None @@ -727,3 +744,67 @@ def get_user_datasets(request,user_id): status='500' return JsonResponse(result, status=status) + +@login_required +def oauth2_login(request): + callback_url = reverse('dcf_callback') + print 'li request user: {}'.format(str(request.user), callback_url) + + social_account = SocialApp.objects.get(provider='dcf') + + rando = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(10)) + + wac = WebApplicationClient(social_account.client_id) + + ruri = wac.prepare_request_uri(DCF_AUTH_URL, + redirect_uri='http://localhost:8100/accounts/dcf/login/callback/', + state=rando, scope=['openid', 'user']) + return HttpResponseRedirect(ruri) + +@login_required +def oauth2_callback(request): + callback_url = reverse('dcf_callback') + print 'cb request user: {} : {}'.format(str(request.user.id), callback_url) + data = { + 'redirect_uri': 'http://localhost:8100/accounts/dcf/login/callback/', + 'grant_type': 'authorization_code', + 'code': request.GET['code']} + + social_app = SocialApp.objects.get(provider='dcf') + + auth = requests.auth.HTTPBasicAuth(social_app.client_id, social_app.secret) + + resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) + token_data = json.loads(resp.text) + expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta(seconds=token_data["expires_in"])) + print expiration_time + + print "AT {} : {} : {} : {}".format(resp.status_code, resp.request, resp.text, str(token_data)); + + headers = {'Authorization': 'Bearer {}'.format(token_data['access_token'])} + resp = requests.get(DCF_USER_URL, headers=headers) + user_data = json.loads(resp.text) + + # Note we also get back an "id_token" which is an encrypted JWT. + # Note we also get back a "token_type" which had better be "Bearer". + + print "AT {} : {} : {}".format(resp.status_code, resp.request, resp.text); + + # Note the logic here. If a different User shows up and logs in to get back the same uid from dcf that + # another User had associated, the uid now becomes linked to the new user: + social_account, created = SocialAccount.objects.update_or_create(uid = user_data['user_id'], + provider = 'dcf', + defaults={ + 'user_id' : request.user.id, + 'extra_data': resp.text.rstrip() + }) + + " seeing: Duplicate entry '2-2' for key 'socialaccount_socialtoken_app_id_account_id_fca4e0ac_uniq'" + social_token, created = SocialToken.objects.update_or_create(account_id=social_account.id, + app_id=social_app.id, + defaults= { + 'token' : token_data['access_token'], + 'token_secret' : token_data['refresh_token'], + 'expires_at' : expiration_time + }) + return redirect('dashboard') \ No newline at end of file From 969a4a8a439bd216bfa67d5dcb05c249a4794657 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Fri, 4 May 2018 10:43:12 -0700 Subject: [PATCH 02/76] DCF Second Pass --- accounts/views.py | 63 +++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/accounts/views.py b/accounts/views.py index c867432d..d716bad3 100755 --- a/accounts/views.py +++ b/accounts/views.py @@ -50,6 +50,9 @@ from allauth.socialaccount.models import SocialApp, SocialToken from oauthlib.oauth2 import WebApplicationClient +from requests_oauthlib.oauth2_session import OAuth2Session +import os +os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' @@ -745,50 +748,68 @@ def get_user_datasets(request,user_id): return JsonResponse(result, status=status) +hackie_state = None + @login_required def oauth2_login(request): - callback_url = reverse('dcf_callback') - print 'li request user: {}'.format(str(request.user), callback_url) + full_callback = request.build_absolute_uri(reverse('dcf_callback')) social_account = SocialApp.objects.get(provider='dcf') rando = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(10)) - wac = WebApplicationClient(social_account.client_id) + # Should provide a state string that gets stashed in sessions! + oauth = OAuth2Session(social_account.client_id, redirect_uri=full_callback, scope=['openid', 'user']) + authorization_url, state = oauth.authorization_url(DCF_AUTH_URL) + # stash the state string in the session! + hackie_state = state + return HttpResponseRedirect(authorization_url) - ruri = wac.prepare_request_uri(DCF_AUTH_URL, - redirect_uri='http://localhost:8100/accounts/dcf/login/callback/', - state=rando, scope=['openid', 'user']) - return HttpResponseRedirect(ruri) + # wac = WebApplicationClient(social_account.client_id) + # + # ruri = wac.prepare_request_uri(DCF_AUTH_URL, + # redirect_uri=full_callback, + # state=rando, scope=['openid', 'user']) + # return HttpResponseRedirect(ruri) @login_required def oauth2_callback(request): - callback_url = reverse('dcf_callback') - print 'cb request user: {} : {}'.format(str(request.user.id), callback_url) + full_callback = request.build_absolute_uri(reverse('dcf_callback')) + print 'cb request user: {} : {}'.format(str(request.user.id), full_callback) data = { - 'redirect_uri': 'http://localhost:8100/accounts/dcf/login/callback/', + 'redirect_uri': full_callback, 'grant_type': 'authorization_code', 'code': request.GET['code']} social_app = SocialApp.objects.get(provider='dcf') - auth = requests.auth.HTTPBasicAuth(social_app.client_id, social_app.secret) + # You MUST provide the callback here to get it into the fetch request + dcf = OAuth2Session(social_app.client_id, state=hackie_state, redirect_uri=full_callback) + + # You MUST provide the client_id here in order to get this to do basic auth! Plus we need to provide + # the authorization_response argument intead of a parsed-out code argument: + token_data = dcf.fetch_token(DCF_TOKEN_URL, client_secret=social_app.secret, client_id=social_app.client_id, + authorization_response=request.get_full_path()) + +# auth = requests.auth.HTTPBasicAuth(social_app.client_id, social_app.secret) - resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) - token_data = json.loads(resp.text) + # resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) + # token_data = json.loads(resp.text) expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta(seconds=token_data["expires_in"])) - print expiration_time + # print expiration_time - print "AT {} : {} : {} : {}".format(resp.status_code, resp.request, resp.text, str(token_data)); + # print "AT {} : {} : {} : {}".format(resp.status_code, resp.request, resp.text, str(token_data)); - headers = {'Authorization': 'Bearer {}'.format(token_data['access_token'])} - resp = requests.get(DCF_USER_URL, headers=headers) + # headers = {'Authorization': 'Bearer {}'.format(token_data['access_token'])} + # resp = requests.get(DCF_USER_URL, headers=headers) + + resp = dcf.get(DCF_USER_URL) user_data = json.loads(resp.text) # Note we also get back an "id_token" which is an encrypted JWT. # Note we also get back a "token_type" which had better be "Bearer". - print "AT {} : {} : {}".format(resp.status_code, resp.request, resp.text); + print "AT {} : {} : {}".format(resp.status_code, resp.request, " ".join(resp.text.split())) # Note the logic here. If a different User shows up and logs in to get back the same uid from dcf that # another User had associated, the uid now becomes linked to the new user: @@ -796,10 +817,10 @@ def oauth2_callback(request): provider = 'dcf', defaults={ 'user_id' : request.user.id, - 'extra_data': resp.text.rstrip() + 'extra_data': json.loads(" ".join(resp.text.split())) }) - - " seeing: Duplicate entry '2-2' for key 'socialaccount_socialtoken_app_id_account_id_fca4e0ac_uniq'" + # Note how update_or_create works. You match on the named parameter values, and update with the values in the defaults. + # If it doesn't exist, the named parameter values are used to create the row. social_token, created = SocialToken.objects.update_or_create(account_id=social_account.id, app_id=social_app.id, defaults= { From b3bbccd1c322b3d08fc202dd80c4ee1b823d8f46 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 9 May 2018 09:57:46 -0700 Subject: [PATCH 03/76] More stuff... --- accounts/migrations/0016_dcftoken.py | 27 ++ accounts/models.py | 8 + accounts/sa_utils.py | 424 +++++++++++++++------------ accounts/urls.py | 2 + accounts/views.py | 345 ++++++++++++++++++---- 5 files changed, 555 insertions(+), 251 deletions(-) create mode 100644 accounts/migrations/0016_dcftoken.py diff --git a/accounts/migrations/0016_dcftoken.py b/accounts/migrations/0016_dcftoken.py new file mode 100644 index 00000000..aeeb2506 --- /dev/null +++ b/accounts/migrations/0016_dcftoken.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.10 on 2018-05-05 01:18 +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('accounts', '0015_googleproject_active'), + ] + + operations = [ + migrations.CreateModel( + name='DCFToken', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('dcf_user', models.CharField(max_length=128)), + ('access_token', models.TextField()), + ('refresh_token', models.TextField()), + ('expires_at', models.DateTimeField()), + ('nih_user', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to='accounts.NIH_User')), + ], + ), + ] diff --git a/accounts/models.py b/accounts/models.py index 3fe548e1..d4484b98 100755 --- a/accounts/models.py +++ b/accounts/models.py @@ -157,3 +157,11 @@ class ServiceAccountAuthorizedDatasets(models.Model): service_account = models.ForeignKey(ServiceAccount, null=False) authorized_dataset = models.ForeignKey(AuthorizedDataset, null=False) authorized_date = models.DateTimeField(auto_now=True) + + +class DCFToken(models.Model): + nih_user = models.OneToOneField(NIH_User, null=False) + dcf_user = models.CharField(max_length=128, null=False) + access_token = models.TextField(null=False) + refresh_token = models.TextField(null=False) + expires_at = models.DateTimeField(null=False) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 06b64021..a017c916 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -820,6 +820,64 @@ def __str__(self): def __repr_(self): return self.__str__() + +def found_linking_problems(NIH_username, user_id, user_email, my_st_logger, results): + # 1. check if this google identity is currently linked to other NIH usernames + # note: the NIH username exclusion is case-insensitive so this will not return a false positive + # e.g. if this google identity is linked to 'NIHUSERNAME1' but just authenticated with 'nihusername1', + # it will still pass this test + nih_usernames_already_linked_to_this_google_identity = NIH_User.objects.filter( + user_id=user_id, linked=True).exclude(NIH_username__iexact=NIH_username) + for nih_user in nih_usernames_already_linked_to_this_google_identity: + if nih_user.NIH_username.lower() != NIH_username.lower(): + logger.warn( + "User {} is already linked to the eRA commons identity {} and attempted authentication" + " with the eRA commons identity {}." + .format(user_email, nih_user.NIH_username, NIH_username)) + my_st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] {}".format( + "User {} is already linked to the eRA commons identity {} and attempted authentication" + " with the eRA commons identity {}." + .format(user_email, nih_user.NIH_username, NIH_username))) + + results.messages.append("User {} is already linked to the eRA commons identity {}. " + "Please unlink these before authenticating with the eRA commons " + "identity {}.".format(user_email, nih_user.NIH_username, + NIH_username)) + return True + + # 2. check if there are other google identities that are still linked to this NIH_username + # note: the NIH username match is case-insensitive so this will not return a false negative. + # e.g. if a different google identity is linked to 'NIHUSERNAME1' and this google identity just authenticated with 'nihusername1', + # this will fail the test and return to the /users/ url with a warning message + preexisting_nih_users = NIH_User.objects.filter( + NIH_username__iexact=NIH_username, linked=True).exclude(user_id=user_id) + + if len(preexisting_nih_users) > 0: + preexisting_nih_user_user_ids = [preexisting_nih_user.user_id for preexisting_nih_user in + preexisting_nih_users] + prelinked_user_email_list = [user.email for user in + User.objects.filter(id__in=preexisting_nih_user_user_ids)] + prelinked_user_emails = ', '.join(prelinked_user_email_list) + + logger.warn( + "User {} tried to log into the NIH account {} that is already linked to user(s) {}".format( + user_email, + NIH_username, + prelinked_user_emails + '.' + )) + my_st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "User {} tried to log into the NIH account {} that is already linked to user(s) {}".format( + user_email, + NIH_username, + prelinked_user_emails + '.' + )) + + results.messages.append( + "You tried to link your email address to NIH account {}, but it is already linked to {}.".format( + NIH_username, prelinked_user_emails)) + return True + return False + def demo_process_success(auth, user_id, saml_response): retval = DemoLoginResults() st_logger = StackDriverLogger.build_from_django_settings() @@ -857,58 +915,7 @@ def demo_process_success(auth, user_id, saml_response): user_email = User.objects.get(id=user_id).email - # 1. check if this google identity is currently linked to other NIH usernames - # note: the NIH username exclusion is case-insensitive so this will not return a false positive - # e.g. if this google identity is linked to 'NIHUSERNAME1' but just authenticated with 'nihusername1', - # it will still pass this test - nih_usernames_already_linked_to_this_google_identity = NIH_User.objects.filter( - user_id=user_id, linked=True).exclude(NIH_username__iexact=NIH_username) - for nih_user in nih_usernames_already_linked_to_this_google_identity: - if nih_user.NIH_username.lower() != NIH_username.lower(): - logger.warn( - "User {} is already linked to the eRA commons identity {} and attempted authentication" - " with the eRA commons identity {}." - .format(user_email, nih_user.NIH_username, NIH_username)) - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] {}".format( - "User {} is already linked to the eRA commons identity {} and attempted authentication" - " with the eRA commons identity {}." - .format(user_email, nih_user.NIH_username, NIH_username))) - - retval.messages.append("User {} is already linked to the eRA commons identity {}. " - "Please unlink these before authenticating with the eRA commons " - "identity {}.".format(user_email, nih_user.NIH_username, - NIH_username)) - return retval - - # 2. check if there are other google identities that are still linked to this NIH_username - # note: the NIH username match is case-insensitive so this will not return a false negative. - # e.g. if a different google identity is linked to 'NIHUSERNAME1' and this google identity just authenticated with 'nihusername1', - # this will fail the test and return to the /users/ url with a warning message - preexisting_nih_users = NIH_User.objects.filter( - NIH_username__iexact=NIH_username, linked=True).exclude(user_id=user_id) - - if len(preexisting_nih_users) > 0: - preexisting_nih_user_user_ids = [preexisting_nih_user.user_id for preexisting_nih_user in - preexisting_nih_users] - prelinked_user_email_list = [user.email for user in - User.objects.filter(id__in=preexisting_nih_user_user_ids)] - prelinked_user_emails = ', '.join(prelinked_user_email_list) - - logger.warn( - "User {} tried to log into the NIH account {} that is already linked to user(s) {}".format( - user_email, - NIH_username, - prelinked_user_emails + '.' - )) - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - "User {} tried to log into the NIH account {} that is already linked to user(s) {}".format( - user_email, - NIH_username, - prelinked_user_emails + '.' - )) - - retval.messages.append("You tried to link your email address to NIH account {}, but it is already linked to {}.".format( - NIH_username, prelinked_user_emails)) + if found_linking_problems(NIH_username, user_id, user_email, st_logger, retval): return retval except Exception as e: @@ -916,49 +923,14 @@ def demo_process_success(auth, user_id, saml_response): "[ERROR] Exception while finding user email: {}".format(str(e))) logger.exception(e) + # This stuff used to live sprinkled into the Django update code that is now in + # handle_user_db_entry. But it is not useful for us with DCF, so break it out, but + # handle exception as before: + no_exception = True try: - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] Updating Django model") - authorized_datasets = das.get_datasets_for_era_login(NIH_username) - - #saml_response = None if 'SAMLResponse' not in req['post_data'] else req['post_data']['SAMLResponse'] - saml_response = saml_response.replace('\r\n', '') - - # AppEngine Flex appears to return a datetime.datetime.now() of the server's local timezone, and not - # UTC as on AppEngine Standard; use utcnow() to ensure UTC. - NIH_assertion_expiration = datetime.datetime.utcnow() + datetime.timedelta( - seconds=login_expiration_seconds) - - updated_values = { - 'NIH_assertion': saml_response, - 'NIH_assertion_expiration': pytz.utc.localize(NIH_assertion_expiration), - 'user_id': user_id, - 'active': 1, - 'linked': True - } - - nih_user, created = NIH_User.objects.update_or_create(NIH_username=NIH_username, - user_id=user_id, - defaults=updated_values) - - logger.info("[STATUS] NIH_User.objects.update_or_create() returned nih_user: {} and created: {}".format( - str(nih_user.NIH_username), str(created))) - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - "[STATUS] NIH_User.objects.update_or_create() returned nih_user: {} and created: {}".format( - str(nih_user.NIH_username), str(created))) - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - "[STATUS] NIH_User {} associated with email {} and logged in with assertion: {}".format( - str(nih_user.NIH_username), str(user_email), str(saml_response))) - # add or remove user from ACL_GOOGLE_GROUP if they are or are not dbGaP authorized directory_client, http_auth = get_directory_resource() - # default warn message is for eRA Commons users who are not dbGaP authorized - warn_message = ''' -

WARNING NOTICE

-

You are accessing a US Government web site which may contain information that must be protected under the US Privacy Act or other sensitive information and is intended for Government authorized use only.

-

Unauthorized attempts to upload information, change information, or use of this web site may result in disciplinary action, civil, and/or criminal penalties. Unauthorized users of this website should have no expectation of privacy regarding any communications or data processed by this website.

-

Anyone accessing this website expressly consents to monitoring of their actions and all communications or data transiting or stored on related to this website and is advised that if such monitoring reveals possible evidence of criminal activity, NIH may provide that evidence to law enforcement officials.

- ''' except Exception as e: st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, @@ -966,106 +938,25 @@ def demo_process_success(auth, user_id, saml_response): logger.error("[ERROR] Exception while finding user email: ") logger.exception(e) warn_message = "" + no_exception = False - if len(authorized_datasets) > 0: - # if user has access to one or more datasets, warn message is different - warn_message += '

You are reminded that when accessing controlled information you are bound by the dbGaP DATA USE CERTIFICATION AGREEMENT (DUCA) for each dataset.

' + if no_exception: + #saml_response = None if 'SAMLResponse' not in req['post_data'] else req['post_data']['SAMLResponse'] + saml_response = saml_response.replace('\r\n', '') + num_auth_datasets = len(authorized_datasets) + # AppEngine Flex appears to return a datetime.datetime.now() of the server's local timezone, and not + # UTC as on AppEngine Standard; use utcnow() to ensure UTC. + NIH_assertion_expiration = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( + seconds=login_expiration_seconds)) + + nih_user, warn_message = handle_user_db_entry(user_id, NIH_username, user_email, saml_response, + num_auth_datasets, NIH_assertion_expiration, st_logger) all_datasets = das.get_all_datasets_and_google_groups() for dataset in all_datasets: - ad = None - try: - ad = AuthorizedDataset.objects.get(whitelist_id=dataset.dataset_id, - acl_google_group=dataset.google_group_name) - except (ObjectDoesNotExist, MultipleObjectsReturned) as e: - logger.error(( - "[ERROR] " + ( - "More than one dataset " if type(e) is MultipleObjectsReturned else "No dataset ") + - "found for this ID and Google Group Name in the database: %s, %s") % ( - dataset.dataset_id, dataset.google_group_name) - ) - continue - - uad = UserAuthorizedDatasets.objects.filter(nih_user=nih_user, authorized_dataset=ad) - dataset_in_auth_set = next((ds for ds in authorized_datasets if ( - ds.dataset_id == dataset.dataset_id and ds.google_group_name == dataset.google_group_name)), None) - - logger.debug("[STATUS] UserAuthorizedDatasets for {}: {}".format(nih_user.NIH_username, str(uad))) - - try: - result = directory_client.members().get(groupKey=dataset.google_group_name, - memberKey=user_email).execute(http=http_auth) - - # If we found them in the ACL but they're not currently authorized for it, remove them from it and the table - if len(result) and not dataset_in_auth_set: - directory_client.members().delete(groupKey=dataset.google_group_name, - memberKey=user_email).execute(http=http_auth) - logger.warn( - "User {} was deleted from group {} because they don't have dbGaP authorization.".format( - user_email, dataset.google_group_name - ) - ) - st_logger.write_text_log_entry( - LOG_NAME_ERA_LOGIN_VIEW, - "[WARN] User {} was deleted from group {} because they don't have dbGaP authorization.".format( - user_email, dataset.google_group_name - ) - ) - - if len(uad) and not dataset_in_auth_set: - uad.delete() - # Sometimes an account is in the Google Group but not the database - add them if they should - # have access - elif not len(uad) and len(result) and dataset_in_auth_set: - logger.info( - "User {} was was found in group {} but not the database--adding them.".format( - user_email, dataset.google_group_name - ) - ) - st_logger.write_text_log_entry( - LOG_NAME_ERA_LOGIN_VIEW, - "[WARN] User {} was was found in group {} but not the database--adding them.".format( - user_email, dataset.google_group_name - ) - ) - uad, created = UserAuthorizedDatasets.objects.update_or_create(nih_user=nih_user, - authorized_dataset=ad) - if not created: - logger.warn("[WARNING] Unable to create entry for user {} and dataset {}.".format(user_email, - ad.whitelist_id)) - else: - logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) - - # if the user_email doesn't exist in the google group an HttpError will be thrown... - except HttpError: - # Check for their need to be in the ACL, and add them - if dataset_in_auth_set: - body = { - "email": user_email, - "role": "MEMBER" - } - - result = directory_client.members().insert( - groupKey=dataset.google_group_name, - body=body - ).execute(http=http_auth) - - # Then add then to the database as well - if not len(uad): - uad, created = UserAuthorizedDatasets.objects.update_or_create(nih_user=nih_user, - authorized_dataset=ad) - if not created: - logger.warn("[WARNING] Unable to create entry for user {} and dataset {}.".format(user_email, - ad.whitelist_id)) - else: - logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) - - logger.info(result) - logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - "[STATUS] User {} added to {}.".format(user_email, - dataset.google_group_name)) + handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, True, + directory_client, http_auth, st_logger) # Add task in queue to deactivate NIH_User entry after NIH_assertion_expiration has passed. try: @@ -1096,9 +987,170 @@ def demo_process_success(auth, user_id, saml_response): st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[ERROR] Failed to publish to PubSub topic: {}".format(str(e))) + retval.messages.append(warn_message) return retval +def handle_user_db_entry(user_id, NIH_username, user_email, auth_response, num_auth_datasets, + NIH_assertion_expiration, st_logger): + try: + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] Updating Django model") + + updated_values = { + 'NIH_assertion': auth_response, + 'NIH_assertion_expiration': NIH_assertion_expiration, + 'user_id': user_id, + 'active': 1, + 'linked': True + } + + nih_user, created = NIH_User.objects.update_or_create(NIH_username=NIH_username, + user_id=user_id, + defaults=updated_values) + + logger.info("[STATUS] NIH_User.objects.update_or_create() returned nih_user: {} and created: {}".format( + str(nih_user.NIH_username), str(created))) + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "[STATUS] NIH_User.objects.update_or_create() returned nih_user: {} and created: {}".format( + str(nih_user.NIH_username), str(created))) + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "[STATUS] NIH_User {} associated with email {} and logged in with assertion: {}".format( + str(nih_user.NIH_username), str(user_email), str(auth_response))) + + # default warn message is for eRA Commons users who are not dbGaP authorized + warn_message = ''' +

WARNING NOTICE

+

You are accessing a US Government web site which may contain information that must be protected under the US Privacy Act or other sensitive information and is intended for Government authorized use only.

+

Unauthorized attempts to upload information, change information, or use of this web site may result in disciplinary action, civil, and/or criminal penalties. Unauthorized users of this website should have no expectation of privacy regarding any communications or data processed by this website.

+

Anyone accessing this website expressly consents to monitoring of their actions and all communications or data transiting or stored on related to this website and is advised that if such monitoring reveals possible evidence of criminal activity, NIH may provide that evidence to law enforcement officials.

+ ''' + + except Exception as e: + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "[ERROR] Exception while finding user email: {}".format(str(e))) + logger.error("[ERROR] Exception while finding user email: ") + logger.exception(e) + warn_message = "" + + if num_auth_datasets > 0: + # if user has access to one or more datasets, warn message is different + warn_message += '

You are reminded that when accessing controlled information you are bound by the dbGaP DATA USE CERTIFICATION AGREEMENT (DUCA) for each dataset.

' + + return nih_user, warn_message + + +def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, handle_acls, + directory_client, http_auth, st_logger): + try: + ad = AuthorizedDataset.objects.get(whitelist_id=dataset.dataset_id, + acl_google_group=dataset.google_group_name) + except (ObjectDoesNotExist, MultipleObjectsReturned) as e: + logger.error(( + "[ERROR] " + ( + "More than one dataset " if type(e) is MultipleObjectsReturned else "No dataset ") + + "found for this ID and Google Group Name in the database: %s, %s") % ( + dataset.dataset_id, dataset.google_group_name) + ) + return + + uad = UserAuthorizedDatasets.objects.filter(nih_user=nih_user, authorized_dataset=ad) + dataset_in_auth_set = next((ds for ds in authorized_datasets if + (ds.dataset_id == dataset.dataset_id and + ds.google_group_name == dataset.google_group_name)), None) + + logger.debug("[STATUS] UserAuthorizedDatasets for {}: {}".format(nih_user.NIH_username, str(uad))) + + need_to_add = False + if handle_acls: + try: + result = directory_client.members().get(groupKey=dataset.google_group_name, + memberKey=user_email).execute(http=http_auth) + + # If we found them in the ACL but they're not currently authorized for it, remove them from it and the table + if len(result) and not dataset_in_auth_set: + directory_client.members().delete(groupKey=dataset.google_group_name, + memberKey=user_email).execute(http=http_auth) + logger.warn( + "User {} was deleted from group {} because they don't have dbGaP authorization.".format( + user_email, dataset.google_group_name + ) + ) + st_logger.write_text_log_entry( + LOG_NAME_ERA_LOGIN_VIEW, + "[WARN] User {} was deleted from group {} because they don't have dbGaP authorization.".format( + user_email, dataset.google_group_name + ) + ) + except HttpError: + # if the user_email doesn't exist in the google group an HttpError will be thrown... + need_to_add = True + else: + need_to_add = (len(uad) == 0) and dataset_in_auth_set + + # + # Either remove them from the table, or add them to the table. + # + + if len(uad) and not dataset_in_auth_set: + st_logger.write_text_log_entry( + LOG_NAME_ERA_LOGIN_VIEW, + "[WARN] User {} being deleted from UserAuthorizedDatasets table {} because they don't have dbGaP authorization.".format( + nih_user.NIH_username, dataset.dataset_id + ) + ) + uad.delete() + + # Sometimes an account is in the Google Group but not the database - add them if they should + # have access. + # May 2018: Not handling ACL groups anymore, we skip this step (added handle_acls condition) + elif not len(uad) and handle_acls and len(result) and dataset_in_auth_set: + logger.info( + "User {} was was found in group {} but not the database--adding them.".format( + user_email, dataset.google_group_name + ) + ) + st_logger.write_text_log_entry( + LOG_NAME_ERA_LOGIN_VIEW, + "[WARN] User {} was was found in group {} but not the database--adding them.".format( + user_email, dataset.google_group_name + ) + ) + uad, created = UserAuthorizedDatasets.objects.update_or_create(nih_user=nih_user, + authorized_dataset=ad) + if not created: + logger.warn("[WARNING] Unable to create entry for user {} and dataset {}.".format(user_email, + ad.whitelist_id)) + else: + logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) + + if need_to_add: + if handle_acls: + # Check for their need to be in the ACL, and add them + if dataset_in_auth_set: + body = { + "email": user_email, + "role": "MEMBER" + } + + result = directory_client.members().insert( + groupKey=dataset.google_group_name, + body=body + ).execute(http=http_auth) + + logger.info(result) + logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "[STATUS] User {} added to {}.".format(user_email, + dataset.google_group_name)) + # Add them to the database as well + if not len(uad): + uad, created = UserAuthorizedDatasets.objects.update_or_create(nih_user=nih_user, + authorized_dataset=ad) + if not created: + logger.warn("[WARNING] Unable to create entry for user {} and dataset {}.".format(user_email, + ad.whitelist_id)) + else: + logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) def deactivate_nih_add_to_open(user_id, user_email): try: diff --git a/accounts/urls.py b/accounts/urls.py index 3bda2cf6..f122afc6 100755 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -31,6 +31,8 @@ url(r'^dcf/login/callback/$', views.oauth2_callback, name='dcf_callback'), url(r'^dcf_login/$', views.oauth2_login, name='dcf_login'), url(r'^unlink_accounts/', views.unlink_accounts, name='unlink_accounts'), + url(r'^dcf/test', views.test_the_dcf, name='dcf_test'), + # Google Cloud Project related url(r'^users/(?P\d+)/gcp_list/$', views.user_gcp_list, name='user_gcp_list'), diff --git a/accounts/views.py b/accounts/views.py index d716bad3..772eaf8c 100755 --- a/accounts/views.py +++ b/accounts/views.py @@ -33,26 +33,24 @@ from google_helpers.storage_service import get_storage_resource from google_helpers.bigquery.bq_support import BigQuerySupport from googleapiclient.errors import HttpError +from django.contrib.auth.models import User from models import * from projects.models import User_Data_Tables from django.utils.html import escape from sa_utils import verify_service_account, register_service_account, \ unregister_all_gcp_sa, unregister_sa_with_id, service_account_dict, \ - do_nih_unlink, deactivate_nih_add_to_open - - -import requests -import random -import string + do_nih_unlink, deactivate_nih_add_to_open, handle_user_db_entry, \ + found_linking_problems, DemoLoginResults, handle_user_for_dataset from django.http import HttpResponseRedirect -from django.contrib.auth.decorators import login_required -from allauth.socialaccount.models import SocialApp, SocialToken - -from oauthlib.oauth2 import WebApplicationClient from requests_oauthlib.oauth2_session import OAuth2Session import os -os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' +from base64 import urlsafe_b64decode +import jwt +from jwt.contrib.algorithms.pycrypto import RSAAlgorithm +from json import dumps as json_dumps +from dataset_utils.dataset_access_support_factory import DatasetAccessSupportFactory +from dataset_utils.dataset_config import DatasetGoogleGroupPair @@ -748,84 +746,301 @@ def get_user_datasets(request,user_id): return JsonResponse(result, status=status) -hackie_state = None - @login_required def oauth2_login(request): + """ + First step of OAuth2 login ro DCF. Just build the URL that we send back to the browser in the refresh request + """ full_callback = request.build_absolute_uri(reverse('dcf_callback')) - social_account = SocialApp.objects.get(provider='dcf') + # OAuth2Session ENFORCES https unless this environment variable is set. For local dev, we want that off + # so we can talk to localhost over http. But let's turn it on/off to minimize, and make it only active in + # development: + + if settings.IS_DEV and full_callback.startswith('http://localhost'): + os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' - rando = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(10)) + dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) - # Should provide a state string that gets stashed in sessions! - oauth = OAuth2Session(social_account.client_id, redirect_uri=full_callback, scope=['openid', 'user']) + # Found that 'user' scope had to be included to be able to do the user query on callback, and the data scope + # to do data queries. Starting to recognize a pattern here... + oauth = OAuth2Session(dcf_secrets['DCF_CLIENT_ID'], redirect_uri=full_callback, scope=['openid', 'user', 'data']) authorization_url, state = oauth.authorization_url(DCF_AUTH_URL) # stash the state string in the session! - hackie_state = state + request.session['dcfOAuth2State'] = state + os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' return HttpResponseRedirect(authorization_url) + # For future reference, this also worked, using underlying oauthlib.oauth2 library: + # from oauthlib.oauth2 import WebApplicationClient # wac = WebApplicationClient(social_account.client_id) - # - # ruri = wac.prepare_request_uri(DCF_AUTH_URL, - # redirect_uri=full_callback, - # state=rando, scope=['openid', 'user']) + # rando = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(10)) + # ruri = wac.prepare_request_uri(DCF_AUTH_URL, redirect_uri=full_callback, state=rando, scope=['openid', 'user']) # return HttpResponseRedirect(ruri) @login_required def oauth2_callback(request): + """ + Second step of OAuth2 login to DCF. Takes the response redirect URL that DCF returned to the user's browser, + parse out the auth code, use it to get a token, then get user info from DCF using the token + """ full_callback = request.build_absolute_uri(reverse('dcf_callback')) - print 'cb request user: {} : {}'.format(str(request.user.id), full_callback) - data = { - 'redirect_uri': full_callback, - 'grant_type': 'authorization_code', - 'code': request.GET['code']} - social_app = SocialApp.objects.get(provider='dcf') + # For future reference, this also worked, using underlying requests library: + # data = { 'redirect_uri': full_callback, 'grant_type': 'authorization_code', 'code': request.GET['code']} + # auth = requests.auth.HTTPBasicAuth(social_app.client_id, social_app.secret) + # resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) + # token_data = json.loads(resp.text) + # headers = {'Authorization': 'Bearer {}'.format(token_data['access_token'])} + # resp = requests.get(DCF_USER_URL, headers=headers) + + # OAuth2Session ENFORCES https unless this environment variable is set. FOr local dev, we want that off + # so we can talk to localhost over http. But let's turn it on/off to minimize, and make it only active in + # development: - # You MUST provide the callback here to get it into the fetch request - dcf = OAuth2Session(social_app.client_id, state=hackie_state, redirect_uri=full_callback) + if settings.IS_DEV and full_callback.startswith('http://localhost'): + os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' - # You MUST provide the client_id here in order to get this to do basic auth! Plus we need to provide - # the authorization_response argument intead of a parsed-out code argument: - token_data = dcf.fetch_token(DCF_TOKEN_URL, client_secret=social_app.secret, client_id=social_app.client_id, + dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) + + if 'dcfOAuth2State' in request.session: + saved_state = request.session['dcfOAuth2State'] + else: + """Do something here to report the error""" + + # You MUST provide the callback *here* to get it into the fetch request + dcf = OAuth2Session(dcf_secrets['DCF_CLIENT_ID'], state=saved_state, redirect_uri=full_callback) + + # You MUST provide the client_id *here* (again!) in order to get this to do basic auth! DCF will not authorize + # unless we use basic auth (i.e. client ID and secret in the header, not the body). Plus we need to provide + # the authorization_response argument intead of a parsed-out code argument since this is a WebApplication flow. + # Note we also get back an "id_token" which is a base64-encoded JWT. + # Note we also get back a "token_type" which had better be "Bearer". + + token_data = dcf.fetch_token(DCF_TOKEN_URL, client_secret=dcf_secrets['DCF_CLIENT_SECRET'], + client_id=dcf_secrets['DCF_CLIENT_ID'], authorization_response=request.get_full_path()) -# auth = requests.auth.HTTPBasicAuth(social_app.client_id, social_app.secret) + if token_data['token_type'] != 'Bearer': + """Do something here to report the error""" + + # + # Although user data can be extracted from the /user endpoint, DCF instructs us to pull the user information + # out of the JWT in the id_token. They also recommend we check that the JWT validates using the public + # key provided by their endpoint using the pyjwt package to do the work. + # + + id_token_b64 = token_data['id_token'] - # resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) - # token_data = json.loads(resp.text) - expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta(seconds=token_data["expires_in"])) - # print expiration_time + # + # PyJWT happens to want the cryptography package, but that involves C code, so we use the provided fallback of + # pycrypto, which we do use. The steps below are how they say to use the pycrypto implmentation, but note that + # we appear to need to create a new PyJWT() object so that it does not complain about previously registered + # algorithm, but also doesn't like is we unregister non-registered algorithms, or appear to provide an easy + # way to get at the global list of registered algorithms? + # - # print "AT {} : {} : {} : {}".format(resp.status_code, resp.request, resp.text, str(token_data)); + my_jwt = jwt.PyJWT() + my_jwt.register_algorithm('RS256', RSAAlgorithm(RSAAlgorithm.SHA256)) - # headers = {'Authorization': 'Bearer {}'.format(token_data['access_token'])} - # resp = requests.get(DCF_USER_URL, headers=headers) + # + # DCF's key endpoint provides a list of keys they use. Right now, only one, but to future-proof, we want + # to choose the right one from the list. But that means we need to parse the first element of the JWT tuple + # to know which key to use, even though we need the key to decode the tuple. (There has to be a better way + # that I am missing.) So, we need to break the id_token at the "." delimiting the tuples (base64decode PUKES + # on the "."). Then take the first element of the JWT and decode it: + # - resp = dcf.get(DCF_USER_URL) - user_data = json.loads(resp.text) + id_tokens_b64 = id_token_b64.split('.') + i64 = id_tokens_b64[0] + padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length + id_token = urlsafe_b64decode(padded.encode("ascii")) + jwt_header = json.loads(id_token) + kid = jwt_header['kid'] - # Note we also get back an "id_token" which is an encrypted JWT. - # Note we also get back a "token_type" which had better be "Bearer". + # + # Get the key list from the endpoint and choose which one was used in the JWT: + # + + resp = dcf.get('https://qa.dcf.planx-pla.net/user/jwt/keys') + key_data = json.loads(resp.text) + key_list = key_data['keys'] + use_key = None + for key in key_list: + if key[0] == kid: + use_key = key[1] + + if use_key is None: + """Do something here to report the error""" + + # + # Decode the JWT! + # + + try: + alg_list = ['RS256'] + decoded_jwt = my_jwt.decode(id_token_b64, key=use_key, algorithms=alg_list, + audience=['openid', 'user', 'data', dcf_secrets['DCF_CLIENT_ID']]) + except Exception as e: + """Do something here to report the error""" + + # + # For reference, this is what I am seeing in the JWT: + # + # comp = {u'aud': [u'openid', u'user', u'data', u'Client ID'], + # u'iss': u'https://qa.dcf.planx-pla.net/user', + # u'iat': 1525732539, + # u'jti': u'big hex string with dashes', + # u'context': {u'user': {u'phone_number': u'', + # u'display_name': u'', + # u'name': u'email of NIH Username', + # u'is_admin': False, + # u'email': u'email address', + # u'projects': {u'qa': [u'read', u'read-storage'], + # u'test': [u'read', u'read-storage']}}}, + # u'auth_time': 1525732539, + # u'azp': u'Client ID', + # u'exp': 1525733739, + # u'pur': u'id', + # u'sub': u'integer use key'} + + nih_from_dcf = decoded_jwt['context']['user']['name'] + dcf_user_id = decoded_jwt['sub'] + dict_o_projects = decoded_jwt['context']['user']['projects'] + + # + # This also works to get user info from the DCF, though you need to have 'user' in the audience as well: + # + # resp = dcf.get(DCF_USER_URL) + # user_data = json.loads(resp.text) + # nih_from_dcf = user_data['username'] + # + + # + # For development, let's pretend that DCF actually returns an ERACommons ID: + # + + if nih_from_dcf == dcf_secrets['DEV_1_EMAIL']: + nih_from_dcf = dcf_secrets['DEV_1_NIH'] + + # We now have the NIH User ID back from DCF. We check that we don't have linking issues! + results = DemoLoginResults() + st_logger = StackDriverLogger.build_from_django_settings() + user_email = User.objects.get(id=request.user.id).email + if found_linking_problems(nih_from_dcf, request.user.id, user_email, st_logger, results): + """return the linking problem!""" + return redirect('dashboard') - print "AT {} : {} : {}".format(resp.status_code, resp.request, " ".join(resp.text.split())) - - # Note the logic here. If a different User shows up and logs in to get back the same uid from dcf that - # another User had associated, the uid now becomes linked to the new user: - social_account, created = SocialAccount.objects.update_or_create(uid = user_data['user_id'], - provider = 'dcf', - defaults={ - 'user_id' : request.user.id, - 'extra_data': json.loads(" ".join(resp.text.split())) - }) - # Note how update_or_create works. You match on the named parameter values, and update with the values in the defaults. - # If it doesn't exist, the named parameter values are used to create the row. - social_token, created = SocialToken.objects.update_or_create(account_id=social_account.id, - app_id=social_app.id, - defaults= { - 'token' : token_data['access_token'], - 'token_secret' : token_data['refresh_token'], - 'expires_at' : expiration_time - }) - return redirect('dashboard') \ No newline at end of file + ## This is the place to link to Google??? But lotsa stuff needs to go into the session to be stored later? + + # We now will have the NIH User ID back from DCF. + + login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 + nih_assertion_expiration = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( + seconds=login_expiration_seconds)) + + nih_user, warnings = handle_user_db_entry(request.user.id, nih_from_dcf, user_email, json_dumps(decoded_jwt), + len(dict_o_projects), nih_assertion_expiration, st_logger) + + _token_storage(token_data, nih_user.id, dcf_user_id) + + authorized_datasets = [] + all_datasets = [] + for project, perm_list in dict_o_projects.iteritems(): + if project == 'qa': + project = 'phs000178' + goog = 'isb-cgc-dev-cntl@isb-cgc.org' + elif project == 'test': + project = 'phs000218' + goog = 'isb-cgc-dev-cntl-target@isb-cgc.org' + ad = AuthorizedDataset.objects.get(whitelist_id=project) + authorized_datasets.append(DatasetGoogleGroupPair(project, goog)) #ad.acl_google_group)) + all_datasets.append(DatasetGoogleGroupPair(project, goog)) + + + # das = DatasetAccessSupportFactory.from_webapp_django_settings() + # all_datasets = das.get_all_datasets_and_google_groups() + + for dataset in all_datasets: + handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) + + if warnings: + messages.warning(request, warnings) + os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' + return redirect('/users/' + str(request.user.id)) + + +def _token_storage(token_dict, nih_pk, dcf_uid): + + if token_dict.has_key('expires_at'): + expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) + else: + print "Have to build an expiration time" + expiration_time = pytz.utc.localize( + datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) + + DCFToken.objects.update_or_create(nih_user_id=nih_pk, + defaults={ + 'dcf_user': dcf_uid, + 'access_token': token_dict['access_token'], + 'refresh_token': token_dict['refresh_token'], + 'expires_at': expiration_time + }) + +@login_required +def test_the_dcf(request): + """ + Use this to test that we can call the DCF and get back useful info. Also, use as a template for doing all + DCF calls + """ + file_uuid = 'ffcc4f7d-471a-4ad0-b199-53d992217986' + resp = _dcf_call('https://qa.dcf.planx-pla.net/user/data/download/{}'.format(file_uuid), request.user.id) + result = {'uri': resp.text} + return JsonResponse(result, status=resp.status_code) + + +def _dcf_call(full_url, user_id): + """ + All the stuff around a DCF call that handles token management and refreshes + """ + + dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) + + nih_user = NIH_User.objects.get(user_id=user_id, linked=True) + dcf_token = DCFToken.objects.get(nih_user=nih_user.id) + + expires_in = (dcf_token.expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() + print "Expiration : {} seconds".format(expires_in) + + token_dict = { + 'access_token' : dcf_token.access_token, + 'refresh_token' : dcf_token.refresh_token, + 'token_type' : 'Bearer', + 'expires_in' : expires_in + } + extra_dict = { + 'client_id' : dcf_secrets['DCF_CLIENT_ID'], + 'client_secret': dcf_secrets['DCF_CLIENT_SECRET'] + } + + def token_storage_for_user(my_token_dict): + _token_storage(my_token_dict, user_id, dcf_token.dcf_user) + + dcf = OAuth2Session(dcf_secrets['DCF_CLIENT_ID'], token=token_dict, auto_refresh_url=DCF_TOKEN_URL, + auto_refresh_kwargs=extra_dict, token_updater=token_storage_for_user) + + # Hoo boy! You *MUST* provide the client_id and client_secret in the call itself to insure an OAuth2Session token + # refresh call uses HTTPBasicAuth! + resp = dcf.get(full_url, client_id=dcf_secrets['DCF_CLIENT_ID'], client_secret=dcf_secrets['DCF_CLIENT_SECRET']) + return resp + + +def _read_dict(my_file_name): + retval = {} + with open(my_file_name, 'r') as f: + for line in f: + if '=' not in line: + continue + split_line = line.split('=') + retval[split_line[0].strip()] = split_line[1].strip() + return retval \ No newline at end of file From 5b2a9954c8087ee9827b921bfb578ad61f4301ba Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Thu, 24 May 2018 17:38:53 -0700 Subject: [PATCH 04/76] Second pass --- accounts/dcf_views.py | 779 ++++++++++++++++++ .../migrations/0016_auto_20180524_1033.py | 39 + accounts/models.py | 11 +- accounts/sa_utils.py | 228 ++++- accounts/urls.py | 13 +- accounts/views.py | 319 +------ 6 files changed, 1044 insertions(+), 345 deletions(-) create mode 100755 accounts/dcf_views.py create mode 100644 accounts/migrations/0016_auto_20180524_1033.py diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py new file mode 100755 index 00000000..b2cde81e --- /dev/null +++ b/accounts/dcf_views.py @@ -0,0 +1,779 @@ +""" +Copyright 2017-2018, Institute for Systems Biology + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import logging +import jwt +import os +import requests +import datetime +import pytz + +from django.conf import settings +from django.contrib import messages +from django.contrib.auth.decorators import login_required +from django.shortcuts import redirect +from django.contrib.auth.models import User +from django.core.urlresolvers import reverse +from django.http import HttpResponseRedirect + +from google_helpers.stackdriver import StackDriverLogger + +from sa_utils import found_linking_problems, DemoLoginResults, handle_user_for_dataset,\ + handle_user_db_update_for_dcf_linking, \ + unlink_account_in_db_for_dcf, get_dcf_auth_key_remaining_seconds + +from models import DCFToken, AuthorizedDataset +from requests_oauthlib.oauth2_session import OAuth2Session +from base64 import urlsafe_b64decode +from jwt.contrib.algorithms.pycrypto import RSAAlgorithm +from json import loads as json_loads, dumps as json_dumps +from dataset_utils.dataset_access_support_factory import DatasetAccessSupportFactory +from dataset_utils.dataset_config import DatasetGoogleGroupPair + +import httplib as http_client + +http_client.HTTPConnection.debuglevel = 1 + +logger = logging.getLogger('main_logger') + +DCF_AUTH_URL = settings.DCF_AUTH_URL +DCF_TOKEN_URL = settings.DCF_TOKEN_URL +DCF_USER_URL = settings.DCF_USER_URL +DCF_REVOKE_URL = settings.DCF_REVOKE_URL +DCF_GOOGLE_URL = settings.DCF_GOOGLE_URL +DCF_TOKEN_REFRESH_WINDOW_SECONDS = settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS + +@login_required +def oauth2_login(request): + """ + First step of OAuth2 login to DCF. Just build the URL that we send back to the browser in the refresh request + """ + try: + full_callback = request.build_absolute_uri(reverse('dcf_callback')) + + # OAuth2Session ENFORCES https unless this environment variable is set. For local dev, we want that off + # so we can talk to localhost over http. But let's turn it on/off to minimize, and make it only active in + # development: + + if settings.IS_DEV and full_callback.startswith('http://localhost'): + os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' + + client_id, _ = _get_secrets() + + # Found that 'user' scope had to be included to be able to do the user query on callback, and the data scope + # to do data queries. Starting to recognize a pattern here... + oauth = OAuth2Session(client_id, redirect_uri=full_callback, scope=['openid', 'user', 'data']) + authorization_url, state = oauth.authorization_url(DCF_AUTH_URL) + # stash the state string in the session! + request.session['dcfOAuth2State'] = state + return HttpResponseRedirect(authorization_url) + + finally: + os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' + + # For future reference, this also worked, using underlying oauthlib.oauth2 library: + # from oauthlib.oauth2 import WebApplicationClient + # wac = WebApplicationClient(social_account.client_id) + # rando = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(10)) + # ruri = wac.prepare_request_uri(DCF_AUTH_URL, redirect_uri=full_callback, state=rando, scope=['openid', 'user']) + # return HttpResponseRedirect(ruri) + + +@login_required +def oauth2_callback(request): + """ + Second step of OAuth2 login to DCF. Takes the response redirect URL that DCF returned to the user's browser, + parse out the auth code, use it to get a token, then get user info from DCF using the token + """ + + try: + full_callback = request.build_absolute_uri(reverse('dcf_callback')) + + # For future reference, this also worked, using underlying requests library: + # data = { 'redirect_uri': full_callback, 'grant_type': 'authorization_code', 'code': request.GET['code']} + # auth = requests.auth.HTTPBasicAuth(social_app.client_id, social_app.secret) + # resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) + # token_data = json.loads(resp.text) + # headers = {'Authorization': 'Bearer {}'.format(token_data['access_token'])} + # resp = requests.get(DCF_USER_URL, headers=headers) + + # OAuth2Session ENFORCES https unless this environment variable is set. FOr local dev, we want that off + # so we can talk to localhost over http. But let's turn it on/off to minimize, and make it only active in + # development: + + if settings.IS_DEV and full_callback.startswith('http://localhost'): + os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' + + if 'dcfOAuth2State' in request.session: + saved_state = request.session['dcfOAuth2State'] + else: + logger.error("[ERROR] Missing dcfOAuth2State during callback") + messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") + return redirect(reverse('user_detail', args=[request.user.id])) + + client_id, client_secret = _get_secrets() + + # You MUST provide the callback *here* to get it into the fetch request + dcf = OAuth2Session(client_id, state=saved_state, redirect_uri=full_callback) + + # You MUST provide the client_id *here* (again!) in order to get this to do basic auth! DCF will not authorize + # unless we use basic auth (i.e. client ID and secret in the header, not the body). Plus we need to provide + # the authorization_response argument intead of a parsed-out code argument since this is a WebApplication flow. + # Note we also get back an "id_token" which is a base64-encoded JWT. + # Note we also get back a "token_type" which had better be "Bearer". + + token_data = dcf.fetch_token(DCF_TOKEN_URL, client_secret=client_secret, + client_id=client_id, + authorization_response=request.get_full_path()) + client_secret = None + + if token_data['token_type'] != 'Bearer': + logger.error("[ERROR] Token type returned was not 'Bearer'") + messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # Although user data can be extracted from the /user endpoint, DCF instructs us to pull the user information + # out of the JWT in the id_token. They also recommend we check that the JWT validates using the public + # key provided by their endpoint using the pyjwt package to do the work. + # + + id_token_b64 = token_data['id_token'] + + # + # PyJWT happens to want the cryptography package, but that involves C code, so we use the provided fallback of + # pycrypto, which we do use. The steps below are how they say to use the pycrypto implmentation, but note that + # we appear to need to create a new PyJWT() object so that it does not complain about previously registered + # algorithm, but also doesn't like if we unregister non-registered algorithms, or appear to provide an easy + # way to get at the global list of registered algorithms? + # + + my_jwt = jwt.PyJWT() + my_jwt.register_algorithm('RS256', RSAAlgorithm(RSAAlgorithm.SHA256)) + + # + # DCF's key endpoint provides a list of keys they use. Right now, only one, but to future-proof, we want + # to choose the right one from the list. But that means we need to parse the first element of the JWT tuple + # to know which key to use, even though we need the key to decode the tuple. (There has to be a better way + # that I am missing.) So, we need to break the id_token at the "." delimiting the tuples (base64decode PUKES + # on the "."). Then take the first element of the JWT and decode it: + # + + id_tokens_b64 = id_token_b64.split('.') + i64 = id_tokens_b64[0] + padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length + id_token = urlsafe_b64decode(padded.encode("ascii")) + jwt_header = json_loads(id_token) + kid = jwt_header['kid'] + + # + # Get the key list from the endpoint and choose which one was used in the JWT: + # + + resp = dcf.get(settings.DCF_KEY_URL) + key_data = json_loads(resp.text) + key_list = key_data['keys'] + use_key = None + for key in key_list: + if key[0] == kid: + use_key = key[1] + + if use_key is None: + logger.error("[ERROR] No key found from DCF to validate JWT") + messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # Decode the JWT! + # + + try: + alg_list = ['RS256'] + decoded_jwt = my_jwt.decode(id_token_b64, key=use_key, algorithms=alg_list, + audience=['openid', 'user', 'data', client_id]) + except Exception as e: + logger.error("[ERROR] Decoding JWT failure") + logger.exception(e) + messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # For reference, this is what I am seeing in the JWT: + # + # comp = {u'aud': [u'openid', u'user', u'data', u'Client ID'], + # u'iss': u'https://The DCF server/user', + # u'iat': 1525732539, + # u'jti': u'big hex string with dashes', + # u'context': {u'user': {u'google': {u'linked_google_account': u'email of linked user'}, + # u'phone_number': u'', + # u'display_name': u'', + # u'name': u'email of NIH Username', + # u'is_admin': False, + # u'email': u'email address', + # u'projects': {u'qa': [u'read', u'read-storage'], + # u'test': [u'read', u'read-storage']}}}, + # u'auth_time': 1525732539, + # u'azp': u'Client ID', + # u'exp': 1525733739, + # u'pur': u'id', (The "purpose" of the token. This is an ID. Refresh tokens say "refresh") + # u'sub': u'integer user key'} + + dcf_user_id = decoded_jwt['sub'] + + # + # User info is available in the JWT, but also from the user endpoint. We are going to use the endpoint + # since the info goes into the database, and we are going to be refreshing it frequently: + # + + user_resp = dcf.get(DCF_USER_URL) + the_user = json_loads(user_resp.text) + the_user = _massage_user_data_for_dev(the_user) + nih_from_dcf = the_user['username'] + + # + # BUT! DCF currently only returns google link data in the JWT. So we need to look there to figure + # out if the user is linked! + # + + the_user_for_google_link = decoded_jwt['context']['user'] + + gotta_google_link = the_user_for_google_link.has_key('google') and \ + the_user_for_google_link['google'].has_key('linked_google_account') + google_link = the_user_for_google_link['google']['linked_google_account'] if gotta_google_link else None + + # We now have the NIH User ID back from DCF; we also might now know the Google ID they have linked to previously + # (it comes back in the user_id). Note that this routine is going to get called every 30 days or so when we + # need to get a new refresh token, so it is possible that e.g. the first time they logged in as their PI and + # now are doing the legit thing of logging in as themselves. If we catch that problem, they need to unlink. Also, + # if DCF's idea of who they have linked to differs from ours (we keep a local copy), we need to handle that now! + + results = DemoLoginResults() + st_logger = StackDriverLogger.build_from_django_settings() + user_email = User.objects.get(id=request.user.id).email + # FIXME This old test is not what we really want to use... + if found_linking_problems(nih_from_dcf, request.user.id, user_email, st_logger, results): + for warn in results.messages: + messages.warning(request, warn) + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # We now have the minimum we need to store the tokens from DCF, so stick that in the database. We DO NOT yet + # make the entry in the NIH_User table, since we need to now either establish or refresh the DCF-Google ID link: + # + + _refresh_token_storage(token_data, decoded_jwt, user_resp.text, nih_from_dcf, dcf_user_id, request.user.id, google_link) + + # + # If user already has a google ID link, we would PATCH the endpoint to update it for 24 more hours. If + # not, we do a GET. (I.e. the first time they show up at DCF is the ONLY time we do a get, except for + # those cases where an unlink has been called.) So here is where the control flow diverges. For the + # GET, we wrap things up in the callback. For the PATCH, we wrap things up immediately: + # + + if gotta_google_link: + + # + # It is possible that the first time the user logged in they provided the wrong email address to DCF and + # then ignored us when we asked them to correct the problem. If DCF's provided Google ID does not match + # ours, then they need to still provide us with the correct version before we let them use it! + # + + req_user = User.objects.get(id=request.user.id) + if google_link != req_user.email: + message = "Please unlink ID {} and use your ISB-CGC login email ({}) to link with the DCF".format( + google_link, req_user.email) + messages.warning(request, message) + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # The link matches. So we use PATCH, and if it goes smoothly, we write the new link to the database: + + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') + if resp.status_code == 404: + messages.warning(request, "No linked Google account found for user") + elif resp.status_code == 200: + pass + else: + messages.warning(request, "Unexpected response ({}) from DCF during linking. " + "Please contact the ISB-CGC administrator.".format(resp.status_code)) + + warning = _finish_the_link(request.user.id, req_user.email, st_logger) + messages.warning(request, warning) + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # User has not yet been linked, so start the redirect flow with the user and DCF that will result + # in us getting the callback below to finish the process: + # + + link_callback = request.build_absolute_uri(reverse('dcf_link_callback')) + + callback = '{}?redirect={}'.format(DCF_GOOGLE_URL, link_callback) + return HttpResponseRedirect(callback) + finally: + os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' + + +@login_required +def dcf_link_callback(request): + """ + When the user comes back from Google/DCF after linking, this routine gets called. It provides us with any error + conditions, plus + """ + + # log the reports using Cloud logging API + st_logger = StackDriverLogger.build_from_django_settings() + + # + # If there was an error, return that: + # + error = request.GET.get('error', None) + if error: + error_description = request.GET.get('error_description', None) + if error == 'g_acnt_link_error': + message = 'Issue with the linkage between user and their Google account' + elif error == 'g_acnt_auth_failure': + message = "Issue with Oauth2 flow to AuthN user's Google account" + elif error == 'g_acnt_access_error': + message = "Issue with providing access to Google account by putting in user's proxy group" + else: + message = 'Unrecognized error' + + messages.warning(request, 'Error detected during linking. ' + 'Please report error "{}" with description "{}" and message "{}" ' + 'to the ISB-CGC administrator'.format(error, message, error_description)) + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # At this point, we need to wrestle with the possible problem that the user has linked + # to a DIFFERENT GoogleID while off messing with DCF. If the ID that comes back is not + # identical to what we think it is. They need to go and do it again! + # + + req_user = User.objects.get(id=request.user.id) + resp = _dcf_call(DCF_USER_URL, request.user.id) + user_data = json_loads(resp.text) + if user_data['email'] != req_user.email: + message = "Please unlink ID {} and use your ISB-CGC login email ({}) to link with the DCF".format(user_data['email'], req_user.email) + messages.warning(request, message) + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # If all is well, this is where we add the user to the NIH_User table and link the user to the various data sets. + # + + warning = _finish_the_link(request.user.id, user_data['email'], st_logger) + if warning: + messages.warning(request, warning) + return redirect(reverse('user_detail', args=[request.user.id])) + + +def _finish_the_link(user_id, user_email, st_logger): + """ + Regardless of how they get here, this step handles the linking of the user by adding the required database records. + """ + + # Until we get back user expiration time, we calculate it: + login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 + nih_assertion_expiration = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( + seconds=login_expiration_seconds)) + + # + # Until we get back current projects, refresh it: + # + + the_user = _get_user_data(user_id) + + # + # Save the new info from the DCF: + # + + dcf_token = DCFToken.objects.get(user_id=user_id) + if dcf_token.google_id is not None and dcf_token.google_id != user_email: + # FIXME + print "WE HAVE A PROBLEM" + + dcf_token.google_id = user_email + dcf_token.user_token = json_dumps(the_user) + dcf_token.save() + + nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user, + nih_assertion_expiration, st_logger) + + dict_o_projects = the_user['project_access'] + authorized_datasets = [] + for project, perm_list in dict_o_projects.iteritems(): + ad = AuthorizedDataset.objects.get(whitelist_id=project) + authorized_datasets.append(DatasetGoogleGroupPair(project, ad.acl_google_group)) + + das = DatasetAccessSupportFactory.from_webapp_django_settings() + all_datasets = das.get_all_datasets_and_google_groups() + + for dataset in all_datasets: + handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) + + return warning + +def _get_user_data(user_id): + """ + Get up-to-date user data from DCF, massage as needed + """ + resp = _dcf_call(DCF_USER_URL, user_id) + the_user = json_loads(resp.text) + + return _massage_user_data_for_dev(the_user) + + +def _massage_user_data_for_dev(the_user): + """ + Note that when working against their QA server, user names + and projects are junk. So we repair them here for our development needs. + """ + + dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) + nih_from_dcf = the_user['username'] + if nih_from_dcf == dcf_secrets['DEV_1_EMAIL']: + nih_from_dcf = dcf_secrets['DEV_1_NIH'] + the_user['username'] = nih_from_dcf + + dict_o_projects = the_user['project_access'] + new_dict_o_projects = {} + for project, perm_list in dict_o_projects.iteritems(): + # DCF QA returns bogus project info. Do this mapping as a workaround: + if project == dcf_secrets['DEV_1_PROJ']: + project = dcf_secrets['DEV_1_MAPPED_PROJ'] + elif project == dcf_secrets['DEV_2_PROJ']: + project = dcf_secrets['DEV_2_MAPPED_PROJ'] + new_dict_o_projects[project] = perm_list + the_user['project_access'] = new_dict_o_projects + + return the_user + + +@login_required +def dcf_link_extend(request): + """ + Put a user's GoogleID in the ACL groups for 24 (more) hours: + """ + + # log the reports using Cloud logging API + st_logger = StackDriverLogger.build_from_django_settings() + + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') + if resp.status_code == 404: + messages.warning(request, "No linked Google account found for user") + elif resp.status_code == 200: + pass + else: + messages.warning(request, "Unexpected response ({}) from DCF during linking. " + "Please contact the ISB-CGC administrator.".format(resp.status_code)) + + + + # Until we get back user expiration time, we calculate it: + login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 + nih_assertion_expiration = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( + seconds=login_expiration_seconds)) + + # User data set permissions might have changed, so we call and find out what they are: + user_data = _get_user_data(request.user.id) + + _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data, nih_assertion_expiration, st_logger) + + if warning: + messages.warning(request, warning) + + return redirect(reverse('user_detail', args=[request.user.id])) + + +@login_required +def dcf_unlink(request): + """ + Unlink a user's GoogleID from their NIH ID. This is NOT the traditional sense of unlink, as the user is + still able to talk to DCF using their NIH ID. For a traditional unlink, we use dcf_disconnect_user: + """ + + # + # First, call DCF to drop the linkage. This is the only way to get the user + # booted out of control groups. + # + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') + if resp.status_code == 404: + messages.warning(request, "No linked Google account found for user") + elif resp.status_code == 400: + delete_response = json_loads(resp.text) + error = delete_response['error'] + message = delete_response['error_description'] + messages.error(request, "Error in unlinking: {} : {}".format(error, message)) + elif resp.status_code == 200: + pass + else: + messages.warning(request, "Unexpected response from DCF") + + # + # Now drop the link flag and active flag from the DB, plus our db records of what datasets the user is + # good for: + # + + try: + message = unlink_account_in_db_for_dcf(request.user.id) + if message: + messages.error(request, message) + + except Exception as e: + logger.error("[ERROR] While unlinking accounts:") + logger.exception(e) + messages.error(request, 'There was an error when attempting to unlink your NIH user account - please contact the administrator.') + + # redirect to user detail page + return redirect(reverse('user_detail', args=[request.user.id])) + + +def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_from_dcf, dcf_uid, cgc_uid, google_id): + """ + This is called when the user needs to get a new 30-day refresh token from DCF by logging into + NIH (or if they unlink and need to reauthenticate to DCF again). + """ + + # + # We need to extract out the expiration time buried in the refresh token. When the refresh token + # expires (30 days) the user has to reauthenticate with DCF: + # + refresh_token = token_dict['refresh_token'] + refresh_tokens_b64 = refresh_token.split('.') + i64 = refresh_tokens_b64[1] + padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length + refresh_token_decoded = urlsafe_b64decode(padded.encode("ascii")) + refresh_token_dict = json_loads(refresh_token_decoded) + + # A refresh key: + # { + # "azp": "Client ID", + # "jti": "hex string with dashes", + # "aud": ["openid", "user", "data", "Client ID"], + # "exp": 1529262310, + # "iss": "https://The DCF server/user", + # "iat": 1526670310, + # "pur": "refresh", + # "sub": "The users's DCF ID" + # } + + refresh_expire_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(refresh_token_dict['exp'])) + + # This refers to the *access key* expiration (~20 minutes) + if token_dict.has_key('expires_at'): + expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) + else: + expiration_time = pytz.utc.localize( + datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) + logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) + + print 'Token storage. New token expires at {}'.format(str(expiration_time)) + + # FIXME! Make sure that the NIH name is going to be unique before we shove it into the table. Don't + # depend on the DB table constraint. + + # Note that (nih_username_lower, user_id) is enforced unique in the table: + DCFToken.objects.update_or_create(user_id=cgc_uid, + defaults={ + 'dcf_user': dcf_uid, + 'nih_username': nih_username_from_dcf, + 'nih_username_lower': nih_username_from_dcf.lower(), + 'access_token': token_dict['access_token'], + 'refresh_token': token_dict['refresh_token'], + 'user_token': user_token, + 'decoded_jwt': json_dumps(decoded_jwt), + 'expires_at': expiration_time, + 'refresh_expires_at': refresh_expire_time, + 'google_id': google_id # May be none on create... + }) + + +def _access_token_storage(token_dict, cgc_uid): + """ + This call just replaces the access key part of the DCF record. Used when we use the + refresh token to get a new access key. + """ + + # This refers to the *access key* expiration (~20 minutes) + if token_dict.has_key('expires_at'): + expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) + else: + expiration_time = pytz.utc.localize( + datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) + logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) + + print 'Token storage. New token expires at {}'.format(str(expiration_time)) + + dcf_token = DCFToken.objects.get(user_id=cgc_uid) + dcf_token.access_token = token_dict['access_token'] + dcf_token.expires_at = expiration_time + dcf_token.save() + +@login_required +def test_the_dcf(request): + """ + Use this to test that we can call the DCF and get back useful info. Also, use as a template for doing all + DCF calls + """ + file_uuid = 'ffcc4f7d-471a-4ad0-b199-53d992217986' + resp = _dcf_call('https://qa.dcf.planx-pla.net/user/data/download/{}'.format(file_uuid), request.user.id) + result = { + 'uri': resp.text, + 'code': resp.status_code + } + messages.warning(request, 'TDCF Responded with {}: {}'.format(resp.status_code, resp.text)) + + # redirect to user detail page + return redirect(reverse('user_detail', args=[request.user.id])) + + +@login_required +def dcf_disconnect_user(request): + """ + In the new DCF world, to 'unlink' means we both need to tell DCF to 'unlink' the user, + PLUS we drop all the access token/refresh token stuff after telling DCF to revoke the + refresh token. + """ + + # First thing ya gotta do is tell DCF to unlink the user, which will get them out of + # access control groups: + + msg_list = [] + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') + if resp.status_code == 404: + msg_list.append("No linked Google account found for user, code {}".format(resp.status_code)) + elif resp.status_code == 400: + delete_response = json_loads(resp.text) + error = delete_response['error'] + message = delete_response['error_description'] + msg_list.append("Error in unlinking: {} : {} : {}".format(error, message, resp.status_code)) + elif resp.status_code == 200: + pass + else: + msg_list.append(request, "Unexpected response from DCF {}".format(resp.status_code)) + + # + # The revoke call is unlike other DCF endpoints in that it is a special! + # Token revocation is described here: https://tools.ietf.org/html/rfc7009#section-2.1 + # So we do not provide a bearer access token, but the client ID and secret in a Basic Auth + # framework. Not seeing that inside the OAuthSession framework, so we roll our own by hand: + # + + dcf_token = DCFToken.objects.get(user_id=request.user.id) + + client_id, client_secret = _get_secrets() + + data = { + 'token': dcf_token.refresh_token + } + auth = requests.auth.HTTPBasicAuth(client_id, client_secret) + resp = requests.request('POST', DCF_REVOKE_URL, data=data, auth=auth) + client_id = None + client_secret = None + + if resp.status_code != 200 and resp.status_code != 204: + messages.warning(request, 'Revocation problem: {} : {}'.format(resp.status_code, resp.text)) + + for msg in msg_list: + messages.warning(request, msg) + + # + # OK, NOW we detach the user in our NIH tables, and detach the user from data permissions. + # + + unlink_account_in_db_for_dcf(request.user.id) + + # + # Finally, we clear out our tokens for the user (which allows them to appear to DCF as the + # logged-in NIH user; we cannot keep them around: + # + + dcf_token = DCFToken.objects.get(user_id=request.user.id) + dcf_token.delete() + + # redirect to user detail page + return redirect(reverse('user_detail', args=[request.user.id])) + + +@login_required +def dcf_get_user_data(request): + """ + Use for QC and development + """ + resp = _dcf_call(DCF_USER_URL, request.user.id) + user_data = json_loads(resp.text) + + remaining_token_time = get_dcf_auth_key_remaining_seconds(request.user.id) + messages.warning(request, 'TDCF Responded with {}: {}'.format(user_data, remaining_token_time)) + return redirect(reverse('user_detail', args=[request.user.id])) + + +def _dcf_call(full_url, user_id, mode='get', post_body=None): + """ + All the stuff around a DCF call that handles token management and refreshes. + """ + dcf_token = DCFToken.objects.get(user_id=user_id) + + expires_in = (dcf_token.expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() + logger.info("[INFO] Token Expiration : {} seconds".format(expires_in)) + + token_dict = { + 'access_token' : dcf_token.access_token, + 'refresh_token' : dcf_token.refresh_token, + 'token_type' : 'Bearer', + 'expires_in' : expires_in + } + + def token_storage_for_user(my_token_dict): + _access_token_storage(my_token_dict, user_id) + + client_id, client_secret = _get_secrets() + + extra_dict = { + 'client_id' : client_id, + 'client_secret': client_secret + } + + dcf = OAuth2Session(client_id, token=token_dict, auto_refresh_url=DCF_TOKEN_URL, + auto_refresh_kwargs=extra_dict, token_updater=token_storage_for_user) + + # Hoo boy! You *MUST* provide the client_id and client_secret in the call itself to insure an OAuth2Session token + # refresh call uses HTTPBasicAuth! + + # FIXME can get an exception here (BAD REQUEST) if refresh token has e.g. been revoked and not dropped out of DB. + resp = dcf.request(mode, full_url, client_id=client_id, + client_secret=client_secret, data=post_body) + + return resp + + +def _get_secrets(): + dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) + client_id = dcf_secrets['DCF_CLIENT_ID'] + client_secret = dcf_secrets['DCF_CLIENT_SECRET'] + return client_id, client_secret + + +def _read_dict(my_file_name): + retval = {} + with open(my_file_name, 'r') as f: + for line in f: + if '=' not in line: + continue + split_line = line.split('=') + retval[split_line[0].strip()] = split_line[1].strip() + return retval \ No newline at end of file diff --git a/accounts/migrations/0016_auto_20180524_1033.py b/accounts/migrations/0016_auto_20180524_1033.py new file mode 100644 index 00000000..5ec1fb51 --- /dev/null +++ b/accounts/migrations/0016_auto_20180524_1033.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.10 on 2018-05-24 17:33 +from __future__ import unicode_literals + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ('accounts', '0015_googleproject_active'), + ] + + operations = [ + migrations.CreateModel( + name='DCFToken', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('nih_username', models.TextField()), + ('nih_username_lower', models.CharField(max_length=128)), + ('dcf_user', models.CharField(max_length=128)), + ('access_token', models.TextField()), + ('refresh_token', models.TextField()), + ('user_token', models.TextField()), + ('decoded_jwt', models.TextField()), + ('expires_at', models.DateTimeField()), + ('refresh_expires_at', models.DateTimeField()), + ('google_id', models.TextField(null=True)), + ('user', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + ), + migrations.AlterUniqueTogether( + name='dcftoken', + unique_together=set([('user', 'nih_username_lower')]), + ), + ] diff --git a/accounts/models.py b/accounts/models.py index d4484b98..cacb3478 100755 --- a/accounts/models.py +++ b/accounts/models.py @@ -160,8 +160,17 @@ class ServiceAccountAuthorizedDatasets(models.Model): class DCFToken(models.Model): - nih_user = models.OneToOneField(NIH_User, null=False) + user = models.OneToOneField(User, null=False) + nih_username = models.TextField(null=False) + nih_username_lower = models.CharField(max_length=128, null=False) # Must be limited to include in constraint dcf_user = models.CharField(max_length=128, null=False) access_token = models.TextField(null=False) refresh_token = models.TextField(null=False) + user_token = models.TextField(null=False) + decoded_jwt = models.TextField(null=False) expires_at = models.DateTimeField(null=False) + refresh_expires_at = models.DateTimeField(null=False) + google_id = models.TextField(null=True) + + class Meta: + unique_together = (("user", "nih_username_lower"),) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index a017c916..d89055eb 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -16,8 +16,10 @@ import re import base64 -from json import dumps as json_dumps +from json import dumps as json_dumps, loads as json_loads +from base64 import urlsafe_b64decode import traceback +import time import datetime import pytz @@ -687,6 +689,7 @@ def __str__(self): def __repr__(self): return self.__str__() + def do_nih_unlink(user_id): unlink_accounts_result, message = unlink_accounts_and_get_acl_tasks(user_id) if message: @@ -696,6 +699,7 @@ def do_nih_unlink(user_id): return next_message return None + def _process_actions(unlink_accounts_result): directory_service, http_auth = get_directory_resource() for action in unlink_accounts_result.acl_delete_actions: @@ -878,6 +882,7 @@ def found_linking_problems(NIH_username, user_id, user_email, my_st_logger, resu return True return False + def demo_process_success(auth, user_id, saml_response): retval = DemoLoginResults() st_logger = StackDriverLogger.build_from_django_settings() @@ -991,10 +996,115 @@ def demo_process_success(auth, user_id, saml_response): return retval +def get_dcf_auth_key_remaining_seconds(user_id): + """ + We need to know how many seconds are left before the user needs to log back in to NIH to get + a new refresh token, which will expire every 30 days. + """ + + dcf_token = DCFToken.objects.get(user_id=user_id) + + remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() + logger.info('[INFO] user {} has {} seconds remaining on refresh token'. + format(dcf_token.nih_username, remaining_seconds)) + + return remaining_seconds + + +def handle_user_db_update_for_dcf_linking(user_id, user_data, nih_assertion_expiration, st_logger): + """ + When user logs into DCF using iTrust and links via DCF, we create an NIH record for them and link them to to their data. + """ + nih_user = None + try: + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] Updating Django model for DCF") + + updated_values = { + 'NIH_assertion': None, # Unused + 'NIH_assertion_expiration': nih_assertion_expiration, + 'active': 1, + 'linked': True + } + + nih_user, created = NIH_User.objects.update_or_create(NIH_username=user_data['username'], + user_id=user_id, + defaults=updated_values) + + logger.info("[STATUS] NIH_User.objects.update_or_create() returned nih_user: {} and created: {}".format( + str(nih_user.NIH_username), str(created))) + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "[STATUS] NIH_User.objects.update_or_create() returned nih_user: {} and created: {}".format( + str(nih_user.NIH_username), str(created))) + + our_user = User.objects.get(id=user_id) + dict_o_projects = user_data['project_access'] + + logger.info("[STATUS] NIH_User.objects updated nih_user for linking: {}".format( + str(nih_user.NIH_username))) + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "[STATUS] NIH_User.objects updated nih_user for linking: {}".format( + str(nih_user.NIH_username))) + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "[STATUS] NIH_User {} associated with email {}".format( + str(nih_user.NIH_username), our_user.email)) + + # default warn message is for eRA Commons users who are not dbGaP authorized + warn_message = ''' +

WARNING NOTICE

+

You are accessing a US Government web site which may contain information that must be protected under the US Privacy Act or other sensitive information and is intended for Government authorized use only.

+

Unauthorized attempts to upload information, change information, or use of this web site may result in disciplinary action, civil, and/or criminal penalties. Unauthorized users of this website should have no expectation of privacy regarding any communications or data processed by this website.

+

Anyone accessing this website expressly consents to monitoring of their actions and all communications or data transiting or stored on related to this website and is advised that if such monitoring reveals possible evidence of criminal activity, NIH may provide that evidence to law enforcement officials.

+ ''' + + except Exception as e: + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "[ERROR] Exception while installing DCF linking: {}".format(str(e))) + logger.error("[ERROR] Exception while installing DCF linking: ") + logger.exception(e) + warn_message = "" + + if len(dict_o_projects) > 0: + # if user has access to one or more datasets, warn message is different + warn_message += '

You are reminded that when accessing controlled information you are bound by the dbGaP DATA USE CERTIFICATION AGREEMENT (DUCA) for each dataset.

' + + return nih_user, warn_message + + +def unlink_account_in_db_for_dcf(user_id): + """ + This function modifies the 'NIH_User' objects! + + We find the NIH user(s) linked to the user_id, and set the Linked and Active states to False. We then remove their + authorized dataset records. This should only have to deal with one user, but we are set up to handle multiple users + to be safe. + + """ + + user_email = User.objects.get(id=user_id).email + nih_user_query_set = NIH_User.objects.filter(user_id=user_id, linked=True) + num_linked = len(nih_user_query_set) + + # If nobody is linked, we are actually done. There is nothing to do. + if num_linked == 0: + return None + elif num_linked > 1: + logger.warn("[WARNING] Found multiple linked accounts for user {}! Unlinking all accounts.".format(user_email)) + + for nih_account_to_unlink in nih_user_query_set: + nih_account_to_unlink.linked = False + nih_account_to_unlink.active = False + nih_account_to_unlink.save() + nih_account_to_unlink.delete_all_auth_datasets() + logger.info("[STATUS] Unlinked NIH User {} from user {}.".format(nih_account_to_unlink.NIH_username, user_email)) + + return None + + def handle_user_db_entry(user_id, NIH_username, user_email, auth_response, num_auth_datasets, NIH_assertion_expiration, st_logger): + try: - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] Updating Django model") + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] Updating Django model for DCF") updated_values = { 'NIH_assertion': auth_response, @@ -1045,8 +1155,7 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, ad = AuthorizedDataset.objects.get(whitelist_id=dataset.dataset_id, acl_google_group=dataset.google_group_name) except (ObjectDoesNotExist, MultipleObjectsReturned) as e: - logger.error(( - "[ERROR] " + ( + logger.error(("[ERROR] " + ( "More than one dataset " if type(e) is MultipleObjectsReturned else "No dataset ") + "found for this ID and Google Group Name in the database: %s, %s") % ( dataset.dataset_id, dataset.google_group_name) @@ -1152,23 +1261,26 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, else: logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) -def deactivate_nih_add_to_open(user_id, user_email): - try: - nih_user = NIH_User.objects.get(user_id=user_id, linked=True) - nih_user.active = False - nih_user.save() - logger.info("[STATUS] NIH user {} has been de-activated.".format(nih_user.NIH_username)) - except (ObjectDoesNotExist, MultipleObjectsReturned) as e: - if type(e) is MultipleObjectsReturned: - logger.error("[ERROR] More than one linked NIH User with user id {} - deactivating all of them!".format (str(e), user_id)) - nih_users = NIH_User.objects.filter(user_id=user_id) - for nih_user in nih_users: - nih_user.active = False - nih_user.save() - nih_user.delete_all_auth_datasets() - else: - logger.info("[STATUS] No linked NIH user was found for user {} - no one set to inactive.".format(user_email)) +def deactivate_nih_add_to_open(user_id, user_email): + # 5/14/18 NO! active flag has nothing to do with user logout, but instead is set to zero when user expires off of ACL group + # after 24 hours: + # try: + # nih_user = NIH_User.objects.get(user_id=user_id, linked=True) + # nih_user.active = False + # nih_user.save() + # logger.info("[STATUS] NIH user {} has been de-activated.".format(nih_user.NIH_username)) + # + # except (ObjectDoesNotExist, MultipleObjectsReturned) as e: + # if type(e) is MultipleObjectsReturned: + # logger.error("[ERROR] More than one linked NIH User with user id {} - deactivating all of them!".format (str(e), user_id)) + # nih_users = NIH_User.objects.filter(user_id=user_id) + # for nih_user in nih_users: + # nih_user.active = False + # nih_user.save() + # nih_user.delete_all_auth_datasets() + # else: + # logger.info("[STATUS] No linked NIH user was found for user {} - no one set to inactive.".format(user_email)) directory_service, http_auth = get_directory_resource() @@ -1185,20 +1297,68 @@ def deactivate_nih_add_to_open(user_id, user_email): def get_nih_user_details(user_id): user_details = {} - try: - nih_user = NIH_User.objects.get(user_id=user_id, linked=True) - user_auth_datasets = UserAuthorizedDatasets.objects.filter(nih_user=nih_user) - user_details['NIH_username'] = nih_user.NIH_username - user_details['NIH_assertion_expiration'] = nih_user.NIH_assertion_expiration - user_details['dbGaP_authorized'] = (len(user_auth_datasets) > 0) and nih_user.active - logger.debug("[DEBUG] User {} has access to {} dataset(s) and is {}".format(nih_user.NIH_username, str(len(user_auth_datasets)), ('not active' if not nih_user.active else 'active'))) - user_details['NIH_active'] = nih_user.active - user_details['auth_datasets'] = [] if len(user_auth_datasets) <= 0 else AuthorizedDataset.objects.filter(id__in=user_auth_datasets.values_list('authorized_dataset',flat=True)) - except (MultipleObjectsReturned, ObjectDoesNotExist), e: - if type(e) is MultipleObjectsReturned: - # in this case there is more than one nih_username linked to the same google identity - logger.warn("Error when retrieving nih_user with user_id {}. {}".format(str(user_id), str(e))) - # todo: add code to unlink all accounts? + # + # Now with DCF, we can have a user logged in as an NIH user, but not be linked (which means DCF does not + # have an association between NIH ID and Google ID). So while we previously did a get on a linked user, + # now we need to filter. If one of the users is linked, that is who we use. Otherwise, we can resolve the + # issue by looking at the current DCF token attached to the user to see who they are associated with. + # + + dcf_tokens = DCFToken.objects.filter(user_id=user_id) + if len(dcf_tokens) == 0: + return user_details + elif len(dcf_tokens) > 1: + logger.error("[ERROR] MULTIPLE DCF RECORDS FOR USER {}. ".format(str(user_id))) + return user_details + + dcf_token = dcf_tokens.first() + nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username=dcf_token.nih_username) + + if len(nih_users) == 0: + return user_details + + elif len(nih_users) == 1: + nih_user = nih_users.first() + + else: + nih_user = None + freshest_linked = None + freshest_linked_stamp = None + freshest_unlinked = None + freshest_unlinked_stamp = None + for user in nih_users: + if user.linked: + if (freshest_linked_stamp is None) or (freshest_linked_stamp < user.NIH_assertion_expiration): + freshest_linked_stamp = user.NIH_assertion_expiration + freshest_linked = user + if nih_user is None: + nih_user = nih_users.first() + else: + logger.error("[ERROR] Multiple linked nih users retrieved nih_user with user_id {}.".format(user_id)) + else: + if (freshest_unlinked_stamp is None) or (freshest_unlinked_stamp < user.NIH_assertion_expiration): + freshest_unlinked_stamp = user.NIH_assertion_expiration + freshest_unlinked = user + + if freshest_linked: + nih_user = freshest_linked + elif freshest_unlinked: + nih_user = freshest_unlinked + else: + logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) + return user_details + + user_auth_datasets = UserAuthorizedDatasets.objects.filter(nih_user=nih_user) + user_details['NIH_username'] = nih_user.NIH_username + user_details['NIH_assertion_expiration'] = nih_user.NIH_assertion_expiration + # Add a separate field to break out program count from active: + user_details['dbGaP_has_datasets'] = (len(user_auth_datasets) > 0) + user_details['dbGaP_authorized'] = (len(user_auth_datasets) > 0) and nih_user.active + logger.debug("[DEBUG] User {} has access to {} dataset(s) and is {}".format(nih_user.NIH_username, str(len(user_auth_datasets)), ('not active' if not nih_user.active else 'active'))) + user_details['NIH_active'] = nih_user.active + user_details['NIH_DCF_linked'] = nih_user.linked + user_details['refresh_key_ok'] = get_dcf_auth_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS + user_details['auth_datasets'] = [] if len(user_auth_datasets) <= 0 else AuthorizedDataset.objects.filter(id__in=user_auth_datasets.values_list('authorized_dataset',flat=True)) return user_details diff --git a/accounts/urls.py b/accounts/urls.py index f122afc6..94c11586 100755 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -19,7 +19,7 @@ from django.conf.urls import url, include from allauth.socialaccount.providers.google import urls as google_urls, views as google_views -from . import views +from . import views, dcf_views urlpatterns = [ @@ -28,10 +28,15 @@ url(r'^logout', views.extended_logout_view, name='account_logout'), url(r'^login/$', google_views.oauth2_login, name='account_login'), # url(r'^nih_login/$', views.nih_login, name='nih_login'), - url(r'^dcf/login/callback/$', views.oauth2_callback, name='dcf_callback'), - url(r'^dcf_login/$', views.oauth2_login, name='dcf_login'), + url(r'^dcf/login/callback/$', dcf_views.oauth2_callback, name='dcf_callback'), + url(r'^dcf_link_callback/$', dcf_views.dcf_link_callback, name='dcf_link_callback'), + url(r'^dcf_link_extend/$', dcf_views.dcf_link_extend, name='dcf_link_extend'), + url(r'^dcf_disconnect_user/$', dcf_views.dcf_disconnect_user, name='dcf_disconnect_user'), + url(r'^dcf_user_data/$', dcf_views.dcf_get_user_data, name='dcf_get_user_data'), + url(r'^dcf_unlink/$', dcf_views.dcf_unlink, name='dcf_unlink'), + url(r'^dcf_login/$', dcf_views.oauth2_login, name='dcf_login'), + url(r'^dcf/test', dcf_views.test_the_dcf, name='dcf_test'), url(r'^unlink_accounts/', views.unlink_accounts, name='unlink_accounts'), - url(r'^dcf/test', views.test_the_dcf, name='dcf_test'), # Google Cloud Project related diff --git a/accounts/views.py b/accounts/views.py index 772eaf8c..d6ba752d 100755 --- a/accounts/views.py +++ b/accounts/views.py @@ -1,5 +1,5 @@ """ -Copyright 2017, Institute for Systems Biology +Copyright 2017-2018, Institute for Systems Biology Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -40,7 +40,9 @@ from sa_utils import verify_service_account, register_service_account, \ unregister_all_gcp_sa, unregister_sa_with_id, service_account_dict, \ do_nih_unlink, deactivate_nih_add_to_open, handle_user_db_entry, \ - found_linking_problems, DemoLoginResults, handle_user_for_dataset + found_linking_problems, DemoLoginResults, handle_user_for_dataset,\ + handle_user_db_update_for_dcf_linking, \ + unlink_account_in_db_for_dcf, get_dcf_auth_key_remaining_seconds from django.http import HttpResponseRedirect from requests_oauthlib.oauth2_session import OAuth2Session @@ -48,13 +50,13 @@ from base64 import urlsafe_b64decode import jwt from jwt.contrib.algorithms.pycrypto import RSAAlgorithm -from json import dumps as json_dumps +from json import loads as json_loads, dumps as json_dumps from dataset_utils.dataset_access_support_factory import DatasetAccessSupportFactory from dataset_utils.dataset_config import DatasetGoogleGroupPair +import requests +import httplib as http_client - - -import json +http_client.HTTPConnection.debuglevel = 1 logger = logging.getLogger('main_logger') @@ -67,9 +69,13 @@ DCF_AUTH_URL = settings.DCF_AUTH_URL DCF_TOKEN_URL = settings.DCF_TOKEN_URL DCF_USER_URL = settings.DCF_USER_URL +DCF_REVOKE_URL = settings.DCF_REVOKE_URL +DCF_GOOGLE_URL = settings.DCF_GOOGLE_URL +DCF_TOKEN_REFRESH_WINDOW_SECONDS = settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS @login_required def extended_logout_view(request): + response = None try: # deactivate NIH_username entry if exists @@ -518,7 +524,7 @@ def register_bucket(request, user_id, gcp_id): messages.error(request, 'Access to the bucket {0} in Google Cloud Project {1} was denied.'.format( bucket_name, gcp.project_id)) elif e.resp.get('content-type', '').startswith('application/json'): - err_val = json.loads(e.content).get('error') + err_val = json_loads(e.content).get('error') if err_val: e_message = err_val.get('message') else: @@ -745,302 +751,3 @@ def get_user_datasets(request,user_id): status='500' return JsonResponse(result, status=status) - -@login_required -def oauth2_login(request): - """ - First step of OAuth2 login ro DCF. Just build the URL that we send back to the browser in the refresh request - """ - full_callback = request.build_absolute_uri(reverse('dcf_callback')) - - # OAuth2Session ENFORCES https unless this environment variable is set. For local dev, we want that off - # so we can talk to localhost over http. But let's turn it on/off to minimize, and make it only active in - # development: - - if settings.IS_DEV and full_callback.startswith('http://localhost'): - os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' - - dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) - - # Found that 'user' scope had to be included to be able to do the user query on callback, and the data scope - # to do data queries. Starting to recognize a pattern here... - oauth = OAuth2Session(dcf_secrets['DCF_CLIENT_ID'], redirect_uri=full_callback, scope=['openid', 'user', 'data']) - authorization_url, state = oauth.authorization_url(DCF_AUTH_URL) - # stash the state string in the session! - request.session['dcfOAuth2State'] = state - os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' - return HttpResponseRedirect(authorization_url) - - # For future reference, this also worked, using underlying oauthlib.oauth2 library: - # from oauthlib.oauth2 import WebApplicationClient - # wac = WebApplicationClient(social_account.client_id) - # rando = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(10)) - # ruri = wac.prepare_request_uri(DCF_AUTH_URL, redirect_uri=full_callback, state=rando, scope=['openid', 'user']) - # return HttpResponseRedirect(ruri) - -@login_required -def oauth2_callback(request): - """ - Second step of OAuth2 login to DCF. Takes the response redirect URL that DCF returned to the user's browser, - parse out the auth code, use it to get a token, then get user info from DCF using the token - """ - full_callback = request.build_absolute_uri(reverse('dcf_callback')) - - # For future reference, this also worked, using underlying requests library: - # data = { 'redirect_uri': full_callback, 'grant_type': 'authorization_code', 'code': request.GET['code']} - # auth = requests.auth.HTTPBasicAuth(social_app.client_id, social_app.secret) - # resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) - # token_data = json.loads(resp.text) - # headers = {'Authorization': 'Bearer {}'.format(token_data['access_token'])} - # resp = requests.get(DCF_USER_URL, headers=headers) - - # OAuth2Session ENFORCES https unless this environment variable is set. FOr local dev, we want that off - # so we can talk to localhost over http. But let's turn it on/off to minimize, and make it only active in - # development: - - if settings.IS_DEV and full_callback.startswith('http://localhost'): - os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' - - dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) - - if 'dcfOAuth2State' in request.session: - saved_state = request.session['dcfOAuth2State'] - else: - """Do something here to report the error""" - - # You MUST provide the callback *here* to get it into the fetch request - dcf = OAuth2Session(dcf_secrets['DCF_CLIENT_ID'], state=saved_state, redirect_uri=full_callback) - - # You MUST provide the client_id *here* (again!) in order to get this to do basic auth! DCF will not authorize - # unless we use basic auth (i.e. client ID and secret in the header, not the body). Plus we need to provide - # the authorization_response argument intead of a parsed-out code argument since this is a WebApplication flow. - # Note we also get back an "id_token" which is a base64-encoded JWT. - # Note we also get back a "token_type" which had better be "Bearer". - - token_data = dcf.fetch_token(DCF_TOKEN_URL, client_secret=dcf_secrets['DCF_CLIENT_SECRET'], - client_id=dcf_secrets['DCF_CLIENT_ID'], - authorization_response=request.get_full_path()) - - if token_data['token_type'] != 'Bearer': - """Do something here to report the error""" - - # - # Although user data can be extracted from the /user endpoint, DCF instructs us to pull the user information - # out of the JWT in the id_token. They also recommend we check that the JWT validates using the public - # key provided by their endpoint using the pyjwt package to do the work. - # - - id_token_b64 = token_data['id_token'] - - # - # PyJWT happens to want the cryptography package, but that involves C code, so we use the provided fallback of - # pycrypto, which we do use. The steps below are how they say to use the pycrypto implmentation, but note that - # we appear to need to create a new PyJWT() object so that it does not complain about previously registered - # algorithm, but also doesn't like is we unregister non-registered algorithms, or appear to provide an easy - # way to get at the global list of registered algorithms? - # - - my_jwt = jwt.PyJWT() - my_jwt.register_algorithm('RS256', RSAAlgorithm(RSAAlgorithm.SHA256)) - - # - # DCF's key endpoint provides a list of keys they use. Right now, only one, but to future-proof, we want - # to choose the right one from the list. But that means we need to parse the first element of the JWT tuple - # to know which key to use, even though we need the key to decode the tuple. (There has to be a better way - # that I am missing.) So, we need to break the id_token at the "." delimiting the tuples (base64decode PUKES - # on the "."). Then take the first element of the JWT and decode it: - # - - id_tokens_b64 = id_token_b64.split('.') - i64 = id_tokens_b64[0] - padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length - id_token = urlsafe_b64decode(padded.encode("ascii")) - jwt_header = json.loads(id_token) - kid = jwt_header['kid'] - - # - # Get the key list from the endpoint and choose which one was used in the JWT: - # - - resp = dcf.get('https://qa.dcf.planx-pla.net/user/jwt/keys') - key_data = json.loads(resp.text) - key_list = key_data['keys'] - use_key = None - for key in key_list: - if key[0] == kid: - use_key = key[1] - - if use_key is None: - """Do something here to report the error""" - - # - # Decode the JWT! - # - - try: - alg_list = ['RS256'] - decoded_jwt = my_jwt.decode(id_token_b64, key=use_key, algorithms=alg_list, - audience=['openid', 'user', 'data', dcf_secrets['DCF_CLIENT_ID']]) - except Exception as e: - """Do something here to report the error""" - - # - # For reference, this is what I am seeing in the JWT: - # - # comp = {u'aud': [u'openid', u'user', u'data', u'Client ID'], - # u'iss': u'https://qa.dcf.planx-pla.net/user', - # u'iat': 1525732539, - # u'jti': u'big hex string with dashes', - # u'context': {u'user': {u'phone_number': u'', - # u'display_name': u'', - # u'name': u'email of NIH Username', - # u'is_admin': False, - # u'email': u'email address', - # u'projects': {u'qa': [u'read', u'read-storage'], - # u'test': [u'read', u'read-storage']}}}, - # u'auth_time': 1525732539, - # u'azp': u'Client ID', - # u'exp': 1525733739, - # u'pur': u'id', - # u'sub': u'integer use key'} - - nih_from_dcf = decoded_jwt['context']['user']['name'] - dcf_user_id = decoded_jwt['sub'] - dict_o_projects = decoded_jwt['context']['user']['projects'] - - # - # This also works to get user info from the DCF, though you need to have 'user' in the audience as well: - # - # resp = dcf.get(DCF_USER_URL) - # user_data = json.loads(resp.text) - # nih_from_dcf = user_data['username'] - # - - # - # For development, let's pretend that DCF actually returns an ERACommons ID: - # - - if nih_from_dcf == dcf_secrets['DEV_1_EMAIL']: - nih_from_dcf = dcf_secrets['DEV_1_NIH'] - - # We now have the NIH User ID back from DCF. We check that we don't have linking issues! - results = DemoLoginResults() - st_logger = StackDriverLogger.build_from_django_settings() - user_email = User.objects.get(id=request.user.id).email - if found_linking_problems(nih_from_dcf, request.user.id, user_email, st_logger, results): - """return the linking problem!""" - return redirect('dashboard') - - ## This is the place to link to Google??? But lotsa stuff needs to go into the session to be stored later? - - # We now will have the NIH User ID back from DCF. - - login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 - nih_assertion_expiration = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( - seconds=login_expiration_seconds)) - - nih_user, warnings = handle_user_db_entry(request.user.id, nih_from_dcf, user_email, json_dumps(decoded_jwt), - len(dict_o_projects), nih_assertion_expiration, st_logger) - - _token_storage(token_data, nih_user.id, dcf_user_id) - - authorized_datasets = [] - all_datasets = [] - for project, perm_list in dict_o_projects.iteritems(): - if project == 'qa': - project = 'phs000178' - goog = 'isb-cgc-dev-cntl@isb-cgc.org' - elif project == 'test': - project = 'phs000218' - goog = 'isb-cgc-dev-cntl-target@isb-cgc.org' - ad = AuthorizedDataset.objects.get(whitelist_id=project) - authorized_datasets.append(DatasetGoogleGroupPair(project, goog)) #ad.acl_google_group)) - all_datasets.append(DatasetGoogleGroupPair(project, goog)) - - - # das = DatasetAccessSupportFactory.from_webapp_django_settings() - # all_datasets = das.get_all_datasets_and_google_groups() - - for dataset in all_datasets: - handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) - - if warnings: - messages.warning(request, warnings) - os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' - return redirect('/users/' + str(request.user.id)) - - -def _token_storage(token_dict, nih_pk, dcf_uid): - - if token_dict.has_key('expires_at'): - expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) - else: - print "Have to build an expiration time" - expiration_time = pytz.utc.localize( - datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) - - DCFToken.objects.update_or_create(nih_user_id=nih_pk, - defaults={ - 'dcf_user': dcf_uid, - 'access_token': token_dict['access_token'], - 'refresh_token': token_dict['refresh_token'], - 'expires_at': expiration_time - }) - -@login_required -def test_the_dcf(request): - """ - Use this to test that we can call the DCF and get back useful info. Also, use as a template for doing all - DCF calls - """ - file_uuid = 'ffcc4f7d-471a-4ad0-b199-53d992217986' - resp = _dcf_call('https://qa.dcf.planx-pla.net/user/data/download/{}'.format(file_uuid), request.user.id) - result = {'uri': resp.text} - return JsonResponse(result, status=resp.status_code) - - -def _dcf_call(full_url, user_id): - """ - All the stuff around a DCF call that handles token management and refreshes - """ - - dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) - - nih_user = NIH_User.objects.get(user_id=user_id, linked=True) - dcf_token = DCFToken.objects.get(nih_user=nih_user.id) - - expires_in = (dcf_token.expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() - print "Expiration : {} seconds".format(expires_in) - - token_dict = { - 'access_token' : dcf_token.access_token, - 'refresh_token' : dcf_token.refresh_token, - 'token_type' : 'Bearer', - 'expires_in' : expires_in - } - extra_dict = { - 'client_id' : dcf_secrets['DCF_CLIENT_ID'], - 'client_secret': dcf_secrets['DCF_CLIENT_SECRET'] - } - - def token_storage_for_user(my_token_dict): - _token_storage(my_token_dict, user_id, dcf_token.dcf_user) - - dcf = OAuth2Session(dcf_secrets['DCF_CLIENT_ID'], token=token_dict, auto_refresh_url=DCF_TOKEN_URL, - auto_refresh_kwargs=extra_dict, token_updater=token_storage_for_user) - - # Hoo boy! You *MUST* provide the client_id and client_secret in the call itself to insure an OAuth2Session token - # refresh call uses HTTPBasicAuth! - resp = dcf.get(full_url, client_id=dcf_secrets['DCF_CLIENT_ID'], client_secret=dcf_secrets['DCF_CLIENT_SECRET']) - return resp - - -def _read_dict(my_file_name): - retval = {} - with open(my_file_name, 'r') as f: - for line in f: - if '=' not in line: - continue - split_line = line.split('=') - retval[split_line[0].strip()] = split_line[1].strip() - return retval \ No newline at end of file From df2fb734b34b36de7fdce03f658fb2324c0309de Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Thu, 24 May 2018 17:48:58 -0700 Subject: [PATCH 05/76] ditch the bogus file --- accounts/migrations/0016_dcftoken.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/accounts/migrations/0016_dcftoken.py b/accounts/migrations/0016_dcftoken.py index aeeb2506..e69de29b 100644 --- a/accounts/migrations/0016_dcftoken.py +++ b/accounts/migrations/0016_dcftoken.py @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by Django 1.11.10 on 2018-05-05 01:18 -from __future__ import unicode_literals - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('accounts', '0015_googleproject_active'), - ] - - operations = [ - migrations.CreateModel( - name='DCFToken', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('dcf_user', models.CharField(max_length=128)), - ('access_token', models.TextField()), - ('refresh_token', models.TextField()), - ('expires_at', models.DateTimeField()), - ('nih_user', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to='accounts.NIH_User')), - ], - ), - ] From 27af43299a05bd6b096baff65df1e0b603ceea92 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Thu, 24 May 2018 17:49:23 -0700 Subject: [PATCH 06/76] ditch the bogus file --- accounts/migrations/0016_dcftoken.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 accounts/migrations/0016_dcftoken.py diff --git a/accounts/migrations/0016_dcftoken.py b/accounts/migrations/0016_dcftoken.py deleted file mode 100644 index e69de29b..00000000 From cbf6e8c2851d7bb4d6f03f6fefe1d1d486b65afb Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Thu, 24 May 2018 18:15:53 -0700 Subject: [PATCH 07/76] Clean up unused stuff, turn off debugging HTTP --- accounts/dcf_views.py | 3 ++- accounts/views.py | 27 ++------------------------- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index b2cde81e..7b086905 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -45,7 +45,8 @@ import httplib as http_client -http_client.HTTPConnection.debuglevel = 1 +# Shut this up unless we need to do debug of HTTP request contents +# http_client.HTTPConnection.debuglevel = 1 logger = logging.getLogger('main_logger') diff --git a/accounts/views.py b/accounts/views.py index d6ba752d..e2ad421f 100755 --- a/accounts/views.py +++ b/accounts/views.py @@ -39,24 +39,8 @@ from django.utils.html import escape from sa_utils import verify_service_account, register_service_account, \ unregister_all_gcp_sa, unregister_sa_with_id, service_account_dict, \ - do_nih_unlink, deactivate_nih_add_to_open, handle_user_db_entry, \ - found_linking_problems, DemoLoginResults, handle_user_for_dataset,\ - handle_user_db_update_for_dcf_linking, \ - unlink_account_in_db_for_dcf, get_dcf_auth_key_remaining_seconds - -from django.http import HttpResponseRedirect -from requests_oauthlib.oauth2_session import OAuth2Session -import os -from base64 import urlsafe_b64decode -import jwt -from jwt.contrib.algorithms.pycrypto import RSAAlgorithm -from json import loads as json_loads, dumps as json_dumps -from dataset_utils.dataset_access_support_factory import DatasetAccessSupportFactory -from dataset_utils.dataset_config import DatasetGoogleGroupPair -import requests -import httplib as http_client - -http_client.HTTPConnection.debuglevel = 1 + do_nih_unlink, deactivate_nih_add_to_open +from json import loads as json_loads logger = logging.getLogger('main_logger') @@ -66,13 +50,6 @@ GOOGLE_ORG_WHITELIST_PATH = settings.GOOGLE_ORG_WHITELIST_PATH MANAGED_SERVICE_ACCOUNTS_PATH = settings.MANAGED_SERVICE_ACCOUNTS_PATH -DCF_AUTH_URL = settings.DCF_AUTH_URL -DCF_TOKEN_URL = settings.DCF_TOKEN_URL -DCF_USER_URL = settings.DCF_USER_URL -DCF_REVOKE_URL = settings.DCF_REVOKE_URL -DCF_GOOGLE_URL = settings.DCF_GOOGLE_URL -DCF_TOKEN_REFRESH_WINDOW_SECONDS = settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS - @login_required def extended_logout_view(request): From 6a883eebe9a5c82e189d461e4632804423d9b19a Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 29 May 2018 18:06:43 -0700 Subject: [PATCH 08/76] More progress --- accounts/dcf_views.py | 99 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 11 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 7b086905..a7844150 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -46,7 +46,7 @@ import httplib as http_client # Shut this up unless we need to do debug of HTTP request contents -# http_client.HTTPConnection.debuglevel = 1 +http_client.HTTPConnection.debuglevel = 1 logger = logging.getLogger('main_logger') @@ -249,6 +249,8 @@ def oauth2_callback(request): # out if the user is linked! # + print decoded_jwt + the_user_for_google_link = decoded_jwt['context']['user'] gotta_google_link = the_user_for_google_link.has_key('google') and \ @@ -417,8 +419,9 @@ def _finish_the_link(user_id, user_email, st_logger): dict_o_projects = the_user['project_access'] authorized_datasets = [] for project, perm_list in dict_o_projects.iteritems(): - ad = AuthorizedDataset.objects.get(whitelist_id=project) - authorized_datasets.append(DatasetGoogleGroupPair(project, ad.acl_google_group)) + adqs = AuthorizedDataset.objects.filter(whitelist_id=project) + if len(adqs) == 1: + authorized_datasets.append(DatasetGoogleGroupPair(project, adqs.first().acl_google_group)) das = DatasetAccessSupportFactory.from_webapp_django_settings() all_datasets = das.get_all_datasets_and_google_groups() @@ -445,6 +448,9 @@ def _massage_user_data_for_dev(the_user): """ dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) + if not dcf_secrets.has_key('DEV_1_EMAIL'): + return the_user + nih_from_dcf = the_user['username'] if nih_from_dcf == dcf_secrets['DEV_1_EMAIL']: nih_from_dcf = dcf_secrets['DEV_1_NIH'] @@ -482,7 +488,7 @@ def dcf_link_extend(request): messages.warning(request, "Unexpected response ({}) from DCF during linking. " "Please contact the ISB-CGC administrator.".format(resp.status_code)) - + print resp.text # Until we get back user expiration time, we calculate it: login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 @@ -603,6 +609,16 @@ def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_fro }) +def print_dict(dictionary, ident = '', braces=1): + """ Recursively prints nested dictionaries.""" + + for key, value in dictionary.iteritems(): + if isinstance(value, dict): + print '%s%s%s%s' %(ident,braces*'[',key,braces*']') + print_dict(value, ident+' ', braces+1) + else: + print ident+'%s = %s' %(key, value) + def _access_token_storage(token_dict, cgc_uid): """ This call just replaces the access key part of the DCF record. Used when we use the @@ -619,6 +635,15 @@ def _access_token_storage(token_dict, cgc_uid): print 'Token storage. New token expires at {}'.format(str(expiration_time)) + print_dict(token_dict) + id_token = token_dict['id_token'] + id_tokens_b64 = id_token.split('.') + i64 = id_tokens_b64[1] + padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length + id_token_decoded = urlsafe_b64decode(padded.encode("ascii")) + id_token_dict = json_loads(id_token_decoded) + print_dict(id_token_dict) + dcf_token = DCFToken.objects.get(user_id=cgc_uid) dcf_token.access_token = token_dict['access_token'] dcf_token.expires_at = expiration_time @@ -706,21 +731,73 @@ def dcf_disconnect_user(request): dcf_token = DCFToken.objects.get(user_id=request.user.id) dcf_token.delete() + logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) + + callback = '{}?next_url={}'.format('https://qa.dcf.planx-pla.net/user/logout', logout_callback) + return HttpResponseRedirect(callback) + # redirect to user detail page - return redirect(reverse('user_detail', args=[request.user.id])) + # return redirect(reverse('user_detail', args=[request.user.id])) + + +@login_required +def dcf_user_data_from_token(request): + """ + Seems that we should be able to get full user info from the user endpoint, but it turns out that + the information in the token refresh is more complete. + """ + + + # + # OAuth2Session handles token refreshes under the covers. Here we want to do it explicitly not + # that we care about the refresh (we don't care about it), but we want the id_token contents. + + dcf_token = DCFToken.objects.get(user_id=request.user.id) + + client_id, client_secret = _get_secrets() + + data = { + 'grant_type': 'refresh_token', + 'refresh_token': dcf_token.refresh_token, + 'client_id': client_id + } + auth = requests.auth.HTTPBasicAuth(client_id, client_secret) + resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) + client_id = None + client_secret = None + + token_dict = json_loads(resp.text) + print_dict(token_dict) + id_token = token_dict['id_token'] + id_tokens_b64 = id_token.split('.') + i64 = id_tokens_b64[1] + padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length + id_token_decoded = urlsafe_b64decode(padded.encode("ascii")) + id_token_dict = json_loads(id_token_decoded) + print_dict(id_token_dict) + + if resp.status_code != 200: + messages.warning(request, 'Token acquisition problem: {} : {}'.format(resp.status_code, resp.text)) + + messages.warning(request, 'TDCF Responded with {}'.format(id_token_decoded)) + # redirect to user detail page + return redirect(reverse('user_detail', args=[request.user.id])) @login_required def dcf_get_user_data(request): """ Use for QC and development """ - resp = _dcf_call(DCF_USER_URL, request.user.id) - user_data = json_loads(resp.text) - remaining_token_time = get_dcf_auth_key_remaining_seconds(request.user.id) - messages.warning(request, 'TDCF Responded with {}: {}'.format(user_data, remaining_token_time)) - return redirect(reverse('user_detail', args=[request.user.id])) + return dcf_user_data_from_token(request) + + # resp = _dcf_call(DCF_USER_URL, request.user.id) + # user_data = json_loads(resp.text) + # + # remaining_token_time = get_dcf_auth_key_remaining_seconds(request.user.id) + # messages.warning(request, 'TDCF Responded with {}: {}'.format(user_data, remaining_token_time)) + # return redirect(reverse('user_detail', args=[request.user.id])) def _dcf_call(full_url, user_id, mode='get', post_body=None): @@ -736,7 +813,7 @@ def _dcf_call(full_url, user_id, mode='get', post_body=None): 'access_token' : dcf_token.access_token, 'refresh_token' : dcf_token.refresh_token, 'token_type' : 'Bearer', - 'expires_in' : expires_in + 'expires_in' : -100 } def token_storage_for_user(my_token_dict): From 4edd3e980ca353004c9ab3a28add68cf70cf1793 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Mon, 18 Jun 2018 12:31:24 -0700 Subject: [PATCH 09/76] Work in progress --- accounts/dcf_views.py | 746 +++++++++++++++++++++++++++++++++--------- accounts/sa_utils.py | 44 ++- 2 files changed, 631 insertions(+), 159 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index a7844150..5d5aab97 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -35,7 +35,7 @@ handle_user_db_update_for_dcf_linking, \ unlink_account_in_db_for_dcf, get_dcf_auth_key_remaining_seconds -from models import DCFToken, AuthorizedDataset +from models import DCFToken, AuthorizedDataset, NIH_User from requests_oauthlib.oauth2_session import OAuth2Session from base64 import urlsafe_b64decode from jwt.contrib.algorithms.pycrypto import RSAAlgorithm @@ -55,6 +55,8 @@ DCF_USER_URL = settings.DCF_USER_URL DCF_REVOKE_URL = settings.DCF_REVOKE_URL DCF_GOOGLE_URL = settings.DCF_GOOGLE_URL +DCF_LOGOUT_URL = settings.DCF_LOGOUT_URL +DCF_URL_URL = settings.DCF_URL_URL DCF_TOKEN_REFRESH_WINDOW_SECONDS = settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS @login_required @@ -97,7 +99,7 @@ def oauth2_login(request): def oauth2_callback(request): """ Second step of OAuth2 login to DCF. Takes the response redirect URL that DCF returned to the user's browser, - parse out the auth code, use it to get a token, then get user info from DCF using the token + parse out the auth code and use it to get a token """ try: @@ -111,9 +113,25 @@ def oauth2_callback(request): # headers = {'Authorization': 'Bearer {}'.format(token_data['access_token'])} # resp = requests.get(DCF_USER_URL, headers=headers) - # OAuth2Session ENFORCES https unless this environment variable is set. FOr local dev, we want that off + # + # DCF now adding a user confirmation page to their flow. If the user says "no", the call back will report + # an error. We need to tell the user there is a problem + # + + error = request.GET.get('error', None) + if error: + error_description = request.GET.get('error_description', None) + if error_description == 'The resource owner or authorization server denied the request': + logger.error("[INFO] User did not allow ISB access") + messages.error(request, + "Login cannot continue if ISB-CGC is not allowed access to the Data Commons Framework") + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # OAuth2Session ENFORCES https unless this environment variable is set. For local dev, we want that off # so we can talk to localhost over http. But let's turn it on/off to minimize, and make it only active in # development: + # if settings.IS_DEV and full_callback.startswith('http://localhost'): os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' @@ -139,24 +157,16 @@ def oauth2_callback(request): token_data = dcf.fetch_token(DCF_TOKEN_URL, client_secret=client_secret, client_id=client_id, authorization_response=request.get_full_path()) - client_secret = None + client_secret = None # clear this in case we are in Debug mode to keep this out of the browser if token_data['token_type'] != 'Bearer': logger.error("[ERROR] Token type returned was not 'Bearer'") messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") return redirect(reverse('user_detail', args=[request.user.id])) - # - # Although user data can be extracted from the /user endpoint, DCF instructs us to pull the user information - # out of the JWT in the id_token. They also recommend we check that the JWT validates using the public - # key provided by their endpoint using the pyjwt package to do the work. - # - - id_token_b64 = token_data['id_token'] - # # PyJWT happens to want the cryptography package, but that involves C code, so we use the provided fallback of - # pycrypto, which we do use. The steps below are how they say to use the pycrypto implmentation, but note that + # pycrypto, which we do use. The steps below are how they say to use the pycrypto implementation, but note that # we appear to need to create a new PyJWT() object so that it does not complain about previously registered # algorithm, but also doesn't like if we unregister non-registered algorithms, or appear to provide an easy # way to get at the global list of registered algorithms? @@ -172,13 +182,13 @@ def oauth2_callback(request): # that I am missing.) So, we need to break the id_token at the "." delimiting the tuples (base64decode PUKES # on the "."). Then take the first element of the JWT and decode it: # + # Although user data can be extracted from the /user endpoint, DCF instructs us to pull the user information + # out of the JWT in the id_token. They also recommend we check that the JWT validates using the public + # key provided by their endpoint using the pyjwt package to do the work. + # - id_tokens_b64 = id_token_b64.split('.') - i64 = id_tokens_b64[0] - padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length - id_token = urlsafe_b64decode(padded.encode("ascii")) - jwt_header = json_loads(id_token) - kid = jwt_header['kid'] + jwt_header_json, jwt_header_dict = _decode_token_chunk(token_data['id_token'], 0) + kid = jwt_header_dict['kid'] # # Get the key list from the endpoint and choose which one was used in the JWT: @@ -203,7 +213,7 @@ def oauth2_callback(request): try: alg_list = ['RS256'] - decoded_jwt = my_jwt.decode(id_token_b64, key=use_key, algorithms=alg_list, + decoded_jwt_id = my_jwt.decode(token_data['id_token'], key=use_key, algorithms=alg_list, audience=['openid', 'user', 'data', client_id]) except Exception as e: logger.error("[ERROR] Decoding JWT failure") @@ -212,7 +222,7 @@ def oauth2_callback(request): return redirect(reverse('user_detail', args=[request.user.id])) # - # For reference, this is what I am seeing in the JWT: + # For reference, this is what I am seeing in the JWT (May 2018): # # comp = {u'aud': [u'openid', u'user', u'data', u'Client ID'], # u'iss': u'https://The DCF server/user', @@ -232,36 +242,26 @@ def oauth2_callback(request): # u'pur': u'id', (The "purpose" of the token. This is an ID. Refresh tokens say "refresh") # u'sub': u'integer user key'} - dcf_user_id = decoded_jwt['sub'] + dcf_user_id = decoded_jwt_id['sub'] # - # User info is available in the JWT, but also from the user endpoint. We are going to use the endpoint - # since the info goes into the database, and we are going to be refreshing it frequently: + # Suck the data out of the user token to plunk into the database # - user_resp = dcf.get(DCF_USER_URL) - the_user = json_loads(user_resp.text) - the_user = _massage_user_data_for_dev(the_user) - nih_from_dcf = the_user['username'] + user_data_token_str, user_data_token_dict = _user_data_token_dict_massaged(decoded_jwt_id) - # - # BUT! DCF currently only returns google link data in the JWT. So we need to look there to figure - # out if the user is linked! - # - - print decoded_jwt + user_data_dict = _user_data_token_dict_to_user_dict(user_data_token_dict) - the_user_for_google_link = decoded_jwt['context']['user'] + nih_from_dcf = _get_nih_id_from_user_dict(user_data_dict) - gotta_google_link = the_user_for_google_link.has_key('google') and \ - the_user_for_google_link['google'].has_key('linked_google_account') - google_link = the_user_for_google_link['google']['linked_google_account'] if gotta_google_link else None + google_link = _get_google_link_from_user_dict(user_data_dict) # We now have the NIH User ID back from DCF; we also might now know the Google ID they have linked to previously # (it comes back in the user_id). Note that this routine is going to get called every 30 days or so when we # need to get a new refresh token, so it is possible that e.g. the first time they logged in as their PI and - # now are doing the legit thing of logging in as themselves. If we catch that problem, they need to unlink. Also, - # if DCF's idea of who they have linked to differs from ours (we keep a local copy), we need to handle that now! + # now are doing the legit thing of logging in as themselves. If we catch that problem, they need to + # unlink. Also, if DCF's idea of who they have linked to differs from ours (we keep a local copy), we need + # to handle that now! results = DemoLoginResults() st_logger = StackDriverLogger.build_from_django_settings() @@ -277,7 +277,7 @@ def oauth2_callback(request): # make the entry in the NIH_User table, since we need to now either establish or refresh the DCF-Google ID link: # - _refresh_token_storage(token_data, decoded_jwt, user_resp.text, nih_from_dcf, dcf_user_id, request.user.id, google_link) + _refresh_token_storage(token_data, decoded_jwt_id, user_data_token_str, nih_from_dcf, dcf_user_id, request.user.id, google_link) # # If user already has a google ID link, we would PATCH the endpoint to update it for 24 more hours. If @@ -286,12 +286,14 @@ def oauth2_callback(request): # GET, we wrap things up in the callback. For the PATCH, we wrap things up immediately: # - if gotta_google_link: + if google_link: # # It is possible that the first time the user logged in they provided the wrong email address to DCF and # then ignored us when we asked them to correct the problem. If DCF's provided Google ID does not match # ours, then they need to still provide us with the correct version before we let them use it! + # Also, if a user is trying to reuse the same NIH login, we expect to get back a Google ID from DCF that + # does not match the current user email. # req_user = User.objects.get(id=request.user.id) @@ -310,10 +312,17 @@ def oauth2_callback(request): elif resp.status_code == 200: pass else: - messages.warning(request, "Unexpected response ({}) from DCF during linking. " - "Please contact the ISB-CGC administrator.".format(resp.status_code)) + messages.warning(request, "Unexpected response ({}, {}) from DCF during linking. " + "Please contact the ISB-CGC administrator.".format(resp.status_code, resp.text)) + + print 'response {}'.format(str(resp.text)) + print 'PATCH ONLY RETURNS e.g. {"exp": 1528509163}' + + login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 + calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( + seconds=login_expiration_seconds)) - warning = _finish_the_link(request.user.id, req_user.email, st_logger) + warning = _finish_the_link(request.user.id, req_user.email, calc_expiration_time, st_logger) messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) @@ -360,17 +369,64 @@ def dcf_link_callback(request): 'to the ISB-CGC administrator'.format(error, message, error_description)) return redirect(reverse('user_detail', args=[request.user.id])) + # + # The callback provides us with both the link expiration and the user ID that was linked. BUT THIS IS + # COMING FROM THE USER, IS NOT SIGNED, AND SO CANNOT BE TRUSTED! Pull them out and verify them. If things + # are not too crazy, we accept the value we are sent: + # + + returned_expiration_str = request.GET.get('exp', None) + returned_google_link = request.GET.get('linked_email', None) + + returned_expiration_time = None + if returned_expiration_str: + exp_secs = float(returned_expiration_str) + returned_expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(exp_secs)) + + login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 + calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( + seconds=login_expiration_seconds)) + if returned_expiration_time: + diff = returned_expiration_time - calc_expiration_time + secs = abs((diff.days * (3600 * 24)) + diff.seconds) + if secs > 30: + logger.error("WARNING: DCF RETURNED TIME SKEW OF {} SECONDS".format(secs)) + else: + logger.info("DCF expiration skew was {} seconds".format(secs)) + calc_expiration_time = returned_expiration_time + else: + logger.error("No expiration time provided by DCF") + # # At this point, we need to wrestle with the possible problem that the user has linked # to a DIFFERENT GoogleID while off messing with DCF. If the ID that comes back is not # identical to what we think it is. They need to go and do it again! # + the_user_token_string = _get_user_data_token_string(request.user.id) # a string + the_user_token_dict = json_loads(the_user_token_string) + the_user_dict = the_user_token_dict['context']['user'] + + google_link = _get_google_link_from_user_dict(the_user_dict) + + if returned_google_link: + if google_link != returned_google_link: + logger.error("WARNING: DCF RETURNED CONFLICTING GOOGLE LINK {} VERSUS {}".format(returned_google_link, + google_link)) + else: + logger.info("DCF provided google link was consistent") + else: + logger.error("No google link provided by DCF") + + if google_link is None: + messages.warning(request, 'Error detected during linking. ' + 'No Google User ID returned. Please report this ' + 'to the ISB-CGC administrator') + return redirect(reverse('user_detail', args=[request.user.id])) + req_user = User.objects.get(id=request.user.id) - resp = _dcf_call(DCF_USER_URL, request.user.id) - user_data = json_loads(resp.text) - if user_data['email'] != req_user.email: - message = "Please unlink ID {} and use your ISB-CGC login email ({}) to link with the DCF".format(user_data['email'], req_user.email) + if google_link != req_user.email: + message = "Please unlink ID {} and use your ISB-CGC login email ({}) to link with the DCF".format(google_link, req_user.email) messages.warning(request, message) return redirect(reverse('user_detail', args=[request.user.id])) @@ -378,27 +434,24 @@ def dcf_link_callback(request): # If all is well, this is where we add the user to the NIH_User table and link the user to the various data sets. # - warning = _finish_the_link(request.user.id, user_data['email'], st_logger) + warning = _finish_the_link(request.user.id, google_link, calc_expiration_time, st_logger) if warning: messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) -def _finish_the_link(user_id, user_email, st_logger): +def _finish_the_link(user_id, user_email, expiration_time, st_logger): """ Regardless of how they get here, this step handles the linking of the user by adding the required database records. """ - # Until we get back user expiration time, we calculate it: - login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 - nih_assertion_expiration = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( - seconds=login_expiration_seconds)) + nih_assertion_expiration = expiration_time # # Until we get back current projects, refresh it: # - the_user = _get_user_data(user_id) + the_user_token = _get_user_data_token_string(user_id) # the_user is a string # # Save the new info from the DCF: @@ -406,17 +459,18 @@ def _finish_the_link(user_id, user_email, st_logger): dcf_token = DCFToken.objects.get(user_id=user_id) if dcf_token.google_id is not None and dcf_token.google_id != user_email: - # FIXME - print "WE HAVE A PROBLEM" + return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ + 'Please report this to the ISB-CGC administrator' dcf_token.google_id = user_email - dcf_token.user_token = json_dumps(the_user) + dcf_token.user_token = the_user_token dcf_token.save() - nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user, + the_user_dict = _user_data_token_to_user_dict(the_user_token) + nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, nih_assertion_expiration, st_logger) - dict_o_projects = the_user['project_access'] + dict_o_projects = _get_projects_from_user_dict(the_user_dict) authorized_datasets = [] for project, perm_list in dict_o_projects.iteritems(): adqs = AuthorizedDataset.objects.filter(whitelist_id=project) @@ -431,14 +485,302 @@ def _finish_the_link(user_id, user_email, st_logger): return warning -def _get_user_data(user_id): + +class GoogleLinkState: + BOTH_NULL = 1 + DCF_NULL_CGC_NON_NULL = 2 + DCF_NON_NULL_CGC_NULL = 3 + MATCHING_BAD = 4 + MATCHING_OK = 5 + NON_MATCHING_DCF_BAD = 6 + NON_MATCHING_CGC_BAD = 7 + NON_MATCHING_ALL_BAD = 8 + +def _compare_google_ids(dcf_version, cgc_version, user_email): + """ + When we get new tokens from DCF, we want to sanity check if the Google IDs are in agreement. + """ + + if dcf_version != cgc_version: + # Most likely possibility is that either DCF or us thinks the google ID is None and the other doesn't. Another + # possibility is that DCF has another Google ID for the user that is not consistent with the + # one we *want* them to be using. That case *should* have been caught when they first tried to link. + # + # If link IDs do not match, we need match ours to DCF, and flag the problem + if dcf_version is None: + google_match_state = GoogleLinkState.DCF_NULL_CGC_NON_NULL + elif cgc_version is None: + google_match_state = GoogleLinkState.DCF_NON_NULL_CGC_NULL + elif dcf_version == user_email: + google_match_state = GoogleLinkState.NON_MATCHING_CGC_BAD + elif cgc_version == user_email: + google_match_state = GoogleLinkState.NON_MATCHING_DCF_BAD + else: + google_match_state = GoogleLinkState.NON_MATCHING_ALL_BAD + # Next three cases handle matching GoogleIDs: + elif dcf_version is None: + google_match_state = GoogleLinkState.BOTH_NULL + elif dcf_version == user_email: + google_match_state = GoogleLinkState.MATCHING_OK + elif dcf_version != user_email: + google_match_state = GoogleLinkState.MATCHING_BAD + + return google_match_state + + +def _refresh_from_dcf(user_id): + """ + We would like to check if our view of the user (linkage, expirations, datasets) is consistent with what the + DCF thinks, and update accordingly! + """ + + user_email = User.objects.get(id=user_id).email + + # + # Haul the user data token string down from DCF: + # + + the_user_token = _get_user_data_token_string(user_id) # the_user_token is a string + + # + # Things that could be different: Google ID linkage, expiration time, approved datasets. + # Right now, we are not provided with expiration time, so we cannot check that. While NIH linkage + # could change in theory, that is fixed via DCF for the life of a refresh token. User could only change + # that by logging out/disconnecting from DCF and going back in again, which would give us a new refresh + # token. + # + + the_user_dict = _user_data_token_to_user_dict(the_user_token) + + dcf_google_link = _get_google_link_from_user_dict(the_user_dict) + nih_id = _get_nih_id_from_user_dict(the_user_dict) + dict_o_projects = _get_projects_from_user_dict(the_user_dict) + + # + # Compare to our versions: + # + + dcf_token = DCFToken.objects.get(user_id=user_id) + + google_match_state = _compare_google_ids(dcf_google_link, dcf_token.google_id, user_email) + google_problem = None + + if google_match_state != GoogleLinkState.MATCHING_OK and google_match_state != GoogleLinkState.BOTH_NULL: + dcf_token.google_id = dcf_google_link + google_problem = google_match_state + + if nih_id.lower() != dcf_token.nih_username_lower: + logger.error("ERROR: UNEXPECTED NIH_USER_ID MISMATCH {} VERSUS {}".format(nih_id.lower(), + dcf_token.nih_username_lower)) + + # + # If everything was consistent, if DCF tells the user is linked to an NIH ID, we would have that ID as one and + # only one linked record in our DB. + # + + # Note the use of __iexact does case insensitive match: + nih_users_for_id = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=nih_id, linked=True) + if len(nih_users_for_id) == 1: + print "All is good" + else: + nih_users_for_user = NIH_User.objects.filter(user_id=user_id) + nih_users_for_id = NIH_User.objects.filter(NIH_username__iexact=nih_id) + + # If user logged into DCF but did not get the linking done correctly, the token will provide us with the + # NIH ID they are using, but we will NOT have a current linked record in the NIH_User table. + + wafjwophfwfHIGwfpsiFif + + + if dcf_token.google_id is not None and dcf_token.google_id != user_email: + return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ + 'Please report this to the ISB-CGC administrator' + + dcf_token.google_id = user_email + dcf_token.user_token = the_user_token + dcf_token.save() + + nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, + nih_assertion_expiration, st_logger) + + + authorized_datasets = [] + for project, perm_list in dict_o_projects.iteritems(): + adqs = AuthorizedDataset.objects.filter(whitelist_id=project) + if len(adqs) == 1: + authorized_datasets.append(DatasetGoogleGroupPair(project, adqs.first().acl_google_group)) + + das = DatasetAccessSupportFactory.from_webapp_django_settings() + all_datasets = das.get_all_datasets_and_google_groups() + + for dataset in all_datasets: + handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) + + return warning + + +def _user_data_token_dict_massaged(the_user_token_dict): + """ + Takes the user data token dictionary (as returned by DCF) and returns massaged user-only string AND dict + + """ + the_user_dict = the_user_token_dict['context']['user'] + the_massaged_dict = _massage_user_data_for_dev(the_user_dict) + the_user_token_dict['context']['user'] = the_massaged_dict + return json_dumps(the_user_token_dict), the_user_token_dict + + +def _user_data_token_massaged(user_data_token_string): + """ + Takes the user data token string and returns user-only string AND dict + + """ + the_user_token_dict = json_loads(user_data_token_string) + the_user_dict = the_user_token_dict['context']['user'] + the_massaged_dict = _massage_user_data_for_dev(the_user_dict) + the_user_token_dict['context']['user'] = the_massaged_dict + return json_dumps(the_user_token_dict), the_user_token_dict + + +def _get_projects_from_user_dict(the_user_dict): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + return the_user_dict['projects'] + + +def _set_projects_for_user_dict(the_user_dict, projects): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + the_user_dict['projects'] = projects + return + + +def _get_nih_id_from_user_dict(the_user_dict): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + return the_user_dict['name'] + +def _set_nih_id_for_user_dict(the_user_dict, nih_id): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + the_user_dict['name'] = nih_id + return + + +def _get_google_link_from_user_dict(the_user_dict): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + gotta_google_link = the_user_dict.has_key('google') and \ + the_user_dict['google'].has_key('linked_google_account') + google_link = the_user_dict['google']['linked_google_account'] if gotta_google_link else None + return google_link + + +def _user_data_token_to_user_dict(user_data_token_string): + """ + Takes the user data token string (as returned by DCF and stored in database) and returns user-only dict + + """ + the_user_token_dict = json_loads(user_data_token_string) + print "UDTS", user_data_token_string + the_user_dict = the_user_token_dict['context']['user'] + return the_user_dict + + +def _user_data_token_dict_to_user_dict(the_user_token_dict): + """ + Takes the user data token dict and returns user-only dict + + """ + the_user_dict = the_user_token_dict['context']['user'] + return the_user_dict + + +def _get_user_data_token_string(user_id): """ Get up-to-date user data from DCF, massage as needed """ - resp = _dcf_call(DCF_USER_URL, user_id) - the_user = json_loads(resp.text) + # The user endpoint is spotty at the moment (6/5/18) so we drag it out of the token instead + #resp = _dcf_call(DCF_USER_URL, user_id) + #the_user = json_loads(resp.text) + + the_user_id_token, _ = _user_data_from_token(user_id) + + massaged_string, _ = _user_data_token_massaged(the_user_id_token) + + return massaged_string + + +def _user_data_from_token(user_id): + """ + Seems that we should be able to get full user info from the user endpoint, but it turns out that + the information in the token refresh is more complete. + """ + + # + # OAuth2Session handles token refreshes under the covers. Here we want to do it explicitly. We + # do not care about the refresh, but we want the id_token contents. + # Note THIS WILL NOT WORK IF REFRESH TOKEN HAS EXPIRED! + # + + dcf_token = DCFToken.objects.get(user_id=user_id) + + client_id, client_secret = _get_secrets() + + data = { + 'grant_type': 'refresh_token', + 'refresh_token': dcf_token.refresh_token, + 'client_id': client_id + } + auth = requests.auth.HTTPBasicAuth(client_id, client_secret) + resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) + client_id = None + client_secret = None + if resp.status_code != 200: + logger.error("[INFO] Token acquisition problem: {} : {}".format(resp.status_code, resp.text)) + return None, None + + token_dict = json_loads(resp.text) + id_token_decoded, id_token_dict = _decode_token(token_dict['id_token']) + + return id_token_decoded, id_token_dict + + +def _refresh_access_token(user_id): + """ + DCF suggests we refresh the access token after e.g. unlinking. OAuth2Session usually handles token refreshes + # under the covers, but here we want to do it explicitly. + """ - return _massage_user_data_for_dev(the_user) + dcf_token = DCFToken.objects.get(user_id=user_id) + + client_id, client_secret = _get_secrets() + + data = { + 'grant_type': 'refresh_token', + 'refresh_token': dcf_token.refresh_token, + 'client_id': client_id + } + auth = requests.auth.HTTPBasicAuth(client_id, client_secret) + resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) + client_id = None + client_secret = None + if resp.status_code != 200: + logger.error("[INFO] Token acquisition problem: {} : {}".format(resp.status_code, resp.text)) + return None, None + + token_dict = json_loads(resp.text) + _access_token_storage(token_dict, user_id) def _massage_user_data_for_dev(the_user): @@ -451,12 +793,12 @@ def _massage_user_data_for_dev(the_user): if not dcf_secrets.has_key('DEV_1_EMAIL'): return the_user - nih_from_dcf = the_user['username'] + nih_from_dcf = _get_nih_id_from_user_dict(the_user) if nih_from_dcf == dcf_secrets['DEV_1_EMAIL']: nih_from_dcf = dcf_secrets['DEV_1_NIH'] - the_user['username'] = nih_from_dcf + _set_nih_id_for_user_dict(the_user, nih_from_dcf) - dict_o_projects = the_user['project_access'] + dict_o_projects = _get_projects_from_user_dict(the_user) new_dict_o_projects = {} for project, perm_list in dict_o_projects.iteritems(): # DCF QA returns bogus project info. Do this mapping as a workaround: @@ -465,7 +807,7 @@ def _massage_user_data_for_dev(the_user): elif project == dcf_secrets['DEV_2_PROJ']: project = dcf_secrets['DEV_2_MAPPED_PROJ'] new_dict_o_projects[project] = perm_list - the_user['project_access'] = new_dict_o_projects + _set_projects_for_user_dict(the_user, new_dict_o_projects) return the_user @@ -489,6 +831,8 @@ def dcf_link_extend(request): "Please contact the ISB-CGC administrator.".format(resp.status_code)) print resp.text + print 'PATCH ONLY RETURNS e.g. {"exp": 1528509163}' + print "NO! TIME TO USE THE EXPIRATION" # Until we get back user expiration time, we calculate it: login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 @@ -496,9 +840,10 @@ def dcf_link_extend(request): seconds=login_expiration_seconds)) # User data set permissions might have changed, so we call and find out what they are: - user_data = _get_user_data(request.user.id) + user_data_token_string = _get_user_data_token_string(request.user.id) + user_data_dict = _user_data_token_to_user_dict(user_data_token_string) - _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data, nih_assertion_expiration, st_logger) + _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data_dict, nih_assertion_expiration, st_logger) if warning: messages.warning(request, warning) @@ -513,6 +858,17 @@ def dcf_unlink(request): still able to talk to DCF using their NIH ID. For a traditional unlink, we use dcf_disconnect_user: """ + # DO NOT UNLINK IF NOT CURRENTLY LINKED + + dcf_token = DCFToken.objects.get(user_id=request.user.id) + the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) + + google_link = _get_google_link_from_user_dict(the_user_dict) + + if google_link is None: + messages.warning(request, "User is not linked to Google") # redirect to user detail page + return redirect(reverse('user_detail', args=[request.user.id])) + # # First, call DCF to drop the linkage. This is the only way to get the user # booted out of control groups. @@ -530,6 +886,22 @@ def dcf_unlink(request): else: messages.warning(request, "Unexpected response from DCF") + # + # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking + # since they care about the token info: + # + + _refresh_access_token(request.user.id) + + # + # The Token table records the User's Google ID. This needs to be nulled. The expiration time in the DCFToken + # is for the access token, not the google link (that info is stored in the NIH_user: + # + + dcf_token = DCFToken.objects.get(user_id=request.user.id) + dcf_token.google_id = None + dcf_token.save() + # # Now drop the link flag and active flag from the DB, plus our db records of what datasets the user is # good for: @@ -552,7 +924,7 @@ def dcf_unlink(request): def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_from_dcf, dcf_uid, cgc_uid, google_id): """ This is called when the user needs to get a new 30-day refresh token from DCF by logging into - NIH (or if they unlink and need to reauthenticate to DCF again). + NIH (or if they explicitly disconnect their NIH identity and need to reauthenticate to DCF again). """ # @@ -609,19 +981,9 @@ def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_fro }) -def print_dict(dictionary, ident = '', braces=1): - """ Recursively prints nested dictionaries.""" - - for key, value in dictionary.iteritems(): - if isinstance(value, dict): - print '%s%s%s%s' %(ident,braces*'[',key,braces*']') - print_dict(value, ident+' ', braces+1) - else: - print ident+'%s = %s' %(key, value) - def _access_token_storage(token_dict, cgc_uid): """ - This call just replaces the access key part of the DCF record. Used when we use the + This call just replaces the access key and user token part of the DCF record. Used when we use the refresh token to get a new access key. """ @@ -635,20 +997,40 @@ def _access_token_storage(token_dict, cgc_uid): print 'Token storage. New token expires at {}'.format(str(expiration_time)) - print_dict(token_dict) - id_token = token_dict['id_token'] - id_tokens_b64 = id_token.split('.') - i64 = id_tokens_b64[1] - padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length - id_token_decoded = urlsafe_b64decode(padded.encode("ascii")) - id_token_dict = json_loads(id_token_decoded) - print_dict(id_token_dict) + # + # Right now (5/30/18) we only get full user info back during the token refresh call. So decode + # it and stash it as well: + # + id_token_decoded, _ = _decode_token(token_dict['id_token']) + print 'id_token', id_token_decoded + print 'access_token', token_dict['access_token'] dcf_token = DCFToken.objects.get(user_id=cgc_uid) dcf_token.access_token = token_dict['access_token'] + dcf_token.user_token = id_token_decoded dcf_token.expires_at = expiration_time dcf_token.save() + +def _decode_token_chunk(token, index): + """ + Decode a given chunk of the token and return it as a JSON string and as a dict + """ + tokens_b64 = token.split('.') + i64 = tokens_b64[index] + padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length + token_decoded = urlsafe_b64decode(padded.encode("ascii")) + token_dict = json_loads(token_decoded) + return token_decoded, token_dict + + +def _decode_token(token): + """ + Decode the token and return it as a JSON string and as a dict + """ + return _decode_token_chunk(token, 1) + + @login_required def test_the_dcf(request): """ @@ -656,7 +1038,7 @@ def test_the_dcf(request): DCF calls """ file_uuid = 'ffcc4f7d-471a-4ad0-b199-53d992217986' - resp = _dcf_call('https://qa.dcf.planx-pla.net/user/data/download/{}'.format(file_uuid), request.user.id) + resp = _dcf_call('{}/{}'.format(DCF_URL_URL, file_uuid), request.user.id) result = { 'uri': resp.text, 'code': resp.status_code @@ -676,31 +1058,37 @@ def dcf_disconnect_user(request): """ # First thing ya gotta do is tell DCF to unlink the user, which will get them out of - # access control groups: + # access control groups. BUT ONLY IF THEY ARE ACTUALLY CURRENTLY LINKED! msg_list = [] - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') - if resp.status_code == 404: - msg_list.append("No linked Google account found for user, code {}".format(resp.status_code)) - elif resp.status_code == 400: - delete_response = json_loads(resp.text) - error = delete_response['error'] - message = delete_response['error_description'] - msg_list.append("Error in unlinking: {} : {} : {}".format(error, message, resp.status_code)) - elif resp.status_code == 200: - pass - else: - msg_list.append(request, "Unexpected response from DCF {}".format(resp.status_code)) + + dcf_token = DCFToken.objects.get(user_id=request.user.id) + the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) + + print the_user_dict, type(the_user_dict) + google_link = _get_google_link_from_user_dict(the_user_dict) + + if google_link: + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') + if resp.status_code == 404: + msg_list.append("No linked Google account found for user, code {}".format(resp.status_code)) + elif resp.status_code == 400: + delete_response = json_loads(resp.text) + error = delete_response['error'] + message = delete_response['error_description'] + msg_list.append("Error in unlinking: {} : {} : {}".format(error, message, resp.status_code)) + elif resp.status_code == 200: + pass + else: + msg_list.append(request, "Unexpected response from DCF {}".format(resp.status_code)) # - # The revoke call is unlike other DCF endpoints in that it is a special! + # The revoke call is unlike other DCF endpoints in that it is special! # Token revocation is described here: https://tools.ietf.org/html/rfc7009#section-2.1 # So we do not provide a bearer access token, but the client ID and secret in a Basic Auth # framework. Not seeing that inside the OAuthSession framework, so we roll our own by hand: # - dcf_token = DCFToken.objects.get(user_id=request.user.id) - client_id, client_secret = _get_secrets() data = { @@ -724,73 +1112,47 @@ def dcf_disconnect_user(request): unlink_account_in_db_for_dcf(request.user.id) # - # Finally, we clear out our tokens for the user (which allows them to appear to DCF as the + # Next, we clear out our tokens for the user (which allows them to appear to DCF as the # logged-in NIH user; we cannot keep them around: # - dcf_token = DCFToken.objects.get(user_id=request.user.id) dcf_token.delete() + # + # Finally, we need to send the user to logout from the DCF, which is needed to clear the + # cookies DCF has dumped into their browser, which will allow them to log in to NIH again. + # + logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) - callback = '{}?next_url={}'.format('https://qa.dcf.planx-pla.net/user/logout', logout_callback) + callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) return HttpResponseRedirect(callback) - # redirect to user detail page - # return redirect(reverse('user_detail', args=[request.user.id])) - -@login_required -def dcf_user_data_from_token(request): +def _dcf_user_data_from_token(request): """ Seems that we should be able to get full user info from the user endpoint, but it turns out that the information in the token refresh is more complete. """ + id_token_decoded, id_token_dict = _user_data_from_token(request.user.id) - # - # OAuth2Session handles token refreshes under the covers. Here we want to do it explicitly not - # that we care about the refresh (we don't care about it), but we want the id_token contents. - - dcf_token = DCFToken.objects.get(user_id=request.user.id) - - client_id, client_secret = _get_secrets() - - data = { - 'grant_type': 'refresh_token', - 'refresh_token': dcf_token.refresh_token, - 'client_id': client_id - } - auth = requests.auth.HTTPBasicAuth(client_id, client_secret) - resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) - client_id = None - client_secret = None - - token_dict = json_loads(resp.text) - print_dict(token_dict) - id_token = token_dict['id_token'] - id_tokens_b64 = id_token.split('.') - i64 = id_tokens_b64[1] - padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length - id_token_decoded = urlsafe_b64decode(padded.encode("ascii")) - id_token_dict = json_loads(id_token_decoded) - print_dict(id_token_dict) - - if resp.status_code != 200: - messages.warning(request, 'Token acquisition problem: {} : {}'.format(resp.status_code, resp.text)) - - messages.warning(request, 'TDCF Responded with {}'.format(id_token_decoded)) + if id_token_decoded is not None: + messages.warning(request, 'TDCF Responded with {}'.format(id_token_decoded)) + else: + messages.warning(request, 'Token acquisition problem') # redirect to user detail page return redirect(reverse('user_detail', args=[request.user.id])) + @login_required def dcf_get_user_data(request): """ Use for QC and development """ - return dcf_user_data_from_token(request) + return _dcf_user_data_from_token(request) # resp = _dcf_call(DCF_USER_URL, request.user.id) # user_data = json_loads(resp.text) @@ -800,7 +1162,7 @@ def dcf_get_user_data(request): # return redirect(reverse('user_detail', args=[request.user.id])) -def _dcf_call(full_url, user_id, mode='get', post_body=None): +def _dcf_call(full_url, user_id, mode='get', post_body=None, force_token=False): """ All the stuff around a DCF call that handles token management and refreshes. """ @@ -813,7 +1175,7 @@ def _dcf_call(full_url, user_id, mode='get', post_body=None): 'access_token' : dcf_token.access_token, 'refresh_token' : dcf_token.refresh_token, 'token_type' : 'Bearer', - 'expires_in' : -100 + 'expires_in' : -100 if force_token else expires_in } def token_storage_for_user(my_token_dict): @@ -854,4 +1216,84 @@ def _read_dict(my_file_name): continue split_line = line.split('=') retval[split_line[0].strip()] = split_line[1].strip() - return retval \ No newline at end of file + return retval + + +def get_nih_user_details_from_token(user_id): + user_details = {} + + + + # + # The information we used to pull out of our database is now obtained from a DCF token + # + + # + # Now with DCF, we can have a user logged in as an NIH user, but not be linked (which means DCF does not + # have an association between NIH ID and Google ID). So while we previously did a get on a linked user, + # now we need to filter. If one of the users is linked, that is who we use. Otherwise, we can resolve the + # issue by looking at the current DCF token attached to the user to see who they are associated with. + # + + dcf_tokens = DCFToken.objects.filter(user_id=user_id) + if len(dcf_tokens) == 0: + return user_details + elif len(dcf_tokens) > 1: + logger.error("[ERROR] MULTIPLE DCF RECORDS FOR USER {}. ".format(str(user_id))) + return user_details + + dcf_token = dcf_tokens.first() + + the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) + + google_link = _get_google_link_from_user_dict(the_user_dict) + + nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username=dcf_token.nih_username) + + if len(nih_users) == 0: + return user_details + + elif len(nih_users) == 1: + nih_user = nih_users.first() + + else: + nih_user = None + freshest_linked = None + freshest_linked_stamp = None + freshest_unlinked = None + freshest_unlinked_stamp = None + for user in nih_users: + if user.linked: + if (freshest_linked_stamp is None) or (freshest_linked_stamp < user.NIH_assertion_expiration): + freshest_linked_stamp = user.NIH_assertion_expiration + freshest_linked = user + if nih_user is None: + nih_user = nih_users.first() + else: + logger.error("[ERROR] Multiple linked nih users retrieved nih_user with user_id {}.".format(user_id)) + else: + if (freshest_unlinked_stamp is None) or (freshest_unlinked_stamp < user.NIH_assertion_expiration): + freshest_unlinked_stamp = user.NIH_assertion_expiration + freshest_unlinked = user + + if freshest_linked: + nih_user = freshest_linked + elif freshest_unlinked: + nih_user = freshest_unlinked + else: + logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) + return user_details + + user_auth_datasets = UserAuthorizedDatasets.objects.filter(nih_user=nih_user) + user_details['NIH_username'] = nih_user.NIH_username + user_details['NIH_assertion_expiration'] = nih_user.NIH_assertion_expiration + # Add a separate field to break out program count from active: + user_details['dbGaP_has_datasets'] = (len(user_auth_datasets) > 0) + user_details['dbGaP_authorized'] = (len(user_auth_datasets) > 0) and nih_user.active + logger.debug("[DEBUG] User {} has access to {} dataset(s) and is {}".format(nih_user.NIH_username, str(len(user_auth_datasets)), ('not active' if not nih_user.active else 'active'))) + user_details['NIH_active'] = nih_user.active + user_details['NIH_DCF_linked'] = nih_user.linked + user_details['refresh_key_ok'] = get_dcf_auth_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS + user_details['auth_datasets'] = [] if len(user_auth_datasets) <= 0 else AuthorizedDataset.objects.filter(id__in=user_auth_datasets.values_list('authorized_dataset',flat=True)) + + return user_details diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index d89055eb..a40fc170 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1011,7 +1011,7 @@ def get_dcf_auth_key_remaining_seconds(user_id): return remaining_seconds -def handle_user_db_update_for_dcf_linking(user_id, user_data, nih_assertion_expiration, st_logger): +def handle_user_db_update_for_dcf_linking(user_id, user_data_dict, nih_assertion_expiration, st_logger): """ When user logs into DCF using iTrust and links via DCF, we create an NIH record for them and link them to to their data. """ @@ -1026,7 +1026,7 @@ def handle_user_db_update_for_dcf_linking(user_id, user_data, nih_assertion_expi 'linked': True } - nih_user, created = NIH_User.objects.update_or_create(NIH_username=user_data['username'], + nih_user, created = NIH_User.objects.update_or_create(NIH_username=user_data_dict['name'], user_id=user_id, defaults=updated_values) @@ -1037,7 +1037,7 @@ def handle_user_db_update_for_dcf_linking(user_id, user_data, nih_assertion_expi str(nih_user.NIH_username), str(created))) our_user = User.objects.get(id=user_id) - dict_o_projects = user_data['project_access'] + dict_o_projects = user_data_dict['projects'] logger.info("[STATUS] NIH_User.objects updated nih_user for linking: {}".format( str(nih_user.NIH_username))) @@ -1297,6 +1297,7 @@ def deactivate_nih_add_to_open(user_id, user_email): def get_nih_user_details(user_id): user_details = {} + # # Now with DCF, we can have a user logged in as an NIH user, but not be linked (which means DCF does not # have an association between NIH ID and Google ID). So while we previously did a get on a linked user, @@ -1306,21 +1307,28 @@ def get_nih_user_details(user_id): dcf_tokens = DCFToken.objects.filter(user_id=user_id) if len(dcf_tokens) == 0: - return user_details + return user_details # i.e. empty dict elif len(dcf_tokens) > 1: logger.error("[ERROR] MULTIPLE DCF RECORDS FOR USER {}. ".format(str(user_id))) - return user_details + return user_details # i.e. empty dict dcf_token = dcf_tokens.first() + # FIXME? IS THERE AN ISSUE HERE WITH NIH USERNAME CASE SENSITIVITY?? nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username=dcf_token.nih_username) if len(nih_users) == 0: - return user_details + return user_details # i.e. empty dict elif len(nih_users) == 1: nih_user = nih_users.first() else: + # + # Multiple NIH user rows for the current user for the same nih_username. We want the one that is linked. + # If more than one (is that possible??) take the one with the most recent usage. If nobody is linked, + # again take the one with the most recent usage. Some of these cases should not be possible (?) but + # trying to be bombproof here: + # nih_user = None freshest_linked = None freshest_linked_stamp = None @@ -1346,7 +1354,29 @@ def get_nih_user_details(user_id): nih_user = freshest_unlinked else: logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) - return user_details + return user_details # i.e. empty dict + + # + # With the user_details page, we now need to check with DCF about current status before we display information + # to the user, as our database view could be stale. + # + # Step 1: If the expiration time has passed for the user and they are still tagged as active, we clear that + # flag. This is the *minimun* we chould be doing, no matter what. Note that in DCF-based Brave New World, we no + # longer need to have a cron job doing this, as we don't actually need to do anything at 24 hours. We just + # need to give the user an accurate picture of the state when they hit this page. + # + + if nih_user.active: + expired_time = nih_user.NIH_assertion_expiration + # If we need to have the access expire in just a few minutes for testing, this is one way to fake it: + # testing_expire_hack = datetime.timedelta(minutes=-((60 * 23) + 55)) + # expired_time = expired_time + testing_expire_hack + now_time = pytz.utc.localize(datetime.datetime.utcnow()) + print "times", expired_time, now_time + if now_time >= expired_time: + nih_user.active = False + nih_user.NIH_assertion_expiration = now_time + nih_user.save() user_auth_datasets = UserAuthorizedDatasets.objects.filter(nih_user=nih_user) user_details['NIH_username'] = nih_user.NIH_username From 147b557a22247d1920d68b9a801ca761c3a22ad4 Mon Sep 17 00:00:00 2001 From: s-paquette Date: Tue, 19 Jun 2018 13:47:41 -0700 Subject: [PATCH 10/76] - Add ability to insert jobs and check for job done-ness, use this internally in the simple-execute-and-fetch class method --- google_helpers/bigquery/bq_support.py | 62 ++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/google_helpers/bigquery/bq_support.py b/google_helpers/bigquery/bq_support.py index 890cf005..1807922b 100644 --- a/google_helpers/bigquery/bq_support.py +++ b/google_helpers/bigquery/bq_support.py @@ -242,13 +242,8 @@ def _confirm_dataset_and_table(self, desc): 'status': 'TABLE_EXISTS' } - # Runs a basic, optionally parameterized query - # If self.project_id, self.dataset_id, and self.table_id are set they - # will be used as the destination table for the query - # WRITE_DISPOSITION is assumed to be for an empty table unless specified - def execute_query(self, query, parameters=None, write_disposition='WRITE_EMPTY', cost_est=False): - - query_results = None + # Build and insert a BQ job + def insert_bq_query_job(self, query,parameters=None, write_disposition='WRITE_EMPTY', cost_est=False): # Make yourself a job ID job_id = str(uuid4()) @@ -282,10 +277,21 @@ def execute_query(self, query, parameters=None, write_disposition='WRITE_EMPTY', if cost_est: job_desc['configuration']['dryRun'] = True - query_job = self.bq_service.jobs().insert( + return self.bq_service.jobs().insert( projectId=self.executing_project, body=job_desc).execute(num_retries=5) + + # Runs a basic, optionally parameterized query + # If self.project_id, self.dataset_id, and self.table_id are set they + # will be used as the destination table for the query + # WRITE_DISPOSITION is assumed to be for an empty table unless specified + def execute_query(self, query, parameters=None, write_disposition='WRITE_EMPTY', cost_est=False): + + query_job = self.insert_bq_query_job(query,parameters,write_disposition,cost_est) + + job_id = query_job['jobReference']['jobId'] + # Cost Estimates don't actually run as fully-fledged jobs, and won't be inserted as such, # so we just get back the estimate immediately if cost_est: @@ -329,6 +335,13 @@ def execute_query(self, query, parameters=None, write_disposition='WRITE_EMPTY', return query_results + # Check to see if query job is done + def job_is_done(self, query_job): + job_is_done = self.bq_service.jobs().get(projectId=self.executing_project, + jobId=query_job['jobReference']['jobId']).execute(num_retries=5) + + return job_is_done and job_is_done['status']['state'] == 'DONE' + # Fetch the results of a job based on the reference provided def fetch_job_results(self, job_ref): result = [] @@ -352,11 +365,25 @@ def fetch_job_results(self, job_ref): return result # Execute a query to be saved on a temp table (shorthand to instance method above), optionally parameterized + # and fetch its results @classmethod def execute_query_and_fetch_results(cls, query, parameters=None): bqs = cls(None, None, None) return bqs.execute_query(query, parameters) + # Insert a BQ job for a query to be saved on a temp table (shorthand to instance method above), optionally + # parameterized, and return the job reference + @classmethod + def insert_query_job(cls, query, parameters=None): + bqs = cls(None, None, None) + return bqs.insert_bq_query_job(query, parameters) + + # Check the status of a BQ job + @classmethod + def check_job_is_done(cls, job_ref): + bqs = cls(None, None, None) + return bqs.job_is_done(job_ref) + # Do a 'dry run' query, which estimates the cost @classmethod def estimate_query_cost(cls, query, parameters=None): @@ -379,13 +406,15 @@ def get_job_results(cls, job_reference): # TODO: add support for BETWEEN # TODO: add support for <>= @staticmethod - def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None): - + def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with_count_toggle=False): result = { 'filter_string': '', 'parameters': [] } + if with_count_toggle: + result['count_params'] = {} + filter_set = [] mutation_filters = {} @@ -476,6 +505,19 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None): query_param['parameterType']['arrayType'] = {'type': ('STRING' if re.compile(ur'[^0-9\.,]', re.UNICODE).search(values[0]) else 'INT64')} filter_string += "{} IN UNNEST(@{})".format(attr, param_name) + if with_count_toggle: + filter_string = "({}) OR @{}_filtering = 'not_filtering'".format(filter_string,param_name) + result['count_params'][param_name] = { + 'name': param_name+'_filtering', + 'parameterType': { + 'type': 'STRING' + }, + 'parameterValue': { + 'value': 'filtering' + } + } + result['parameters'].append(result['count_params'][param_name]) + filter_set.append('({})'.format(filter_string)) result['parameters'].append(query_param) From 4cc8ae63a918bb1d45b6012bd2aebb10d0834452 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 20 Jun 2018 17:42:05 -0700 Subject: [PATCH 11/76] Work in progress --- accounts/dcf_views.py | 158 +++++++++++++++++++++++++++++------------- accounts/sa_utils.py | 8 ++- accounts/urls.py | 1 + 3 files changed, 117 insertions(+), 50 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 5d5aab97..2339fcc1 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -35,8 +35,9 @@ handle_user_db_update_for_dcf_linking, \ unlink_account_in_db_for_dcf, get_dcf_auth_key_remaining_seconds -from models import DCFToken, AuthorizedDataset, NIH_User +from models import DCFToken, AuthorizedDataset, NIH_User, UserAuthorizedDatasets from requests_oauthlib.oauth2_session import OAuth2Session +from oauthlib.oauth2 import MissingTokenError from base64 import urlsafe_b64decode from jwt.contrib.algorithms.pycrypto import RSAAlgorithm from json import loads as json_loads, dumps as json_dumps @@ -296,33 +297,36 @@ def oauth2_callback(request): # does not match the current user email. # + link_mismatch = False req_user = User.objects.get(id=request.user.id) if google_link != req_user.email: message = "Please unlink ID {} and use your ISB-CGC login email ({}) to link with the DCF".format( google_link, req_user.email) messages.warning(request, message) + link_mismatch = True return redirect(reverse('user_detail', args=[request.user.id])) # # The link matches. So we use PATCH, and if it goes smoothly, we write the new link to the database: - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') - if resp.status_code == 404: - messages.warning(request, "No linked Google account found for user") - elif resp.status_code == 200: - pass - else: - messages.warning(request, "Unexpected response ({}, {}) from DCF during linking. " - "Please contact the ISB-CGC administrator.".format(resp.status_code, resp.text)) + if not link_mismatch: + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') + if resp.status_code == 404: + messages.warning(request, "No linked Google account found for user") + elif resp.status_code == 200: + pass + else: + messages.warning(request, "Unexpected response ({}, {}) from DCF during linking. " + "Please contact the ISB-CGC administrator.".format(resp.status_code, resp.text)) - print 'response {}'.format(str(resp.text)) - print 'PATCH ONLY RETURNS e.g. {"exp": 1528509163}' + print 'response {}'.format(str(resp.text)) + print 'PATCH ONLY RETURNS e.g. {"exp": 1528509163}' login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( seconds=login_expiration_seconds)) - warning = _finish_the_link(request.user.id, req_user.email, calc_expiration_time, st_logger) + warning = _finish_the_link(request.user.id, req_user.email, calc_expiration_time, st_logger, link_mismatch) messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) @@ -339,6 +343,17 @@ def oauth2_callback(request): os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' +@login_required +def dcf_link_redo(request): + """ + If the user needs to redo their google, link, this is what does it. + """ + + link_callback = request.build_absolute_uri(reverse('dcf_link_callback')) + callback = '{}?redirect={}'.format(DCF_GOOGLE_URL, link_callback) + return HttpResponseRedirect(callback) + + @login_required def dcf_link_callback(request): """ @@ -400,7 +415,8 @@ def dcf_link_callback(request): # # At this point, we need to wrestle with the possible problem that the user has linked # to a DIFFERENT GoogleID while off messing with DCF. If the ID that comes back is not - # identical to what we think it is. They need to go and do it again! + # identical to what we think it is. They need to go and do it again. BUT as far as DCF + # is concerned, they are linked, so we need to finish the job here... # the_user_token_string = _get_user_data_token_string(request.user.id) # a string @@ -424,23 +440,24 @@ def dcf_link_callback(request): 'to the ISB-CGC administrator') return redirect(reverse('user_detail', args=[request.user.id])) + link_mismatch = False req_user = User.objects.get(id=request.user.id) if google_link != req_user.email: message = "Please unlink ID {} and use your ISB-CGC login email ({}) to link with the DCF".format(google_link, req_user.email) messages.warning(request, message) - return redirect(reverse('user_detail', args=[request.user.id])) + link_mismatch = True # # If all is well, this is where we add the user to the NIH_User table and link the user to the various data sets. # - warning = _finish_the_link(request.user.id, google_link, calc_expiration_time, st_logger) + warning = _finish_the_link(request.user.id, google_link, calc_expiration_time, st_logger, link_mismatch) if warning: messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) -def _finish_the_link(user_id, user_email, expiration_time, st_logger): +def _finish_the_link(user_id, user_email, expiration_time, st_logger, link_mismatch): """ Regardless of how they get here, this step handles the linking of the user by adding the required database records. """ @@ -458,7 +475,7 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger): # dcf_token = DCFToken.objects.get(user_id=user_id) - if dcf_token.google_id is not None and dcf_token.google_id != user_email: + if dcf_token.google_id is not None and dcf_token.google_id != user_email and not link_mismatch: return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ 'Please report this to the ISB-CGC administrator' @@ -466,6 +483,9 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger): dcf_token.user_token = the_user_token dcf_token.save() + if link_mismatch: + return + the_user_dict = _user_data_token_to_user_dict(the_user_token) nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, nih_assertion_expiration, st_logger) @@ -569,6 +589,11 @@ def _refresh_from_dcf(user_id): dcf_token.google_id = dcf_google_link google_problem = google_match_state + # + # This is exercised when the NIH ID of the user, returned in the ID token is different than the one we + # have in our token database. Don't think this is even possible, since user would need to log in as the + # new NIH ID first... + # if nih_id.lower() != dcf_token.nih_username_lower: logger.error("ERROR: UNEXPECTED NIH_USER_ID MISMATCH {} VERSUS {}".format(nih_id.lower(), dcf_token.nih_username_lower)) @@ -578,45 +603,54 @@ def _refresh_from_dcf(user_id): # only one linked record in our DB. # - # Note the use of __iexact does case insensitive match: - nih_users_for_id = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=nih_id, linked=True) - if len(nih_users_for_id) == 1: + if google_match_state == GoogleLinkState.MATCHING_OK: + # Note the use of __iexact does case insensitive match: + linked_nih_user_for_user_and_id = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=nih_id, linked=True) + if len(linked_nih_user_for_user_and_id) == 1: print "All is good" else: + # + # Problems! If we have nih_users_for_user = NIH_User.objects.filter(user_id=user_id) nih_users_for_id = NIH_User.objects.filter(NIH_username__iexact=nih_id) + if len(nih_users_for_id) == 1: + pass - # If user logged into DCF but did not get the linking done correctly, the token will provide us with the - # NIH ID they are using, but we will NOT have a current linked record in the NIH_User table. - - wafjwophfwfHIGwfpsiFif - - - if dcf_token.google_id is not None and dcf_token.google_id != user_email: - return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ - 'Please report this to the ISB-CGC administrator' - dcf_token.google_id = user_email - dcf_token.user_token = the_user_token - dcf_token.save() - nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, - nih_assertion_expiration, st_logger) - authorized_datasets = [] - for project, perm_list in dict_o_projects.iteritems(): - adqs = AuthorizedDataset.objects.filter(whitelist_id=project) - if len(adqs) == 1: - authorized_datasets.append(DatasetGoogleGroupPair(project, adqs.first().acl_google_group)) - - das = DatasetAccessSupportFactory.from_webapp_django_settings() - all_datasets = das.get_all_datasets_and_google_groups() + # If user logged into DCF but did not get the linking done correctly, the token will provide us with the + # NIH ID they are using, but we will NOT have a current linked record in the NIH_User table. - for dataset in all_datasets: - handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) + # wafjwophfwfHIGwfpsiFif + # + # + # if dcf_token.google_id is not None and dcf_token.google_id != user_email: + # return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ + # 'Please report this to the ISB-CGC administrator' + # + # dcf_token.google_id = user_email + # dcf_token.user_token = the_user_token + # dcf_token.save() + # + # nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, + # nih_assertion_expiration, st_logger) + # + # + # authorized_datasets = [] + # for project, perm_list in dict_o_projects.iteritems(): + # adqs = AuthorizedDataset.objects.filter(whitelist_id=project) + # if len(adqs) == 1: + # authorized_datasets.append(DatasetGoogleGroupPair(project, adqs.first().acl_google_group)) + # + # das = DatasetAccessSupportFactory.from_webapp_django_settings() + # all_datasets = das.get_all_datasets_and_google_groups() + # + # for dataset in all_datasets: + # handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) - return warning + #return warning def _user_data_token_dict_massaged(the_user_token_dict): @@ -858,6 +892,23 @@ def dcf_unlink(request): still able to talk to DCF using their NIH ID. For a traditional unlink, we use dcf_disconnect_user: """ + "If user has linked to incorrect google account, we do not give them the option to first **unlink** from" \ + " the bad account, but only the option to LINK." + # Please + # unlink + # ID + # wlongabaugh @ gmail.com and use + # your + # ISB - CGC + # login + # email(wlongabaugh @ systemsbiology.org) + # to + # link + # with the DCF + + + + # DO NOT UNLINK IF NOT CURRENTLY LINKED dcf_token = DCFToken.objects.get(user_id=request.user.id) @@ -866,7 +917,7 @@ def dcf_unlink(request): google_link = _get_google_link_from_user_dict(the_user_dict) if google_link is None: - messages.warning(request, "User is not linked to Google") # redirect to user detail page + messages.warning(request, "User is not linked to Google ") # redirect to user detail page return redirect(reverse('user_detail', args=[request.user.id])) # @@ -1195,8 +1246,18 @@ def token_storage_for_user(my_token_dict): # refresh call uses HTTPBasicAuth! # FIXME can get an exception here (BAD REQUEST) if refresh token has e.g. been revoked and not dropped out of DB. - resp = dcf.request(mode, full_url, client_id=client_id, - client_secret=client_secret, data=post_body) + # FIXME: Also have seen this when requesting an unlink + # FIXME: reply: 'HTTP/1.1 401 UNAUTHORIZED\r\n' after staging server is rolled?? + # FIXME: "/home/vagrant/www/lib/oauthlib/oauth2/rfc6749/parameters.py" + # FIXME: MissingTokenError: (missing_token) Missing access token parameter. + try: + resp = dcf.request(mode, full_url, client_id=client_id, + client_secret=client_secret, data=post_body) + except MissingTokenError as e: + print "drop the records from the database {}".format(str(e)) + print "NO! gotta remember they linked as NIH ID before!!" + except Exception as e: + print "drop the records from the database {}".format(str(e)) return resp @@ -1288,6 +1349,7 @@ def get_nih_user_details_from_token(user_id): user_details['NIH_username'] = nih_user.NIH_username user_details['NIH_assertion_expiration'] = nih_user.NIH_assertion_expiration # Add a separate field to break out program count from active: + user_details['dbGaP_has_datasets'] = (len(user_auth_datasets) > 0) user_details['dbGaP_authorized'] = (len(user_auth_datasets) > 0) and nih_user.active logger.debug("[DEBUG] User {} has access to {} dataset(s) and is {}".format(nih_user.NIH_username, str(len(user_auth_datasets)), ('not active' if not nih_user.active else 'active'))) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index a40fc170..dc221c6e 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1313,10 +1313,12 @@ def get_nih_user_details(user_id): return user_details # i.e. empty dict dcf_token = dcf_tokens.first() - # FIXME? IS THERE AN ISSUE HERE WITH NIH USERNAME CASE SENSITIVITY?? - nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username=dcf_token.nih_username) + + curr_user = User.objects.get(id=user_id) + nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=dcf_token.nih_username) if len(nih_users) == 0: + user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) return user_details # i.e. empty dict elif len(nih_users) == 1: @@ -1354,6 +1356,7 @@ def get_nih_user_details(user_id): nih_user = freshest_unlinked else: logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) + user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) return user_details # i.e. empty dict # @@ -1385,6 +1388,7 @@ def get_nih_user_details(user_id): user_details['dbGaP_has_datasets'] = (len(user_auth_datasets) > 0) user_details['dbGaP_authorized'] = (len(user_auth_datasets) > 0) and nih_user.active logger.debug("[DEBUG] User {} has access to {} dataset(s) and is {}".format(nih_user.NIH_username, str(len(user_auth_datasets)), ('not active' if not nih_user.active else 'active'))) + user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) user_details['NIH_active'] = nih_user.active user_details['NIH_DCF_linked'] = nih_user.linked user_details['refresh_key_ok'] = get_dcf_auth_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS diff --git a/accounts/urls.py b/accounts/urls.py index 94c11586..bd46521d 100755 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -31,6 +31,7 @@ url(r'^dcf/login/callback/$', dcf_views.oauth2_callback, name='dcf_callback'), url(r'^dcf_link_callback/$', dcf_views.dcf_link_callback, name='dcf_link_callback'), url(r'^dcf_link_extend/$', dcf_views.dcf_link_extend, name='dcf_link_extend'), + url(r'^dcf_link_redo/$', dcf_views.dcf_link_redo, name='dcf_link_redo'), url(r'^dcf_disconnect_user/$', dcf_views.dcf_disconnect_user, name='dcf_disconnect_user'), url(r'^dcf_user_data/$', dcf_views.dcf_get_user_data, name='dcf_get_user_data'), url(r'^dcf_unlink/$', dcf_views.dcf_unlink, name='dcf_unlink'), From 3d8e12c18b96443d8b0d141afa3399b6789ccc9d Mon Sep 17 00:00:00 2001 From: s-paquette Date: Wed, 20 Jun 2018 18:04:10 -0700 Subject: [PATCH 12/76] -> Quick logger fix --- accounts/sa_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 0305a826..e5704ef2 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1260,10 +1260,9 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, ad.whitelist_id)) else: logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) - - logger.info(result) - logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + logger.info(result) + logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] User {} added to {}.".format(user_email, dataset.google_group_name)) From 3604f970031a8d1c249f36acb9b80a126e1c596d Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 20 Jun 2018 18:19:18 -0700 Subject: [PATCH 13/76] Tardy fix --- accounts/sa_utils.py | 70 ++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 0305a826..f318eebe 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1261,43 +1261,43 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, else: logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) - logger.info(result) - logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - "[STATUS] User {} added to {}.".format(user_email, - dataset.google_group_name)) + # logger.info(result) + # logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) + # st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + # "[STATUS] User {} added to {}.".format(user_email, + # dataset.google_group_name)) # Add task in queue to deactivate NIH_User entry after NIH_assertion_expiration has passed. - try: - full_topic_name = get_full_topic_name(PUBSUB_TOPIC_ERA_LOGIN) - logger.info("Full topic name: {}".format(full_topic_name)) - client = get_pubsub_service() - params = { - 'event_type': 'era_login', - 'user_id': user_id, - 'deployment': CRON_MODULE - } - message = json_dumps(params) - - body = { - 'messages': [ - { - 'data': base64.b64encode(message.encode('utf-8')) - } - ] - } - client.projects().topics().publish(topic=full_topic_name, body=body).execute() - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - "[STATUS] Notification sent to PubSub topic: {}".format(full_topic_name)) - - except Exception as e: - logger.error("[ERROR] Failed to publish to PubSub topic") - logger.exception(e) - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - "[ERROR] Failed to publish to PubSub topic: {}".format(str(e))) - - retval.messages.append(warn_message) - return retval + # try: + # full_topic_name = get_full_topic_name(PUBSUB_TOPIC_ERA_LOGIN) + # logger.info("Full topic name: {}".format(full_topic_name)) + # client = get_pubsub_service() + # params = { + # 'event_type': 'era_login', + # 'user_id': user_id, + # 'deployment': CRON_MODULE + # } + # message = json_dumps(params) + # + # body = { + # 'messages': [ + # { + # 'data': base64.b64encode(message.encode('utf-8')) + # } + # ] + # } + # client.projects().topics().publish(topic=full_topic_name, body=body).execute() + # st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + # "[STATUS] Notification sent to PubSub topic: {}".format(full_topic_name)) + # + # except Exception as e: + # logger.error("[ERROR] Failed to publish to PubSub topic") + # logger.exception(e) + # st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + # "[ERROR] Failed to publish to PubSub topic: {}".format(str(e))) + # + # retval.messages.append(warn_message) + # return retval def deactivate_nih_add_to_open(user_id, user_email): From e1f68c4015799740a429cc7c0829a2cfa2adc9c0 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Thu, 21 Jun 2018 13:45:44 -0700 Subject: [PATCH 14/76] Debug DCF --- accounts/dcf_views.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 2339fcc1..d3aaec03 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -66,6 +66,8 @@ def oauth2_login(request): First step of OAuth2 login to DCF. Just build the URL that we send back to the browser in the refresh request """ try: + logger.info("[INFO] OAuth1 a") + full_callback = request.build_absolute_uri(reverse('dcf_callback')) # OAuth2Session ENFORCES https unless this environment variable is set. For local dev, we want that off @@ -77,12 +79,15 @@ def oauth2_login(request): client_id, _ = _get_secrets() + logger.info("[INFO] OAuth1 b") # Found that 'user' scope had to be included to be able to do the user query on callback, and the data scope # to do data queries. Starting to recognize a pattern here... oauth = OAuth2Session(client_id, redirect_uri=full_callback, scope=['openid', 'user', 'data']) authorization_url, state = oauth.authorization_url(DCF_AUTH_URL) + logger.info("[INFO] OAuth1 c") # stash the state string in the session! request.session['dcfOAuth2State'] = state + logger.info("[INFO] OAuth1 d") return HttpResponseRedirect(authorization_url) finally: From ec960b19385e8dfe084948a8d353be5fb90c5eef Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Fri, 22 Jun 2018 15:38:43 -0700 Subject: [PATCH 15/76] Debug DCF --- accounts/dcf_views.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index d3aaec03..3b416457 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -109,6 +109,7 @@ def oauth2_callback(request): """ try: + logger.info("[INFO] OAuthCB a") full_callback = request.build_absolute_uri(reverse('dcf_callback')) # For future reference, this also worked, using underlying requests library: @@ -138,7 +139,7 @@ def oauth2_callback(request): # so we can talk to localhost over http. But let's turn it on/off to minimize, and make it only active in # development: # - + logger.info("[INFO] OAuthCB b") if settings.IS_DEV and full_callback.startswith('http://localhost'): os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' @@ -150,7 +151,7 @@ def oauth2_callback(request): return redirect(reverse('user_detail', args=[request.user.id])) client_id, client_secret = _get_secrets() - + logger.info("[INFO] OAuthCB c") # You MUST provide the callback *here* to get it into the fetch request dcf = OAuth2Session(client_id, state=saved_state, redirect_uri=full_callback) @@ -164,7 +165,7 @@ def oauth2_callback(request): client_id=client_id, authorization_response=request.get_full_path()) client_secret = None # clear this in case we are in Debug mode to keep this out of the browser - + logger.info("[INFO] OAuthCB d") if token_data['token_type'] != 'Bearer': logger.error("[ERROR] Token type returned was not 'Bearer'") messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") @@ -180,7 +181,7 @@ def oauth2_callback(request): my_jwt = jwt.PyJWT() my_jwt.register_algorithm('RS256', RSAAlgorithm(RSAAlgorithm.SHA256)) - + logger.info("[INFO] OAuthCB e") # # DCF's key endpoint provides a list of keys they use. Right now, only one, but to future-proof, we want # to choose the right one from the list. But that means we need to parse the first element of the JWT tuple @@ -199,7 +200,7 @@ def oauth2_callback(request): # # Get the key list from the endpoint and choose which one was used in the JWT: # - + logger.info("[INFO] OAuthCB f") resp = dcf.get(settings.DCF_KEY_URL) key_data = json_loads(resp.text) key_list = key_data['keys'] @@ -216,7 +217,7 @@ def oauth2_callback(request): # # Decode the JWT! # - + logger.info("[INFO] OAuthCB g") try: alg_list = ['RS256'] decoded_jwt_id = my_jwt.decode(token_data['id_token'], key=use_key, algorithms=alg_list, @@ -247,7 +248,7 @@ def oauth2_callback(request): # u'exp': 1525733739, # u'pur': u'id', (The "purpose" of the token. This is an ID. Refresh tokens say "refresh") # u'sub': u'integer user key'} - + logger.info("[INFO] OAuthCB h") dcf_user_id = decoded_jwt_id['sub'] # @@ -261,7 +262,7 @@ def oauth2_callback(request): nih_from_dcf = _get_nih_id_from_user_dict(user_data_dict) google_link = _get_google_link_from_user_dict(user_data_dict) - + logger.info("[INFO] OAuthCB i") # We now have the NIH User ID back from DCF; we also might now know the Google ID they have linked to previously # (it comes back in the user_id). Note that this routine is going to get called every 30 days or so when we # need to get a new refresh token, so it is possible that e.g. the first time they logged in as their PI and @@ -277,7 +278,7 @@ def oauth2_callback(request): for warn in results.messages: messages.warning(request, warn) return redirect(reverse('user_detail', args=[request.user.id])) - + logger.info("[INFO] OAuthCB j") # # We now have the minimum we need to store the tokens from DCF, so stick that in the database. We DO NOT yet # make the entry in the NIH_User table, since we need to now either establish or refresh the DCF-Google ID link: @@ -291,7 +292,7 @@ def oauth2_callback(request): # those cases where an unlink has been called.) So here is where the control flow diverges. For the # GET, we wrap things up in the callback. For the PATCH, we wrap things up immediately: # - + logger.info("[INFO] OAuthCB k") if google_link: # @@ -304,6 +305,7 @@ def oauth2_callback(request): link_mismatch = False req_user = User.objects.get(id=request.user.id) + logger.info("[INFO] OAuthCB l") if google_link != req_user.email: message = "Please unlink ID {} and use your ISB-CGC login email ({}) to link with the DCF".format( google_link, req_user.email) @@ -313,7 +315,7 @@ def oauth2_callback(request): # # The link matches. So we use PATCH, and if it goes smoothly, we write the new link to the database: - + logger.info("[INFO] OAuthCB m") if not link_mismatch: resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') if resp.status_code == 404: @@ -324,13 +326,14 @@ def oauth2_callback(request): messages.warning(request, "Unexpected response ({}, {}) from DCF during linking. " "Please contact the ISB-CGC administrator.".format(resp.status_code, resp.text)) + logger.info("[INFO] OAuthCB n") print 'response {}'.format(str(resp.text)) print 'PATCH ONLY RETURNS e.g. {"exp": 1528509163}' login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( seconds=login_expiration_seconds)) - + logger.info("[INFO] OAuthCB o") warning = _finish_the_link(request.user.id, req_user.email, calc_expiration_time, st_logger, link_mismatch) messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) @@ -339,7 +342,7 @@ def oauth2_callback(request): # User has not yet been linked, so start the redirect flow with the user and DCF that will result # in us getting the callback below to finish the process: # - + logger.info("[INFO] OAuthCB p") link_callback = request.build_absolute_uri(reverse('dcf_link_callback')) callback = '{}?redirect={}'.format(DCF_GOOGLE_URL, link_callback) From d7fc358ee77ce506d03bca5169db0a8417dc8ac6 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Fri, 22 Jun 2018 17:11:20 -0700 Subject: [PATCH 16/76] Debug DCF --- accounts/dcf_views.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 3b416457..4aae67d2 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -47,7 +47,7 @@ import httplib as http_client # Shut this up unless we need to do debug of HTTP request contents -http_client.HTTPConnection.debuglevel = 1 +#http_client.HTTPConnection.debuglevel = 1 logger = logging.getLogger('main_logger') @@ -154,6 +154,7 @@ def oauth2_callback(request): logger.info("[INFO] OAuthCB c") # You MUST provide the callback *here* to get it into the fetch request dcf = OAuth2Session(client_id, state=saved_state, redirect_uri=full_callback) + logger.info("[INFO] OAuthCB c1") # You MUST provide the client_id *here* (again!) in order to get this to do basic auth! DCF will not authorize # unless we use basic auth (i.e. client ID and secret in the header, not the body). Plus we need to provide @@ -161,9 +162,14 @@ def oauth2_callback(request): # Note we also get back an "id_token" which is a base64-encoded JWT. # Note we also get back a "token_type" which had better be "Bearer". - token_data = dcf.fetch_token(DCF_TOKEN_URL, client_secret=client_secret, - client_id=client_id, - authorization_response=request.get_full_path()) + try: + token_data = dcf.fetch_token(DCF_TOKEN_URL, client_secret=client_secret, + client_id=client_id, + authorization_response=request.get_full_path()) + except Exception as e: + logger.error("[ERROR] dcf.fetch_token") + logger.exception(e) + client_secret = None # clear this in case we are in Debug mode to keep this out of the browser logger.info("[INFO] OAuthCB d") if token_data['token_type'] != 'Bearer': From 79187701d37847317c41ee794fb756f78e97b5a3 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Mon, 25 Jun 2018 13:32:02 -0700 Subject: [PATCH 17/76] Debug DCF --- accounts/dcf_views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 4aae67d2..14d5f74e 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -168,6 +168,7 @@ def oauth2_callback(request): authorization_response=request.get_full_path()) except Exception as e: logger.error("[ERROR] dcf.fetch_token") + logger.error('DCF_TOKEN_URL: {} / authresp: {} / full_callback: {}'.format(DCF_TOKEN_URL, request.get_full_path(), full_callback)) logger.exception(e) client_secret = None # clear this in case we are in Debug mode to keep this out of the browser From ea35d5d6b189e022af934d1d3efe8c945b67fa35 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Mon, 25 Jun 2018 15:16:35 -0700 Subject: [PATCH 18/76] Debug DCF --- accounts/dcf_views.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 14d5f74e..49f37ac7 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -155,6 +155,7 @@ def oauth2_callback(request): # You MUST provide the callback *here* to get it into the fetch request dcf = OAuth2Session(client_id, state=saved_state, redirect_uri=full_callback) logger.info("[INFO] OAuthCB c1") + auth_response = request.build_absolute_uri(request.get_full_path()) # You MUST provide the client_id *here* (again!) in order to get this to do basic auth! DCF will not authorize # unless we use basic auth (i.e. client ID and secret in the header, not the body). Plus we need to provide @@ -165,10 +166,10 @@ def oauth2_callback(request): try: token_data = dcf.fetch_token(DCF_TOKEN_URL, client_secret=client_secret, client_id=client_id, - authorization_response=request.get_full_path()) + authorization_response=auth_response) except Exception as e: logger.error("[ERROR] dcf.fetch_token") - logger.error('DCF_TOKEN_URL: {} / authresp: {} / full_callback: {}'.format(DCF_TOKEN_URL, request.get_full_path(), full_callback)) + logger.error('DCF_TOKEN_URL: {} / authresp: {} / full_callback: {}'.format(DCF_TOKEN_URL, auth_response, full_callback)) logger.exception(e) client_secret = None # clear this in case we are in Debug mode to keep this out of the browser From bd88f43c90d171bd4d75ae7072b43b5be3285a43 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Mon, 25 Jun 2018 17:14:54 -0700 Subject: [PATCH 19/76] Debug DCF --- accounts/dcf_views.py | 124 ++++++++++++++++++++++-------------------- accounts/sa_utils.py | 2 +- accounts/urls.py | 1 - 3 files changed, 67 insertions(+), 60 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 49f37ac7..1c797917 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -304,45 +304,45 @@ def oauth2_callback(request): if google_link: # - # It is possible that the first time the user logged in they provided the wrong email address to DCF and + # DCF says the user has linked their Google ID. If it matches our version of the Google ID, great! We are + # done. BUT if the ID has a mismatch, we are going to drop it. It is possible that the first time the user logged in they provided the wrong email address to DCF and # then ignored us when we asked them to correct the problem. If DCF's provided Google ID does not match # ours, then they need to still provide us with the correct version before we let them use it! # Also, if a user is trying to reuse the same NIH login, we expect to get back a Google ID from DCF that # does not match the current user email. # - link_mismatch = False req_user = User.objects.get(id=request.user.id) logger.info("[INFO] OAuthCB l") if google_link != req_user.email: - message = "Please unlink ID {} and use your ISB-CGC login email ({}) to link with the DCF".format( - google_link, req_user.email) + _unlink_internals(request.user.id, False) + message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of ({})".format( + req_user.email, google_link) messages.warning(request, message) - link_mismatch = True return redirect(reverse('user_detail', args=[request.user.id])) # # The link matches. So we use PATCH, and if it goes smoothly, we write the new link to the database: + # logger.info("[INFO] OAuthCB m") - if not link_mismatch: - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') - if resp.status_code == 404: - messages.warning(request, "No linked Google account found for user") - elif resp.status_code == 200: - pass - else: - messages.warning(request, "Unexpected response ({}, {}) from DCF during linking. " - "Please contact the ISB-CGC administrator.".format(resp.status_code, resp.text)) + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') + if resp.status_code == 404: + messages.warning(request, "No linked Google account found for user") + elif resp.status_code == 200: + pass + else: + messages.warning(request, "Unexpected response ({}, {}) from DCF during linking. " + "Please contact the ISB-CGC administrator.".format(resp.status_code, resp.text)) - logger.info("[INFO] OAuthCB n") - print 'response {}'.format(str(resp.text)) - print 'PATCH ONLY RETURNS e.g. {"exp": 1528509163}' + logger.info("[INFO] OAuthCB n") + print 'response {}'.format(str(resp.text)) + print 'PATCH ONLY RETURNS e.g. {"exp": 1528509163}' login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( seconds=login_expiration_seconds)) logger.info("[INFO] OAuthCB o") - warning = _finish_the_link(request.user.id, req_user.email, calc_expiration_time, st_logger, link_mismatch) + warning = _finish_the_link(request.user.id, req_user.email, calc_expiration_time, st_logger) messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) @@ -359,17 +359,6 @@ def oauth2_callback(request): os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' -@login_required -def dcf_link_redo(request): - """ - If the user needs to redo their google, link, this is what does it. - """ - - link_callback = request.build_absolute_uri(reverse('dcf_link_callback')) - callback = '{}?redirect={}'.format(DCF_GOOGLE_URL, link_callback) - return HttpResponseRedirect(callback) - - @login_required def dcf_link_callback(request): """ @@ -459,7 +448,9 @@ def dcf_link_callback(request): link_mismatch = False req_user = User.objects.get(id=request.user.id) if google_link != req_user.email: - message = "Please unlink ID {} and use your ISB-CGC login email ({}) to link with the DCF".format(google_link, req_user.email) + _unlink_internals(request.user.id, True) + message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of ({})".format( + req_user.email, google_link) messages.warning(request, message) link_mismatch = True @@ -467,13 +458,13 @@ def dcf_link_callback(request): # If all is well, this is where we add the user to the NIH_User table and link the user to the various data sets. # - warning = _finish_the_link(request.user.id, google_link, calc_expiration_time, st_logger, link_mismatch) + warning = _finish_the_link(request.user.id, google_link, calc_expiration_time, st_logger) if warning: messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) -def _finish_the_link(user_id, user_email, expiration_time, st_logger, link_mismatch): +def _finish_the_link(user_id, user_email, expiration_time, st_logger): """ Regardless of how they get here, this step handles the linking of the user by adding the required database records. """ @@ -491,7 +482,7 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger, link_misma # dcf_token = DCFToken.objects.get(user_id=user_id) - if dcf_token.google_id is not None and dcf_token.google_id != user_email and not link_mismatch: + if dcf_token.google_id is not None and dcf_token.google_id != user_email: return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ 'Please report this to the ISB-CGC administrator' @@ -499,9 +490,6 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger, link_misma dcf_token.user_token = the_user_token dcf_token.save() - if link_mismatch: - return - the_user_dict = _user_data_token_to_user_dict(the_user_token) nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, nih_assertion_expiration, st_logger) @@ -922,50 +910,69 @@ def dcf_unlink(request): # link # with the DCF + success, warnings, errors = _unlink_internals(request.user.id, False) + if not success: + for warning in warnings: + messages.warning(request, warning) + for error in errors: + messages.error(request, error) + return redirect(reverse('user_detail', args=[request.user.id])) +def _unlink_internals(user_id, just_with_dcf): + """ + Handles all the internal details of unlinking a user's Google ID. + """ + warnings = [] + errors = [] + success = False - # DO NOT UNLINK IF NOT CURRENTLY LINKED - - dcf_token = DCFToken.objects.get(user_id=request.user.id) - the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) - - google_link = _get_google_link_from_user_dict(the_user_dict) + # + # Get our concept of linking state from the token DB: + # + if not just_with_dcf: + dcf_token = DCFToken.objects.get(user_id=user_id) + the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) + google_link = _get_google_link_from_user_dict(the_user_dict) - if google_link is None: - messages.warning(request, "User is not linked to Google ") # redirect to user detail page - return redirect(reverse('user_detail', args=[request.user.id])) + if google_link is None: + warnings.append("User is not linked to Google") + return success, warnings, errors # # First, call DCF to drop the linkage. This is the only way to get the user # booted out of control groups. # - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') + + resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') if resp.status_code == 404: - messages.warning(request, "No linked Google account found for user") + warnings.append("No linked Google account found for user") elif resp.status_code == 400: delete_response = json_loads(resp.text) error = delete_response['error'] message = delete_response['error_description'] - messages.error(request, "Error in unlinking: {} : {}".format(error, message)) + errors.append("Error in unlinking: {} : {}".format(error, message)) elif resp.status_code == 200: - pass + success = True else: - messages.warning(request, "Unexpected response from DCF") + warnings.append("Unexpected response from DCF") + + if just_with_dcf: + return success, warnings, errors # # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking # since they care about the token info: # - _refresh_access_token(request.user.id) + _refresh_access_token(user_id) # # The Token table records the User's Google ID. This needs to be nulled. The expiration time in the DCFToken - # is for the access token, not the google link (that info is stored in the NIH_user: + # is for the access token, not the google link (that info is stored in the NIH_user): # - dcf_token = DCFToken.objects.get(user_id=request.user.id) + dcf_token = DCFToken.objects.get(user_id=user_id) dcf_token.google_id = None dcf_token.save() @@ -975,17 +982,18 @@ def dcf_unlink(request): # try: - message = unlink_account_in_db_for_dcf(request.user.id) + message = unlink_account_in_db_for_dcf(user_id) if message: - messages.error(request, message) + errors.append(message) + success = False except Exception as e: logger.error("[ERROR] While unlinking accounts:") logger.exception(e) - messages.error(request, 'There was an error when attempting to unlink your NIH user account - please contact the administrator.') + errors.append('There was an error when attempting to unlink your Google ID - please contact the administrator.') + success = False - # redirect to user detail page - return redirect(reverse('user_detail', args=[request.user.id])) + return success, warnings, errors def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_from_dcf, dcf_uid, cgc_uid, google_id): diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index f318eebe..7785bcb6 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1426,7 +1426,7 @@ def get_nih_user_details(user_id): user_details['dbGaP_has_datasets'] = (len(user_auth_datasets) > 0) user_details['dbGaP_authorized'] = (len(user_auth_datasets) > 0) and nih_user.active logger.debug("[DEBUG] User {} has access to {} dataset(s) and is {}".format(nih_user.NIH_username, str(len(user_auth_datasets)), ('not active' if not nih_user.active else 'active'))) - user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) + user_details['link_mismatch'] = (dcf_token.google_id is None) or (dcf_token.google_id != curr_user.email) user_details['NIH_active'] = nih_user.active user_details['NIH_DCF_linked'] = nih_user.linked user_details['refresh_key_ok'] = get_dcf_auth_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS diff --git a/accounts/urls.py b/accounts/urls.py index bd46521d..94c11586 100755 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -31,7 +31,6 @@ url(r'^dcf/login/callback/$', dcf_views.oauth2_callback, name='dcf_callback'), url(r'^dcf_link_callback/$', dcf_views.dcf_link_callback, name='dcf_link_callback'), url(r'^dcf_link_extend/$', dcf_views.dcf_link_extend, name='dcf_link_extend'), - url(r'^dcf_link_redo/$', dcf_views.dcf_link_redo, name='dcf_link_redo'), url(r'^dcf_disconnect_user/$', dcf_views.dcf_disconnect_user, name='dcf_disconnect_user'), url(r'^dcf_user_data/$', dcf_views.dcf_get_user_data, name='dcf_get_user_data'), url(r'^dcf_unlink/$', dcf_views.dcf_unlink, name='dcf_unlink'), From 232e56640fe7426b71c600c04911b211a1681628 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 26 Jun 2018 10:49:55 -0700 Subject: [PATCH 20/76] Debug DCF --- accounts/dcf_views.py | 139 ++++++++++++++++++++++++------------------ accounts/urls.py | 1 + 2 files changed, 80 insertions(+), 60 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 1c797917..bffea619 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -358,6 +358,15 @@ def oauth2_callback(request): finally: os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' +@login_required +def dcf_link_redo(request): + """ + If the user needs to redo their google, link, this is what does it. + """ + + link_callback = request.build_absolute_uri(reverse('dcf_link_callback')) + callback = '{}?redirect={}'.format(DCF_GOOGLE_URL, link_callback) + return HttpResponseRedirect(callback) @login_required def dcf_link_callback(request): @@ -445,14 +454,13 @@ def dcf_link_callback(request): 'to the ISB-CGC administrator') return redirect(reverse('user_detail', args=[request.user.id])) - link_mismatch = False req_user = User.objects.get(id=request.user.id) if google_link != req_user.email: _unlink_internals(request.user.id, True) message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of ({})".format( req_user.email, google_link) messages.warning(request, message) - link_mismatch = True + return redirect(reverse('user_detail', args=[request.user.id])) # # If all is well, this is where we add the user to the NIH_User table and link the user to the various data sets. @@ -1135,72 +1143,83 @@ def dcf_disconnect_user(request): # First thing ya gotta do is tell DCF to unlink the user, which will get them out of # access control groups. BUT ONLY IF THEY ARE ACTUALLY CURRENTLY LINKED! - msg_list = [] + try: + msg_list = [] - dcf_token = DCFToken.objects.get(user_id=request.user.id) - the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) + dcf_token = DCFToken.objects.get(user_id=request.user.id) + the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) - print the_user_dict, type(the_user_dict) - google_link = _get_google_link_from_user_dict(the_user_dict) + print the_user_dict, type(the_user_dict) + google_link = _get_google_link_from_user_dict(the_user_dict) - if google_link: - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') - if resp.status_code == 404: - msg_list.append("No linked Google account found for user, code {}".format(resp.status_code)) - elif resp.status_code == 400: - delete_response = json_loads(resp.text) - error = delete_response['error'] - message = delete_response['error_description'] - msg_list.append("Error in unlinking: {} : {} : {}".format(error, message, resp.status_code)) - elif resp.status_code == 200: - pass - else: - msg_list.append(request, "Unexpected response from DCF {}".format(resp.status_code)) + logger.info("[INFO] DDU A") - # - # The revoke call is unlike other DCF endpoints in that it is special! - # Token revocation is described here: https://tools.ietf.org/html/rfc7009#section-2.1 - # So we do not provide a bearer access token, but the client ID and secret in a Basic Auth - # framework. Not seeing that inside the OAuthSession framework, so we roll our own by hand: - # + if google_link: + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') + if resp.status_code == 404: + msg_list.append("No linked Google account found for user, code {}".format(resp.status_code)) + elif resp.status_code == 400: + delete_response = json_loads(resp.text) + error = delete_response['error'] + message = delete_response['error_description'] + msg_list.append("Error in unlinking: {} : {} : {}".format(error, message, resp.status_code)) + elif resp.status_code == 200: + pass + else: + msg_list.append(request, "Unexpected response from DCF {}".format(resp.status_code)) - client_id, client_secret = _get_secrets() + # + # The revoke call is unlike other DCF endpoints in that it is special! + # Token revocation is described here: https://tools.ietf.org/html/rfc7009#section-2.1 + # So we do not provide a bearer access token, but the client ID and secret in a Basic Auth + # framework. Not seeing that inside the OAuthSession framework, so we roll our own by hand: + # - data = { - 'token': dcf_token.refresh_token - } - auth = requests.auth.HTTPBasicAuth(client_id, client_secret) - resp = requests.request('POST', DCF_REVOKE_URL, data=data, auth=auth) - client_id = None - client_secret = None + client_id, client_secret = _get_secrets() - if resp.status_code != 200 and resp.status_code != 204: - messages.warning(request, 'Revocation problem: {} : {}'.format(resp.status_code, resp.text)) + data = { + 'token': dcf_token.refresh_token + } + logger.info("[INFO] DDU B") - for msg in msg_list: - messages.warning(request, msg) + auth = requests.auth.HTTPBasicAuth(client_id, client_secret) + resp = requests.request('POST', DCF_REVOKE_URL, data=data, auth=auth) + client_id = None + client_secret = None - # - # OK, NOW we detach the user in our NIH tables, and detach the user from data permissions. - # + logger.info("[INFO] DDU C") - unlink_account_in_db_for_dcf(request.user.id) + if resp.status_code != 200 and resp.status_code != 204: + messages.warning(request, 'Revocation problem: {} : {}'.format(resp.status_code, resp.text)) - # - # Next, we clear out our tokens for the user (which allows them to appear to DCF as the - # logged-in NIH user; we cannot keep them around: - # + for msg in msg_list: + messages.warning(request, msg) - dcf_token.delete() + # + # OK, NOW we detach the user in our NIH tables, and detach the user from data permissions. + # - # - # Finally, we need to send the user to logout from the DCF, which is needed to clear the - # cookies DCF has dumped into their browser, which will allow them to log in to NIH again. - # + unlink_account_in_db_for_dcf(request.user.id) - logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) + # + # Next, we clear out our tokens for the user (which allows them to appear to DCF as the + # logged-in NIH user; we cannot keep them around: + # + + dcf_token.delete() + + # + # Finally, we need to send the user to logout from the DCF, which is needed to clear the + # cookies DCF has dumped into their browser, which will allow them to log in to NIH again. + # - callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) + logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) + logger.info("[INFO] DDU D") + callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) + except Exception as e: + logger.error("[ERROR] While disconnect:") + logger.exception(e) + raise e return HttpResponseRedirect(callback) @@ -1227,14 +1246,14 @@ def dcf_get_user_data(request): Use for QC and development """ - return _dcf_user_data_from_token(request) + udft = _dcf_user_data_from_token(request) - # resp = _dcf_call(DCF_USER_URL, request.user.id) - # user_data = json_loads(resp.text) - # - # remaining_token_time = get_dcf_auth_key_remaining_seconds(request.user.id) - # messages.warning(request, 'TDCF Responded with {}: {}'.format(user_data, remaining_token_time)) - # return redirect(reverse('user_detail', args=[request.user.id])) + resp = _dcf_call(DCF_USER_URL, request.user.id) + user_data = json_loads(resp.text) + + remaining_token_time = get_dcf_auth_key_remaining_seconds(request.user.id) + messages.warning(request, 'EPDCF Responded with {}: {} plus {}'.format(user_data, remaining_token_time, udft)) + return redirect(reverse('user_detail', args=[request.user.id])) def _dcf_call(full_url, user_id, mode='get', post_body=None, force_token=False): diff --git a/accounts/urls.py b/accounts/urls.py index 94c11586..bd46521d 100755 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -31,6 +31,7 @@ url(r'^dcf/login/callback/$', dcf_views.oauth2_callback, name='dcf_callback'), url(r'^dcf_link_callback/$', dcf_views.dcf_link_callback, name='dcf_link_callback'), url(r'^dcf_link_extend/$', dcf_views.dcf_link_extend, name='dcf_link_extend'), + url(r'^dcf_link_redo/$', dcf_views.dcf_link_redo, name='dcf_link_redo'), url(r'^dcf_disconnect_user/$', dcf_views.dcf_disconnect_user, name='dcf_disconnect_user'), url(r'^dcf_user_data/$', dcf_views.dcf_get_user_data, name='dcf_get_user_data'), url(r'^dcf_unlink/$', dcf_views.dcf_unlink, name='dcf_unlink'), From e4adfc1ec19b26d0080bee707af02c593a416331 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 26 Jun 2018 16:21:56 -0700 Subject: [PATCH 21/76] Debug DCF --- accounts/dcf_views.py | 96 +++++++++++++++++++------------------------ accounts/urls.py | 1 - 2 files changed, 43 insertions(+), 54 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index bffea619..169ee0b9 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -335,14 +335,11 @@ def oauth2_callback(request): "Please contact the ISB-CGC administrator.".format(resp.status_code, resp.text)) logger.info("[INFO] OAuthCB n") - print 'response {}'.format(str(resp.text)) - print 'PATCH ONLY RETURNS e.g. {"exp": 1528509163}' + returned_expiration_str = json_loads(resp.text)['exp'] + use_expiration_time = _calc_expiration_time(returned_expiration_str) - login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 - calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( - seconds=login_expiration_seconds)) logger.info("[INFO] OAuthCB o") - warning = _finish_the_link(request.user.id, req_user.email, calc_expiration_time, st_logger) + warning = _finish_the_link(request.user.id, req_user.email, use_expiration_time, st_logger) messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) @@ -407,24 +404,7 @@ def dcf_link_callback(request): returned_expiration_str = request.GET.get('exp', None) returned_google_link = request.GET.get('linked_email', None) - returned_expiration_time = None - if returned_expiration_str: - exp_secs = float(returned_expiration_str) - returned_expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(exp_secs)) - - login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 - calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( - seconds=login_expiration_seconds)) - if returned_expiration_time: - diff = returned_expiration_time - calc_expiration_time - secs = abs((diff.days * (3600 * 24)) + diff.seconds) - if secs > 30: - logger.error("WARNING: DCF RETURNED TIME SKEW OF {} SECONDS".format(secs)) - else: - logger.info("DCF expiration skew was {} seconds".format(secs)) - calc_expiration_time = returned_expiration_time - else: - logger.error("No expiration time provided by DCF") + use_expiration_time = _calc_expiration_time(returned_expiration_str) # # At this point, we need to wrestle with the possible problem that the user has linked @@ -466,12 +446,36 @@ def dcf_link_callback(request): # If all is well, this is where we add the user to the NIH_User table and link the user to the various data sets. # - warning = _finish_the_link(request.user.id, google_link, calc_expiration_time, st_logger) + warning = _finish_the_link(request.user.id, google_link, use_expiration_time, st_logger) if warning: messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) +def _calc_expiration_time(returned_expiration_str): + + returned_expiration_time = None + if returned_expiration_str: + exp_secs = float(returned_expiration_str) + returned_expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(exp_secs)) + + login_expiration_seconds = settings.DCF_LOGIN_EXPIRATION_SECONDS + calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( + seconds=login_expiration_seconds)) + if returned_expiration_time: + diff = returned_expiration_time - calc_expiration_time + secs = abs((diff.days * (3600 * 24)) + diff.seconds) + if secs > 30: + logger.error("WARNING: DCF RETURNED TIME SKEW OF {} SECONDS".format(secs)) + else: + logger.info("DCF expiration skew was {} seconds".format(secs)) + calc_expiration_time = returned_expiration_time + else: + logger.error("No expiration time provided by DCF") + + return calc_expiration_time + + def _finish_the_link(user_id, user_email, expiration_time, st_logger): """ Regardless of how they get here, this step handles the linking of the user by adding the required database records. @@ -876,20 +880,14 @@ def dcf_link_extend(request): messages.warning(request, "Unexpected response ({}) from DCF during linking. " "Please contact the ISB-CGC administrator.".format(resp.status_code)) - print resp.text - print 'PATCH ONLY RETURNS e.g. {"exp": 1528509163}' - print "NO! TIME TO USE THE EXPIRATION" - - # Until we get back user expiration time, we calculate it: - login_expiration_seconds = settings.LOGIN_EXPIRATION_MINUTES * 60 - nih_assertion_expiration = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( - seconds=login_expiration_seconds)) + returned_expiration_str = json_loads(resp.text)['exp'] + use_expiration_time = _calc_expiration_time(returned_expiration_str) # User data set permissions might have changed, so we call and find out what they are: user_data_token_string = _get_user_data_token_string(request.user.id) user_data_dict = _user_data_token_to_user_dict(user_data_token_string) - _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data_dict, nih_assertion_expiration, st_logger) + _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data_dict, use_expiration_time, st_logger) if warning: messages.warning(request, warning) @@ -1114,24 +1112,6 @@ def _decode_token(token): return _decode_token_chunk(token, 1) -@login_required -def test_the_dcf(request): - """ - Use this to test that we can call the DCF and get back useful info. Also, use as a template for doing all - DCF calls - """ - file_uuid = 'ffcc4f7d-471a-4ad0-b199-53d992217986' - resp = _dcf_call('{}/{}'.format(DCF_URL_URL, file_uuid), request.user.id) - result = { - 'uri': resp.text, - 'code': resp.status_code - } - messages.warning(request, 'TDCF Responded with {}: {}'.format(resp.status_code, resp.text)) - - # redirect to user detail page - return redirect(reverse('user_detail', args=[request.user.id])) - - @login_required def dcf_disconnect_user(request): """ @@ -1145,8 +1125,18 @@ def dcf_disconnect_user(request): try: msg_list = [] + # + # If user is sitting on this page in one browser, and logs out via another, we would have + # no DCF token anymore. Catch that case and silently no-op: + # + dcf_tokens = DCFToken.objects.filter(user_id=request.user.id) + num_tokens = len(dcf_tokens) + if num_tokens != 1: + if num_tokens > 1: + messages.warning(request, 'Unexpected Server Error: Multiple tokens found') + return redirect(reverse('user_detail', args=[request.user.id])) + dcf_token = dcf_tokens.first() - dcf_token = DCFToken.objects.get(user_id=request.user.id) the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) print the_user_dict, type(the_user_dict) diff --git a/accounts/urls.py b/accounts/urls.py index bd46521d..55e6af22 100755 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -36,7 +36,6 @@ url(r'^dcf_user_data/$', dcf_views.dcf_get_user_data, name='dcf_get_user_data'), url(r'^dcf_unlink/$', dcf_views.dcf_unlink, name='dcf_unlink'), url(r'^dcf_login/$', dcf_views.oauth2_login, name='dcf_login'), - url(r'^dcf/test', dcf_views.test_the_dcf, name='dcf_test'), url(r'^unlink_accounts/', views.unlink_accounts, name='unlink_accounts'), From e44ff38bb86cdab02c26da90b8477d13325e3300 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 27 Jun 2018 15:05:18 -0700 Subject: [PATCH 22/76] Debug DCF --- accounts/dcf_views.py | 23 ++++- accounts/sa_utils.py | 210 ++++++++++++++++++++++++------------------ accounts/urls.py | 1 + 3 files changed, 142 insertions(+), 92 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 169ee0b9..1284c187 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -100,6 +100,18 @@ def oauth2_login(request): # ruri = wac.prepare_request_uri(DCF_AUTH_URL, redirect_uri=full_callback, state=rando, scope=['openid', 'user']) # return HttpResponseRedirect(ruri) +@login_required +def dcf_simple_logout(request): + ''' + If the user is trying to login with an NIH idea already in use by somebody else, or if they are already linked + with a different NIH ID, we immediately reject the response from DCF and tell the user they need to logout to + try again. This involves simply sending them back to DCF; the user's DCF session cookies do the rest to let + DCF know who they are. Note we also clear the session key we are using to record the error. + ''' + request.session.pop('dcfForcedLogout', None) + logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) + callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) + return HttpResponseRedirect(callback) @login_required def oauth2_callback(request): @@ -271,6 +283,7 @@ def oauth2_callback(request): google_link = _get_google_link_from_user_dict(user_data_dict) logger.info("[INFO] OAuthCB i") + # We now have the NIH User ID back from DCF; we also might now know the Google ID they have linked to previously # (it comes back in the user_id). Note that this routine is going to get called every 30 days or so when we # need to get a new refresh token, so it is possible that e.g. the first time they logged in as their PI and @@ -281,11 +294,19 @@ def oauth2_callback(request): results = DemoLoginResults() st_logger = StackDriverLogger.build_from_django_settings() user_email = User.objects.get(id=request.user.id).email - # FIXME This old test is not what we really want to use... + # + # Looks for cases where we have another user with this NIH ID, or that this user is currently linked + # with another ID. If either case is true, we tell the user they will need to logout of DCF and try + # again; note we use a session key to remember this fact and will use it to generate the user data + # that will configure the user_detail page: + # if found_linking_problems(nih_from_dcf, request.user.id, user_email, st_logger, results): for warn in results.messages: messages.warning(request, warn) + # stash the requirement to only show a logout link in the session! + request.session['dcfForcedLogout'] = nih_from_dcf return redirect(reverse('user_detail', args=[request.user.id])) + logger.info("[INFO] OAuthCB j") # # We now have the minimum we need to store the tokens from DCF, so stick that in the database. We DO NOT yet diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 1fbfbd08..f3d092e2 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -828,31 +828,38 @@ def __repr_(self): def found_linking_problems(NIH_username, user_id, user_email, my_st_logger, results): # 1. check if this google identity is currently linked to other NIH usernames # note: the NIH username exclusion is case-insensitive so this will not return a false positive - # e.g. if this google identity is linked to 'NIHUSERNAME1' but just authenticated with 'nihusername1', + # e.g. if this user identity is linked to 'NIHUSERNAME1' but just authenticated with 'nihusername1', # it will still pass this test - nih_usernames_already_linked_to_this_google_identity = NIH_User.objects.filter( + nih_usernames_already_linked_to_this_user_identity = NIH_User.objects.filter( user_id=user_id, linked=True).exclude(NIH_username__iexact=NIH_username) - for nih_user in nih_usernames_already_linked_to_this_google_identity: + for nih_user in nih_usernames_already_linked_to_this_user_identity: if nih_user.NIH_username.lower() != NIH_username.lower(): + existing_nih_user_name = nih_user.NIH_username logger.warn( "User {} is already linked to the eRA commons identity {} and attempted authentication" " with the eRA commons identity {}." - .format(user_email, nih_user.NIH_username, NIH_username)) + .format(user_email, existing_nih_user_name, NIH_username)) my_st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] {}".format( "User {} is already linked to the eRA commons identity {} and attempted authentication" " with the eRA commons identity {}." - .format(user_email, nih_user.NIH_username, NIH_username))) + .format(user_email, existing_nih_user_name, NIH_username))) - results.messages.append("User {} is already linked to the eRA commons identity {}. " - "Please unlink these before authenticating with the eRA commons " - "identity {}.".format(user_email, nih_user.NIH_username, - NIH_username)) + if settings.DCF_TEST: + user_message = "User {} is already linked to the eRA commons identity {}. " \ + "Please log out of the Data Commons now using the link below, then " \ + "click the link to disconnect from {} before trying to log in " \ + "using {}".format(user_email, existing_nih_user_name, existing_nih_user_name, NIH_username) + else: + user_message = "User {} is already linked to the eRA commons identity {}. " \ + "Please unlink these before authenticating with the eRA commons " \ + "identity {}.".format(user_email, existing_nih_user_name, NIH_username) + results.messages.append(user_message) return True # 2. check if there are other google identities that are still linked to this NIH_username # note: the NIH username match is case-insensitive so this will not return a false negative. - # e.g. if a different google identity is linked to 'NIHUSERNAME1' and this google identity just authenticated with 'nihusername1', - # this will fail the test and return to the /users/ url with a warning message + # e.g. if a different user identity is linked to 'NIHUSERNAME1' and this user identity just authenticated with 'nihusername1', + # this will fail the test preexisting_nih_users = NIH_User.objects.filter( NIH_username__iexact=NIH_username, linked=True).exclude(user_id=user_id) @@ -875,10 +882,12 @@ def found_linking_problems(NIH_username, user_id, user_email, my_st_logger, resu NIH_username, prelinked_user_emails + '.' )) - - results.messages.append( - "You tried to link your email address to NIH account {}, but it is already linked to {}.".format( - NIH_username, prelinked_user_emails)) + if settings.DCF_TEST: + user_message = "You tried to link your email address to NIH account {}, but it is already linked to {}. " \ + "Please log out of the Data Commons now using the link below, then try again." + else: + user_message = "You tried to link your email address to NIH account {}, but it is already linked to {}." + results.messages.append(user_message.format(NIH_username, prelinked_user_emails)) return True return False @@ -1333,91 +1342,107 @@ def deactivate_nih_add_to_open(user_id, user_email): logger.info(e) -def get_nih_user_details(user_id): +def get_nih_user_details(user_id, force_logout): user_details = {} - # - # Now with DCF, we can have a user logged in as an NIH user, but not be linked (which means DCF does not - # have an association between NIH ID and Google ID). So while we previously did a get on a linked user, - # now we need to filter. If one of the users is linked, that is who we use. Otherwise, we can resolve the - # issue by looking at the current DCF token attached to the user to see who they are associated with. - # + if settings.DCF_TEST: + + if force_logout: + user_details['force_DCF_logout'] = True + user_details['NIH_username'] = force_logout + return user_details + + # + # Now with DCF, we can have a user logged in as an NIH user, but not be linked (which means DCF does not + # have an association between NIH ID and Google ID). So while we previously did a get on a linked user, + # now we need to filter. If one of the users is linked, that is who we use. Otherwise, we can resolve the + # issue by looking at the current DCF token attached to the user to see who they are associated with. + # - dcf_tokens = DCFToken.objects.filter(user_id=user_id) - if len(dcf_tokens) == 0: - return user_details # i.e. empty dict - elif len(dcf_tokens) > 1: - logger.error("[ERROR] MULTIPLE DCF RECORDS FOR USER {}. ".format(str(user_id))) - return user_details # i.e. empty dict + dcf_tokens = DCFToken.objects.filter(user_id=user_id) + if len(dcf_tokens) == 0: + return user_details # i.e. empty dict + elif len(dcf_tokens) > 1: + logger.error("[ERROR] MULTIPLE DCF RECORDS FOR USER {}. ".format(str(user_id))) + return user_details # i.e. empty dict - dcf_token = dcf_tokens.first() + dcf_token = dcf_tokens.first() - curr_user = User.objects.get(id=user_id) - nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=dcf_token.nih_username) + curr_user = User.objects.get(id=user_id) - if len(nih_users) == 0: - user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) - return user_details # i.e. empty dict - elif len(nih_users) == 1: - nih_user = nih_users.first() + nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=dcf_token.nih_username) - else: - # - # Multiple NIH user rows for the current user for the same nih_username. We want the one that is linked. - # If more than one (is that possible??) take the one with the most recent usage. If nobody is linked, - # again take the one with the most recent usage. Some of these cases should not be possible (?) but - # trying to be bombproof here: - # - nih_user = None - freshest_linked = None - freshest_linked_stamp = None - freshest_unlinked = None - freshest_unlinked_stamp = None - for user in nih_users: - if user.linked: - if (freshest_linked_stamp is None) or (freshest_linked_stamp < user.NIH_assertion_expiration): - freshest_linked_stamp = user.NIH_assertion_expiration - freshest_linked = user - if nih_user is None: - nih_user = nih_users.first() - else: - logger.error("[ERROR] Multiple linked nih users retrieved nih_user with user_id {}.".format(user_id)) - else: - if (freshest_unlinked_stamp is None) or (freshest_unlinked_stamp < user.NIH_assertion_expiration): - freshest_unlinked_stamp = user.NIH_assertion_expiration - freshest_unlinked = user - - if freshest_linked: - nih_user = freshest_linked - elif freshest_unlinked: - nih_user = freshest_unlinked - else: - logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) + if len(nih_users) == 0: user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) return user_details # i.e. empty dict - # - # With the user_details page, we now need to check with DCF about current status before we display information - # to the user, as our database view could be stale. - # - # Step 1: If the expiration time has passed for the user and they are still tagged as active, we clear that - # flag. This is the *minimun* we chould be doing, no matter what. Note that in DCF-based Brave New World, we no - # longer need to have a cron job doing this, as we don't actually need to do anything at 24 hours. We just - # need to give the user an accurate picture of the state when they hit this page. - # + elif len(nih_users) == 1: + nih_user = nih_users.first() - if nih_user.active: - expired_time = nih_user.NIH_assertion_expiration - # If we need to have the access expire in just a few minutes for testing, this is one way to fake it: - # testing_expire_hack = datetime.timedelta(minutes=-((60 * 23) + 55)) - # expired_time = expired_time + testing_expire_hack - now_time = pytz.utc.localize(datetime.datetime.utcnow()) - print "times", expired_time, now_time - if now_time >= expired_time: - nih_user.active = False - nih_user.NIH_assertion_expiration = now_time - nih_user.save() + else: + # + # Multiple NIH user rows for the current user for the same nih_username. We want the one that is linked. + # If more than one (is that possible??) take the one with the most recent usage. If nobody is linked, + # again take the one with the most recent usage. Some of these cases should not be possible (?) but + # trying to be bombproof here: + # + nih_user = None + freshest_linked = None + freshest_linked_stamp = None + freshest_unlinked = None + freshest_unlinked_stamp = None + for user in nih_users: + if user.linked: + if (freshest_linked_stamp is None) or (freshest_linked_stamp < user.NIH_assertion_expiration): + freshest_linked_stamp = user.NIH_assertion_expiration + freshest_linked = user + if nih_user is None: + nih_user = nih_users.first() + else: + logger.error("[ERROR] Multiple linked nih users retrieved nih_user with user_id {}.".format(user_id)) + else: + if (freshest_unlinked_stamp is None) or (freshest_unlinked_stamp < user.NIH_assertion_expiration): + freshest_unlinked_stamp = user.NIH_assertion_expiration + freshest_unlinked = user + + if freshest_linked: + nih_user = freshest_linked + elif freshest_unlinked: + nih_user = freshest_unlinked + else: + logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) + user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) + return user_details # i.e. empty dict + + # + # With the user_details page, we now need to check with DCF about current status before we display information + # to the user, as our database view could be stale. + # + # Step 1: If the expiration time has passed for the user and they are still tagged as active, we clear that + # flag. This is the *minimun* we chould be doing, no matter what. Note that in DCF-based Brave New World, we no + # longer need to have a cron job doing this, as we don't actually need to do anything at 24 hours. We just + # need to give the user an accurate picture of the state when they hit this page. + # + + if nih_user.active: + expired_time = nih_user.NIH_assertion_expiration + # If we need to have the access expire in just a few minutes for testing, this is one way to fake it: + # testing_expire_hack = datetime.timedelta(minutes=-((60 * 23) + 55)) + # expired_time = expired_time + testing_expire_hack + now_time = pytz.utc.localize(datetime.datetime.utcnow()) + print "times", expired_time, now_time + if now_time >= expired_time: + nih_user.active = False + nih_user.NIH_assertion_expiration = now_time + nih_user.save() + else: + try: + nih_user = NIH_User.objects.get(user_id=user_id, linked=True) + except (MultipleObjectsReturned, ObjectDoesNotExist), e: + if type(e) is MultipleObjectsReturned: + logger.warn("Error when retrieving noh_user with user_id {}. {}".format(str(user_id), str(e))) + return user_details user_auth_datasets = UserAuthorizedDatasets.objects.filter(nih_user=nih_user) user_details['NIH_username'] = nih_user.NIH_username @@ -1426,12 +1451,15 @@ def get_nih_user_details(user_id): user_details['dbGaP_has_datasets'] = (len(user_auth_datasets) > 0) user_details['dbGaP_authorized'] = (len(user_auth_datasets) > 0) and nih_user.active logger.debug("[DEBUG] User {} has access to {} dataset(s) and is {}".format(nih_user.NIH_username, str(len(user_auth_datasets)), ('not active' if not nih_user.active else 'active'))) - user_details['link_mismatch'] = (dcf_token.google_id is None) or (dcf_token.google_id != curr_user.email) user_details['NIH_active'] = nih_user.active - user_details['NIH_DCF_linked'] = nih_user.linked - user_details['refresh_key_ok'] = get_dcf_auth_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS user_details['auth_datasets'] = [] if len(user_auth_datasets) <= 0 else AuthorizedDataset.objects.filter(id__in=user_auth_datasets.values_list('authorized_dataset',flat=True)) + if settings.DCF_TEST: + user_details['link_mismatch'] = (dcf_token.google_id is None) or (dcf_token.google_id != curr_user.email) + user_details['NIH_DCF_linked'] = nih_user.linked + user_details['refresh_key_ok'] = get_dcf_auth_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS + user_details['force_DCF_logout'] = False + return user_details diff --git a/accounts/urls.py b/accounts/urls.py index 55e6af22..b7ba3fdb 100755 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -36,6 +36,7 @@ url(r'^dcf_user_data/$', dcf_views.dcf_get_user_data, name='dcf_get_user_data'), url(r'^dcf_unlink/$', dcf_views.dcf_unlink, name='dcf_unlink'), url(r'^dcf_login/$', dcf_views.oauth2_login, name='dcf_login'), + url(r'^dcf_simple_logout/$', dcf_views.dcf_simple_logout, name='dcf_simple_logout'), url(r'^unlink_accounts/', views.unlink_accounts, name='unlink_accounts'), From 68dc661d01e7c4d5ed6664d1b469be8cd32bf23a Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 27 Jun 2018 19:10:11 -0700 Subject: [PATCH 23/76] Debug DCF --- accounts/dcf_views.py | 636 ++++++++++++++++++++++-------------------- accounts/sa_utils.py | 93 +++--- accounts/urls.py | 15 +- 3 files changed, 377 insertions(+), 367 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 1284c187..e06d3aa6 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -33,7 +33,8 @@ from sa_utils import found_linking_problems, DemoLoginResults, handle_user_for_dataset,\ handle_user_db_update_for_dcf_linking, \ - unlink_account_in_db_for_dcf, get_dcf_auth_key_remaining_seconds + unlink_account_in_db_for_dcf, get_dcf_auth_key_remaining_seconds, \ + get_stored_dcf_token from models import DCFToken, AuthorizedDataset, NIH_User, UserAuthorizedDatasets from requests_oauthlib.oauth2_session import OAuth2Session @@ -44,9 +45,8 @@ from dataset_utils.dataset_access_support_factory import DatasetAccessSupportFactory from dataset_utils.dataset_config import DatasetGoogleGroupPair -import httplib as http_client - # Shut this up unless we need to do debug of HTTP request contents +#import httplib as http_client #http_client.HTTPConnection.debuglevel = 1 logger = logging.getLogger('main_logger') @@ -60,6 +60,10 @@ DCF_URL_URL = settings.DCF_URL_URL DCF_TOKEN_REFRESH_WINDOW_SECONDS = settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS + +class TokenFailure(Exception): + """Thrown if we have problems with our access/refresh tokens """ + @login_required def oauth2_login(request): """ @@ -100,6 +104,7 @@ def oauth2_login(request): # ruri = wac.prepare_request_uri(DCF_AUTH_URL, redirect_uri=full_callback, state=rando, scope=['openid', 'user']) # return HttpResponseRedirect(ruri) + @login_required def dcf_simple_logout(request): ''' @@ -113,6 +118,7 @@ def dcf_simple_logout(request): callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) return HttpResponseRedirect(callback) + @login_required def oauth2_callback(request): """ @@ -134,23 +140,30 @@ def oauth2_callback(request): # # DCF now adding a user confirmation page to their flow. If the user says "no", the call back will report - # an error. We need to tell the user there is a problem + # an error. We need to tell the user there is a problem. Also, we now need to equip all callbacks to report + # any random error that is reported back to us. # error = request.GET.get('error', None) if error: error_description = request.GET.get('error_description', None) if error_description == 'The resource owner or authorization server denied the request': - logger.error("[INFO] User did not allow ISB access") + logger.info("[INFO] User did not allow ISB access") + messages.warning(request, + "Login cannot continue if ISB-CGC is not allowed access to the Data Commons Framework.") + else: + logger.error("[ERROR] Unrecognized DCF error: {}".format(error_description)) messages.error(request, - "Login cannot continue if ISB-CGC is not allowed access to the Data Commons Framework") - return redirect(reverse('user_detail', args=[request.user.id])) + 'Data Commons Framework returned an error "{}": {}. ' \ + 'Please contact the ISB-CGC administrator.'.format(error, error_description)) + return redirect(reverse('user_detail', args=[request.user.id])) # # OAuth2Session ENFORCES https unless this environment variable is set. For local dev, we want that off # so we can talk to localhost over http. But let's turn it on/off to minimize, and make it only active in # development: # + logger.info("[INFO] OAuthCB b") if settings.IS_DEV and full_callback.startswith('http://localhost'): os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' @@ -159,7 +172,7 @@ def oauth2_callback(request): saved_state = request.session['dcfOAuth2State'] else: logger.error("[ERROR] Missing dcfOAuth2State during callback") - messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") + messages.error(request, "There was an internal error 001 logging in. Please contact the ISB-CGC administrator.") return redirect(reverse('user_detail', args=[request.user.id])) client_id, client_secret = _get_secrets() @@ -183,12 +196,15 @@ def oauth2_callback(request): logger.error("[ERROR] dcf.fetch_token") logger.error('DCF_TOKEN_URL: {} / authresp: {} / full_callback: {}'.format(DCF_TOKEN_URL, auth_response, full_callback)) logger.exception(e) + messages.error(request, "There was an error contacting the DCF. Please try again, or contact the ISB-CGC administrator.") + return redirect(reverse('user_detail', args=[request.user.id])) + finally: + client_secret = None # clear this in case we are in Debug mode to keep this out of the browser - client_secret = None # clear this in case we are in Debug mode to keep this out of the browser logger.info("[INFO] OAuthCB d") if token_data['token_type'] != 'Bearer': logger.error("[ERROR] Token type returned was not 'Bearer'") - messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") + messages.error(request, "There was an internal error 002 logging in. Please contact the ISB-CGC administrator.") return redirect(reverse('user_detail', args=[request.user.id])) # @@ -221,7 +237,14 @@ def oauth2_callback(request): # Get the key list from the endpoint and choose which one was used in the JWT: # logger.info("[INFO] OAuthCB f") - resp = dcf.get(settings.DCF_KEY_URL) + try: + resp = dcf.get(settings.DCF_KEY_URL) + except Exception as e: + logger.error("[ERROR] Could not retrieve key from DCF") + logger.exception(e) + messages.error(request, "There was an internal error 003 logging in. Please contact the ISB-CGC administrator.") + return redirect(reverse('user_detail', args=[request.user.id])) + key_data = json_loads(resp.text) key_list = key_data['keys'] use_key = None @@ -231,7 +254,7 @@ def oauth2_callback(request): if use_key is None: logger.error("[ERROR] No key found from DCF to validate JWT") - messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") + messages.error(request, "There was an internal error 004 logging in. Please contact the ISB-CGC administrator.") return redirect(reverse('user_detail', args=[request.user.id])) # @@ -241,11 +264,11 @@ def oauth2_callback(request): try: alg_list = ['RS256'] decoded_jwt_id = my_jwt.decode(token_data['id_token'], key=use_key, algorithms=alg_list, - audience=['openid', 'user', 'data', client_id]) + audience=['openid', 'user', 'data', client_id]) except Exception as e: logger.error("[ERROR] Decoding JWT failure") logger.exception(e) - messages.error(request, "There was an internal error logging in. Please contact the ISB-CGC administrator.") + messages.error(request, "There was an internal error 005 logging in. Please contact the ISB-CGC administrator.") return redirect(reverse('user_detail', args=[request.user.id])) # @@ -268,6 +291,7 @@ def oauth2_callback(request): # u'exp': 1525733739, # u'pur': u'id', (The "purpose" of the token. This is an ID. Refresh tokens say "refresh") # u'sub': u'integer user key'} + logger.info("[INFO] OAuthCB h") dcf_user_id = decoded_jwt_id['sub'] @@ -326,18 +350,19 @@ def oauth2_callback(request): # # DCF says the user has linked their Google ID. If it matches our version of the Google ID, great! We are - # done. BUT if the ID has a mismatch, we are going to drop it. It is possible that the first time the user logged in they provided the wrong email address to DCF and + # done. BUT if the ID has a mismatch, we are going to drop it. It is possible that the first time the user + # logged in they provided the wrong email address to DCF and # then ignored us when we asked them to correct the problem. If DCF's provided Google ID does not match # ours, then they need to still provide us with the correct version before we let them use it! # Also, if a user is trying to reuse the same NIH login, we expect to get back a Google ID from DCF that - # does not match the current user email. + # does not match the current user email (but that is caught above, isn't it??) # req_user = User.objects.get(id=request.user.id) logger.info("[INFO] OAuthCB l") if google_link != req_user.email: _unlink_internals(request.user.id, False) - message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of ({})".format( + message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( req_user.email, google_link) messages.warning(request, message) return redirect(reverse('user_detail', args=[request.user.id])) @@ -346,7 +371,12 @@ def oauth2_callback(request): # The link matches. So we use PATCH, and if it goes smoothly, we write the new link to the database: # logger.info("[INFO] OAuthCB m") - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') + try: + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') + except Exception as e: + logger.error("[ERROR] Link patch call failure") + logger.exception(e) + if resp.status_code == 404: messages.warning(request, "No linked Google account found for user") elif resp.status_code == 200: @@ -376,32 +406,23 @@ def oauth2_callback(request): finally: os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' -@login_required -def dcf_link_redo(request): - """ - If the user needs to redo their google, link, this is what does it. - """ - - link_callback = request.build_absolute_uri(reverse('dcf_link_callback')) - callback = '{}?redirect={}'.format(DCF_GOOGLE_URL, link_callback) - return HttpResponseRedirect(callback) - @login_required def dcf_link_callback(request): """ When the user comes back from Google/DCF after linking, this routine gets called. It provides us with any error - conditions, plus + conditions """ # log the reports using Cloud logging API st_logger = StackDriverLogger.build_from_django_settings() # - # If there was an error, return that: + # If there was an error, return that: Also, we now need to equip all callbacks to report + # any random error that is reported back to us. # error = request.GET.get('error', None) if error: - error_description = request.GET.get('error_description', None) + error_description = request.GET.get('error_description', "") if error == 'g_acnt_link_error': message = 'Issue with the linkage between user and their Google account' elif error == 'g_acnt_auth_failure': @@ -411,9 +432,10 @@ def dcf_link_callback(request): else: message = 'Unrecognized error' - messages.warning(request, 'Error detected during linking. ' - 'Please report error "{}" with description "{}" and message "{}" ' - 'to the ISB-CGC administrator'.format(error, message, error_description)) + logger.error("[ERROR]: DCF reports an error ({}, {}, {}) trying to link Google ID".format(error, message, error_description)) + + messages.warning(request, 'Unexpected error detected during linking. ' \ + 'Please report error "{}": {} to the ISB-CGC administrator'.format(error, error_description)) return redirect(reverse('user_detail', args=[request.user.id])) # @@ -458,7 +480,7 @@ def dcf_link_callback(request): req_user = User.objects.get(id=request.user.id) if google_link != req_user.email: _unlink_internals(request.user.id, True) - message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of ({})".format( + message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( req_user.email, google_link) messages.warning(request, message) return redirect(reverse('user_detail', args=[request.user.id])) @@ -473,6 +495,47 @@ def dcf_link_callback(request): return redirect(reverse('user_detail', args=[request.user.id])) +@login_required +def dcf_link_extend(request): + """ + Put a user's GoogleID in the ACL groups for 24 (more) hours: + """ + + # log the reports using Cloud logging API + st_logger = StackDriverLogger.build_from_django_settings() + + try: + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') + except Exception as e: + logger.error("[ERROR] Link patch call failure") + logger.exception(e) + messages.warning(request, "Error contacting DCF during linking. " + "Please contact the ISB-CGC administrator.") + return redirect(reverse('user_detail', args=[request.user.id])) + + if resp.status_code == 404: + messages.warning(request, "No linked Google account found for user") + elif resp.status_code == 200: + pass + else: + messages.warning(request, "Unexpected response ({}) from DCF during linking. " + "Please contact the ISB-CGC administrator.".format(resp.status_code)) + + returned_expiration_str = json_loads(resp.text)['exp'] + use_expiration_time = _calc_expiration_time(returned_expiration_str) + + # User data set permissions might have changed, so we call and find out what they are: + user_data_token_string = _get_user_data_token_string(request.user.id) + user_data_dict = _user_data_token_to_user_dict(user_data_token_string) + + _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data_dict, use_expiration_time, st_logger) + + if warning: + messages.warning(request, warning) + + return redirect(reverse('user_detail', args=[request.user.id])) + + def _calc_expiration_time(returned_expiration_str): returned_expiration_time = None @@ -514,7 +577,11 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger): # Save the new info from the DCF: # - dcf_token = DCFToken.objects.get(user_id=user_id) + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: + return 'Unexpected internal error: No stored token ' \ + 'Please report this to the ISB-CGC administrator' + if dcf_token.google_id is not None and dcf_token.google_id != user_email: return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ 'Please report this to the ISB-CGC administrator' @@ -543,153 +610,6 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger): return warning -class GoogleLinkState: - BOTH_NULL = 1 - DCF_NULL_CGC_NON_NULL = 2 - DCF_NON_NULL_CGC_NULL = 3 - MATCHING_BAD = 4 - MATCHING_OK = 5 - NON_MATCHING_DCF_BAD = 6 - NON_MATCHING_CGC_BAD = 7 - NON_MATCHING_ALL_BAD = 8 - -def _compare_google_ids(dcf_version, cgc_version, user_email): - """ - When we get new tokens from DCF, we want to sanity check if the Google IDs are in agreement. - """ - - if dcf_version != cgc_version: - # Most likely possibility is that either DCF or us thinks the google ID is None and the other doesn't. Another - # possibility is that DCF has another Google ID for the user that is not consistent with the - # one we *want* them to be using. That case *should* have been caught when they first tried to link. - # - # If link IDs do not match, we need match ours to DCF, and flag the problem - if dcf_version is None: - google_match_state = GoogleLinkState.DCF_NULL_CGC_NON_NULL - elif cgc_version is None: - google_match_state = GoogleLinkState.DCF_NON_NULL_CGC_NULL - elif dcf_version == user_email: - google_match_state = GoogleLinkState.NON_MATCHING_CGC_BAD - elif cgc_version == user_email: - google_match_state = GoogleLinkState.NON_MATCHING_DCF_BAD - else: - google_match_state = GoogleLinkState.NON_MATCHING_ALL_BAD - # Next three cases handle matching GoogleIDs: - elif dcf_version is None: - google_match_state = GoogleLinkState.BOTH_NULL - elif dcf_version == user_email: - google_match_state = GoogleLinkState.MATCHING_OK - elif dcf_version != user_email: - google_match_state = GoogleLinkState.MATCHING_BAD - - return google_match_state - - -def _refresh_from_dcf(user_id): - """ - We would like to check if our view of the user (linkage, expirations, datasets) is consistent with what the - DCF thinks, and update accordingly! - """ - - user_email = User.objects.get(id=user_id).email - - # - # Haul the user data token string down from DCF: - # - - the_user_token = _get_user_data_token_string(user_id) # the_user_token is a string - - # - # Things that could be different: Google ID linkage, expiration time, approved datasets. - # Right now, we are not provided with expiration time, so we cannot check that. While NIH linkage - # could change in theory, that is fixed via DCF for the life of a refresh token. User could only change - # that by logging out/disconnecting from DCF and going back in again, which would give us a new refresh - # token. - # - - the_user_dict = _user_data_token_to_user_dict(the_user_token) - - dcf_google_link = _get_google_link_from_user_dict(the_user_dict) - nih_id = _get_nih_id_from_user_dict(the_user_dict) - dict_o_projects = _get_projects_from_user_dict(the_user_dict) - - # - # Compare to our versions: - # - - dcf_token = DCFToken.objects.get(user_id=user_id) - - google_match_state = _compare_google_ids(dcf_google_link, dcf_token.google_id, user_email) - google_problem = None - - if google_match_state != GoogleLinkState.MATCHING_OK and google_match_state != GoogleLinkState.BOTH_NULL: - dcf_token.google_id = dcf_google_link - google_problem = google_match_state - - # - # This is exercised when the NIH ID of the user, returned in the ID token is different than the one we - # have in our token database. Don't think this is even possible, since user would need to log in as the - # new NIH ID first... - # - if nih_id.lower() != dcf_token.nih_username_lower: - logger.error("ERROR: UNEXPECTED NIH_USER_ID MISMATCH {} VERSUS {}".format(nih_id.lower(), - dcf_token.nih_username_lower)) - - # - # If everything was consistent, if DCF tells the user is linked to an NIH ID, we would have that ID as one and - # only one linked record in our DB. - # - - if google_match_state == GoogleLinkState.MATCHING_OK: - # Note the use of __iexact does case insensitive match: - linked_nih_user_for_user_and_id = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=nih_id, linked=True) - if len(linked_nih_user_for_user_and_id) == 1: - print "All is good" - else: - # - # Problems! If we have - nih_users_for_user = NIH_User.objects.filter(user_id=user_id) - nih_users_for_id = NIH_User.objects.filter(NIH_username__iexact=nih_id) - if len(nih_users_for_id) == 1: - pass - - - - - - # If user logged into DCF but did not get the linking done correctly, the token will provide us with the - # NIH ID they are using, but we will NOT have a current linked record in the NIH_User table. - - # wafjwophfwfHIGwfpsiFif - # - # - # if dcf_token.google_id is not None and dcf_token.google_id != user_email: - # return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ - # 'Please report this to the ISB-CGC administrator' - # - # dcf_token.google_id = user_email - # dcf_token.user_token = the_user_token - # dcf_token.save() - # - # nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, - # nih_assertion_expiration, st_logger) - # - # - # authorized_datasets = [] - # for project, perm_list in dict_o_projects.iteritems(): - # adqs = AuthorizedDataset.objects.filter(whitelist_id=project) - # if len(adqs) == 1: - # authorized_datasets.append(DatasetGoogleGroupPair(project, adqs.first().acl_google_group)) - # - # das = DatasetAccessSupportFactory.from_webapp_django_settings() - # all_datasets = das.get_all_datasets_and_google_groups() - # - # for dataset in all_datasets: - # handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) - - #return warning - - def _user_data_token_dict_massaged(the_user_token_dict): """ Takes the user data token dictionary (as returned by DCF) and returns massaged user-only string AND dict @@ -763,7 +683,6 @@ def _user_data_token_to_user_dict(user_data_token_string): """ the_user_token_dict = json_loads(user_data_token_string) - print "UDTS", user_data_token_string the_user_dict = the_user_token_dict['context']['user'] return the_user_dict @@ -804,7 +723,9 @@ def _user_data_from_token(user_id): # Note THIS WILL NOT WORK IF REFRESH TOKEN HAS EXPIRED! # - dcf_token = DCFToken.objects.get(user_id=user_id) + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: + raise TokenFailure() client_id, client_secret = _get_secrets() @@ -833,7 +754,9 @@ def _refresh_access_token(user_id): # under the covers, but here we want to do it explicitly. """ - dcf_token = DCFToken.objects.get(user_id=user_id) + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: + raise TokenFailure() client_id, client_secret = _get_secrets() @@ -883,67 +806,7 @@ def _massage_user_data_for_dev(the_user): return the_user -@login_required -def dcf_link_extend(request): - """ - Put a user's GoogleID in the ACL groups for 24 (more) hours: - """ - - # log the reports using Cloud logging API - st_logger = StackDriverLogger.build_from_django_settings() - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') - if resp.status_code == 404: - messages.warning(request, "No linked Google account found for user") - elif resp.status_code == 200: - pass - else: - messages.warning(request, "Unexpected response ({}) from DCF during linking. " - "Please contact the ISB-CGC administrator.".format(resp.status_code)) - - returned_expiration_str = json_loads(resp.text)['exp'] - use_expiration_time = _calc_expiration_time(returned_expiration_str) - - # User data set permissions might have changed, so we call and find out what they are: - user_data_token_string = _get_user_data_token_string(request.user.id) - user_data_dict = _user_data_token_to_user_dict(user_data_token_string) - - _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data_dict, use_expiration_time, st_logger) - - if warning: - messages.warning(request, warning) - - return redirect(reverse('user_detail', args=[request.user.id])) - - -@login_required -def dcf_unlink(request): - """ - Unlink a user's GoogleID from their NIH ID. This is NOT the traditional sense of unlink, as the user is - still able to talk to DCF using their NIH ID. For a traditional unlink, we use dcf_disconnect_user: - """ - - "If user has linked to incorrect google account, we do not give them the option to first **unlink** from" \ - " the bad account, but only the option to LINK." - # Please - # unlink - # ID - # wlongabaugh @ gmail.com and use - # your - # ISB - CGC - # login - # email(wlongabaugh @ systemsbiology.org) - # to - # link - # with the DCF - - success, warnings, errors = _unlink_internals(request.user.id, False) - if not success: - for warning in warnings: - messages.warning(request, warning) - for error in errors: - messages.error(request, error) - return redirect(reverse('user_detail', args=[request.user.id])) def _unlink_internals(user_id, just_with_dcf): @@ -958,7 +821,9 @@ def _unlink_internals(user_id, just_with_dcf): # Get our concept of linking state from the token DB: # if not just_with_dcf: - dcf_token = DCFToken.objects.get(user_id=user_id) + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: + raise TokenFailure() the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) google_link = _get_google_link_from_user_dict(the_user_dict) @@ -971,7 +836,14 @@ def _unlink_internals(user_id, just_with_dcf): # booted out of control groups. # - resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') + try: + resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') + except Exception as e: + logger.error("[ERROR] Attempt to contact DCF for Google ID unlink failed (user {})".format(user_id)) + logger.exception(e) + errors.append("Unexpected error in unlinking") + return success, warnings, errors + if resp.status_code == 404: warnings.append("No linked Google account found for user") elif resp.status_code == 400: @@ -999,7 +871,10 @@ def _unlink_internals(user_id, just_with_dcf): # is for the access token, not the google link (that info is stored in the NIH_user): # - dcf_token = DCFToken.objects.get(user_id=user_id) + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: + raise TokenFailure() + dcf_token.google_id = None dcf_token.save() @@ -1062,7 +937,7 @@ def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_fro datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) - print 'Token storage. New token expires at {}'.format(str(expiration_time)) + logger.info('[INFO] Refresh token storage. New token expires at {}'.format(str(expiration_time))) # FIXME! Make sure that the NIH name is going to be unique before we shove it into the table. Don't # depend on the DB table constraint. @@ -1097,17 +972,18 @@ def _access_token_storage(token_dict, cgc_uid): datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) - print 'Token storage. New token expires at {}'.format(str(expiration_time)) + logger.info('[INFO] Access token storage. New token expires at {}'.format(str(expiration_time))) # # Right now (5/30/18) we only get full user info back during the token refresh call. So decode # it and stash it as well: # id_token_decoded, _ = _decode_token(token_dict['id_token']) - print 'id_token', id_token_decoded - print 'access_token', token_dict['access_token'] - dcf_token = DCFToken.objects.get(user_id=cgc_uid) + dcf_token = get_stored_dcf_token(cgc_uid) + if not dcf_token: + raise TokenFailure() + dcf_token.access_token = token_dict['access_token'] dcf_token.user_token = id_token_decoded dcf_token.expires_at = expiration_time @@ -1132,7 +1008,6 @@ def _decode_token(token): """ return _decode_token_chunk(token, 1) - @login_required def dcf_disconnect_user(request): """ @@ -1150,17 +1025,13 @@ def dcf_disconnect_user(request): # If user is sitting on this page in one browser, and logs out via another, we would have # no DCF token anymore. Catch that case and silently no-op: # - dcf_tokens = DCFToken.objects.filter(user_id=request.user.id) - num_tokens = len(dcf_tokens) - if num_tokens != 1: - if num_tokens > 1: - messages.warning(request, 'Unexpected Server Error: Multiple tokens found') + + dcf_token = get_stored_dcf_token(request.user.id) + if not dcf_token: return redirect(reverse('user_detail', args=[request.user.id])) - dcf_token = dcf_tokens.first() the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) - print the_user_dict, type(the_user_dict) google_link = _get_google_link_from_user_dict(the_user_dict) logger.info("[INFO] DDU A") @@ -1177,7 +1048,7 @@ def dcf_disconnect_user(request): elif resp.status_code == 200: pass else: - msg_list.append(request, "Unexpected response from DCF {}".format(resp.status_code)) + msg_list.append("Unexpected response from DCF {}".format(resp.status_code)) # # The revoke call is unlike other DCF endpoints in that it is special! @@ -1234,44 +1105,30 @@ def dcf_disconnect_user(request): return HttpResponseRedirect(callback) -def _dcf_user_data_from_token(request): - """ - Seems that we should be able to get full user info from the user endpoint, but it turns out that - the information in the token refresh is more complete. - """ - - id_token_decoded, id_token_dict = _user_data_from_token(request.user.id) - - if id_token_decoded is not None: - messages.warning(request, 'TDCF Responded with {}'.format(id_token_decoded)) - else: - messages.warning(request, 'Token acquisition problem') - - # redirect to user detail page - return redirect(reverse('user_detail', args=[request.user.id])) - - -@login_required -def dcf_get_user_data(request): - """ - Use for QC and development - """ - - udft = _dcf_user_data_from_token(request) - - resp = _dcf_call(DCF_USER_URL, request.user.id) - user_data = json_loads(resp.text) - - remaining_token_time = get_dcf_auth_key_remaining_seconds(request.user.id) - messages.warning(request, 'EPDCF Responded with {}: {} plus {}'.format(user_data, remaining_token_time, udft)) - return redirect(reverse('user_detail', args=[request.user.id])) +# @login_required +# def dcf_get_user_data(request): +# """ +# Use for QC and development if we need to see token info. Not used in production +# """ +# +# id_token_decoded, _ = _user_data_from_token(request.user.id) +# +# resp = _dcf_call(DCF_USER_URL, request.user.id) +# user_data = json_loads(resp.text) +# +# remaining_token_time = get_dcf_auth_key_remaining_seconds(request.user.id) +# messages.warning(request, 'EPDCF Responded with {}: {} plus {}'.format(user_data, remaining_token_time, id_token_decoded)) +# return redirect(reverse('user_detail', args=[request.user.id])) def _dcf_call(full_url, user_id, mode='get', post_body=None, force_token=False): """ All the stuff around a DCF call that handles token management and refreshes. """ - dcf_token = DCFToken.objects.get(user_id=user_id) + + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: + raise TokenFailure() expires_in = (dcf_token.expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() logger.info("[INFO] Token Expiration : {} seconds".format(expires_in)) @@ -1310,6 +1167,8 @@ def token_storage_for_user(my_token_dict): except MissingTokenError as e: print "drop the records from the database {}".format(str(e)) print "NO! gotta remember they linked as NIH ID before!!" + except TokenFailure as e: + print "token problem" except Exception as e: print "drop the records from the database {}".format(str(e)) @@ -1337,8 +1196,6 @@ def _read_dict(my_file_name): def get_nih_user_details_from_token(user_id): user_details = {} - - # # The information we used to pull out of our database is now obtained from a DCF token # @@ -1350,15 +1207,10 @@ def get_nih_user_details_from_token(user_id): # issue by looking at the current DCF token attached to the user to see who they are associated with. # - dcf_tokens = DCFToken.objects.filter(user_id=user_id) - if len(dcf_tokens) == 0: - return user_details - elif len(dcf_tokens) > 1: - logger.error("[ERROR] MULTIPLE DCF RECORDS FOR USER {}. ".format(str(user_id))) + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: return user_details - dcf_token = dcf_tokens.first() - the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) google_link = _get_google_link_from_user_dict(the_user_dict) @@ -1413,3 +1265,179 @@ def get_nih_user_details_from_token(user_id): user_details['auth_datasets'] = [] if len(user_auth_datasets) <= 0 else AuthorizedDataset.objects.filter(id__in=user_auth_datasets.values_list('authorized_dataset',flat=True)) return user_details + + +class GoogleLinkState: + BOTH_NULL = 1 + DCF_NULL_CGC_NON_NULL = 2 + DCF_NON_NULL_CGC_NULL = 3 + MATCHING_BAD = 4 + MATCHING_OK = 5 + NON_MATCHING_DCF_BAD = 6 + NON_MATCHING_CGC_BAD = 7 + NON_MATCHING_ALL_BAD = 8 + +def _compare_google_ids(dcf_version, cgc_version, user_email): + """ + When we get new tokens from DCF, we want to sanity check if the Google IDs are in agreement. + """ + + if dcf_version != cgc_version: + # Most likely possibility is that either DCF or us thinks the google ID is None and the other doesn't. Another + # possibility is that DCF has another Google ID for the user that is not consistent with the + # one we *want* them to be using. That case *should* have been caught when they first tried to link. + # + # If link IDs do not match, we need match ours to DCF, and flag the problem + if dcf_version is None: + google_match_state = GoogleLinkState.DCF_NULL_CGC_NON_NULL + elif cgc_version is None: + google_match_state = GoogleLinkState.DCF_NON_NULL_CGC_NULL + elif dcf_version == user_email: + google_match_state = GoogleLinkState.NON_MATCHING_CGC_BAD + elif cgc_version == user_email: + google_match_state = GoogleLinkState.NON_MATCHING_DCF_BAD + else: + google_match_state = GoogleLinkState.NON_MATCHING_ALL_BAD + # Next three cases handle matching GoogleIDs: + elif dcf_version is None: + google_match_state = GoogleLinkState.BOTH_NULL + elif dcf_version == user_email: + google_match_state = GoogleLinkState.MATCHING_OK + elif dcf_version != user_email: + google_match_state = GoogleLinkState.MATCHING_BAD + + return google_match_state + + +def _refresh_from_dcf(user_id): + """ + We would like to check if our view of the user (linkage, expirations, datasets) is consistent with what the + DCF thinks, and update accordingly! + """ + + user_email = User.objects.get(id=user_id).email + + # + # Haul the user data token string down from DCF: + # + + the_user_token = _get_user_data_token_string(user_id) # the_user_token is a string + + # + # Things that could be different: Google ID linkage, expiration time, approved datasets. + # Right now, we are not provided with expiration time, so we cannot check that. While NIH linkage + # could change in theory, that is fixed via DCF for the life of a refresh token. User could only change + # that by logging out/disconnecting from DCF and going back in again, which would give us a new refresh + # token. + # + + the_user_dict = _user_data_token_to_user_dict(the_user_token) + + dcf_google_link = _get_google_link_from_user_dict(the_user_dict) + nih_id = _get_nih_id_from_user_dict(the_user_dict) + dict_o_projects = _get_projects_from_user_dict(the_user_dict) + + # + # Compare to our versions: + # + + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: + print "we have no token" + return + + google_match_state = _compare_google_ids(dcf_google_link, dcf_token.google_id, user_email) + google_problem = None + + if google_match_state != GoogleLinkState.MATCHING_OK and google_match_state != GoogleLinkState.BOTH_NULL: + dcf_token.google_id = dcf_google_link + google_problem = google_match_state + + # + # This is exercised when the NIH ID of the user, returned in the ID token is different than the one we + # have in our token database. Don't think this is even possible, since user would need to log in as the + # new NIH ID first... + # + if nih_id.lower() != dcf_token.nih_username_lower: + logger.error("ERROR: UNEXPECTED NIH_USER_ID MISMATCH {} VERSUS {}".format(nih_id.lower(), + dcf_token.nih_username_lower)) + + # + # If everything was consistent, if DCF tells the user is linked to an NIH ID, we would have that ID as one and + # only one linked record in our DB. + # + + if google_match_state == GoogleLinkState.MATCHING_OK: + # Note the use of __iexact does case insensitive match: + linked_nih_user_for_user_and_id = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=nih_id, linked=True) + if len(linked_nih_user_for_user_and_id) == 1: + print "All is good" + else: + # + # Problems! If we have + nih_users_for_user = NIH_User.objects.filter(user_id=user_id) + nih_users_for_id = NIH_User.objects.filter(NIH_username__iexact=nih_id) + if len(nih_users_for_id) == 1: + pass + + + + + + # If user logged into DCF but did not get the linking done correctly, the token will provide us with the + # NIH ID they are using, but we will NOT have a current linked record in the NIH_User table. + + # + # + # if dcf_token.google_id is not None and dcf_token.google_id != user_email: + # return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ + # 'Please report this to the ISB-CGC administrator' + # + # dcf_token.google_id = user_email + # dcf_token.user_token = the_user_token + # dcf_token.save() + # + # nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, + # nih_assertion_expiration, st_logger) + # + # + # authorized_datasets = [] + # for project, perm_list in dict_o_projects.iteritems(): + # adqs = AuthorizedDataset.objects.filter(whitelist_id=project) + # if len(adqs) == 1: + # authorized_datasets.append(DatasetGoogleGroupPair(project, adqs.first().acl_google_group)) + # + # das = DatasetAccessSupportFactory.from_webapp_django_settings() + # all_datasets = das.get_all_datasets_and_google_groups() + # + # for dataset in all_datasets: + # handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) + + #return warning + + +# @login_required +# def dcf_link_redo(request): +# """ +# Simple link redo, but requires that user have the necessary unexpired DCF cookies in their browser. Not +# for production use +# """ +# +# link_callback = request.build_absolute_uri(reverse('dcf_link_callback')) +# callback = '{}?redirect={}'.format(DCF_GOOGLE_URL, link_callback) +# return HttpResponseRedirect(callback) + +# @login_required +# def dcf_unlink(request): +# """ +# Just unlink a user's GoogleID from their NIH ID. This is NOT the traditional sense of unlink, as the user is +# still able to talk to DCF using their NIH ID. For a traditional unlink, we use dcf_disconnect_user: +# """ +# +# success, warnings, errors = _unlink_internals(request.user.id, False) +# if not success: +# for warning in warnings: +# messages.warning(request, warning) +# for error in errors: +# messages.error(request, error) +# return redirect(reverse('user_detail', args=[request.user.id])) \ No newline at end of file diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index f3d092e2..eda90741 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1011,7 +1011,9 @@ def get_dcf_auth_key_remaining_seconds(user_id): a new refresh token, which will expire every 30 days. """ - dcf_token = DCFToken.objects.get(user_id=user_id) + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: + return -1 # ? No token? They expire immediately! remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() logger.info('[INFO] user {} has {} seconds remaining on refresh token'. @@ -1276,39 +1278,6 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, # "[STATUS] User {} added to {}.".format(user_email, # dataset.google_group_name)) - # Add task in queue to deactivate NIH_User entry after NIH_assertion_expiration has passed. - # try: - # full_topic_name = get_full_topic_name(PUBSUB_TOPIC_ERA_LOGIN) - # logger.info("Full topic name: {}".format(full_topic_name)) - # client = get_pubsub_service() - # params = { - # 'event_type': 'era_login', - # 'user_id': user_id, - # 'deployment': CRON_MODULE - # } - # message = json_dumps(params) - # - # body = { - # 'messages': [ - # { - # 'data': base64.b64encode(message.encode('utf-8')) - # } - # ] - # } - # client.projects().topics().publish(topic=full_topic_name, body=body).execute() - # st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - # "[STATUS] Notification sent to PubSub topic: {}".format(full_topic_name)) - # - # except Exception as e: - # logger.error("[ERROR] Failed to publish to PubSub topic") - # logger.exception(e) - # st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - # "[ERROR] Failed to publish to PubSub topic: {}".format(str(e))) - # - # retval.messages.append(warn_message) - # return retval - - def deactivate_nih_add_to_open(user_id, user_email): # 5/14/18 NO! active flag has nothing to do with user logout, but instead is set to zero when user expires off of ACL group # after 24 hours: @@ -1329,17 +1298,34 @@ def deactivate_nih_add_to_open(user_id, user_email): # else: # logger.info("[STATUS] No linked NIH user was found for user {} - no one set to inactive.".format(user_email)) - directory_service, http_auth = get_directory_resource() + if not settings.DCF_TEST: + directory_service, http_auth = get_directory_resource() + # add user to OPEN_ACL_GOOGLE_GROUP if they are not yet on it + try: + body = {"email": user_email, "role": "MEMBER"} + directory_service.members().insert(groupKey=OPEN_ACL_GOOGLE_GROUP, body=body).execute(http=http_auth) + logger.info("[STATUS] Attempting to insert user {} into group {}. " + .format(str(user_email), OPEN_ACL_GOOGLE_GROUP)) + except HttpError as e: + logger.info(e) - # add user to OPEN_ACL_GOOGLE_GROUP if they are not yet on it - try: - body = {"email": user_email, "role": "MEMBER"} - directory_service.members().insert(groupKey=OPEN_ACL_GOOGLE_GROUP, body=body).execute(http=http_auth) - logger.info("[STATUS] Attempting to insert user {} into group {}. " - .format(str(user_email), OPEN_ACL_GOOGLE_GROUP)) - except HttpError as e: - logger.info(e) +def get_stored_dcf_token(user_id): + """ + When a user breaks their connection with DCF, we flush out the revoked tokens. But if they have a + session running in another browser, they might still be clicking on links that expect a token. So + we need to be bulletproof on maybe not getting back a token. May return None + """ + dcf_tokens = DCFToken.objects.filter(user_id) + num_tokens = len(dcf_tokens) + if num_tokens != 1: + if num_tokens > 1: + logger.error('[ERROR] Unexpected Server Error: Multiple tokens found for user {}'.format(user_id)) + else: + logger.info('[INFO] User {} tried to use a flushed token'.format(user_id)) + return None + dcf_token = dcf_tokens.first() + return dcf_token def get_nih_user_details(user_id, force_logout): @@ -1359,18 +1345,12 @@ def get_nih_user_details(user_id, force_logout): # issue by looking at the current DCF token attached to the user to see who they are associated with. # - dcf_tokens = DCFToken.objects.filter(user_id=user_id) - if len(dcf_tokens) == 0: + dcf_token = get_stored_dcf_token(user_id) + if not dcf_token: return user_details # i.e. empty dict - elif len(dcf_tokens) > 1: - logger.error("[ERROR] MULTIPLE DCF RECORDS FOR USER {}. ".format(str(user_id))) - return user_details # i.e. empty dict - - dcf_token = dcf_tokens.first() curr_user = User.objects.get(id=user_id) - nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=dcf_token.nih_username) if len(nih_users) == 0: @@ -1431,17 +1411,19 @@ def get_nih_user_details(user_id, force_logout): # testing_expire_hack = datetime.timedelta(minutes=-((60 * 23) + 55)) # expired_time = expired_time + testing_expire_hack now_time = pytz.utc.localize(datetime.datetime.utcnow()) - print "times", expired_time, now_time if now_time >= expired_time: + logger.info("[INFO] Expired user hit user info page and was deactivated {}.".format(expired_time, now_time)) nih_user.active = False nih_user.NIH_assertion_expiration = now_time nih_user.save() else: try: nih_user = NIH_User.objects.get(user_id=user_id, linked=True) - except (MultipleObjectsReturned, ObjectDoesNotExist), e: - if type(e) is MultipleObjectsReturned: - logger.warn("Error when retrieving noh_user with user_id {}. {}".format(str(user_id), str(e))) + except MultipleObjectsReturned as e: + logger.warn("Multiple objects when retrieving nih_user with user_id {}. {}".format(str(user_id), str(e))) + return user_details + except ObjectDoesNotExist as e: + logger.warn("No objects when retrieving nih_user with user_id {}. {}".format(str(user_id), str(e))) return user_details user_auth_datasets = UserAuthorizedDatasets.objects.filter(nih_user=nih_user) @@ -1456,7 +1438,6 @@ def get_nih_user_details(user_id, force_logout): if settings.DCF_TEST: user_details['link_mismatch'] = (dcf_token.google_id is None) or (dcf_token.google_id != curr_user.email) - user_details['NIH_DCF_linked'] = nih_user.linked user_details['refresh_key_ok'] = get_dcf_auth_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS user_details['force_DCF_logout'] = False diff --git a/accounts/urls.py b/accounts/urls.py index b7ba3fdb..c268e5f0 100755 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -28,17 +28,18 @@ url(r'^logout', views.extended_logout_view, name='account_logout'), url(r'^login/$', google_views.oauth2_login, name='account_login'), # url(r'^nih_login/$', views.nih_login, name='nih_login'), + url(r'^unlink_accounts/', views.unlink_accounts, name='unlink_accounts'), + # Following urls for new DCF flows + url(r'^dcf_login/$', dcf_views.oauth2_login, name='dcf_login'), + url(r'^dcf_simple_logout/$', dcf_views.dcf_simple_logout, name='dcf_simple_logout'), url(r'^dcf/login/callback/$', dcf_views.oauth2_callback, name='dcf_callback'), url(r'^dcf_link_callback/$', dcf_views.dcf_link_callback, name='dcf_link_callback'), url(r'^dcf_link_extend/$', dcf_views.dcf_link_extend, name='dcf_link_extend'), - url(r'^dcf_link_redo/$', dcf_views.dcf_link_redo, name='dcf_link_redo'), url(r'^dcf_disconnect_user/$', dcf_views.dcf_disconnect_user, name='dcf_disconnect_user'), - url(r'^dcf_user_data/$', dcf_views.dcf_get_user_data, name='dcf_get_user_data'), - url(r'^dcf_unlink/$', dcf_views.dcf_unlink, name='dcf_unlink'), - url(r'^dcf_login/$', dcf_views.oauth2_login, name='dcf_login'), - url(r'^dcf_simple_logout/$', dcf_views.dcf_simple_logout, name='dcf_simple_logout'), - url(r'^unlink_accounts/', views.unlink_accounts, name='unlink_accounts'), - + # Following urls for QC and development use. Not used in production + # url(r'^dcf_user_data/$', dcf_views.dcf_get_user_data, name='dcf_get_user_data'), + # url(r'^dcf_unlink/$', dcf_views.dcf_unlink, name='dcf_unlink'), + # url(r'^dcf_link_redo/$', dcf_views.dcf_link_redo, name='dcf_link_redo'), # Google Cloud Project related url(r'^users/(?P\d+)/gcp_list/$', views.user_gcp_list, name='user_gcp_list'), From 9a330074d47d4f43a75c3f8d1a8286d521b21580 Mon Sep 17 00:00:00 2001 From: elainelee Date: Thu, 28 Jun 2018 13:00:38 -0700 Subject: [PATCH 24/76] Add ability to filter files by Case Barcode ticket#2383 --- cohorts/views.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cohorts/views.py b/cohorts/views.py index 26df788a..091af87a 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -2055,7 +2055,8 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co filter_counts = None file_list = [] total_file_count = 0 - + case_barcode = request.GET.get('case_barcode', '') + case_barcode_condition = '' if not case_barcode else "AND cs.case_barcode ='" + case_barcode + "'" try: # Attempt to get the cohort perms - this will cause an excpetion if we don't have them Cohort_Perms.objects.get(cohort_id=cohort_id, user_id=user_id) @@ -2079,12 +2080,12 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co ON cs.case_barcode = ds.PatientID JOIN [{data_project}:{tcga_bioclin_dataset}.{tcga_clin_table}] bc ON bc.case_barcode=cs.case_barcode - WHERE cs.cohort_id = {cohort} + WHERE cs.cohort_id = {cohort} {case_barcode_condition} GROUP BY cs.case_barcode, ds.StudyInstanceUID, ds.StudyDescription, bc.disease_code, bc.project_short_name """.format(cohort_dataset=bq_cohort_dataset, cohort_project=bq_cohort_project_id, cohort_table=bq_cohort_table, data_project=data_project, dcf_data_table="TCGA_radiology_images", tcga_img_dataset="metadata", - tcga_bioclin_dataset="TCGA_bioclin_v0", tcga_clin_table="Clinical", cohort=cohort_id) + tcga_bioclin_dataset="TCGA_bioclin_v0", tcga_clin_table="Clinical", cohort=cohort_id, case_barcode_condition=case_barcode_condition) file_list_query = """ {select_clause} @@ -2165,7 +2166,7 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co WHERE cohort_id = {cohort_id} ) cs ON cs.case_barcode = md.case_barcode - WHERE md.file_uploaded='true' {type_conditions} {filter_conditions} + WHERE md.file_uploaded='true' {type_conditions} {filter_conditions} {case_barcode_condition} """ file_list_query = """ @@ -2197,7 +2198,6 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co cursor = db.cursor(MySQLdb.cursors.DictCursor) cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() - params = () select_clause = '' count_select_clause = '' @@ -2220,17 +2220,20 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co cohort_id=cohort_id, metadata_table=program_data_table, type_conditions=type_conditions, - filter_conditions=filter_conditions) + filter_conditions=filter_conditions, + case_barcode_condition=case_barcode_condition) if do_filter_count: count_select_clause += union_template.format( cohort_id=cohort_id, metadata_table=program_data_table, type_conditions=type_conditions, - filter_conditions='') + filter_conditions='', + case_barcode_condition=case_barcode_condition) first_program = False # if first_program is still true, we found no programs with data tables for this build if not first_program: + if limit > 0: limit_clause = ' LIMIT %s' % str(limit) # Offset is only valid when there is a limit From d8add7ba3df2a4892691d9869c29d4cc311268eb Mon Sep 17 00:00:00 2001 From: s-paquette Date: Thu, 28 Jun 2018 18:34:38 -0700 Subject: [PATCH 25/76] -> Add 'get_program_names' to Cohort model, for easy array access to the unique names of the programs in a cohort -> Per #2312 added a list of a cohort's programs to the context of the filelist view --- cohorts/models.py | 5 +++++ cohorts/views.py | 5 ++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cohorts/models.py b/cohorts/models.py index da001330..d0e3d192 100755 --- a/cohorts/models.py +++ b/cohorts/models.py @@ -68,6 +68,11 @@ def get_programs(self): projects = self.samples_set.values_list('project_id', flat=True).distinct() return Program.objects.filter(active=True, id__in=Project.objects.filter(id__in=projects).values_list('program_id', flat=True)).distinct() + def get_program_names(self): + projects = self.samples_set.values_list('project_id', flat=True).distinct() + names = Program.objects.filter(active=True, id__in=Project.objects.filter(id__in=projects).values_list('program_id', flat=True)).distinct().values_list('name',flat=True) + return [str(x) for x in names] + def only_user_data(self): return bool(Program.objects.filter(id__in=self.get_programs(), is_public=True).count() <= 0) diff --git a/cohorts/views.py b/cohorts/views.py index 7708d890..25686dd7 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -1665,7 +1665,8 @@ def cohort_filelist(request, cohort_id=0, panel_type=None): 'sel_file_max': MAX_SEL_FILES, 'img_thumbs_url': settings.IMG_THUMBS_URL, 'has_user_data': bool(cohort_sample_list.count() > 0), - 'build': build}) + 'build': build, + 'programs_this_cohort': cohort.get_program_names()}) logger.debug("[STATUS] Returning response from cohort_filelist, with exception") @@ -1742,8 +1743,6 @@ def cohort_filelist_ajax(request, cohort_id=0, panel_type=None): del result['metadata_data_counts'] result['metadata_data_attr'] = [metadata_data_attr[x] for x in metadata_data_attr] - logger.debug("[STATUS] Returning response from cohort_filelist_ajax") - except Exception as e: logger.error("[ERROR] While retrieving cohort file data for AJAX call:") logger.exception(e) From 2302f1f79c75ec5d1128f09b7493ccbdf601a1d7 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Thu, 28 Jun 2018 19:20:09 -0700 Subject: [PATCH 26/76] Improving error handling --- accounts/dcf_views.py | 624 ++++++++++++++++++++++++++++++------------ accounts/sa_utils.py | 82 ++++-- 2 files changed, 504 insertions(+), 202 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index e06d3aa6..8570aed1 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -33,8 +33,8 @@ from sa_utils import found_linking_problems, DemoLoginResults, handle_user_for_dataset,\ handle_user_db_update_for_dcf_linking, \ - unlink_account_in_db_for_dcf, get_dcf_auth_key_remaining_seconds, \ - get_stored_dcf_token + unlink_account_in_db_for_dcf, get_dcf_refresh_key_remaining_seconds, \ + get_stored_dcf_token, TokenFailure, RefreshTokenExpired from models import DCFToken, AuthorizedDataset, NIH_User, UserAuthorizedDatasets from requests_oauthlib.oauth2_session import OAuth2Session @@ -61,8 +61,8 @@ DCF_TOKEN_REFRESH_WINDOW_SECONDS = settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS -class TokenFailure(Exception): - """Thrown if we have problems with our access/refresh tokens """ +class DCFCommFailure(Exception): + """Thrown if we have problems communicating with DCF """ @login_required def oauth2_login(request): @@ -123,9 +123,13 @@ def dcf_simple_logout(request): def oauth2_callback(request): """ Second step of OAuth2 login to DCF. Takes the response redirect URL that DCF returned to the user's browser, - parse out the auth code and use it to get a token + parse out the auth code and use it to get a token. """ + comm_err_msg = "There was a communications problem contacting Data Commons Framework." + internal_err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator." + dcf_err_msg = "DCF reported an error {} logging in. Please contact the ISB-CGC administrator." + try: logger.info("[INFO] OAuthCB a") full_callback = request.build_absolute_uri(reverse('dcf_callback')) @@ -148,14 +152,12 @@ def oauth2_callback(request): if error: error_description = request.GET.get('error_description', None) if error_description == 'The resource owner or authorization server denied the request': - logger.info("[INFO] User did not allow ISB access") + logger.info("[INFO] User {} did not allow ISB access".format(request.user.id)) messages.warning(request, "Login cannot continue if ISB-CGC is not allowed access to the Data Commons Framework.") else: - logger.error("[ERROR] Unrecognized DCF error: {}".format(error_description)) - messages.error(request, - 'Data Commons Framework returned an error "{}": {}. ' \ - 'Please contact the ISB-CGC administrator.'.format(error, error_description)) + logger.error("[ERROR] Unrecognized DCF error: {} : {}".format(error, error_description)) + messages.error(request, dcf_err_msg.format("D001")) return redirect(reverse('user_detail', args=[request.user.id])) # @@ -172,7 +174,7 @@ def oauth2_callback(request): saved_state = request.session['dcfOAuth2State'] else: logger.error("[ERROR] Missing dcfOAuth2State during callback") - messages.error(request, "There was an internal error 001 logging in. Please contact the ISB-CGC administrator.") + messages.error(request, internal_err_msg.format("001")) return redirect(reverse('user_detail', args=[request.user.id])) client_id, client_secret = _get_secrets() @@ -193,10 +195,9 @@ def oauth2_callback(request): client_id=client_id, authorization_response=auth_response) except Exception as e: - logger.error("[ERROR] dcf.fetch_token") - logger.error('DCF_TOKEN_URL: {} / authresp: {} / full_callback: {}'.format(DCF_TOKEN_URL, auth_response, full_callback)) + logger.error('[ERROR] dcf.fetch_token DCF_TOKEN_URL: {} / authresp: {} / full_callback: {}'.format(DCF_TOKEN_URL, auth_response, full_callback)) logger.exception(e) - messages.error(request, "There was an error contacting the DCF. Please try again, or contact the ISB-CGC administrator.") + messages.error(request, comm_err_msg) return redirect(reverse('user_detail', args=[request.user.id])) finally: client_secret = None # clear this in case we are in Debug mode to keep this out of the browser @@ -204,7 +205,7 @@ def oauth2_callback(request): logger.info("[INFO] OAuthCB d") if token_data['token_type'] != 'Bearer': logger.error("[ERROR] Token type returned was not 'Bearer'") - messages.error(request, "There was an internal error 002 logging in. Please contact the ISB-CGC administrator.") + messages.error(request, internal_err_msg.format("002")) return redirect(reverse('user_detail', args=[request.user.id])) # @@ -242,7 +243,7 @@ def oauth2_callback(request): except Exception as e: logger.error("[ERROR] Could not retrieve key from DCF") logger.exception(e) - messages.error(request, "There was an internal error 003 logging in. Please contact the ISB-CGC administrator.") + messages.error(request, comm_err_msg) return redirect(reverse('user_detail', args=[request.user.id])) key_data = json_loads(resp.text) @@ -254,7 +255,7 @@ def oauth2_callback(request): if use_key is None: logger.error("[ERROR] No key found from DCF to validate JWT") - messages.error(request, "There was an internal error 004 logging in. Please contact the ISB-CGC administrator.") + messages.error(request, internal_err_msg.format("003")) return redirect(reverse('user_detail', args=[request.user.id])) # @@ -268,7 +269,7 @@ def oauth2_callback(request): except Exception as e: logger.error("[ERROR] Decoding JWT failure") logger.exception(e) - messages.error(request, "There was an internal error 005 logging in. Please contact the ISB-CGC administrator.") + messages.error(request, internal_err_msg.format("004")) return redirect(reverse('user_detail', args=[request.user.id])) # @@ -332,19 +333,35 @@ def oauth2_callback(request): return redirect(reverse('user_detail', args=[request.user.id])) logger.info("[INFO] OAuthCB j") + + # + # We now are almost ready to stash the token. One field in the table is the Google ID. First time + # through, it will be blank. Otherwise, it either matches our login ID, or might be some rando + # email if the user e.g. bailed before fixing it last time. We will not enter a value for that + # field in the DB unless the ID coming back from DCF matches our login ID. + + save_google_link = None + if google_link: + req_user = User.objects.get(id=request.user.id) + if google_link == req_user.email: + save_google_link = google_link + + # + # AFTER THIS CALL WE HAVE A TOKEN WE CAN USE TO COMMUNICATE WITH DCF # # We now have the minimum we need to store the tokens from DCF, so stick that in the database. We DO NOT yet # make the entry in the NIH_User table, since we need to now either establish or refresh the DCF-Google ID link: # - _refresh_token_storage(token_data, decoded_jwt_id, user_data_token_str, nih_from_dcf, dcf_user_id, request.user.id, google_link) + _refresh_token_storage(token_data, decoded_jwt_id, user_data_token_str, nih_from_dcf, dcf_user_id, request.user.id, save_google_link) # # If user already has a google ID link, we would PATCH the endpoint to update it for 24 more hours. If # not, we do a GET. (I.e. the first time they show up at DCF is the ONLY time we do a get, except for - # those cases where an unlink has been called.) So here is where the control flow diverges. For the - # GET, we wrap things up in the callback. For the PATCH, we wrap things up immediately: + # those cases where they have disconnected or provided the wrong ID.) So here is where the control + # flow diverges. For the GET, we wrap things up in the callback. For the PATCH, we wrap things up immediately: # + logger.info("[INFO] OAuthCB k") if google_link: @@ -354,36 +371,58 @@ def oauth2_callback(request): # logged in they provided the wrong email address to DCF and # then ignored us when we asked them to correct the problem. If DCF's provided Google ID does not match # ours, then they need to still provide us with the correct version before we let them use it! - # Also, if a user is trying to reuse the same NIH login, we expect to get back a Google ID from DCF that - # does not match the current user email (but that is caught above, isn't it??) + # Also, if a user is trying to reuse the same NIH login in use by somewhere else, we expect to get back + # a Google ID from DCF that does not match the current user email, but that is caught above. # req_user = User.objects.get(id=request.user.id) logger.info("[INFO] OAuthCB l") if google_link != req_user.email: - _unlink_internals(request.user.id, False) - message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( - req_user.email, google_link) - messages.warning(request, message) - return redirect(reverse('user_detail', args=[request.user.id])) + try: + _unlink_at_dcf(request.user.id, True) # True = recently saved token is now updated with unlinked state + message = "You must use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( + req_user.email, google_link) + messages.warning(request, message) + return redirect(reverse('user_detail', args=[request.user.id])) + except TokenFailure as e: + messages.error(request, internal_err_msg.format("005")) + return redirect(reverse('user_detail', args=[request.user.id])) + except RefreshTokenExpired: + messages.error(request, internal_err_msg.format("005a")) + return redirect(reverse('user_detail', args=[request.user.id])) + except DCFCommFailure as e: + messages.error(request, comm_err_msg) + return redirect(reverse('user_detail', args=[request.user.id])) # # The link matches. So we use PATCH, and if it goes smoothly, we write the new link to the database: # + logger.info("[INFO] OAuthCB m") try: resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') - except Exception as e: - logger.error("[ERROR] Link patch call failure") - logger.exception(e) + except TokenFailure: + messages.error(request, internal_err_msg.format("006")) + return redirect(reverse('user_detail', args=[request.user.id])) + except RefreshTokenExpired: + messages.error(request, internal_err_msg.format("007")) + return redirect(reverse('user_detail', args=[request.user.id])) + except DCFCommFailure: + messages.error(request, comm_err_msg) + return redirect(reverse('user_detail', args=[request.user.id])) - if resp.status_code == 404: - messages.warning(request, "No linked Google account found for user") - elif resp.status_code == 200: - pass - else: - messages.warning(request, "Unexpected response ({}, {}) from DCF during linking. " - "Please contact the ISB-CGC administrator.".format(resp.status_code, resp.text)) + if resp.status_code == 404: # Now DCF says user is NOT linked... + messages.error(request, internal_err_msg.format("008")) + return redirect(reverse('user_detail', args=[request.user.id])) + elif resp.status_code != 200: + logger.error("[ERROR] Unexpected response ({}, {}) from DCF during linking.".format(resp.status_code, resp.text)) + messages.warning(request, internal_err_msg.format("009")) + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # Now that we have a successful PATCH, take the reported expiration time and do the internal work + # to finish the link + # logger.info("[INFO] OAuthCB n") returned_expiration_str = json_loads(resp.text)['exp'] @@ -394,6 +433,8 @@ def oauth2_callback(request): messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) + # Finished handling pre-existing linking. + # # User has not yet been linked, so start the redirect flow with the user and DCF that will result # in us getting the callback below to finish the process: @@ -410,9 +451,11 @@ def oauth2_callback(request): def dcf_link_callback(request): """ When the user comes back from Google/DCF after linking, this routine gets called. It provides us with any error - conditions + conditions. """ + dcf_err_msg = "DCF reported an error {} logging in. Please contact the ISB-CGC administrator." + # log the reports using Cloud logging API st_logger = StackDriverLogger.build_from_django_settings() @@ -434,8 +477,7 @@ def dcf_link_callback(request): logger.error("[ERROR]: DCF reports an error ({}, {}, {}) trying to link Google ID".format(error, message, error_description)) - messages.warning(request, 'Unexpected error detected during linking. ' \ - 'Please report error "{}": {} to the ISB-CGC administrator'.format(error, error_description)) + messages.error(request, messages.error(request, dcf_err_msg.format("D002"))) return redirect(reverse('user_detail', args=[request.user.id])) # @@ -450,21 +492,27 @@ def dcf_link_callback(request): use_expiration_time = _calc_expiration_time(returned_expiration_str) # - # At this point, we need to wrestle with the possible problem that the user has linked - # to a DIFFERENT GoogleID while off messing with DCF. If the ID that comes back is not - # identical to what we think it is. They need to go and do it again. BUT as far as DCF - # is concerned, they are linked, so we need to finish the job here... + # We will NEVER accept a Google ID that does not match At this point, we need to wrestle + # with the possible problem that the user has linked to a DIFFERENT GoogleID while off + # messing with DCF. If the ID that comes back is not identical to what we think it is, + # they need to go and do it again. BUT as far as DCF is concerned, they are linked, + # so we need to keep deleting the linkage at DCF! # - the_user_token_string = _get_user_data_token_string(request.user.id) # a string + try: + the_user_token_string = _get_user_data_token_string(request.user.id) # a string. + except (TokenFailure, DCFCommFailure, RefreshTokenExpired): + return redirect(reverse('user_detail', args=[request.user.id])) + the_user_token_dict = json_loads(the_user_token_string) the_user_dict = the_user_token_dict['context']['user'] + # Just parses the google link out of the recently return token. google_link = _get_google_link_from_user_dict(the_user_dict) if returned_google_link: if google_link != returned_google_link: - logger.error("WARNING: DCF RETURNED CONFLICTING GOOGLE LINK {} VERSUS {}".format(returned_google_link, + logger.error("[ERROR]: DCF RETURNED CONFLICTING GOOGLE LINK {} VERSUS {}".format(returned_google_link, google_link)) else: logger.info("DCF provided google link was consistent") @@ -478,8 +526,9 @@ def dcf_link_callback(request): return redirect(reverse('user_detail', args=[request.user.id])) req_user = User.objects.get(id=request.user.id) + # NOPE! Send user back to details page. The empty google ID in our table will mean the page shows an option to try again. if google_link != req_user.email: - _unlink_internals(request.user.id, True) + _unlink_at_dcf(request.user.id, True) # True = recently saved token is now updated with unlinked state message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( req_user.email, google_link) messages.warning(request, message) @@ -489,7 +538,12 @@ def dcf_link_callback(request): # If all is well, this is where we add the user to the NIH_User table and link the user to the various data sets. # - warning = _finish_the_link(request.user.id, google_link, use_expiration_time, st_logger) + try: + warning = _finish_the_link(request.user.id, google_link, use_expiration_time, st_logger) + except (TokenFailure, RefreshTokenExpired): + messages.warning(request, "say something witty here...") + return redirect(reverse('user_detail', args=[request.user.id])) + if warning: messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) @@ -506,7 +560,11 @@ def dcf_link_extend(request): try: resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') - except Exception as e: + except TokenFailure: + pass + except RefreshTokenExpired: + pass + except DCFCommFailure as e: # Any rando exception from the call is turned into this! logger.error("[ERROR] Link patch call failure") logger.exception(e) messages.warning(request, "Error contacting DCF during linking. " @@ -515,17 +573,21 @@ def dcf_link_extend(request): if resp.status_code == 404: messages.warning(request, "No linked Google account found for user") - elif resp.status_code == 200: - pass - else: + return redirect(reverse('user_detail', args=[request.user.id])) + elif resp.status_code != 200: messages.warning(request, "Unexpected response ({}) from DCF during linking. " "Please contact the ISB-CGC administrator.".format(resp.status_code)) + return redirect(reverse('user_detail', args=[request.user.id])) returned_expiration_str = json_loads(resp.text)['exp'] use_expiration_time = _calc_expiration_time(returned_expiration_str) # User data set permissions might have changed, so we call and find out what they are: - user_data_token_string = _get_user_data_token_string(request.user.id) + try: + user_data_token_string = _get_user_data_token_string(request.user.id) # Can raise TokenFailure or DCFCommFailure + except (TokenFailure, DCFCommFailure, RefreshTokenExpired) as e: + redirect(reverse('user_detail', args=[request.user.id])) + user_data_dict = _user_data_token_to_user_dict(user_data_token_string) _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data_dict, use_expiration_time, st_logger) @@ -563,6 +625,9 @@ def _calc_expiration_time(returned_expiration_str): def _finish_the_link(user_id, user_email, expiration_time, st_logger): """ Regardless of how they get here, this step handles the linking of the user by adding the required database records. + + :raises TokenFailure: + :raises RefreshTokenExpired: """ nih_assertion_expiration = expiration_time @@ -571,16 +636,18 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger): # Until we get back current projects, refresh it: # - the_user_token = _get_user_data_token_string(user_id) # the_user is a string + try: + the_user_token = _get_user_data_token_string(user_id) # the_user is a string. + except (TokenFailure, RefreshTokenExpired) as e: + raise e + + # Can raise TokenFailure or DCFCommFailure or RefreshTokenExpired # # Save the new info from the DCF: # - dcf_token = get_stored_dcf_token(user_id) - if not dcf_token: - return 'Unexpected internal error: No stored token ' \ - 'Please report this to the ISB-CGC administrator' + dcf_token = get_stored_dcf_token(user_id) # Can raise a TokenFailure or RefreshTokenExpired if dcf_token.google_id is not None and dcf_token.google_id != user_email: return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ @@ -698,34 +765,41 @@ def _user_data_token_dict_to_user_dict(the_user_token_dict): def _get_user_data_token_string(user_id): """ - Get up-to-date user data from DCF, massage as needed + Get up-to-date user data from DCF, massage as needed. + + :raises TokenFailure: + :raises DCFCommFailure: + :raises RefreshTokenExpired: """ # The user endpoint is spotty at the moment (6/5/18) so we drag it out of the token instead - #resp = _dcf_call(DCF_USER_URL, user_id) - #the_user = json_loads(resp.text) - the_user_id_token, _ = _user_data_from_token(user_id) + the_user_id_token, _ = _user_data_from_token(user_id, False) massaged_string, _ = _user_data_token_massaged(the_user_id_token) return massaged_string -def _user_data_from_token(user_id): +def _user_data_from_token(user_id, stash_it): """ Seems that we should be able to get full user info from the user endpoint, but it turns out that the information in the token refresh is more complete. + + PLUS, user can set stash_it to True. DCF suggests we refresh the access token after e.g. unlinking. + + :raises TokenFailure: + :raises DCFCommFailure: + :raises RefreshTokenExpired: """ # - # OAuth2Session handles token refreshes under the covers. Here we want to do it explicitly. We - # do not care about the refresh, but we want the id_token contents. - # Note THIS WILL NOT WORK IF REFRESH TOKEN HAS EXPIRED! + # OAuth2Session handles token refreshes under the covers. Here we want to do it explicitly. # - dcf_token = get_stored_dcf_token(user_id) - if not dcf_token: - raise TokenFailure() + try: + dcf_token = get_stored_dcf_token(user_id) + except (TokenFailure, RefreshTokenExpired) as e: + raise e client_id, client_secret = _get_secrets() @@ -734,47 +808,32 @@ def _user_data_from_token(user_id): 'refresh_token': dcf_token.refresh_token, 'client_id': client_id } + auth = requests.auth.HTTPBasicAuth(client_id, client_secret) - resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) client_id = None client_secret = None + try: + resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) + except Exception as e: + logger.error("[ERROR] Token acquisition Exception") + logger.exception(e) + raise DCFCommFailure() + if resp.status_code != 200: - logger.error("[INFO] Token acquisition problem: {} : {}".format(resp.status_code, resp.text)) - return None, None + logger.error("[ERROR] Token acquisition problem: {} : {}".format(resp.status_code, resp.text)) + raise DCFCommFailure() token_dict = json_loads(resp.text) id_token_decoded, id_token_dict = _decode_token(token_dict['id_token']) - return id_token_decoded, id_token_dict - - -def _refresh_access_token(user_id): - """ - DCF suggests we refresh the access token after e.g. unlinking. OAuth2Session usually handles token refreshes - # under the covers, but here we want to do it explicitly. - """ - - dcf_token = get_stored_dcf_token(user_id) - if not dcf_token: - raise TokenFailure() - - client_id, client_secret = _get_secrets() - - data = { - 'grant_type': 'refresh_token', - 'refresh_token': dcf_token.refresh_token, - 'client_id': client_id - } - auth = requests.auth.HTTPBasicAuth(client_id, client_secret) - resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) - client_id = None - client_secret = None - if resp.status_code != 200: - logger.error("[INFO] Token acquisition problem: {} : {}".format(resp.status_code, resp.text)) - return None, None + if stash_it: + try: + _access_token_storage(token_dict, user_id) + except (TokenFailure, RefreshTokenExpired) as e: + logger.error("[ERROR] _user_data_from_token aborted: {}".format(str(e))) + raise e - token_dict = json_loads(resp.text) - _access_token_storage(token_dict, user_id) + return id_token_decoded, id_token_dict def _massage_user_data_for_dev(the_user): @@ -806,12 +865,82 @@ def _massage_user_data_for_dev(the_user): return the_user +def _unlink_at_dcf(user_id, do_refresh): + """ + There are only three places where we call DCF to do a Google unlink: 1) If they login via NIH and we get + a token for the user that tells us they already are linked to a Google ID that does not match their ISB-CGC + login ID. 2) We send them back to DCF to do the Google ID linking step and the callback informs us that they + have logged in with the wrong (not ISB-CGC) Google ID, and 3) the user has chosen to fully disconnect, and + dropping the Google ID is one step in the teardown flow. We NEVER enter a Google ID into the DCFToken + table if it does not match their ISB-CCG ID. + + Can raise TokenFailure, DCFCommFailure + + WARNING: DO NOT CALL this routine unless we have positive evidence returned from DCF that the user is + linked. It is an error to tell DCF to unlink if the user is not actually linked. That said, we will + log the discrepancy but not issue any error to the user. + + :raise TokenFailure: + :raise DCFCommFailure: + :raise RefreshTokenExpired: + + """ + + success = False + + # + # Call DCF to drop the linkage. Note that this will immediately remove them from controlled access. + # + + try: + resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') # can raise TokenFailure, DCFCommFailure + except TokenFailure: + pass + except RefreshTokenExpired: + pass + except DCFCommFailure: + pass + except Exception as e: + logger.error("[ERROR] Attempt to contact DCF for Google ID unlink failed (user {})".format(user_id)) + raise e + + if resp.status_code == 404: + # We are trying to unlink, and DCF thinks there is no link. Silent failure! + logger.error("[ERROR] No linked Google account found for user {}".format(user_id)) + success = True + elif resp.status_code == 400: + delete_response = json_loads(resp.text) + error = delete_response['error'] + message = delete_response['error_description'] + logger.error("[ERROR] Error returned in unlinking: {} : {}".format(error, message)) + elif resp.status_code == 200: + success = True + else: + logger.error("[ERROR] Unexpected response from DCF: {}".format(resp.status_code)) + + # + # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking + # since they care about the token info: + # + + if do_refresh: + _user_data_from_token(user_id, True) # Can raise TokenFailure, DCFCommFailure + + if not success: + raise DCFCommFailure() + return -def _unlink_internals(user_id, just_with_dcf): +def _unlink_internally(user_id, just_with_dcf): + # FIXME NEED SOME INTERNAL UNLINK CODE, THIS NEEDS WORK """ - Handles all the internal details of unlinking a user's Google ID. + There are only three places where we call DCF to do a Google unlink: 1) If they login via NIH and we get + a token for the user that tells us they already are linked to a Google ID that does not match their ISB-CGC + login ID. 2) We send them back to DCF to do the Google ID linking step and the callback informs us that they + have logged in with the wrong (not ISB-CGC) Google ID, and 3) the user has chosen to fully disconnect, and + dropping the Google ID is one step in the teardown flow. We NEVER enter a Google ID into the DCFToken + table if it does not match their ISB-CCG id. """ warnings = [] errors = [] @@ -821,9 +950,12 @@ def _unlink_internals(user_id, just_with_dcf): # Get our concept of linking state from the token DB: # if not just_with_dcf: - dcf_token = get_stored_dcf_token(user_id) - if not dcf_token: - raise TokenFailure() + try: + dcf_token = get_stored_dcf_token(user_id) + except TokenFailure: + pass + except RefreshTokenExpired: + pass the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) google_link = _get_google_link_from_user_dict(the_user_dict) @@ -835,45 +967,24 @@ def _unlink_internals(user_id, just_with_dcf): # First, call DCF to drop the linkage. This is the only way to get the user # booted out of control groups. # - - try: - resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') - except Exception as e: - logger.error("[ERROR] Attempt to contact DCF for Google ID unlink failed (user {})".format(user_id)) - logger.exception(e) - errors.append("Unexpected error in unlinking") - return success, warnings, errors - - if resp.status_code == 404: - warnings.append("No linked Google account found for user") - elif resp.status_code == 400: - delete_response = json_loads(resp.text) - error = delete_response['error'] - message = delete_response['error_description'] - errors.append("Error in unlinking: {} : {}".format(error, message)) - elif resp.status_code == 200: - success = True - else: - warnings.append("Unexpected response from DCF") - - if just_with_dcf: - return success, warnings, errors - # # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking # since they care about the token info: # - _refresh_access_token(user_id) + _user_data_from_token(user_id, True) # Can raise TokenFailure or DCFCommFailure # # The Token table records the User's Google ID. This needs to be nulled. The expiration time in the DCFToken # is for the access token, not the google link (that info is stored in the NIH_user): # - dcf_token = get_stored_dcf_token(user_id) - if not dcf_token: - raise TokenFailure() + try: + dcf_token = get_stored_dcf_token(user_id) + except TokenFailure: + pass + except RefreshTokenExpired: + pass dcf_token.google_id = None dcf_token.save() @@ -898,16 +1009,107 @@ def _unlink_internals(user_id, just_with_dcf): return success, warnings, errors + +# def _unlink_internals(user_id, just_with_dcf): +# """ +# Handles all the internal details of unlinking a user's Google ID. +# """ +# warnings = [] +# errors = [] +# success = False +# +# # +# # Get our concept of linking state from the token DB: +# # +# if not just_with_dcf: +# dcf_token = get_stored_dcf_token(user_id) +# if not dcf_token: +# raise TokenFailure() +# the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) +# google_link = _get_google_link_from_user_dict(the_user_dict) +# +# if google_link is None: +# warnings.append("User is not linked to Google") +# return success, warnings, errors +# +# # +# # First, call DCF to drop the linkage. This is the only way to get the user +# # booted out of control groups. +# # +# +# try: +# resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') +# except Exception as e: +# logger.error("[ERROR] Attempt to contact DCF for Google ID unlink failed (user {})".format(user_id)) +# logger.exception(e) +# errors.append("Unexpected error in unlinking") +# return success, warnings, errors +# +# if resp.status_code == 404: +# warnings.append("No linked Google account found for user") +# elif resp.status_code == 400: +# delete_response = json_loads(resp.text) +# error = delete_response['error'] +# message = delete_response['error_description'] +# errors.append("Error in unlinking: {} : {}".format(error, message)) +# elif resp.status_code == 200: +# success = True +# else: +# warnings.append("Unexpected response from DCF") +# +# if just_with_dcf: +# return success, warnings, errors +# +# # +# # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking +# # since they care about the token info: +# # +# +# _user_data_from_token(user_id, True) # Can raise TokenFailure or DCFCommFailure +# +# # +# # The Token table records the User's Google ID. This needs to be nulled. The expiration time in the DCFToken +# # is for the access token, not the google link (that info is stored in the NIH_user): +# # +# +# dcf_token = get_stored_dcf_token(user_id) +# if not dcf_token: +# raise TokenFailure() +# +# dcf_token.google_id = None +# dcf_token.save() +# +# # +# # Now drop the link flag and active flag from the DB, plus our db records of what datasets the user is +# # good for: +# # +# +# try: +# message = unlink_account_in_db_for_dcf(user_id) +# if message: +# errors.append(message) +# success = False +# +# except Exception as e: +# logger.error("[ERROR] While unlinking accounts:") +# logger.exception(e) +# errors.append('There was an error when attempting to unlink your Google ID - please contact the administrator.') +# success = False +# +# return success, warnings, errors + def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_from_dcf, dcf_uid, cgc_uid, google_id): """ - This is called when the user needs to get a new 30-day refresh token from DCF by logging into - NIH (or if they explicitly disconnect their NIH identity and need to reauthenticate to DCF again). + This is called when the user logs into DCF for the first time, whenever they need to get a new 30-day refresh + token from DCF by logging in, or if they explicitly disconnect their NIH identity and need to reauthenticate + to DCF again. It creates or refreshes the token in the database. """ # # We need to extract out the expiration time buried in the refresh token. When the refresh token # expires (30 days) the user has to reauthenticate with DCF: # + refresh_token = token_dict['refresh_token'] refresh_tokens_b64 = refresh_token.split('.') i64 = refresh_tokens_b64[1] @@ -962,6 +1164,9 @@ def _access_token_storage(token_dict, cgc_uid): """ This call just replaces the access key and user token part of the DCF record. Used when we use the refresh token to get a new access key. + + :raises TokenFailure: + :raises RefreshTokenExpired: """ # This refers to the *access key* expiration (~20 minutes) @@ -980,9 +1185,11 @@ def _access_token_storage(token_dict, cgc_uid): # id_token_decoded, _ = _decode_token(token_dict['id_token']) - dcf_token = get_stored_dcf_token(cgc_uid) - if not dcf_token: - raise TokenFailure() + try: + dcf_token = get_stored_dcf_token(cgc_uid) + except (TokenFailure, RefreshTokenExpired) as e: + logger.error("[INFO] _access_token_storage aborted: {}".format(str(e))) + raise e dcf_token.access_token = token_dict['access_token'] dcf_token.user_token = id_token_decoded @@ -1026,7 +1233,13 @@ def dcf_disconnect_user(request): # no DCF token anymore. Catch that case and silently no-op: # - dcf_token = get_stored_dcf_token(request.user.id) + try: + dcf_token = get_stored_dcf_token(request.user.id) + except TokenFailure: + pass + except RefreshTokenExpired: + pass + if not dcf_token: return redirect(reverse('user_detail', args=[request.user.id])) @@ -1037,7 +1250,17 @@ def dcf_disconnect_user(request): logger.info("[INFO] DDU A") if google_link: - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') + try: + resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') + except TokenFailure: + pass + except RefreshTokenExpired: + pass + except DCFCommFailure: + pass + + + if resp.status_code == 404: msg_list.append("No linked Google account found for user, code {}".format(resp.status_code)) elif resp.status_code == 400: @@ -1098,37 +1321,33 @@ def dcf_disconnect_user(request): logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) logger.info("[INFO] DDU D") callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) + + except TokenFailure as e: + logger.warning("[INFO] MissingTokenError seen") + logger.exception(e) + return redirect(reverse('user_detail', args=[request.user.id])) + except DCFCommFailure as e: + logger.warning("DCF Exception") + logger.exception(e) + return redirect(reverse('user_detail', args=[request.user.id])) except Exception as e: logger.error("[ERROR] While disconnect:") logger.exception(e) - raise e - return HttpResponseRedirect(callback) - + return redirect(reverse('user_detail', args=[request.user.id])) -# @login_required -# def dcf_get_user_data(request): -# """ -# Use for QC and development if we need to see token info. Not used in production -# """ -# -# id_token_decoded, _ = _user_data_from_token(request.user.id) -# -# resp = _dcf_call(DCF_USER_URL, request.user.id) -# user_data = json_loads(resp.text) -# -# remaining_token_time = get_dcf_auth_key_remaining_seconds(request.user.id) -# messages.warning(request, 'EPDCF Responded with {}: {} plus {}'.format(user_data, remaining_token_time, id_token_decoded)) -# return redirect(reverse('user_detail', args=[request.user.id])) + return HttpResponseRedirect(callback) def _dcf_call(full_url, user_id, mode='get', post_body=None, force_token=False): """ All the stuff around a DCF call that handles token management and refreshes. + + :raises TokenFailure: + :raises DCFCommFailure: + :raises RefreshTokenExpired: """ - dcf_token = get_stored_dcf_token(user_id) - if not dcf_token: - raise TokenFailure() + dcf_token = get_stored_dcf_token(user_id) # Can raise a TokenFailure or RefreshTokenExpired expires_in = (dcf_token.expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() logger.info("[INFO] Token Expiration : {} seconds".format(expires_in)) @@ -1152,30 +1371,40 @@ def token_storage_for_user(my_token_dict): dcf = OAuth2Session(client_id, token=token_dict, auto_refresh_url=DCF_TOKEN_URL, auto_refresh_kwargs=extra_dict, token_updater=token_storage_for_user) + extra_dict = None # Hoo boy! You *MUST* provide the client_id and client_secret in the call itself to insure an OAuth2Session token # refresh call uses HTTPBasicAuth! - # FIXME can get an exception here (BAD REQUEST) if refresh token has e.g. been revoked and not dropped out of DB. - # FIXME: Also have seen this when requesting an unlink - # FIXME: reply: 'HTTP/1.1 401 UNAUTHORIZED\r\n' after staging server is rolled?? - # FIXME: "/home/vagrant/www/lib/oauthlib/oauth2/rfc6749/parameters.py" - # FIXME: MissingTokenError: (missing_token) Missing access token parameter. + # We have seen an exception here (BAD REQUEST) if refresh token has e.g. been revoked and not dropped out of DB. + # Also have seen this when requesting an unlink: + # reply: 'HTTP/1.1 401 UNAUTHORIZED\r\n' after staging server is rolled?? + # "/home/vagrant/www/lib/oauthlib/oauth2/rfc6749/parameters.py" + # MissingTokenError: (missing_token) Missing access token parameter. + try: resp = dcf.request(mode, full_url, client_id=client_id, client_secret=client_secret, data=post_body) + except (TokenFailure, RefreshTokenExpired) as e: + # bubbles up from token_storage_for_user call + logger.error("[ERROR] _dcf_call {} aborted: {}".format(full_url, str(e))) + raise e except MissingTokenError as e: - print "drop the records from the database {}".format(str(e)) - print "NO! gotta remember they linked as NIH ID before!!" - except TokenFailure as e: - print "token problem" + logger.warning("[INFO] MissingTokenError seen") + logger.exception(e) + raise TokenFailure() except Exception as e: - print "drop the records from the database {}".format(str(e)) + logger.warning("DCF Exception") + logger.exception(e) + raise DCFCommFailure() return resp def _get_secrets(): + """ + Keep hidden info hidden as much as possible + """ dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) client_id = dcf_secrets['DCF_CLIENT_ID'] client_secret = dcf_secrets['DCF_CLIENT_SECRET'] @@ -1183,6 +1412,9 @@ def _get_secrets(): def _read_dict(my_file_name): + """ + Keep hidden info hidden as much as possible + """ retval = {} with open(my_file_name, 'r') as f: for line in f: @@ -1208,6 +1440,16 @@ def get_nih_user_details_from_token(user_id): # dcf_token = get_stored_dcf_token(user_id) + try: + dcf_token = get_stored_dcf_token(user_id) + except TokenFailure: + pass + except RefreshTokenExpired: + pass + + + + if not dcf_token: return user_details @@ -1261,7 +1503,7 @@ def get_nih_user_details_from_token(user_id): logger.debug("[DEBUG] User {} has access to {} dataset(s) and is {}".format(nih_user.NIH_username, str(len(user_auth_datasets)), ('not active' if not nih_user.active else 'active'))) user_details['NIH_active'] = nih_user.active user_details['NIH_DCF_linked'] = nih_user.linked - user_details['refresh_key_ok'] = get_dcf_auth_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS + user_details['refresh_key_ok'] = get_dcf_refresh_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS user_details['auth_datasets'] = [] if len(user_auth_datasets) <= 0 else AuthorizedDataset.objects.filter(id__in=user_auth_datasets.values_list('authorized_dataset',flat=True)) return user_details @@ -1313,6 +1555,9 @@ def _refresh_from_dcf(user_id): """ We would like to check if our view of the user (linkage, expirations, datasets) is consistent with what the DCF thinks, and update accordingly! + + :raises TokenFailure: + :raises RefreshTokenExpired: """ user_email = User.objects.get(id=user_id).email @@ -1321,7 +1566,12 @@ def _refresh_from_dcf(user_id): # Haul the user data token string down from DCF: # - the_user_token = _get_user_data_token_string(user_id) # the_user_token is a string + try: + the_user_token = _get_user_data_token_string(user_id) # the_user_token is a string. + except (TokenFailure, RefreshTokenExpired) as e: + raise e + + # Can raise TokenFailure or DCFCommFailure # # Things that could be different: Google ID linkage, expiration time, approved datasets. @@ -1341,10 +1591,13 @@ def _refresh_from_dcf(user_id): # Compare to our versions: # - dcf_token = get_stored_dcf_token(user_id) - if not dcf_token: - print "we have no token" - return + try: + dcf_token = get_stored_dcf_token(user_id) + except TokenFailure: + pass + except RefreshTokenExpired: + pass + google_match_state = _compare_google_ids(dcf_google_link, dcf_token.google_id, user_email) google_problem = None @@ -1440,4 +1693,19 @@ def _refresh_from_dcf(user_id): # messages.warning(request, warning) # for error in errors: # messages.error(request, error) +# return redirect(reverse('user_detail', args=[request.user.id])) + +# @login_required +# def dcf_get_user_data(request): +# """ +# Use for QC and development if we need to see token info. Not used in production +# """ +# +# id_token_decoded, _ = _user_data_from_token(request.user.id, False) Can raise TokenFailure or DCFCommFailure +# +# resp = _dcf_call(DCF_USER_URL, request.user.id) +# user_data = json_loads(resp.text) +# +# remaining_token_time = get_dcf_auth_key_remaining_seconds(request.user.id) +# messages.warning(request, 'EPDCF Responded with {}: {} plus {}'.format(user_data, remaining_token_time, id_token_decoded)) # return redirect(reverse('user_detail', args=[request.user.id])) \ No newline at end of file diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index eda90741..52093038 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -48,6 +48,17 @@ GOOGLE_ORG_WHITELIST_PATH = settings.GOOGLE_ORG_WHITELIST_PATH MANAGED_SERVICE_ACCOUNTS_PATH = settings.MANAGED_SERVICE_ACCOUNTS_PATH + +class TokenFailure(Exception): + """Thrown if we have problems with our access/refresh tokens """ + + +class RefreshTokenExpired(Exception): + """Thrown if our refresh token is no longer valid and user must log in """ + + def __init__(self, seconds): + self.seconds = seconds + def verify_service_account(gcp_id, service_account, datasets, user_email, is_refresh=False, is_adjust=False, remove_all=False): # Only verify for protected datasets @@ -1005,23 +1016,6 @@ def demo_process_success(auth, user_id, saml_response): return retval -def get_dcf_auth_key_remaining_seconds(user_id): - """ - We need to know how many seconds are left before the user needs to log back in to NIH to get - a new refresh token, which will expire every 30 days. - """ - - dcf_token = get_stored_dcf_token(user_id) - if not dcf_token: - return -1 # ? No token? They expire immediately! - - remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() - logger.info('[INFO] user {} has {} seconds remaining on refresh token'. - format(dcf_token.nih_username, remaining_seconds)) - - return remaining_seconds - - def handle_user_db_update_for_dcf_linking(user_id, user_data_dict, nih_assertion_expiration, st_logger): """ When user logs into DCF using iTrust and links via DCF, we create an NIH record for them and link them to to their data. @@ -1054,7 +1048,7 @@ def handle_user_db_update_for_dcf_linking(user_id, user_data_dict, nih_assertion str(nih_user.NIH_username))) st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] NIH_User.objects updated nih_user for linking: {}".format( - str(nih_user.NIH_username))) + str(nih_user.NIH_username))) st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, "[STATUS] NIH_User {} associated with email {}".format( str(nih_user.NIH_username), our_user.email)) @@ -1310,11 +1304,34 @@ def deactivate_nih_add_to_open(user_id, user_email): logger.info(e) +def get_dcf_refresh_key_remaining_seconds(user_id): + """ + We need to know how many seconds are left before the user needs to log back in to NIH to get + a new refresh token, which will expire every 30 days. + """ + + try: + dcf_token = get_stored_dcf_token(user_id) + except TokenFailure: + return -1 # ? No token? They expire immediately! + except RefreshTokenExpired as e: + return e.seconds + + remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() + logger.info('[INFO] user {} has {} seconds remaining on refresh token'. + format(dcf_token.nih_username, remaining_seconds)) + + return remaining_seconds + + def get_stored_dcf_token(user_id): """ When a user breaks their connection with DCF, we flush out the revoked tokens. But if they have a session running in another browser, they might still be clicking on links that expect a token. So - we need to be bulletproof on maybe not getting back a token. May return None + we need to be bulletproof on maybe not getting back a token. + + :raises TokenFailure: + :raises RefreshTokenExpired: """ dcf_tokens = DCFToken.objects.filter(user_id) num_tokens = len(dcf_tokens) @@ -1323,15 +1340,27 @@ def get_stored_dcf_token(user_id): logger.error('[ERROR] Unexpected Server Error: Multiple tokens found for user {}'.format(user_id)) else: logger.info('[INFO] User {} tried to use a flushed token'.format(user_id)) - return None + raise TokenFailure() + dcf_token = dcf_tokens.first() + remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() + if remaining_seconds <= 60: + raise RefreshTokenExpired(remaining_seconds) + return dcf_token def get_nih_user_details(user_id, force_logout): + """ + :param user_id: + :param force_logout: + :return: + """ user_details = {} if settings.DCF_TEST: + # FIXME: Check in with DCF for info, throw DCFCommError if we have problems + # FIXME: If refresh token is expired, we cannot show any info until they log back in! if force_logout: user_details['force_DCF_logout'] = True @@ -1345,9 +1374,14 @@ def get_nih_user_details(user_id, force_logout): # issue by looking at the current DCF token attached to the user to see who they are associated with. # - dcf_token = get_stored_dcf_token(user_id) - if not dcf_token: - return user_details # i.e. empty dict + user_details['refresh_required'] = False + try: + dcf_token = get_stored_dcf_token(user_id) + except TokenFailure: + return user_details # i.e. empty dict + except RefreshTokenExpired: + user_details['refresh_required'] = True + return user_details curr_user = User.objects.get(id=user_id) @@ -1438,7 +1472,7 @@ def get_nih_user_details(user_id, force_logout): if settings.DCF_TEST: user_details['link_mismatch'] = (dcf_token.google_id is None) or (dcf_token.google_id != curr_user.email) - user_details['refresh_key_ok'] = get_dcf_auth_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS + user_details['refresh_key_ok'] = get_dcf_refresh_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS user_details['force_DCF_logout'] = False return user_details From 9d7302b105446f73745e22da6840e0f73ff74419 Mon Sep 17 00:00:00 2001 From: "S. Paquette" Date: Fri, 29 Jun 2018 14:15:46 -0700 Subject: [PATCH 27/76] *** REQUIRES MIGRATION *** -> #2298: BQ Datasets can't be duplicate registered in the same project, and duplicate buckets can't be registered at all. --- accounts/models.py | 5 ++++- accounts/views.py | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/accounts/models.py b/accounts/models.py index cacb3478..057402eb 100755 --- a/accounts/models.py +++ b/accounts/models.py @@ -84,7 +84,7 @@ def active_service_accounts(self): class Bucket(models.Model): google_project = models.ForeignKey(GoogleProject, null=False) - bucket_name = models.CharField(null=True,max_length=155) + bucket_name = models.CharField(null=True,max_length=155, unique=True) bucket_permissions = models.TextField(null=True) def __str__(self): @@ -94,6 +94,9 @@ class BqDataset(models.Model): google_project = models.ForeignKey(GoogleProject, null=False) dataset_name = models.CharField(null=False, max_length=155) + class Meta: + unique_together = (("google_project", "dataset_name"),) + class AuthorizedDataset(models.Model): name = models.CharField(max_length=256, null=False) diff --git a/accounts/views.py b/accounts/views.py index e2ad421f..0072ae50 100755 --- a/accounts/views.py +++ b/accounts/views.py @@ -476,6 +476,21 @@ def register_bucket(request, user_id, gcp_id): # Check bucketname not null if not bucket_name: messages.error(request, 'There was no bucket name provided.') + return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) + else: + try: + bucket = Bucket.objects.get(bucket_name=bucket_name) + if bucket.google_project.project_id != gcp.project_id: + messages.error(request,"A bucket with that name has already been registered under a different project.") + else: + messages.error(request, "A bucket with that name has already been registered under this project. Buckets can only be registered once.") + return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) + except MultipleObjectsReturned: + messages.error(request, + "More than one bucket with that name has already been registered. Buckets can only be registered once.") + return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) + except ObjectDoesNotExist: + pass # Check that bucket is in project try: @@ -546,8 +561,19 @@ def register_bqdataset(request, user_id, gcp_id): # Check bqdatasetname not null if not bqdataset_name: messages.error(request, 'There was no dataset name provided.') + return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) else: bqdataset_name = bqdataset_name.strip() + try: + BqDataset.objects.get(dataset_name=bqdataset_name,google_project=gcp) + messages.error(request,"A dataset with this name has already been registered for project {}.".format(gcp.project_id)) + return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) + except MultipleObjectsReturned: + messages.error(request, "Multiple datasets with this name have already been registered for project {}.".format( + gcp.project_id)) + return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) + except ObjectDoesNotExist: + pass # Check that bqdataset is in project try: From 47f0ab4d9a9253fce70f0c7cff6b14ea9b407bfe Mon Sep 17 00:00:00 2001 From: "S. Paquette" Date: Fri, 29 Jun 2018 14:17:37 -0700 Subject: [PATCH 28/76] -> Migration for #2298 --- .../migrations/0017_auto_20180629_1416.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 accounts/migrations/0017_auto_20180629_1416.py diff --git a/accounts/migrations/0017_auto_20180629_1416.py b/accounts/migrations/0017_auto_20180629_1416.py new file mode 100644 index 00000000..0bcf9dae --- /dev/null +++ b/accounts/migrations/0017_auto_20180629_1416.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.10 on 2018-06-29 21:16 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('accounts', '0016_auto_20180524_1033'), + ] + + operations = [ + migrations.AlterField( + model_name='bucket', + name='bucket_name', + field=models.CharField(max_length=155, null=True, unique=True), + ), + migrations.AlterUniqueTogether( + name='bqdataset', + unique_together=set([('google_project', 'dataset_name')]), + ), + ] From 1192d27c09d91b30944fc7348dcce4bbdedfe291 Mon Sep 17 00:00:00 2001 From: "S. Paquette" Date: Fri, 29 Jun 2018 14:29:25 -0700 Subject: [PATCH 29/76] -> Escape user input for Reasons -> Re-iterate name provided, improve error message --- accounts/views.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/accounts/views.py b/accounts/views.py index 0072ae50..75c56556 100755 --- a/accounts/views.py +++ b/accounts/views.py @@ -481,13 +481,24 @@ def register_bucket(request, user_id, gcp_id): try: bucket = Bucket.objects.get(bucket_name=bucket_name) if bucket.google_project.project_id != gcp.project_id: - messages.error(request,"A bucket with that name has already been registered under a different project.") + messages.error( + request, + "A bucket with the name {} has already been registered under a different project.".format(escape(bucket_name)) + + " If you feel you've received this message in error, please contact the administrator." + ) else: - messages.error(request, "A bucket with that name has already been registered under this project. Buckets can only be registered once.") + messages.error( + request, + "A bucket with the name {} has already been registered under project {}.".format(escape(bucket_name),gcp.project_id) + + " Buckets can only be registered to a project once. If you feel you've received this message in error, please contact the administrator." + ) return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) except MultipleObjectsReturned: - messages.error(request, - "More than one bucket with that name has already been registered. Buckets can only be registered once.") + messages.error( + request, + "More than one bucket with the name {} has already been registered.".format(escape(bucket_name)) + + " Buckets can only be registered once." + ) return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) except ObjectDoesNotExist: pass @@ -566,11 +577,11 @@ def register_bqdataset(request, user_id, gcp_id): bqdataset_name = bqdataset_name.strip() try: BqDataset.objects.get(dataset_name=bqdataset_name,google_project=gcp) - messages.error(request,"A dataset with this name has already been registered for project {}.".format(gcp.project_id)) + messages.error(request,"A dataset with the name {} has already been registered for project {}.".format(escape(bqdataset_name),gcp.project_id)) return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) except MultipleObjectsReturned: - messages.error(request, "Multiple datasets with this name have already been registered for project {}.".format( - gcp.project_id)) + messages.error(request, "Multiple datasets with the name {} have already been registered for project {}.".format( + escape(bqdataset_name),gcp.project_id)) return redirect('gcp_detail', user_id=user_id, gcp_id=gcp_id) except ObjectDoesNotExist: pass From ed20614536c8924c48eed827fcda5d7dd9de4f5f Mon Sep 17 00:00:00 2001 From: elainelee Date: Fri, 29 Jun 2018 14:55:45 -0700 Subject: [PATCH 30/76] File Browser: Filter by Case Barcode - changing sql query a 'like %s%' statement --- cohorts/metadata_counting.py | 6 +++--- cohorts/views.py | 25 +++++++++++++++++-------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index 2c140898..894546ee 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -199,7 +199,7 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f subfilter[filter] = inc_filters[filter] built_clause = build_where_clause(subfilter, for_files=True) - filter_clauses[filter]['where_clause'] = built_clause['query_str'] + filter_clauses[filter]['where_clause'] = built_clause['query_str'].replace("%s", "'{}'") filter_clauses[filter]['parameters'] = built_clause['value_tuple'] for attr in metadata_data_attr: @@ -207,10 +207,10 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f where_clause = "" filter_clause = ') AND ('.join([filter_clauses[x]['where_clause'] for x in filter_clauses if x != attr or (filter_format and attr == 'data_format')]) if len(filter_clause): + filter_clause = filter_clause.format(*[y for x in filter_clauses for y in filter_clauses[x]['parameters'] if x != attr or (filter_format and attr == 'data_format')]) where_clause = "AND ( {} )".format(filter_clause) - paramter_tuple = tuple(y for x in filter_clauses for y in filter_clauses[x]['parameters'] if x != attr or (filter_format and attr == 'data_format')) query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr) - cursor.execute(query, paramter_tuple) + cursor.execute(query) for row in cursor.fetchall(): val = "None" if not row[0] else row[0] counts[attr][val] = row[1] diff --git a/cohorts/views.py b/cohorts/views.py index 1b0ed7ef..9f00d657 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -2068,7 +2068,8 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co file_list = [] total_file_count = 0 case_barcode = request.GET.get('case_barcode', '') - case_barcode_condition = '' if not case_barcode else "AND cs.case_barcode ='" + case_barcode + "'" + #case_barcode_condition = '' if not case_barcode else "AND cs.case_barcode ='" + case_barcode + "'" + case_barcode_condition = '' if not case_barcode else "AND cs.case_barcode like '%" + case_barcode + "%'" try: # Attempt to get the cohort perms - this will cause an excpetion if we don't have them Cohort_Perms.objects.get(cohort_id=cohort_id, user_id=user_id) @@ -2126,10 +2127,12 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co } if limit > 0: - limit_clause = ' LIMIT %s' % str(limit) + #limit_clause = ' LIMIT %s' % str(limit) + limit_clause = ' LIMIT {}'.format(str(limit)) # Offset is only valid when there is a limit if offset > 0: - offset_clause = ' OFFSET %s' % str(offset) + #offset_clause = ' OFFSET %s' % str(offset) + offset_clause = ' OFFSET {}'.format(str(offset)) order_clause = "ORDER BY " + col_map[sort_column] + (" DESC" if sort_order == 1 else "") @@ -2210,7 +2213,7 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co cursor = db.cursor(MySQLdb.cursors.DictCursor) cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() - params = () + #params = () select_clause = '' count_select_clause = '' first_program = True @@ -2225,7 +2228,7 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co if len(inc_filters): built_clause = build_where_clause(inc_filters, for_files=True) filter_conditions = 'AND ' + built_clause['query_str'] - params += built_clause['value_tuple'] + filter_conditions = filter_conditions.replace("%s", "'{}'").format(*built_clause['value_tuple']) union_template = (" UNION " if not first_program else "") + "(" + select_clause_base + ")" select_clause += union_template.format( @@ -2247,16 +2250,22 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co if not first_program: if limit > 0: - limit_clause = ' LIMIT %s' % str(limit) + #limit_clause = ' LIMIT %s' % str(limit) + limit_clause = ' LIMIT {}'.format(str(limit)) # Offset is only valid when there is a limit if offset > 0: - offset_clause = ' OFFSET %s' % str(offset) + #offset_clause = ' OFFSET %s' % str(offset) + offset_clause = ' OFFSET {}'.format(str(offset)) order_clause = "ORDER BY "+col_map[sort_column]+(" DESC" if sort_order == 1 else "") start = time.time() query = file_list_query.format(select_clause=select_clause, order_clause=order_clause, limit_clause=limit_clause, offset_clause=offset_clause) - cursor.execute(query, params) + #final_query = query % params + #cursor.execute(query, params) + #print(final_query) + #print(params) + cursor.execute(query) stop = time.time() logger.info("[STATUS] Time to get file-list: {}s".format(str(stop - start))) From 235ae744319c08d43ea952c0241382157f68c821 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Mon, 2 Jul 2018 19:30:12 -0700 Subject: [PATCH 31/76] Making progress... --- accounts/dcf_views.py | 810 +++++++++++++++++------------------------- accounts/sa_utils.py | 55 ++- 2 files changed, 380 insertions(+), 485 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 8570aed1..315c3155 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -34,7 +34,8 @@ from sa_utils import found_linking_problems, DemoLoginResults, handle_user_for_dataset,\ handle_user_db_update_for_dcf_linking, \ unlink_account_in_db_for_dcf, get_dcf_refresh_key_remaining_seconds, \ - get_stored_dcf_token, TokenFailure, RefreshTokenExpired + get_stored_dcf_token, TokenFailure, RefreshTokenExpired, InternalTokenError, \ + force_dcf_token_expiration from models import DCFToken, AuthorizedDataset, NIH_User, UserAuthorizedDatasets from requests_oauthlib.oauth2_session import OAuth2Session @@ -339,6 +340,7 @@ def oauth2_callback(request): # through, it will be blank. Otherwise, it either matches our login ID, or might be some rando # email if the user e.g. bailed before fixing it last time. We will not enter a value for that # field in the DB unless the ID coming back from DCF matches our login ID. + # save_google_link = None if google_link: @@ -379,44 +381,43 @@ def oauth2_callback(request): logger.info("[INFO] OAuthCB l") if google_link != req_user.email: try: - _unlink_at_dcf(request.user.id, True) # True = recently saved token is now updated with unlinked state + _unlink_at_dcf(request.user.id, True) # True means after unlinking, we call DCF again to update our link state message = "You must use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( req_user.email, google_link) messages.warning(request, message) return redirect(reverse('user_detail', args=[request.user.id])) - except TokenFailure as e: + except TokenFailure: messages.error(request, internal_err_msg.format("005")) return redirect(reverse('user_detail', args=[request.user.id])) except RefreshTokenExpired: messages.error(request, internal_err_msg.format("005a")) return redirect(reverse('user_detail', args=[request.user.id])) - except DCFCommFailure as e: + except DCFCommFailure: messages.error(request, comm_err_msg) return redirect(reverse('user_detail', args=[request.user.id])) + except InternalTokenError: + messages.error(request, internal_err_msg.format("005b")) + return redirect(reverse('user_detail', args=[request.user.id])) # - # The link matches. So we use PATCH, and if it goes smoothly, we write the new link to the database: + # The link matches. So we use PATCH. Any problems encountered and we return error message to user: # logger.info("[INFO] OAuthCB m") + try: - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') + err_msg, returned_expiration_str, _ = _refresh_at_dcf(request.user.id) except TokenFailure: - messages.error(request, internal_err_msg.format("006")) - return redirect(reverse('user_detail', args=[request.user.id])) + err_msg = internal_err_msg.format("006") + except InternalTokenError: + err_msg = internal_err_msg.format("006a") except RefreshTokenExpired: - messages.error(request, internal_err_msg.format("007")) - return redirect(reverse('user_detail', args=[request.user.id])) + err_msg = internal_err_msg.format("007") except DCFCommFailure: - messages.error(request, comm_err_msg) - return redirect(reverse('user_detail', args=[request.user.id])) + err_msg = comm_err_msg - if resp.status_code == 404: # Now DCF says user is NOT linked... - messages.error(request, internal_err_msg.format("008")) - return redirect(reverse('user_detail', args=[request.user.id])) - elif resp.status_code != 200: - logger.error("[ERROR] Unexpected response ({}, {}) from DCF during linking.".format(resp.status_code, resp.text)) - messages.warning(request, internal_err_msg.format("009")) + if err_msg: + messages.error(request, err_msg) return redirect(reverse('user_detail', args=[request.user.id])) # @@ -425,11 +426,11 @@ def oauth2_callback(request): # logger.info("[INFO] OAuthCB n") - returned_expiration_str = json_loads(resp.text)['exp'] use_expiration_time = _calc_expiration_time(returned_expiration_str) logger.info("[INFO] OAuthCB o") - warning = _finish_the_link(request.user.id, req_user.email, use_expiration_time, st_logger) + # Don't hit DCF again, we just did it (thus False): + warning = _finish_the_link(request.user.id, req_user.email, use_expiration_time, st_logger, False) messages.warning(request, warning) return redirect(reverse('user_detail', args=[request.user.id])) @@ -447,6 +448,7 @@ def oauth2_callback(request): finally: os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '0' + @login_required def dcf_link_callback(request): """ @@ -456,9 +458,6 @@ def dcf_link_callback(request): dcf_err_msg = "DCF reported an error {} logging in. Please contact the ISB-CGC administrator." - # log the reports using Cloud logging API - st_logger = StackDriverLogger.build_from_django_settings() - # # If there was an error, return that: Also, we now need to equip all callbacks to report # any random error that is reported back to us. @@ -477,7 +476,7 @@ def dcf_link_callback(request): logger.error("[ERROR]: DCF reports an error ({}, {}, {}) trying to link Google ID".format(error, message, error_description)) - messages.error(request, messages.error(request, dcf_err_msg.format("D002"))) + messages.error(request, dcf_err_msg.format("D002")) return redirect(reverse('user_detail', args=[request.user.id])) # @@ -492,7 +491,7 @@ def dcf_link_callback(request): use_expiration_time = _calc_expiration_time(returned_expiration_str) # - # We will NEVER accept a Google ID that does not match At this point, we need to wrestle + # We will NEVER accept a Google ID that does not match. At this point, we need to wrestle # with the possible problem that the user has linked to a DIFFERENT GoogleID while off # messing with DCF. If the ID that comes back is not identical to what we think it is, # they need to go and do it again. BUT as far as DCF is concerned, they are linked, @@ -501,7 +500,7 @@ def dcf_link_callback(request): try: the_user_token_string = _get_user_data_token_string(request.user.id) # a string. - except (TokenFailure, DCFCommFailure, RefreshTokenExpired): + except (TokenFailure, InternalTokenError, DCFCommFailure, RefreshTokenExpired): return redirect(reverse('user_detail', args=[request.user.id])) the_user_token_dict = json_loads(the_user_token_string) @@ -520,26 +519,34 @@ def dcf_link_callback(request): logger.error("No google link provided by DCF") if google_link is None: - messages.warning(request, 'Error detected during linking. ' - 'No Google User ID returned. Please report this ' - 'to the ISB-CGC administrator') + messages.error(request, dcf_err_msg.format("D003")) return redirect(reverse('user_detail', args=[request.user.id])) req_user = User.objects.get(id=request.user.id) - # NOPE! Send user back to details page. The empty google ID in our table will mean the page shows an option to try again. + # + # No match? Not acceptable. Send user back to details page. The empty google ID in our table will + # mean the page shows an option to try again. We need to + # if google_link != req_user.email: - _unlink_at_dcf(request.user.id, True) # True = recently saved token is now updated with unlinked state - message = "Please use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( + try: + _unlink_at_dcf(request.user.id, True) # True means saved token is now updated with unlinked state + except (TokenFailure, InternalTokenError, DCFCommFailure, RefreshTokenExpired): + return redirect(reverse('user_detail', args=[request.user.id])) + + message = "You must use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( req_user.email, google_link) messages.warning(request, message) return redirect(reverse('user_detail', args=[request.user.id])) # - # If all is well, this is where we add the user to the NIH_User table and link the user to the various data sets. + # If all is well, we add the user to the NIH_User table and link the user to the various data sets. # try: - warning = _finish_the_link(request.user.id, google_link, use_expiration_time, st_logger) + # log the reports using Cloud logging API + st_logger = StackDriverLogger.build_from_django_settings() + # Don't hit DCF again, we just did it (thus False): + warning = _finish_the_link(request.user.id, google_link, use_expiration_time, st_logger, False) except (TokenFailure, RefreshTokenExpired): messages.warning(request, "say something witty here...") return redirect(reverse('user_detail', args=[request.user.id])) @@ -555,41 +562,36 @@ def dcf_link_extend(request): Put a user's GoogleID in the ACL groups for 24 (more) hours: """ - # log the reports using Cloud logging API - st_logger = StackDriverLogger.build_from_django_settings() + comm_err_msg = "There was a communications problem contacting Data Commons Framework." + + # + # If user has disconnected their ID in another window before clicking this link, they would easily get a + # TokenFailure, or an error message that they were no longer linked at DCF. + # + + returned_expiration_str = None + user_data_token_string = None try: - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='patch') + err_msg, returned_expiration_str, user_data_token_string = _refresh_at_dcf(request.user.id) except TokenFailure: - pass + err_msg = "Your Data Commons Framework identity needs to be reestablished to complete this task." except RefreshTokenExpired: - pass - except DCFCommFailure as e: # Any rando exception from the call is turned into this! - logger.error("[ERROR] Link patch call failure") - logger.exception(e) - messages.warning(request, "Error contacting DCF during linking. " - "Please contact the ISB-CGC administrator.") - return redirect(reverse('user_detail', args=[request.user.id])) + err_msg = "Your login to the Data Commons Framework has expired. You will need to log in again." + except DCFCommFailure: + err_msg = comm_err_msg + except Exception: + err_msg = "Unexpected problem." - if resp.status_code == 404: - messages.warning(request, "No linked Google account found for user") - return redirect(reverse('user_detail', args=[request.user.id])) - elif resp.status_code != 200: - messages.warning(request, "Unexpected response ({}) from DCF during linking. " - "Please contact the ISB-CGC administrator.".format(resp.status_code)) + if err_msg: + messages.error(request, err_msg) return redirect(reverse('user_detail', args=[request.user.id])) - returned_expiration_str = json_loads(resp.text)['exp'] use_expiration_time = _calc_expiration_time(returned_expiration_str) - - # User data set permissions might have changed, so we call and find out what they are: - try: - user_data_token_string = _get_user_data_token_string(request.user.id) # Can raise TokenFailure or DCFCommFailure - except (TokenFailure, DCFCommFailure, RefreshTokenExpired) as e: - redirect(reverse('user_detail', args=[request.user.id])) - user_data_dict = _user_data_token_to_user_dict(user_data_token_string) + # log the reports using Cloud logging API + st_logger = StackDriverLogger.build_from_django_settings() _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data_dict, use_expiration_time, st_logger) if warning: @@ -622,11 +624,13 @@ def _calc_expiration_time(returned_expiration_str): return calc_expiration_time -def _finish_the_link(user_id, user_email, expiration_time, st_logger): +def _finish_the_link(user_id, user_email, expiration_time, st_logger, refresh_first): """ Regardless of how they get here, this step handles the linking of the user by adding the required database records. :raises TokenFailure: + :raises InternalTokenError: + :raises DCFCommFailure: :raises RefreshTokenExpired: """ @@ -636,25 +640,30 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger): # Until we get back current projects, refresh it: # - try: - the_user_token = _get_user_data_token_string(user_id) # the_user is a string. - except (TokenFailure, RefreshTokenExpired) as e: - raise e - - # Can raise TokenFailure or DCFCommFailure or RefreshTokenExpired + if refresh_first: + try: + the_user_token = _get_user_data_token_string(user_id) # the_user is a string. + except (TokenFailure, InternalTokenError, DCFCommFailure, RefreshTokenExpired) as e: + raise e # # Save the new info from the DCF: # - dcf_token = get_stored_dcf_token(user_id) # Can raise a TokenFailure or RefreshTokenExpired + try: + dcf_token = get_stored_dcf_token(user_id) + except (TokenFailure, InternalTokenError, RefreshTokenExpired) as e: + raise e if dcf_token.google_id is not None and dcf_token.google_id != user_email: return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ 'Please report this to the ISB-CGC administrator' dcf_token.google_id = user_email - dcf_token.user_token = the_user_token + if refresh_first: + dcf_token.user_token = the_user_token + else: + the_user_token = dcf_token.user_token dcf_token.save() the_user_dict = _user_data_token_to_user_dict(the_user_token) @@ -768,6 +777,7 @@ def _get_user_data_token_string(user_id): Get up-to-date user data from DCF, massage as needed. :raises TokenFailure: + :raises InternalTokenError: :raises DCFCommFailure: :raises RefreshTokenExpired: """ @@ -788,6 +798,7 @@ def _user_data_from_token(user_id, stash_it): PLUS, user can set stash_it to True. DCF suggests we refresh the access token after e.g. unlinking. :raises TokenFailure: + :raises InternalTokenError: :raises DCFCommFailure: :raises RefreshTokenExpired: """ @@ -798,7 +809,7 @@ def _user_data_from_token(user_id, stash_it): try: dcf_token = get_stored_dcf_token(user_id) - except (TokenFailure, RefreshTokenExpired) as e: + except (TokenFailure, InternalTokenError, RefreshTokenExpired) as e: raise e client_id, client_secret = _get_secrets() @@ -874,19 +885,21 @@ def _unlink_at_dcf(user_id, do_refresh): dropping the Google ID is one step in the teardown flow. We NEVER enter a Google ID into the DCFToken table if it does not match their ISB-CCG ID. - Can raise TokenFailure, DCFCommFailure + Can raise TokenFailure, DCFCommFailure, RefreshTokenExpired WARNING: DO NOT CALL this routine unless we have positive evidence returned from DCF that the user is linked. It is an error to tell DCF to unlink if the user is not actually linked. That said, we will log the discrepancy but not issue any error to the user. :raise TokenFailure: + :raise InternalTokenError: :raise DCFCommFailure: :raise RefreshTokenExpired: """ success = False + throw_later = None # # Call DCF to drop the linkage. Note that this will immediately remove them from controlled access. @@ -894,85 +907,122 @@ def _unlink_at_dcf(user_id, do_refresh): try: resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') # can raise TokenFailure, DCFCommFailure - except TokenFailure: - pass - except RefreshTokenExpired: - pass - except DCFCommFailure: - pass + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: + throw_later = e # hold off so we can try a refresh first... except Exception as e: logger.error("[ERROR] Attempt to contact DCF for Google ID unlink failed (user {})".format(user_id)) raise e - if resp.status_code == 404: - # We are trying to unlink, and DCF thinks there is no link. Silent failure! - logger.error("[ERROR] No linked Google account found for user {}".format(user_id)) - success = True - elif resp.status_code == 400: - delete_response = json_loads(resp.text) - error = delete_response['error'] - message = delete_response['error_description'] - logger.error("[ERROR] Error returned in unlinking: {} : {}".format(error, message)) - elif resp.status_code == 200: - success = True - else: - logger.error("[ERROR] Unexpected response from DCF: {}".format(resp.status_code)) + if resp: + if resp.status_code == 404: + # We are trying to unlink, and DCF thinks there is no link. *Silent* failure! + logger.error("[ERROR] No linked Google account found for user {}".format(user_id)) + success = True + elif resp.status_code == 400: + delete_response = json_loads(resp.text) + error = delete_response['error'] + message = delete_response['error_description'] + logger.error("[ERROR] Error returned in unlinking: {} : {}".format(error, message)) + elif resp.status_code == 200: + success = True + else: + logger.error("[ERROR] Unexpected response from DCF: {}".format(resp.status_code)) # # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking - # since they care about the token info: + # since they care about the token info. Even if we had a failure, let's try to refresh: # if do_refresh: - _user_data_from_token(user_id, True) # Can raise TokenFailure, DCFCommFailure + try: + _user_data_from_token(user_id, True) + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: + throw_later = throw_later if throw_later else e - if not success: + if throw_later: + raise throw_later + elif not success: raise DCFCommFailure() return -def _unlink_internally(user_id, just_with_dcf): - # FIXME NEED SOME INTERNAL UNLINK CODE, THIS NEEDS WORK +def _refresh_at_dcf(user_id): """ - There are only three places where we call DCF to do a Google unlink: 1) If they login via NIH and we get - a token for the user that tells us they already are linked to a Google ID that does not match their ISB-CGC - login ID. 2) We send them back to DCF to do the Google ID linking step and the callback informs us that they - have logged in with the wrong (not ISB-CGC) Google ID, and 3) the user has chosen to fully disconnect, and - dropping the Google ID is one step in the teardown flow. We NEVER enter a Google ID into the DCFToken - table if it does not match their ISB-CCG id. + Handle the PATCH call, to extend a user's presence on controlled access for 24 hours. Note that we might + reasonably raise a TokenFailure if the user disconnects from DCF in one screen before extending in another. + This could also manifest as a 404 response from DCF + + Can raise TokenFailure, DCFCommFailure, RefreshTokenExpired + + WARNING: DO NOT CALL this routine unless we have positive evidence returned from DCF that the user is + linked. It is an error to tell DCF to patch if the user is not actually linked, and this will be an error. + + :raises TokenFailure: + :raises InternalTokenError: + :raises DCFCommFailure: + :raises RefreshTokenExpired: + """ - warnings = [] - errors = [] + success = False + throw_later = None + err_msg = None + returned_expiration_str = None + massaged_string = None # - # Get our concept of linking state from the token DB: + # Call DCF to drop the linkage. Note that this will immediately remove them from controlled access. # - if not just_with_dcf: - try: - dcf_token = get_stored_dcf_token(user_id) - except TokenFailure: - pass - except RefreshTokenExpired: - pass - the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) - google_link = _get_google_link_from_user_dict(the_user_dict) - - if google_link is None: - warnings.append("User is not linked to Google") - return success, warnings, errors + + try: + resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='patch') + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: + throw_later = e + except Exception as e: + logger.error("[ERROR] Attempt to contact DCF for Google ID patch failed (user {})".format(user_id)) + raise e + + if resp: + if resp.status_code == 404: + err_msg = "User's GoogleID was no longer linked at Data Commons" + elif resp.status_code == 200: + success = True + else: + logger.error("[ERROR] Unexpected response from DCF: {}".format(resp.status_code)) + + returned_expiration_str = json_loads(resp.text)['exp'] # - # First, call DCF to drop the linkage. This is the only way to get the user - # booted out of control groups. - # - # - # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking - # since they care about the token info: + # Per discussions with DCF, need to ask for a new token from DCF after changing google linking + # status. Always try to do this. Return the result too, since caller might be interested. # - _user_data_from_token(user_id, True) # Can raise TokenFailure or DCFCommFailure + try: + the_user_id_token, _ = _user_data_from_token(user_id, True) + massaged_string, _ = _user_data_token_massaged(the_user_id_token) + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: + throw_later = throw_later if throw_later else e + + if throw_later: + raise throw_later + elif not success: + raise DCFCommFailure() + + return err_msg, returned_expiration_str, massaged_string + + +def _unlink_internally(user_id): + """ + If we need to unlink a user who was previously ACTUALLY linked, there are internal fixes to be made. + + :raises TokenFailure: + :raises InternalTokenError: + :raises Exception: + """ + + still_to_throw = None + dcf_token = None # # The Token table records the User's Google ID. This needs to be nulled. The expiration time in the DCFToken @@ -981,13 +1031,16 @@ def _unlink_internally(user_id, just_with_dcf): try: dcf_token = get_stored_dcf_token(user_id) - except TokenFailure: - pass - except RefreshTokenExpired: - pass + except (TokenFailure, InternalTokenError) as e: + # We either have no token, or it is corrupted. But we still want to get the NIH table cleaned up: + still_to_throw = e + except RefreshTokenExpired as e: + # An expired token still needs to have field cleared: + dcf_token = e.token - dcf_token.google_id = None - dcf_token.save() + if dcf_token: + dcf_token.google_id = None + dcf_token.save() # # Now drop the link flag and active flag from the DB, plus our db records of what datasets the user is @@ -995,108 +1048,17 @@ def _unlink_internally(user_id, just_with_dcf): # try: - message = unlink_account_in_db_for_dcf(user_id) - if message: - errors.append(message) - success = False - + unlink_account_in_db_for_dcf(user_id) except Exception as e: + still_to_throw = still_to_throw if still_to_throw else e logger.error("[ERROR] While unlinking accounts:") logger.exception(e) - errors.append('There was an error when attempting to unlink your Google ID - please contact the administrator.') - success = False - - return success, warnings, errors + if still_to_throw: + raise still_to_throw + return -# def _unlink_internals(user_id, just_with_dcf): -# """ -# Handles all the internal details of unlinking a user's Google ID. -# """ -# warnings = [] -# errors = [] -# success = False -# -# # -# # Get our concept of linking state from the token DB: -# # -# if not just_with_dcf: -# dcf_token = get_stored_dcf_token(user_id) -# if not dcf_token: -# raise TokenFailure() -# the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) -# google_link = _get_google_link_from_user_dict(the_user_dict) -# -# if google_link is None: -# warnings.append("User is not linked to Google") -# return success, warnings, errors -# -# # -# # First, call DCF to drop the linkage. This is the only way to get the user -# # booted out of control groups. -# # -# -# try: -# resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') -# except Exception as e: -# logger.error("[ERROR] Attempt to contact DCF for Google ID unlink failed (user {})".format(user_id)) -# logger.exception(e) -# errors.append("Unexpected error in unlinking") -# return success, warnings, errors -# -# if resp.status_code == 404: -# warnings.append("No linked Google account found for user") -# elif resp.status_code == 400: -# delete_response = json_loads(resp.text) -# error = delete_response['error'] -# message = delete_response['error_description'] -# errors.append("Error in unlinking: {} : {}".format(error, message)) -# elif resp.status_code == 200: -# success = True -# else: -# warnings.append("Unexpected response from DCF") -# -# if just_with_dcf: -# return success, warnings, errors -# -# # -# # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking -# # since they care about the token info: -# # -# -# _user_data_from_token(user_id, True) # Can raise TokenFailure or DCFCommFailure -# -# # -# # The Token table records the User's Google ID. This needs to be nulled. The expiration time in the DCFToken -# # is for the access token, not the google link (that info is stored in the NIH_user): -# # -# -# dcf_token = get_stored_dcf_token(user_id) -# if not dcf_token: -# raise TokenFailure() -# -# dcf_token.google_id = None -# dcf_token.save() -# -# # -# # Now drop the link flag and active flag from the DB, plus our db records of what datasets the user is -# # good for: -# # -# -# try: -# message = unlink_account_in_db_for_dcf(user_id) -# if message: -# errors.append(message) -# success = False -# -# except Exception as e: -# logger.error("[ERROR] While unlinking accounts:") -# logger.exception(e) -# errors.append('There was an error when attempting to unlink your Google ID - please contact the administrator.') -# success = False -# -# return success, warnings, errors def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_from_dcf, dcf_uid, cgc_uid, google_id): """ @@ -1215,6 +1177,7 @@ def _decode_token(token): """ return _decode_token_chunk(token, 1) + @login_required def dcf_disconnect_user(request): """ @@ -1223,117 +1186,93 @@ def dcf_disconnect_user(request): refresh token. """ - # First thing ya gotta do is tell DCF to unlink the user, which will get them out of - # access control groups. BUT ONLY IF THEY ARE ACTUALLY CURRENTLY LINKED! + # + # First thing ya gotta do is tell DCF to unlink the user. + # + # If user is sitting on this page in one browser, and logs out via another, we would have + # no DCF token anymore. Catch that case and silently no-op. If their refresh token has expired, + # they would have to login in order to disconnect! + # try: - msg_list = [] - # - # If user is sitting on this page in one browser, and logs out via another, we would have - # no DCF token anymore. Catch that case and silently no-op: - # - - try: - dcf_token = get_stored_dcf_token(request.user.id) - except TokenFailure: - pass - except RefreshTokenExpired: - pass - - if not dcf_token: - return redirect(reverse('user_detail', args=[request.user.id])) - - the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) - - google_link = _get_google_link_from_user_dict(the_user_dict) - - logger.info("[INFO] DDU A") - - if google_link: - try: - resp = _dcf_call(DCF_GOOGLE_URL, request.user.id, mode='delete') - except TokenFailure: - pass - except RefreshTokenExpired: - pass - except DCFCommFailure: - pass - - - - if resp.status_code == 404: - msg_list.append("No linked Google account found for user, code {}".format(resp.status_code)) - elif resp.status_code == 400: - delete_response = json_loads(resp.text) - error = delete_response['error'] - message = delete_response['error_description'] - msg_list.append("Error in unlinking: {} : {} : {}".format(error, message, resp.status_code)) - elif resp.status_code == 200: - pass - else: - msg_list.append("Unexpected response from DCF {}".format(resp.status_code)) + dcf_token = get_stored_dcf_token(request.user.id) + except (TokenFailure, InternalTokenError): + return redirect(reverse('user_detail', args=[request.user.id])) + except RefreshTokenExpired: + messages.warning(request, "You will need to first login to the Data Commons again to disconnect your Google ID") + return redirect(reverse('user_detail', args=[request.user.id])) - # - # The revoke call is unlike other DCF endpoints in that it is special! - # Token revocation is described here: https://tools.ietf.org/html/rfc7009#section-2.1 - # So we do not provide a bearer access token, but the client ID and secret in a Basic Auth - # framework. Not seeing that inside the OAuthSession framework, so we roll our own by hand: - # + # + # We are going to go ahead and unlink regardless of what we think the state is. If DCF tells us there + # is no link when we try to do it, we ignore that fact: + # - client_id, client_secret = _get_secrets() + try: + _unlink_at_dcf(request.user.id, False) # Don't refresh, we are about to drop the record... + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure): + messages.warning(request, "Unexpected response from DCF {}".format(resp.status_code)) + return redirect(reverse('user_detail', args=[request.user.id])) - data = { - 'token': dcf_token.refresh_token - } - logger.info("[INFO] DDU B") + # + # Now revoke the token! + # + # The revoke call is unlike other DCF endpoints in that it is special! + # Token revocation is described here: https://tools.ietf.org/html/rfc7009#section-2.1 + # So we do not provide a bearer access token, but the client ID and secret in a Basic Auth + # framework. Not seeing that inside the OAuthSession framework, so we roll our own by hand: + # - auth = requests.auth.HTTPBasicAuth(client_id, client_secret) - resp = requests.request('POST', DCF_REVOKE_URL, data=data, auth=auth) - client_id = None - client_secret = None + client_id, client_secret = _get_secrets() + data = { + 'token': dcf_token.refresh_token + } + logger.info("[INFO] DDU B") - logger.info("[INFO] DDU C") + auth = requests.auth.HTTPBasicAuth(client_id, client_secret) + resp = requests.request('POST', DCF_REVOKE_URL, data=data, auth=auth) + client_id = None + client_secret = None - if resp.status_code != 200 and resp.status_code != 204: - messages.warning(request, 'Revocation problem: {} : {}'.format(resp.status_code, resp.text)) + logger.info("[INFO] DDU C") - for msg in msg_list: - messages.warning(request, msg) + if resp.status_code != 200 and resp.status_code != 204: + messages.warning(request, 'Revocation problem: {} : {}'.format(resp.status_code, resp.text)) - # - # OK, NOW we detach the user in our NIH tables, and detach the user from data permissions. - # + # + # Now we do the internal unlinking, which includes detach the user in our NIH tables, and detach the user from data permissions. + # - unlink_account_in_db_for_dcf(request.user.id) + try: + _unlink_internally(request.user.id) + except (TokenFailure, InternalTokenError, Exception): + messages.warning(request, "Internal problems unlinking".format(resp.status_code)) + return redirect(reverse('user_detail', args=[request.user.id])) - # - # Next, we clear out our tokens for the user (which allows them to appear to DCF as the - # logged-in NIH user; we cannot keep them around: - # + # + # Next, we clear out our tokens for the user (which allows them to appear to DCF as the + # logged-in NIH user; we cannot keep them around). Since we just saved the last dcf_token + # after clearing the Google ID, we will get it again (probably unnecessary, but...?) + # - dcf_token.delete() + try: + dcf_token = get_stored_dcf_token(request.user.id) + except TokenFailure: + dcf_token = None + except InternalTokenError: + return redirect(reverse('user_detail', args=[request.user.id])) + except RefreshTokenExpired as e: + dcf_token = e.token - # - # Finally, we need to send the user to logout from the DCF, which is needed to clear the - # cookies DCF has dumped into their browser, which will allow them to log in to NIH again. - # + dcf_token.delete() - logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) - logger.info("[INFO] DDU D") - callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) + # + # Finally, we need to send the user to logout from the DCF, which is needed to clear the + # cookies DCF has dumped into their browser, which will allow them to log in to NIH again. + # - except TokenFailure as e: - logger.warning("[INFO] MissingTokenError seen") - logger.exception(e) - return redirect(reverse('user_detail', args=[request.user.id])) - except DCFCommFailure as e: - logger.warning("DCF Exception") - logger.exception(e) - return redirect(reverse('user_detail', args=[request.user.id])) - except Exception as e: - logger.error("[ERROR] While disconnect:") - logger.exception(e) - return redirect(reverse('user_detail', args=[request.user.id])) + logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) + logger.info("[INFO] DDU D") + callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) return HttpResponseRedirect(callback) @@ -1343,6 +1282,7 @@ def _dcf_call(full_url, user_id, mode='get', post_body=None, force_token=False): All the stuff around a DCF call that handles token management and refreshes. :raises TokenFailure: + :raises InternalTokenError: :raises DCFCommFailure: :raises RefreshTokenExpired: """ @@ -1390,10 +1330,16 @@ def token_storage_for_user(my_token_dict): logger.error("[ERROR] _dcf_call {} aborted: {}".format(full_url, str(e))) raise e except MissingTokenError as e: + force_dcf_token_expiration(user_id) logger.warning("[INFO] MissingTokenError seen") logger.exception(e) raise TokenFailure() + except InternalTokenError as e: + logger.warning("Internal Token Exception") + logger.exception(e) + raise e except Exception as e: + force_dcf_token_expiration(user_id) logger.warning("DCF Exception") logger.exception(e) raise DCFCommFailure() @@ -1425,99 +1371,16 @@ def _read_dict(my_file_name): return retval -def get_nih_user_details_from_token(user_id): - user_details = {} - - # - # The information we used to pull out of our database is now obtained from a DCF token - # - - # - # Now with DCF, we can have a user logged in as an NIH user, but not be linked (which means DCF does not - # have an association between NIH ID and Google ID). So while we previously did a get on a linked user, - # now we need to filter. If one of the users is linked, that is who we use. Otherwise, we can resolve the - # issue by looking at the current DCF token attached to the user to see who they are associated with. - # - - dcf_token = get_stored_dcf_token(user_id) - try: - dcf_token = get_stored_dcf_token(user_id) - except TokenFailure: - pass - except RefreshTokenExpired: - pass - - - - - if not dcf_token: - return user_details - - the_user_dict = _user_data_token_to_user_dict(dcf_token.user_token) - - google_link = _get_google_link_from_user_dict(the_user_dict) - - nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username=dcf_token.nih_username) - - if len(nih_users) == 0: - return user_details - - elif len(nih_users) == 1: - nih_user = nih_users.first() - - else: - nih_user = None - freshest_linked = None - freshest_linked_stamp = None - freshest_unlinked = None - freshest_unlinked_stamp = None - for user in nih_users: - if user.linked: - if (freshest_linked_stamp is None) or (freshest_linked_stamp < user.NIH_assertion_expiration): - freshest_linked_stamp = user.NIH_assertion_expiration - freshest_linked = user - if nih_user is None: - nih_user = nih_users.first() - else: - logger.error("[ERROR] Multiple linked nih users retrieved nih_user with user_id {}.".format(user_id)) - else: - if (freshest_unlinked_stamp is None) or (freshest_unlinked_stamp < user.NIH_assertion_expiration): - freshest_unlinked_stamp = user.NIH_assertion_expiration - freshest_unlinked = user - - if freshest_linked: - nih_user = freshest_linked - elif freshest_unlinked: - nih_user = freshest_unlinked - else: - logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) - return user_details - - user_auth_datasets = UserAuthorizedDatasets.objects.filter(nih_user=nih_user) - user_details['NIH_username'] = nih_user.NIH_username - user_details['NIH_assertion_expiration'] = nih_user.NIH_assertion_expiration - # Add a separate field to break out program count from active: - - user_details['dbGaP_has_datasets'] = (len(user_auth_datasets) > 0) - user_details['dbGaP_authorized'] = (len(user_auth_datasets) > 0) and nih_user.active - logger.debug("[DEBUG] User {} has access to {} dataset(s) and is {}".format(nih_user.NIH_username, str(len(user_auth_datasets)), ('not active' if not nih_user.active else 'active'))) - user_details['NIH_active'] = nih_user.active - user_details['NIH_DCF_linked'] = nih_user.linked - user_details['refresh_key_ok'] = get_dcf_refresh_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS - user_details['auth_datasets'] = [] if len(user_auth_datasets) <= 0 else AuthorizedDataset.objects.filter(id__in=user_auth_datasets.values_list('authorized_dataset',flat=True)) - - return user_details - - class GoogleLinkState: BOTH_NULL = 1 DCF_NULL_CGC_NON_NULL = 2 - DCF_NON_NULL_CGC_NULL = 3 - MATCHING_BAD = 4 - MATCHING_OK = 5 - NON_MATCHING_DCF_BAD = 6 - NON_MATCHING_CGC_BAD = 7 - NON_MATCHING_ALL_BAD = 8 + DCF_BAD_CGC_NULL = 3 + DCF_GOOD_CGC_NULL = 4 + MATCHING_BAD = 5 + MATCHING_OK = 6 + NON_MATCHING_DCF_BAD = 7 + NON_MATCHING_CGC_BAD = 8 + NON_MATCHING_ALL_BAD = 9 def _compare_google_ids(dcf_version, cgc_version, user_email): """ @@ -1533,30 +1396,35 @@ def _compare_google_ids(dcf_version, cgc_version, user_email): if dcf_version is None: google_match_state = GoogleLinkState.DCF_NULL_CGC_NON_NULL elif cgc_version is None: - google_match_state = GoogleLinkState.DCF_NON_NULL_CGC_NULL + if dcf_version == user_email: + google_match_state = GoogleLinkState.DCF_GOOD_CGC_NULL + else: + google_match_state = GoogleLinkState.DCF_BAD_CGC_NULL elif dcf_version == user_email: - google_match_state = GoogleLinkState.NON_MATCHING_CGC_BAD + google_match_state = GoogleLinkState.NON_MATCHING_CGC_BAD # Cannot happen elif cgc_version == user_email: google_match_state = GoogleLinkState.NON_MATCHING_DCF_BAD else: - google_match_state = GoogleLinkState.NON_MATCHING_ALL_BAD + google_match_state = GoogleLinkState.NON_MATCHING_ALL_BAD # Cannot happen # Next three cases handle matching GoogleIDs: elif dcf_version is None: google_match_state = GoogleLinkState.BOTH_NULL elif dcf_version == user_email: google_match_state = GoogleLinkState.MATCHING_OK elif dcf_version != user_email: - google_match_state = GoogleLinkState.MATCHING_BAD + google_match_state = GoogleLinkState.MATCHING_BAD # Cannot happen return google_match_state def _refresh_from_dcf(user_id): """ - We would like to check if our view of the user (linkage, expirations, datasets) is consistent with what the - DCF thinks, and update accordingly! + Whenever the user hits the user details page, we need to check how the DCF views the world (linkage, expirations, + datasets), and update accordingly. :raises TokenFailure: + :raises InternalTokenError: + :raises DCFCommFailure: :raises RefreshTokenExpired: """ @@ -1568,11 +1436,9 @@ def _refresh_from_dcf(user_id): try: the_user_token = _get_user_data_token_string(user_id) # the_user_token is a string. - except (TokenFailure, RefreshTokenExpired) as e: + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: raise e - # Can raise TokenFailure or DCFCommFailure - # # Things that could be different: Google ID linkage, expiration time, approved datasets. # Right now, we are not provided with expiration time, so we cannot check that. While NIH linkage @@ -1582,10 +1448,8 @@ def _refresh_from_dcf(user_id): # the_user_dict = _user_data_token_to_user_dict(the_user_token) - dcf_google_link = _get_google_link_from_user_dict(the_user_dict) nih_id = _get_nih_id_from_user_dict(the_user_dict) - dict_o_projects = _get_projects_from_user_dict(the_user_dict) # # Compare to our versions: @@ -1593,80 +1457,72 @@ def _refresh_from_dcf(user_id): try: dcf_token = get_stored_dcf_token(user_id) - except TokenFailure: + except (TokenFailure, InternalTokenError, RefreshTokenExpired) as e: + raise e + + if nih_id.lower() != dcf_token.nih_username_lower: + logger.error("ERROR: UNEXPECTED NIH_USER_ID MISMATCH {} VERSUS {}".format(nih_id.lower(), + dcf_token.nih_username_lower)) + # + # More worrisome is a possible mismatch between + # + + google_match_state = _compare_google_ids(dcf_google_link.lower(), dcf_token.google_id, user_email) + + null_us = False + attach_us = False + detach_dcf = False + all_done = False + if google_match_state == GoogleLinkState.DCF_NULL_CGC_NON_NULL: + null_us = True + elif google_match_state == GoogleLinkState.DCF_GOOD_CGC_NULL: + attach_us = True + elif google_match_state == GoogleLinkState.DCF_BAD_CGC_NULL: + detach_dcf = True + elif google_match_state == GoogleLinkState.NON_MATCHING_CGC_BAD: # Cannot happen + raise Exception() + elif google_match_state == GoogleLinkState.NON_MATCHING_DCF_BAD: + detach_dcf = True + elif google_match_state == GoogleLinkState.NON_MATCHING_ALL_BAD: # Cannot happen + raise Exception() + elif google_match_state == GoogleLinkState.BOTH_NULL: pass - except RefreshTokenExpired: + elif google_match_state == GoogleLinkState.MATCHING_OK: pass + elif google_match_state == GoogleLinkState.MATCHING_BAD: # Cannot happen + raise Exception() + if null_us: + try: + _unlink_internally(user_id) + except (TokenFailure, InternalTokenError, Exception) as e: + raise e + elif attach_us: + try: + warning = _finish_the_link(user_id, user_email, use_expiration_time, st_logger, False) + except (TokenFailure, InternalTokenError, Exception) as e: + raise e + all_done = True + elif detach_dcf: + "Tell the user there is a problem" - google_match_state = _compare_google_ids(dcf_google_link, dcf_token.google_id, user_email) - google_problem = None - - if google_match_state != GoogleLinkState.MATCHING_OK and google_match_state != GoogleLinkState.BOTH_NULL: - dcf_token.google_id = dcf_google_link - google_problem = google_match_state # # This is exercised when the NIH ID of the user, returned in the ID token is different than the one we # have in our token database. Don't think this is even possible, since user would need to log in as the # new NIH ID first... # - if nih_id.lower() != dcf_token.nih_username_lower: - logger.error("ERROR: UNEXPECTED NIH_USER_ID MISMATCH {} VERSUS {}".format(nih_id.lower(), - dcf_token.nih_username_lower)) # # If everything was consistent, if DCF tells the user is linked to an NIH ID, we would have that ID as one and # only one linked record in our DB. # - if google_match_state == GoogleLinkState.MATCHING_OK: - # Note the use of __iexact does case insensitive match: - linked_nih_user_for_user_and_id = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=nih_id, linked=True) - if len(linked_nih_user_for_user_and_id) == 1: - print "All is good" - else: - # - # Problems! If we have - nih_users_for_user = NIH_User.objects.filter(user_id=user_id) - nih_users_for_id = NIH_User.objects.filter(NIH_username__iexact=nih_id) - if len(nih_users_for_id) == 1: - pass - - - - - - # If user logged into DCF but did not get the linking done correctly, the token will provide us with the - # NIH ID they are using, but we will NOT have a current linked record in the NIH_User table. + if not all_done: + st_logger = StackDriverLogger.build_from_django_settings() + _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data_dict, use_expiration_time, st_logger) - # - # - # if dcf_token.google_id is not None and dcf_token.google_id != user_email: - # return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ - # 'Please report this to the ISB-CGC administrator' - # - # dcf_token.google_id = user_email - # dcf_token.user_token = the_user_token - # dcf_token.save() - # - # nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, - # nih_assertion_expiration, st_logger) - # - # - # authorized_datasets = [] - # for project, perm_list in dict_o_projects.iteritems(): - # adqs = AuthorizedDataset.objects.filter(whitelist_id=project) - # if len(adqs) == 1: - # authorized_datasets.append(DatasetGoogleGroupPair(project, adqs.first().acl_google_group)) - # - # das = DatasetAccessSupportFactory.from_webapp_django_settings() - # all_datasets = das.get_all_datasets_and_google_groups() - # - # for dataset in all_datasets: - # handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) - #return warning # @login_required diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 52093038..98e15625 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -50,14 +50,17 @@ class TokenFailure(Exception): - """Thrown if we have problems with our access/refresh tokens """ + """Thrown if we don't have our access/refresh tokens (user has disconnected from DCF)""" +class InternalTokenError(Exception): + """Thrown if we have internal DB consistency errors """ class RefreshTokenExpired(Exception): """Thrown if our refresh token is no longer valid and user must log in """ - def __init__(self, seconds): + def __init__(self, seconds, token): self.seconds = seconds + self.token = token def verify_service_account(gcp_id, service_account, datasets, user_email, is_refresh=False, is_adjust=False, remove_all=False): @@ -1091,7 +1094,7 @@ def unlink_account_in_db_for_dcf(user_id): # If nobody is linked, we are actually done. There is nothing to do. if num_linked == 0: - return None + return elif num_linked > 1: logger.warn("[WARNING] Found multiple linked accounts for user {}! Unlinking all accounts.".format(user_email)) @@ -1099,10 +1102,10 @@ def unlink_account_in_db_for_dcf(user_id): nih_account_to_unlink.linked = False nih_account_to_unlink.active = False nih_account_to_unlink.save() - nih_account_to_unlink.delete_all_auth_datasets() + nih_account_to_unlink.delete_all_auth_datasets() # Drop the user's approved data sets! logger.info("[STATUS] Unlinked NIH User {} from user {}.".format(nih_account_to_unlink.NIH_username, user_email)) - return None + return def handle_user_db_entry(user_id, NIH_username, user_email, auth_response, num_auth_datasets, @@ -1308,10 +1311,14 @@ def get_dcf_refresh_key_remaining_seconds(user_id): """ We need to know how many seconds are left before the user needs to log back in to NIH to get a new refresh token, which will expire every 30 days. + + :raises InternalTokenError: """ try: dcf_token = get_stored_dcf_token(user_id) + except InternalTokenError as e: + raise e except TokenFailure: return -1 # ? No token? They expire immediately! except RefreshTokenExpired as e: @@ -1331,6 +1338,7 @@ def get_stored_dcf_token(user_id): we need to be bulletproof on maybe not getting back a token. :raises TokenFailure: + :raises InternalTokenError: :raises RefreshTokenExpired: """ dcf_tokens = DCFToken.objects.filter(user_id) @@ -1338,18 +1346,42 @@ def get_stored_dcf_token(user_id): if num_tokens != 1: if num_tokens > 1: logger.error('[ERROR] Unexpected Server Error: Multiple tokens found for user {}'.format(user_id)) + raise InternalTokenError() else: logger.info('[INFO] User {} tried to use a flushed token'.format(user_id)) - raise TokenFailure() + raise TokenFailure() dcf_token = dcf_tokens.first() remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() if remaining_seconds <= 60: - raise RefreshTokenExpired(remaining_seconds) + # Still make the token available to e.g. drop linkages from DB + raise RefreshTokenExpired(remaining_seconds, dcf_token) return dcf_token +def force_dcf_token_expiration(user_id): + """ + We have seen a case where DCF has rejected our valid refresh token when their server gets rolled. This should not + happen anymore. But if it does, we need to be able to force our token expirations ASAP so as to let the user login + again to get a new token. + + :raises InternalTokenError: + """ + try: + dcf_token = get_stored_dcf_token(user_id) + except InternalTokenError as e: + raise e + except (TokenFailure, RefreshTokenExpired): + # a no-op + return + + dcf_token.refresh_expires_at = pytz.utc.localize(datetime.datetime.utcnow()) + dcf_token.save() + + return + + def get_nih_user_details(user_id, force_logout): """ :param user_id: @@ -1379,6 +1411,8 @@ def get_nih_user_details(user_id, force_logout): dcf_token = get_stored_dcf_token(user_id) except TokenFailure: return user_details # i.e. empty dict + except InternalTokenError: + return user_details # i.e. empty dict except RefreshTokenExpired: user_details['refresh_required'] = True return user_details @@ -1426,6 +1460,7 @@ def get_nih_user_details(user_id, force_logout): nih_user = freshest_unlinked else: logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) + # FIXME: Second condition can no longer happen: user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) return user_details # i.e. empty dict @@ -1471,8 +1506,12 @@ def get_nih_user_details(user_id, force_logout): user_details['auth_datasets'] = [] if len(user_auth_datasets) <= 0 else AuthorizedDataset.objects.filter(id__in=user_auth_datasets.values_list('authorized_dataset',flat=True)) if settings.DCF_TEST: + # FIXME: Second condition can no longer happen: user_details['link_mismatch'] = (dcf_token.google_id is None) or (dcf_token.google_id != curr_user.email) - user_details['refresh_key_ok'] = get_dcf_refresh_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS + try: + user_details['refresh_key_ok'] = get_dcf_refresh_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS + except InternalTokenError: + return {} user_details['force_DCF_logout'] = False return user_details From d1bc4909229ec2c50b1f7cd5b532e52fce84ea68 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 3 Jul 2018 17:52:09 -0700 Subject: [PATCH 32/76] Major refactor of dcf code --- accounts/dcf_support.py | 827 ++++++++++++++++++++++++++++++++++++++++ accounts/dcf_views.py | 817 +++------------------------------------ accounts/sa_utils.py | 330 ++++++++++------ 3 files changed, 1087 insertions(+), 887 deletions(-) create mode 100755 accounts/dcf_support.py diff --git a/accounts/dcf_support.py b/accounts/dcf_support.py new file mode 100755 index 00000000..b31710ba --- /dev/null +++ b/accounts/dcf_support.py @@ -0,0 +1,827 @@ +""" +Copyright 2017-2018, Institute for Systems Biology + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import logging +import requests +import datetime +import pytz + +from django.conf import settings + +from models import DCFToken, NIH_User +from requests_oauthlib.oauth2_session import OAuth2Session +from oauthlib.oauth2 import MissingTokenError +from base64 import urlsafe_b64decode +from json import loads as json_loads, dumps as json_dumps + +logger = logging.getLogger('main_logger') + +DCF_TOKEN_URL = settings.DCF_TOKEN_URL +DCF_GOOGLE_URL = settings.DCF_GOOGLE_URL + + +class DCFCommFailure(Exception): + """Thrown if we have problems communicating with DCF """ + + +class TokenFailure(Exception): + """Thrown if we don't have our access/refresh tokens (user has disconnected from DCF)""" + + +class InternalTokenError(Exception): + """Thrown if we have internal DB consistency errors """ + + +class RefreshTokenExpired(Exception): + """Thrown if our refresh token is no longer valid and user must log in """ + + def __init__(self, seconds, token): + self.seconds = seconds + self.token = token + + +def get_stored_dcf_token(user_id): + """ + When a user breaks their connection with DCF, we flush out the revoked tokens. But if they have a + session running in another browser, they might still be clicking on links that expect a token. So + we need to be bulletproof on maybe not getting back a token. + + :raises TokenFailure: + :raises InternalTokenError: + :raises RefreshTokenExpired: + """ + dcf_tokens = DCFToken.objects.filter(user=user_id) + num_tokens = len(dcf_tokens) + if num_tokens != 1: + if num_tokens > 1: + logger.error('[ERROR] Unexpected Server Error: Multiple tokens found for user {}'.format(user_id)) + raise InternalTokenError() + else: + logger.info('[INFO] User {} tried to use a flushed token'.format(user_id)) + raise TokenFailure() + + dcf_token = dcf_tokens.first() + remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() + if remaining_seconds <= 60: + # Still make the token available to e.g. drop linkages from DB + raise RefreshTokenExpired(remaining_seconds, dcf_token) + + return dcf_token + + +def get_access_expiration(user_id): + nih_users = NIH_User.objects.filter(user_id=user_id, linked=True) + num_users = len(nih_users) + if num_users != 1: + if num_users > 1: + logger.warn("Multiple objects when retrieving nih_user with user_id {}.".format(str(user_id))) + else: + logger.warn("No objects when retrieving nih_user with user_id {}.".format(str(user_id))) + return pytz.utc.localize(datetime.datetime.utcnow()) + + nih_user = nih_users.first() + return nih_user.NIH_assertion_expiration + + +def force_dcf_token_expiration(user_id): + """ + We have seen a case where DCF has rejected our valid refresh token when their server gets rolled. This should not + happen anymore. But if it does, we need to be able to force our token expirations ASAP so as to let the user login + again to get a new token. + + :raises InternalTokenError: + """ + try: + dcf_token = get_stored_dcf_token(user_id) + except InternalTokenError as e: + raise e + except (TokenFailure, RefreshTokenExpired): + # a no-op + return + + dcf_token.refresh_expires_at = pytz.utc.localize(datetime.datetime.utcnow()) + dcf_token.save() + + return + + +def user_data_token_dict_massaged(the_user_token_dict): + """ + Takes the user data token dictionary (as returned by DCF) and returns massaged user-only string AND dict + + """ + the_user_dict = the_user_token_dict['context']['user'] + the_massaged_dict = massage_user_data_for_dev(the_user_dict) + the_user_token_dict['context']['user'] = the_massaged_dict + return json_dumps(the_user_token_dict), the_user_token_dict + + +def user_data_token_massaged(user_data_token_string): + """ + Takes the user data token string and returns user-only string AND dict + + """ + the_user_token_dict = json_loads(user_data_token_string) + the_user_dict = the_user_token_dict['context']['user'] + the_massaged_dict = massage_user_data_for_dev(the_user_dict) + the_user_token_dict['context']['user'] = the_massaged_dict + return json_dumps(the_user_token_dict), the_user_token_dict + + +def get_projects_from_user_dict(the_user_dict): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + return the_user_dict['projects'] + + +def _set_projects_for_user_dict(the_user_dict, projects): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + the_user_dict['projects'] = projects + return + + +def get_nih_id_from_user_dict(the_user_dict): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + return the_user_dict['name'] + + +def _set_nih_id_for_user_dict(the_user_dict, nih_id): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + the_user_dict['name'] = nih_id + return + + +def get_google_link_from_user_dict(the_user_dict): + """ + The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! + + """ + gotta_google_link = the_user_dict.has_key('google') and \ + the_user_dict['google'].has_key('linked_google_account') + google_link = the_user_dict['google']['linked_google_account'] if gotta_google_link else None + return google_link + + +def user_data_token_to_user_dict(user_data_token_string): + """ + Takes the user data token string (as returned by DCF and stored in database) and returns user-only dict + """ + the_user_token_dict = json_loads(user_data_token_string) + the_user_dict = the_user_token_dict['context']['user'] + return the_user_dict + + +def user_data_token_dict_to_user_dict(the_user_token_dict): + """ + Takes the user data token dict and returns user-only dict + + """ + the_user_dict = the_user_token_dict['context']['user'] + return the_user_dict + + +def get_user_data_token_string(user_id): + """ + Get up-to-date user data from DCF, massage as needed. + + :raises TokenFailure: + :raises InternalTokenError: + :raises DCFCommFailure: + :raises RefreshTokenExpired: + """ + # The user endpoint is spotty at the moment (6/5/18) so we drag it out of the token instead + + the_user_id_token, _ = user_data_from_token(user_id, False) + + massaged_string, _ = user_data_token_massaged(the_user_id_token) + + return massaged_string + + +def user_data_from_token(user_id, stash_it): + """ + Seems that we should be able to get full user info from the user endpoint, but it turns out that + the information in the token refresh is more complete. + + PLUS, user can set stash_it to True. DCF suggests we refresh the access token after e.g. unlinking. + + :raises TokenFailure: + :raises InternalTokenError: + :raises DCFCommFailure: + :raises RefreshTokenExpired: + """ + + # + # OAuth2Session handles token refreshes under the covers. Here we want to do it explicitly. + # + + try: + dcf_token = get_stored_dcf_token(user_id) + except (TokenFailure, InternalTokenError, RefreshTokenExpired) as e: + raise e + + client_id, client_secret = get_secrets() + + data = { + 'grant_type': 'refresh_token', + 'refresh_token': dcf_token.refresh_token, + 'client_id': client_id + } + + auth = requests.auth.HTTPBasicAuth(client_id, client_secret) + client_id = None + client_secret = None + try: + resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) + except Exception as e: + logger.error("[ERROR] Token acquisition Exception") + logger.exception(e) + raise DCFCommFailure() + + if resp.status_code != 200: + logger.error("[ERROR] Token acquisition problem: {} : {}".format(resp.status_code, resp.text)) + raise DCFCommFailure() + + token_dict = json_loads(resp.text) + id_token_decoded, id_token_dict = _decode_token(token_dict['id_token']) + + if stash_it: + try: + _access_token_storage(token_dict, user_id) + except (TokenFailure, RefreshTokenExpired) as e: + logger.error("[ERROR] user_data_from_token aborted: {}".format(str(e))) + raise e + + return id_token_decoded, id_token_dict + + +def massage_user_data_for_dev(the_user): + """ + Note that when working against their QA server, user names + and projects are junk. So we repair them here for our development needs. + """ + + dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) + if not dcf_secrets.has_key('DEV_1_EMAIL'): + return the_user + + nih_from_dcf = get_nih_id_from_user_dict(the_user) + if nih_from_dcf == dcf_secrets['DEV_1_EMAIL']: + nih_from_dcf = dcf_secrets['DEV_1_NIH'] + _set_nih_id_for_user_dict(the_user, nih_from_dcf) + + dict_o_projects = get_projects_from_user_dict(the_user) + new_dict_o_projects = {} + for project in dict_o_projects.keys(): + perm_list = dict_o_projects[project] + # DCF QA returns bogus project info. Do this mapping as a workaround: + if project == dcf_secrets['DEV_1_PROJ']: + project = dcf_secrets['DEV_1_MAPPED_PROJ'] + elif project == dcf_secrets['DEV_2_PROJ']: + project = dcf_secrets['DEV_2_MAPPED_PROJ'] + new_dict_o_projects[project] = perm_list + _set_projects_for_user_dict(the_user, new_dict_o_projects) + + return the_user + + +def calc_expiration_time(returned_expiration_str): + + returned_expiration_time = None + if returned_expiration_str: + exp_secs = float(returned_expiration_str) + returned_expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(exp_secs)) + + login_expiration_seconds = settings.DCF_LOGIN_EXPIRATION_SECONDS + calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( + seconds=login_expiration_seconds)) + if returned_expiration_time: + diff = returned_expiration_time - calc_expiration_time + secs = abs((diff.days * (3600 * 24)) + diff.seconds) + if secs > 30: + logger.error("WARNING: DCF RETURNED TIME SKEW OF {} SECONDS".format(secs)) + else: + logger.info("DCF expiration skew was {} seconds".format(secs)) + calc_expiration_time = returned_expiration_time + else: + logger.error("No expiration time provided by DCF") + + return calc_expiration_time + + +def refresh_at_dcf(user_id): + """ + Handle the PATCH call, to extend a user's presence on controlled access for 24 hours. Note that we might + reasonably raise a TokenFailure if the user disconnects from DCF in one screen before extending in another. + This could also manifest as a 404 response from DCF + + Can raise TokenFailure, DCFCommFailure, RefreshTokenExpired + + WARNING: DO NOT CALL this routine unless we have positive evidence returned from DCF that the user is + linked. It is an error to tell DCF to patch if the user is not actually linked, and this will be an error. + + :raises TokenFailure: + :raises InternalTokenError: + :raises DCFCommFailure: + :raises RefreshTokenExpired: + + """ + + success = False + throw_later = None + err_msg = None + returned_expiration_str = None + massaged_string = None + + # + # Call DCF to drop the linkage. Note that this will immediately remove them from controlled access. + # + + try: + resp = dcf_call(DCF_GOOGLE_URL, user_id, mode='patch') + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: + throw_later = e + except Exception as e: + logger.error("[ERROR] Attempt to contact DCF for Google ID patch failed (user {})".format(user_id)) + raise e + + if resp: + if resp.status_code == 404: + err_msg = "User's GoogleID was no longer linked at Data Commons" + elif resp.status_code == 200: + success = True + else: + logger.error("[ERROR] Unexpected response from DCF: {}".format(resp.status_code)) + + returned_expiration_str = json_loads(resp.text)['exp'] + + # + # Per discussions with DCF, need to ask for a new token from DCF after changing google linking + # status. Always try to do this. Return the result too, since caller might be interested. + # + + try: + the_user_id_token, _ = user_data_from_token(user_id, True) + massaged_string, _ = user_data_token_massaged(the_user_id_token) + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: + throw_later = throw_later if throw_later else e + + if throw_later: + raise throw_later + elif not success: + raise DCFCommFailure() + + return err_msg, returned_expiration_str, massaged_string + + +def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_from_dcf, dcf_uid, cgc_uid, google_id): + """ + This is called when the user logs into DCF for the first time, whenever they need to get a new 30-day refresh + token from DCF by logging in, or if they explicitly disconnect their NIH identity and need to reauthenticate + to DCF again. It creates or refreshes the token in the database. + """ + + # + # We need to extract out the expiration time buried in the refresh token. When the refresh token + # expires (30 days) the user has to reauthenticate with DCF: + # + + refresh_token = token_dict['refresh_token'] + refresh_tokens_b64 = refresh_token.split('.') + i64 = refresh_tokens_b64[1] + padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length + refresh_token_decoded = urlsafe_b64decode(padded.encode("ascii")) + refresh_token_dict = json_loads(refresh_token_decoded) + + # A refresh key: + # { + # "azp": "Client ID", + # "jti": "hex string with dashes", + # "aud": ["openid", "user", "data", "Client ID"], + # "exp": 1529262310, + # "iss": "https://The DCF server/user", + # "iat": 1526670310, + # "pur": "refresh", + # "sub": "The users's DCF ID" + # } + + refresh_expire_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(refresh_token_dict['exp'])) + + # This refers to the *access key* expiration (~20 minutes) + if token_dict.has_key('expires_at'): + expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) + else: + expiration_time = pytz.utc.localize( + datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) + logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) + + logger.info('[INFO] Refresh token storage. New token expires at {}'.format(str(expiration_time))) + + # FIXME! Make sure that the NIH name is going to be unique before we shove it into the table. Don't + # depend on the DB table constraint. + + # Note that (nih_username_lower, user_id) is enforced unique in the table: + DCFToken.objects.update_or_create(user_id=cgc_uid, + defaults={ + 'dcf_user': dcf_uid, + 'nih_username': nih_username_from_dcf, + 'nih_username_lower': nih_username_from_dcf.lower(), + 'access_token': token_dict['access_token'], + 'refresh_token': token_dict['refresh_token'], + 'user_token': user_token, + 'decoded_jwt': json_dumps(decoded_jwt), + 'expires_at': expiration_time, + 'refresh_expires_at': refresh_expire_time, + 'google_id': google_id # May be none on create... + }) + + +def _access_token_storage(token_dict, cgc_uid): + """ + This call just replaces the access key and user token part of the DCF record. Used when we use the + refresh token to get a new access key. + + :raises TokenFailure: + :raises RefreshTokenExpired: + """ + + # This refers to the *access key* expiration (~20 minutes) + if token_dict.has_key('expires_at'): + expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) + else: + expiration_time = pytz.utc.localize( + datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) + logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) + + logger.info('[INFO] Access token storage. New token expires at {}'.format(str(expiration_time))) + + # + # Right now (5/30/18) we only get full user info back during the token refresh call. So decode + # it and stash it as well: + # + id_token_decoded, _ = _decode_token(token_dict['id_token']) + + try: + dcf_token = get_stored_dcf_token(cgc_uid) + except (TokenFailure, RefreshTokenExpired) as e: + logger.error("[INFO] _access_token_storage aborted: {}".format(str(e))) + raise e + + dcf_token.access_token = token_dict['access_token'] + dcf_token.user_token = id_token_decoded + dcf_token.expires_at = expiration_time + dcf_token.save() + + +def decode_token_chunk(token, index): + """ + Decode a given chunk of the token and return it as a JSON string and as a dict + """ + tokens_b64 = token.split('.') + i64 = tokens_b64[index] + padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length + token_decoded = urlsafe_b64decode(padded.encode("ascii")) + token_dict = json_loads(token_decoded) + return token_decoded, token_dict + + +def _decode_token(token): + """ + Decode the token and return it as a JSON string and as a dict + """ + return decode_token_chunk(token, 1) + + +def dcf_call(full_url, user_id, mode='get', post_body=None, force_token=False): + """ + All the stuff around a DCF call that handles token management and refreshes. + + :raises TokenFailure: + :raises InternalTokenError: + :raises DCFCommFailure: + :raises RefreshTokenExpired: + """ + + dcf_token = get_stored_dcf_token(user_id) # Can raise a TokenFailure or RefreshTokenExpired + + expires_in = (dcf_token.expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() + logger.info("[INFO] Token Expiration : {} seconds".format(expires_in)) + + token_dict = { + 'access_token' : dcf_token.access_token, + 'refresh_token' : dcf_token.refresh_token, + 'token_type' : 'Bearer', + 'expires_in' : -100 if force_token else expires_in + } + + def token_storage_for_user(my_token_dict): + _access_token_storage(my_token_dict, user_id) + + client_id, client_secret = get_secrets() + + extra_dict = { + 'client_id' : client_id, + 'client_secret': client_secret + } + + dcf = OAuth2Session(client_id, token=token_dict, auto_refresh_url=DCF_TOKEN_URL, + auto_refresh_kwargs=extra_dict, token_updater=token_storage_for_user) + extra_dict = None + + # Hoo boy! You *MUST* provide the client_id and client_secret in the call itself to insure an OAuth2Session token + # refresh call uses HTTPBasicAuth! + + # We have seen an exception here (BAD REQUEST) if refresh token has e.g. been revoked and not dropped out of DB. + # Also have seen this when requesting an unlink: + # reply: 'HTTP/1.1 401 UNAUTHORIZED\r\n' after staging server is rolled?? + # "/home/vagrant/www/lib/oauthlib/oauth2/rfc6749/parameters.py" + # MissingTokenError: (missing_token) Missing access token parameter. + + try: + resp = dcf.request(mode, full_url, client_id=client_id, + client_secret=client_secret, data=post_body) + except (TokenFailure, RefreshTokenExpired) as e: + # bubbles up from token_storage_for_user call + logger.error("[ERROR] _dcf_call {} aborted: {}".format(full_url, str(e))) + raise e + except MissingTokenError as e: + force_dcf_token_expiration(user_id) + logger.warning("[INFO] MissingTokenError seen") + logger.exception(e) + raise TokenFailure() + except InternalTokenError as e: + logger.warning("Internal Token Exception") + logger.exception(e) + raise e + except Exception as e: + force_dcf_token_expiration(user_id) + logger.warning("DCF Exception") + logger.exception(e) + raise DCFCommFailure() + + return resp + + +def get_secrets(): + """ + Keep hidden info hidden as much as possible + """ + dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) + client_id = dcf_secrets['DCF_CLIENT_ID'] + client_secret = dcf_secrets['DCF_CLIENT_SECRET'] + return client_id, client_secret + + +def _read_dict(my_file_name): + """ + Keep hidden info hidden as much as possible + """ + retval = {} + with open(my_file_name, 'r') as f: + for line in f: + if '=' not in line: + continue + split_line = line.split('=') + retval[split_line[0].strip()] = split_line[1].strip() + return retval + +def _access_token_storage(token_dict, cgc_uid): + """ + This call just replaces the access key and user token part of the DCF record. Used when we use the + refresh token to get a new access key. + + :raises TokenFailure: + :raises RefreshTokenExpired: + """ + + # This refers to the *access key* expiration (~20 minutes) + if token_dict.has_key('expires_at'): + expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) + else: + expiration_time = pytz.utc.localize( + datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) + logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) + + logger.info('[INFO] Access token storage. New token expires at {}'.format(str(expiration_time))) + + # + # Right now (5/30/18) we only get full user info back during the token refresh call. So decode + # it and stash it as well: + # + id_token_decoded, _ = _decode_token(token_dict['id_token']) + + try: + dcf_token = get_stored_dcf_token(cgc_uid) + except (TokenFailure, RefreshTokenExpired) as e: + logger.error("[INFO] _access_token_storage aborted: {}".format(str(e))) + raise e + + dcf_token.access_token = token_dict['access_token'] + dcf_token.user_token = id_token_decoded + dcf_token.expires_at = expiration_time + dcf_token.save() + + +def refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_from_dcf, dcf_uid, cgc_uid, google_id): + """ + This is called when the user logs into DCF for the first time, whenever they need to get a new 30-day refresh + token from DCF by logging in, or if they explicitly disconnect their NIH identity and need to reauthenticate + to DCF again. It creates or refreshes the token in the database. + """ + + # + # We need to extract out the expiration time buried in the refresh token. When the refresh token + # expires (30 days) the user has to reauthenticate with DCF: + # + + refresh_token = token_dict['refresh_token'] + refresh_tokens_b64 = refresh_token.split('.') + i64 = refresh_tokens_b64[1] + padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length + refresh_token_decoded = urlsafe_b64decode(padded.encode("ascii")) + refresh_token_dict = json_loads(refresh_token_decoded) + + # A refresh key: + # { + # "azp": "Client ID", + # "jti": "hex string with dashes", + # "aud": ["openid", "user", "data", "Client ID"], + # "exp": 1529262310, + # "iss": "https://The DCF server/user", + # "iat": 1526670310, + # "pur": "refresh", + # "sub": "The users's DCF ID" + # } + + refresh_expire_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(refresh_token_dict['exp'])) + + # This refers to the *access key* expiration (~20 minutes) + if token_dict.has_key('expires_at'): + expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) + else: + expiration_time = pytz.utc.localize( + datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) + logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) + + logger.info('[INFO] Refresh token storage. New token expires at {}'.format(str(expiration_time))) + + # FIXME! Make sure that the NIH name is going to be unique before we shove it into the table. Don't + # depend on the DB table constraint. + + # Note that (nih_username_lower, user_id) is enforced unique in the table: + DCFToken.objects.update_or_create(user_id=cgc_uid, + defaults={ + 'dcf_user': dcf_uid, + 'nih_username': nih_username_from_dcf, + 'nih_username_lower': nih_username_from_dcf.lower(), + 'access_token': token_dict['access_token'], + 'refresh_token': token_dict['refresh_token'], + 'user_token': user_token, + 'decoded_jwt': json_dumps(decoded_jwt), + 'expires_at': expiration_time, + 'refresh_expires_at': refresh_expire_time, + 'google_id': google_id # May be none on create... + }) + + +def unlink_at_dcf(user_id, do_refresh): + """ + There are only three places where we call DCF to do a Google unlink: 1) If they login via NIH and we get + a token for the user that tells us they already are linked to a Google ID that does not match their ISB-CGC + login ID. 2) We send them back to DCF to do the Google ID linking step and the callback informs us that they + have logged in with the wrong (not ISB-CGC) Google ID, and 3) the user has chosen to fully disconnect, and + dropping the Google ID is one step in the teardown flow. We NEVER enter a Google ID into the DCFToken + table if it does not match their ISB-CCG ID. + + WARNING: DO NOT CALL this routine unless we have positive evidence returned from DCF that the user is + linked. It is an error to tell DCF to unlink if the user is not actually linked. That said, we will + log the discrepancy but not issue any error to the user. + + :raise TokenFailure: + :raise InternalTokenError: + :raise DCFCommFailure: + :raise RefreshTokenExpired: + + """ + + success = False + throw_later = None + + # + # Call DCF to drop the linkage. Note that this will immediately remove them from controlled access. + # + + try: + resp = dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') # can raise TokenFailure, DCFCommFailure + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: + throw_later = e # hold off so we can try a refresh first... + except Exception as e: + logger.error("[ERROR] Attempt to contact DCF for Google ID unlink failed (user {})".format(user_id)) + raise e + + if resp: + if resp.status_code == 404: + # We are trying to unlink, and DCF thinks there is no link. *Silent* failure! + logger.error("[ERROR] No linked Google account found for user {}".format(user_id)) + success = True + elif resp.status_code == 400: + delete_response = json_loads(resp.text) + error = delete_response['error'] + message = delete_response['error_description'] + logger.error("[ERROR] Error returned in unlinking: {} : {}".format(error, message)) + elif resp.status_code == 200: + success = True + else: + logger.error("[ERROR] Unexpected response from DCF: {}".format(resp.status_code)) + + # + # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking + # since they care about the token info. Even if we had a failure, let's try to refresh: + # + + if do_refresh: + try: + user_data_from_token(user_id, True) + except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: + throw_later = throw_later if throw_later else e + + if throw_later: + raise throw_later + elif not success: + raise DCFCommFailure() + + return + + +class GoogleLinkState: + BOTH_NULL = 1 + DCF_NULL_CGC_NON_NULL = 2 + DCF_BAD_CGC_NULL = 3 + DCF_GOOD_CGC_NULL = 4 + MATCHING_BAD = 5 + MATCHING_OK = 6 + NON_MATCHING_DCF_BAD = 7 + NON_MATCHING_CGC_BAD = 8 + NON_MATCHING_ALL_BAD = 9 + + +def compare_google_ids(dcf_version, cgc_version, user_email): + """ + When we get new tokens from DCF, we want to sanity check if the Google IDs are in agreement. + """ + print dcf_version + print cgc_version + print user_email + if dcf_version != cgc_version: + # Most likely possibility is that either DCF or us thinks the google ID is None and the other doesn't. Another + # possibility is that DCF has another Google ID for the user that is not consistent with the + # one we *want* them to be using. That case *should* have been caught when they first tried to link. + # + # If link IDs do not match, we need match ours to DCF, and flag the problem + if dcf_version is None: + google_match_state = GoogleLinkState.DCF_NULL_CGC_NON_NULL + elif cgc_version is None: + if dcf_version == user_email: + google_match_state = GoogleLinkState.DCF_GOOD_CGC_NULL + else: + google_match_state = GoogleLinkState.DCF_BAD_CGC_NULL + elif dcf_version == user_email: + google_match_state = GoogleLinkState.NON_MATCHING_CGC_BAD # Cannot happen + elif cgc_version == user_email: + google_match_state = GoogleLinkState.NON_MATCHING_DCF_BAD + else: + google_match_state = GoogleLinkState.NON_MATCHING_ALL_BAD # Cannot happen + # Next three cases handle matching GoogleIDs: + elif dcf_version is None: + google_match_state = GoogleLinkState.BOTH_NULL + elif dcf_version == user_email: + google_match_state = GoogleLinkState.MATCHING_OK + elif dcf_version != user_email: + google_match_state = GoogleLinkState.MATCHING_BAD # Cannot happen + + return google_match_state + diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 315c3155..90c0e746 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -31,20 +31,21 @@ from google_helpers.stackdriver import StackDriverLogger -from sa_utils import found_linking_problems, DemoLoginResults, handle_user_for_dataset,\ +from sa_utils import found_linking_problems, DemoLoginResults, \ handle_user_db_update_for_dcf_linking, \ - unlink_account_in_db_for_dcf, get_dcf_refresh_key_remaining_seconds, \ - get_stored_dcf_token, TokenFailure, RefreshTokenExpired, InternalTokenError, \ - force_dcf_token_expiration + unlink_account_in_db_for_dcf, refresh_user_projects + +from dcf_support import get_stored_dcf_token, \ + TokenFailure, RefreshTokenExpired, InternalTokenError, DCFCommFailure, \ + get_google_link_from_user_dict, get_projects_from_user_dict, \ + get_nih_id_from_user_dict, user_data_token_to_user_dict, get_user_data_token_string, \ + user_data_token_dict_massaged, \ + user_data_token_dict_to_user_dict, get_secrets, refresh_token_storage, \ + unlink_at_dcf, refresh_at_dcf, decode_token_chunk, calc_expiration_time -from models import DCFToken, AuthorizedDataset, NIH_User, UserAuthorizedDatasets from requests_oauthlib.oauth2_session import OAuth2Session -from oauthlib.oauth2 import MissingTokenError -from base64 import urlsafe_b64decode from jwt.contrib.algorithms.pycrypto import RSAAlgorithm -from json import loads as json_loads, dumps as json_dumps -from dataset_utils.dataset_access_support_factory import DatasetAccessSupportFactory -from dataset_utils.dataset_config import DatasetGoogleGroupPair +from json import loads as json_loads # Shut this up unless we need to do debug of HTTP request contents #import httplib as http_client @@ -54,16 +55,10 @@ DCF_AUTH_URL = settings.DCF_AUTH_URL DCF_TOKEN_URL = settings.DCF_TOKEN_URL -DCF_USER_URL = settings.DCF_USER_URL DCF_REVOKE_URL = settings.DCF_REVOKE_URL DCF_GOOGLE_URL = settings.DCF_GOOGLE_URL DCF_LOGOUT_URL = settings.DCF_LOGOUT_URL -DCF_URL_URL = settings.DCF_URL_URL -DCF_TOKEN_REFRESH_WINDOW_SECONDS = settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS - -class DCFCommFailure(Exception): - """Thrown if we have problems communicating with DCF """ @login_required def oauth2_login(request): @@ -82,7 +77,7 @@ def oauth2_login(request): if settings.IS_DEV and full_callback.startswith('http://localhost'): os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' - client_id, _ = _get_secrets() + client_id, _ = get_secrets() logger.info("[INFO] OAuth1 b") # Found that 'user' scope had to be included to be able to do the user query on callback, and the data scope @@ -178,7 +173,7 @@ def oauth2_callback(request): messages.error(request, internal_err_msg.format("001")) return redirect(reverse('user_detail', args=[request.user.id])) - client_id, client_secret = _get_secrets() + client_id, client_secret = get_secrets() logger.info("[INFO] OAuthCB c") # You MUST provide the callback *here* to get it into the fetch request dcf = OAuth2Session(client_id, state=saved_state, redirect_uri=full_callback) @@ -232,7 +227,7 @@ def oauth2_callback(request): # key provided by their endpoint using the pyjwt package to do the work. # - jwt_header_json, jwt_header_dict = _decode_token_chunk(token_data['id_token'], 0) + jwt_header_json, jwt_header_dict = decode_token_chunk(token_data['id_token'], 0) kid = jwt_header_dict['kid'] # @@ -301,13 +296,13 @@ def oauth2_callback(request): # Suck the data out of the user token to plunk into the database # - user_data_token_str, user_data_token_dict = _user_data_token_dict_massaged(decoded_jwt_id) + user_data_token_str, user_data_token_dict = user_data_token_dict_massaged(decoded_jwt_id) - user_data_dict = _user_data_token_dict_to_user_dict(user_data_token_dict) + user_data_dict = user_data_token_dict_to_user_dict(user_data_token_dict) - nih_from_dcf = _get_nih_id_from_user_dict(user_data_dict) + nih_from_dcf = get_nih_id_from_user_dict(user_data_dict) - google_link = _get_google_link_from_user_dict(user_data_dict) + google_link = get_google_link_from_user_dict(user_data_dict) logger.info("[INFO] OAuthCB i") # We now have the NIH User ID back from DCF; we also might now know the Google ID they have linked to previously @@ -355,7 +350,7 @@ def oauth2_callback(request): # make the entry in the NIH_User table, since we need to now either establish or refresh the DCF-Google ID link: # - _refresh_token_storage(token_data, decoded_jwt_id, user_data_token_str, nih_from_dcf, dcf_user_id, request.user.id, save_google_link) + refresh_token_storage(token_data, decoded_jwt_id, user_data_token_str, nih_from_dcf, dcf_user_id, request.user.id, save_google_link) # # If user already has a google ID link, we would PATCH the endpoint to update it for 24 more hours. If @@ -381,7 +376,7 @@ def oauth2_callback(request): logger.info("[INFO] OAuthCB l") if google_link != req_user.email: try: - _unlink_at_dcf(request.user.id, True) # True means after unlinking, we call DCF again to update our link state + unlink_at_dcf(request.user.id, True) # True means after unlinking, we call DCF again to update our link state message = "You must use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( req_user.email, google_link) messages.warning(request, message) @@ -406,7 +401,7 @@ def oauth2_callback(request): logger.info("[INFO] OAuthCB m") try: - err_msg, returned_expiration_str, _ = _refresh_at_dcf(request.user.id) + err_msg, returned_expiration_str, _ = refresh_at_dcf(request.user.id) except TokenFailure: err_msg = internal_err_msg.format("006") except InternalTokenError: @@ -426,7 +421,7 @@ def oauth2_callback(request): # logger.info("[INFO] OAuthCB n") - use_expiration_time = _calc_expiration_time(returned_expiration_str) + use_expiration_time = calc_expiration_time(returned_expiration_str) logger.info("[INFO] OAuthCB o") # Don't hit DCF again, we just did it (thus False): @@ -488,7 +483,7 @@ def dcf_link_callback(request): returned_expiration_str = request.GET.get('exp', None) returned_google_link = request.GET.get('linked_email', None) - use_expiration_time = _calc_expiration_time(returned_expiration_str) + use_expiration_time = calc_expiration_time(returned_expiration_str) # # We will NEVER accept a Google ID that does not match. At this point, we need to wrestle @@ -499,7 +494,7 @@ def dcf_link_callback(request): # try: - the_user_token_string = _get_user_data_token_string(request.user.id) # a string. + the_user_token_string = get_user_data_token_string(request.user.id) # a string. except (TokenFailure, InternalTokenError, DCFCommFailure, RefreshTokenExpired): return redirect(reverse('user_detail', args=[request.user.id])) @@ -507,7 +502,7 @@ def dcf_link_callback(request): the_user_dict = the_user_token_dict['context']['user'] # Just parses the google link out of the recently return token. - google_link = _get_google_link_from_user_dict(the_user_dict) + google_link = get_google_link_from_user_dict(the_user_dict) if returned_google_link: if google_link != returned_google_link: @@ -529,7 +524,7 @@ def dcf_link_callback(request): # if google_link != req_user.email: try: - _unlink_at_dcf(request.user.id, True) # True means saved token is now updated with unlinked state + unlink_at_dcf(request.user.id, True) # True means saved token is now updated with unlinked state except (TokenFailure, InternalTokenError, DCFCommFailure, RefreshTokenExpired): return redirect(reverse('user_detail', args=[request.user.id])) @@ -573,7 +568,7 @@ def dcf_link_extend(request): user_data_token_string = None try: - err_msg, returned_expiration_str, user_data_token_string = _refresh_at_dcf(request.user.id) + err_msg, returned_expiration_str, user_data_token_string = refresh_at_dcf(request.user.id) except TokenFailure: err_msg = "Your Data Commons Framework identity needs to be reestablished to complete this task." except RefreshTokenExpired: @@ -587,8 +582,8 @@ def dcf_link_extend(request): messages.error(request, err_msg) return redirect(reverse('user_detail', args=[request.user.id])) - use_expiration_time = _calc_expiration_time(returned_expiration_str) - user_data_dict = _user_data_token_to_user_dict(user_data_token_string) + use_expiration_time = calc_expiration_time(returned_expiration_str) + user_data_dict = user_data_token_to_user_dict(user_data_token_string) # log the reports using Cloud logging API st_logger = StackDriverLogger.build_from_django_settings() @@ -600,30 +595,6 @@ def dcf_link_extend(request): return redirect(reverse('user_detail', args=[request.user.id])) -def _calc_expiration_time(returned_expiration_str): - - returned_expiration_time = None - if returned_expiration_str: - exp_secs = float(returned_expiration_str) - returned_expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(exp_secs)) - - login_expiration_seconds = settings.DCF_LOGIN_EXPIRATION_SECONDS - calc_expiration_time = pytz.utc.localize(datetime.datetime.utcnow() + datetime.timedelta( - seconds=login_expiration_seconds)) - if returned_expiration_time: - diff = returned_expiration_time - calc_expiration_time - secs = abs((diff.days * (3600 * 24)) + diff.seconds) - if secs > 30: - logger.error("WARNING: DCF RETURNED TIME SKEW OF {} SECONDS".format(secs)) - else: - logger.info("DCF expiration skew was {} seconds".format(secs)) - calc_expiration_time = returned_expiration_time - else: - logger.error("No expiration time provided by DCF") - - return calc_expiration_time - - def _finish_the_link(user_id, user_email, expiration_time, st_logger, refresh_first): """ Regardless of how they get here, this step handles the linking of the user by adding the required database records. @@ -642,7 +613,7 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger, refresh_fi if refresh_first: try: - the_user_token = _get_user_data_token_string(user_id) # the_user is a string. + the_user_token = get_user_data_token_string(user_id) # the_user is a string. except (TokenFailure, InternalTokenError, DCFCommFailure, RefreshTokenExpired) as e: raise e @@ -666,352 +637,18 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger, refresh_fi the_user_token = dcf_token.user_token dcf_token.save() - the_user_dict = _user_data_token_to_user_dict(the_user_token) + the_user_dict = user_data_token_to_user_dict(the_user_token) nih_user, warning = handle_user_db_update_for_dcf_linking(user_id, the_user_dict, nih_assertion_expiration, st_logger) - dict_o_projects = _get_projects_from_user_dict(the_user_dict) - authorized_datasets = [] - for project, perm_list in dict_o_projects.iteritems(): - adqs = AuthorizedDataset.objects.filter(whitelist_id=project) - if len(adqs) == 1: - authorized_datasets.append(DatasetGoogleGroupPair(project, adqs.first().acl_google_group)) - - das = DatasetAccessSupportFactory.from_webapp_django_settings() - all_datasets = das.get_all_datasets_and_google_groups() + dict_o_projects = get_projects_from_user_dict(the_user_dict) + project_keys = set(dict_o_projects.keys()) - for dataset in all_datasets: - handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) + refresh_user_projects(nih_user, user_email, project_keys, st_logger) return warning -def _user_data_token_dict_massaged(the_user_token_dict): - """ - Takes the user data token dictionary (as returned by DCF) and returns massaged user-only string AND dict - - """ - the_user_dict = the_user_token_dict['context']['user'] - the_massaged_dict = _massage_user_data_for_dev(the_user_dict) - the_user_token_dict['context']['user'] = the_massaged_dict - return json_dumps(the_user_token_dict), the_user_token_dict - - -def _user_data_token_massaged(user_data_token_string): - """ - Takes the user data token string and returns user-only string AND dict - - """ - the_user_token_dict = json_loads(user_data_token_string) - the_user_dict = the_user_token_dict['context']['user'] - the_massaged_dict = _massage_user_data_for_dev(the_user_dict) - the_user_token_dict['context']['user'] = the_massaged_dict - return json_dumps(the_user_token_dict), the_user_token_dict - - -def _get_projects_from_user_dict(the_user_dict): - """ - The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! - - """ - return the_user_dict['projects'] - - -def _set_projects_for_user_dict(the_user_dict, projects): - """ - The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! - - """ - the_user_dict['projects'] = projects - return - - -def _get_nih_id_from_user_dict(the_user_dict): - """ - The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! - - """ - return the_user_dict['name'] - -def _set_nih_id_for_user_dict(the_user_dict, nih_id): - """ - The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! - - """ - the_user_dict['name'] = nih_id - return - - -def _get_google_link_from_user_dict(the_user_dict): - """ - The dict schema and keys vary depending on whether is comes from token or user data endpoint. Hide this fact! - - """ - gotta_google_link = the_user_dict.has_key('google') and \ - the_user_dict['google'].has_key('linked_google_account') - google_link = the_user_dict['google']['linked_google_account'] if gotta_google_link else None - return google_link - - -def _user_data_token_to_user_dict(user_data_token_string): - """ - Takes the user data token string (as returned by DCF and stored in database) and returns user-only dict - - """ - the_user_token_dict = json_loads(user_data_token_string) - the_user_dict = the_user_token_dict['context']['user'] - return the_user_dict - - -def _user_data_token_dict_to_user_dict(the_user_token_dict): - """ - Takes the user data token dict and returns user-only dict - - """ - the_user_dict = the_user_token_dict['context']['user'] - return the_user_dict - - -def _get_user_data_token_string(user_id): - """ - Get up-to-date user data from DCF, massage as needed. - - :raises TokenFailure: - :raises InternalTokenError: - :raises DCFCommFailure: - :raises RefreshTokenExpired: - """ - # The user endpoint is spotty at the moment (6/5/18) so we drag it out of the token instead - - the_user_id_token, _ = _user_data_from_token(user_id, False) - - massaged_string, _ = _user_data_token_massaged(the_user_id_token) - - return massaged_string - - -def _user_data_from_token(user_id, stash_it): - """ - Seems that we should be able to get full user info from the user endpoint, but it turns out that - the information in the token refresh is more complete. - - PLUS, user can set stash_it to True. DCF suggests we refresh the access token after e.g. unlinking. - - :raises TokenFailure: - :raises InternalTokenError: - :raises DCFCommFailure: - :raises RefreshTokenExpired: - """ - - # - # OAuth2Session handles token refreshes under the covers. Here we want to do it explicitly. - # - - try: - dcf_token = get_stored_dcf_token(user_id) - except (TokenFailure, InternalTokenError, RefreshTokenExpired) as e: - raise e - - client_id, client_secret = _get_secrets() - - data = { - 'grant_type': 'refresh_token', - 'refresh_token': dcf_token.refresh_token, - 'client_id': client_id - } - - auth = requests.auth.HTTPBasicAuth(client_id, client_secret) - client_id = None - client_secret = None - try: - resp = requests.request('POST', DCF_TOKEN_URL, data=data, auth=auth) - except Exception as e: - logger.error("[ERROR] Token acquisition Exception") - logger.exception(e) - raise DCFCommFailure() - - if resp.status_code != 200: - logger.error("[ERROR] Token acquisition problem: {} : {}".format(resp.status_code, resp.text)) - raise DCFCommFailure() - - token_dict = json_loads(resp.text) - id_token_decoded, id_token_dict = _decode_token(token_dict['id_token']) - - if stash_it: - try: - _access_token_storage(token_dict, user_id) - except (TokenFailure, RefreshTokenExpired) as e: - logger.error("[ERROR] _user_data_from_token aborted: {}".format(str(e))) - raise e - - return id_token_decoded, id_token_dict - - -def _massage_user_data_for_dev(the_user): - """ - Note that when working against their QA server, user names - and projects are junk. So we repair them here for our development needs. - """ - - dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) - if not dcf_secrets.has_key('DEV_1_EMAIL'): - return the_user - - nih_from_dcf = _get_nih_id_from_user_dict(the_user) - if nih_from_dcf == dcf_secrets['DEV_1_EMAIL']: - nih_from_dcf = dcf_secrets['DEV_1_NIH'] - _set_nih_id_for_user_dict(the_user, nih_from_dcf) - - dict_o_projects = _get_projects_from_user_dict(the_user) - new_dict_o_projects = {} - for project, perm_list in dict_o_projects.iteritems(): - # DCF QA returns bogus project info. Do this mapping as a workaround: - if project == dcf_secrets['DEV_1_PROJ']: - project = dcf_secrets['DEV_1_MAPPED_PROJ'] - elif project == dcf_secrets['DEV_2_PROJ']: - project = dcf_secrets['DEV_2_MAPPED_PROJ'] - new_dict_o_projects[project] = perm_list - _set_projects_for_user_dict(the_user, new_dict_o_projects) - - return the_user - - -def _unlink_at_dcf(user_id, do_refresh): - """ - There are only three places where we call DCF to do a Google unlink: 1) If they login via NIH and we get - a token for the user that tells us they already are linked to a Google ID that does not match their ISB-CGC - login ID. 2) We send them back to DCF to do the Google ID linking step and the callback informs us that they - have logged in with the wrong (not ISB-CGC) Google ID, and 3) the user has chosen to fully disconnect, and - dropping the Google ID is one step in the teardown flow. We NEVER enter a Google ID into the DCFToken - table if it does not match their ISB-CCG ID. - - Can raise TokenFailure, DCFCommFailure, RefreshTokenExpired - - WARNING: DO NOT CALL this routine unless we have positive evidence returned from DCF that the user is - linked. It is an error to tell DCF to unlink if the user is not actually linked. That said, we will - log the discrepancy but not issue any error to the user. - - :raise TokenFailure: - :raise InternalTokenError: - :raise DCFCommFailure: - :raise RefreshTokenExpired: - - """ - - success = False - throw_later = None - - # - # Call DCF to drop the linkage. Note that this will immediately remove them from controlled access. - # - - try: - resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='delete') # can raise TokenFailure, DCFCommFailure - except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: - throw_later = e # hold off so we can try a refresh first... - except Exception as e: - logger.error("[ERROR] Attempt to contact DCF for Google ID unlink failed (user {})".format(user_id)) - raise e - - if resp: - if resp.status_code == 404: - # We are trying to unlink, and DCF thinks there is no link. *Silent* failure! - logger.error("[ERROR] No linked Google account found for user {}".format(user_id)) - success = True - elif resp.status_code == 400: - delete_response = json_loads(resp.text) - error = delete_response['error'] - message = delete_response['error_description'] - logger.error("[ERROR] Error returned in unlinking: {} : {}".format(error, message)) - elif resp.status_code == 200: - success = True - else: - logger.error("[ERROR] Unexpected response from DCF: {}".format(resp.status_code)) - - # - # Per discussions with DCF, need to ask for a new token from DCF after doing the unlinking - # since they care about the token info. Even if we had a failure, let's try to refresh: - # - - if do_refresh: - try: - _user_data_from_token(user_id, True) - except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: - throw_later = throw_later if throw_later else e - - if throw_later: - raise throw_later - elif not success: - raise DCFCommFailure() - - return - - -def _refresh_at_dcf(user_id): - """ - Handle the PATCH call, to extend a user's presence on controlled access for 24 hours. Note that we might - reasonably raise a TokenFailure if the user disconnects from DCF in one screen before extending in another. - This could also manifest as a 404 response from DCF - - Can raise TokenFailure, DCFCommFailure, RefreshTokenExpired - - WARNING: DO NOT CALL this routine unless we have positive evidence returned from DCF that the user is - linked. It is an error to tell DCF to patch if the user is not actually linked, and this will be an error. - - :raises TokenFailure: - :raises InternalTokenError: - :raises DCFCommFailure: - :raises RefreshTokenExpired: - - """ - - success = False - throw_later = None - err_msg = None - returned_expiration_str = None - massaged_string = None - - # - # Call DCF to drop the linkage. Note that this will immediately remove them from controlled access. - # - - try: - resp = _dcf_call(DCF_GOOGLE_URL, user_id, mode='patch') - except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: - throw_later = e - except Exception as e: - logger.error("[ERROR] Attempt to contact DCF for Google ID patch failed (user {})".format(user_id)) - raise e - - if resp: - if resp.status_code == 404: - err_msg = "User's GoogleID was no longer linked at Data Commons" - elif resp.status_code == 200: - success = True - else: - logger.error("[ERROR] Unexpected response from DCF: {}".format(resp.status_code)) - - returned_expiration_str = json_loads(resp.text)['exp'] - - # - # Per discussions with DCF, need to ask for a new token from DCF after changing google linking - # status. Always try to do this. Return the result too, since caller might be interested. - # - - try: - the_user_id_token, _ = _user_data_from_token(user_id, True) - massaged_string, _ = _user_data_token_massaged(the_user_id_token) - except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: - throw_later = throw_later if throw_later else e - - if throw_later: - raise throw_later - elif not success: - raise DCFCommFailure() - - return err_msg, returned_expiration_str, massaged_string - - def _unlink_internally(user_id): """ If we need to unlink a user who was previously ACTUALLY linked, there are internal fixes to be made. @@ -1060,124 +697,6 @@ def _unlink_internally(user_id): return -def _refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_from_dcf, dcf_uid, cgc_uid, google_id): - """ - This is called when the user logs into DCF for the first time, whenever they need to get a new 30-day refresh - token from DCF by logging in, or if they explicitly disconnect their NIH identity and need to reauthenticate - to DCF again. It creates or refreshes the token in the database. - """ - - # - # We need to extract out the expiration time buried in the refresh token. When the refresh token - # expires (30 days) the user has to reauthenticate with DCF: - # - - refresh_token = token_dict['refresh_token'] - refresh_tokens_b64 = refresh_token.split('.') - i64 = refresh_tokens_b64[1] - padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length - refresh_token_decoded = urlsafe_b64decode(padded.encode("ascii")) - refresh_token_dict = json_loads(refresh_token_decoded) - - # A refresh key: - # { - # "azp": "Client ID", - # "jti": "hex string with dashes", - # "aud": ["openid", "user", "data", "Client ID"], - # "exp": 1529262310, - # "iss": "https://The DCF server/user", - # "iat": 1526670310, - # "pur": "refresh", - # "sub": "The users's DCF ID" - # } - - refresh_expire_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(refresh_token_dict['exp'])) - - # This refers to the *access key* expiration (~20 minutes) - if token_dict.has_key('expires_at'): - expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) - else: - expiration_time = pytz.utc.localize( - datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) - logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) - - logger.info('[INFO] Refresh token storage. New token expires at {}'.format(str(expiration_time))) - - # FIXME! Make sure that the NIH name is going to be unique before we shove it into the table. Don't - # depend on the DB table constraint. - - # Note that (nih_username_lower, user_id) is enforced unique in the table: - DCFToken.objects.update_or_create(user_id=cgc_uid, - defaults={ - 'dcf_user': dcf_uid, - 'nih_username': nih_username_from_dcf, - 'nih_username_lower': nih_username_from_dcf.lower(), - 'access_token': token_dict['access_token'], - 'refresh_token': token_dict['refresh_token'], - 'user_token': user_token, - 'decoded_jwt': json_dumps(decoded_jwt), - 'expires_at': expiration_time, - 'refresh_expires_at': refresh_expire_time, - 'google_id': google_id # May be none on create... - }) - - -def _access_token_storage(token_dict, cgc_uid): - """ - This call just replaces the access key and user token part of the DCF record. Used when we use the - refresh token to get a new access key. - - :raises TokenFailure: - :raises RefreshTokenExpired: - """ - - # This refers to the *access key* expiration (~20 minutes) - if token_dict.has_key('expires_at'): - expiration_time = pytz.utc.localize(datetime.datetime.utcfromtimestamp(token_dict['expires_at'])) - else: - expiration_time = pytz.utc.localize( - datetime.datetime.utcnow() + datetime.timedelta(seconds=token_dict["expires_in"])) - logger.info("[INFO] Have to build an expiration time for token: {}".format(expiration_time)) - - logger.info('[INFO] Access token storage. New token expires at {}'.format(str(expiration_time))) - - # - # Right now (5/30/18) we only get full user info back during the token refresh call. So decode - # it and stash it as well: - # - id_token_decoded, _ = _decode_token(token_dict['id_token']) - - try: - dcf_token = get_stored_dcf_token(cgc_uid) - except (TokenFailure, RefreshTokenExpired) as e: - logger.error("[INFO] _access_token_storage aborted: {}".format(str(e))) - raise e - - dcf_token.access_token = token_dict['access_token'] - dcf_token.user_token = id_token_decoded - dcf_token.expires_at = expiration_time - dcf_token.save() - - -def _decode_token_chunk(token, index): - """ - Decode a given chunk of the token and return it as a JSON string and as a dict - """ - tokens_b64 = token.split('.') - i64 = tokens_b64[index] - padded = i64 + '=' * (-len(i64) % 4) # Pad with =; Weird Python % with -length - token_decoded = urlsafe_b64decode(padded.encode("ascii")) - token_dict = json_loads(token_decoded) - return token_decoded, token_dict - - -def _decode_token(token): - """ - Decode the token and return it as a JSON string and as a dict - """ - return _decode_token_chunk(token, 1) - - @login_required def dcf_disconnect_user(request): """ @@ -1208,9 +727,9 @@ def dcf_disconnect_user(request): # try: - _unlink_at_dcf(request.user.id, False) # Don't refresh, we are about to drop the record... + unlink_at_dcf(request.user.id, False) # Don't refresh, we are about to drop the record... except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure): - messages.warning(request, "Unexpected response from DCF {}".format(resp.status_code)) + messages.warning(request, "Problems encountered unlinking Google ID at Data Commons. Please contact ISB-CGC Administrator") return redirect(reverse('user_detail', args=[request.user.id])) # @@ -1222,7 +741,7 @@ def dcf_disconnect_user(request): # framework. Not seeing that inside the OAuthSession framework, so we roll our own by hand: # - client_id, client_secret = _get_secrets() + client_id, client_secret = get_secrets() data = { 'token': dcf_token.refresh_token } @@ -1236,7 +755,8 @@ def dcf_disconnect_user(request): logger.info("[INFO] DDU C") if resp.status_code != 200 and resp.status_code != 204: - messages.warning(request, 'Revocation problem: {} : {}'.format(resp.status_code, resp.text)) + logger.error(request, '[ERROR] Token revocation problem: {} : {}'.format(resp.status_code, resp.text)) + messages.warning(request, "Problems encountered revoking access token at Data Commons. Please contact ISB-CGC Administrator") # # Now we do the internal unlinking, which includes detach the user in our NIH tables, and detach the user from data permissions. @@ -1244,8 +764,11 @@ def dcf_disconnect_user(request): try: _unlink_internally(request.user.id) - except (TokenFailure, InternalTokenError, Exception): - messages.warning(request, "Internal problems unlinking".format(resp.status_code)) + except TokenFailure: + # Token problem? Don't care; it is about to be blown away + pass + except (InternalTokenError, Exception): + messages.warning(request, "Internal problem encountered disconnecting from Data Commons. Please contact ISB-CGC Administrator") return redirect(reverse('user_detail', args=[request.user.id])) # @@ -1259,11 +782,13 @@ def dcf_disconnect_user(request): except TokenFailure: dcf_token = None except InternalTokenError: + messages.warning(request, "Internal problem encountered disconnecting from Data Commons. Please contact ISB-CGC Administrator") return redirect(reverse('user_detail', args=[request.user.id])) except RefreshTokenExpired as e: dcf_token = e.token - dcf_token.delete() + if dcf_token: + dcf_token.delete() # # Finally, we need to send the user to logout from the DCF, which is needed to clear the @@ -1277,254 +802,6 @@ def dcf_disconnect_user(request): return HttpResponseRedirect(callback) -def _dcf_call(full_url, user_id, mode='get', post_body=None, force_token=False): - """ - All the stuff around a DCF call that handles token management and refreshes. - - :raises TokenFailure: - :raises InternalTokenError: - :raises DCFCommFailure: - :raises RefreshTokenExpired: - """ - - dcf_token = get_stored_dcf_token(user_id) # Can raise a TokenFailure or RefreshTokenExpired - - expires_in = (dcf_token.expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() - logger.info("[INFO] Token Expiration : {} seconds".format(expires_in)) - - token_dict = { - 'access_token' : dcf_token.access_token, - 'refresh_token' : dcf_token.refresh_token, - 'token_type' : 'Bearer', - 'expires_in' : -100 if force_token else expires_in - } - - def token_storage_for_user(my_token_dict): - _access_token_storage(my_token_dict, user_id) - - client_id, client_secret = _get_secrets() - - extra_dict = { - 'client_id' : client_id, - 'client_secret': client_secret - } - - dcf = OAuth2Session(client_id, token=token_dict, auto_refresh_url=DCF_TOKEN_URL, - auto_refresh_kwargs=extra_dict, token_updater=token_storage_for_user) - extra_dict = None - - # Hoo boy! You *MUST* provide the client_id and client_secret in the call itself to insure an OAuth2Session token - # refresh call uses HTTPBasicAuth! - - # We have seen an exception here (BAD REQUEST) if refresh token has e.g. been revoked and not dropped out of DB. - # Also have seen this when requesting an unlink: - # reply: 'HTTP/1.1 401 UNAUTHORIZED\r\n' after staging server is rolled?? - # "/home/vagrant/www/lib/oauthlib/oauth2/rfc6749/parameters.py" - # MissingTokenError: (missing_token) Missing access token parameter. - - try: - resp = dcf.request(mode, full_url, client_id=client_id, - client_secret=client_secret, data=post_body) - except (TokenFailure, RefreshTokenExpired) as e: - # bubbles up from token_storage_for_user call - logger.error("[ERROR] _dcf_call {} aborted: {}".format(full_url, str(e))) - raise e - except MissingTokenError as e: - force_dcf_token_expiration(user_id) - logger.warning("[INFO] MissingTokenError seen") - logger.exception(e) - raise TokenFailure() - except InternalTokenError as e: - logger.warning("Internal Token Exception") - logger.exception(e) - raise e - except Exception as e: - force_dcf_token_expiration(user_id) - logger.warning("DCF Exception") - logger.exception(e) - raise DCFCommFailure() - - return resp - - -def _get_secrets(): - """ - Keep hidden info hidden as much as possible - """ - dcf_secrets = _read_dict(settings.DCF_CLIENT_SECRETS) - client_id = dcf_secrets['DCF_CLIENT_ID'] - client_secret = dcf_secrets['DCF_CLIENT_SECRET'] - return client_id, client_secret - - -def _read_dict(my_file_name): - """ - Keep hidden info hidden as much as possible - """ - retval = {} - with open(my_file_name, 'r') as f: - for line in f: - if '=' not in line: - continue - split_line = line.split('=') - retval[split_line[0].strip()] = split_line[1].strip() - return retval - - -class GoogleLinkState: - BOTH_NULL = 1 - DCF_NULL_CGC_NON_NULL = 2 - DCF_BAD_CGC_NULL = 3 - DCF_GOOD_CGC_NULL = 4 - MATCHING_BAD = 5 - MATCHING_OK = 6 - NON_MATCHING_DCF_BAD = 7 - NON_MATCHING_CGC_BAD = 8 - NON_MATCHING_ALL_BAD = 9 - -def _compare_google_ids(dcf_version, cgc_version, user_email): - """ - When we get new tokens from DCF, we want to sanity check if the Google IDs are in agreement. - """ - - if dcf_version != cgc_version: - # Most likely possibility is that either DCF or us thinks the google ID is None and the other doesn't. Another - # possibility is that DCF has another Google ID for the user that is not consistent with the - # one we *want* them to be using. That case *should* have been caught when they first tried to link. - # - # If link IDs do not match, we need match ours to DCF, and flag the problem - if dcf_version is None: - google_match_state = GoogleLinkState.DCF_NULL_CGC_NON_NULL - elif cgc_version is None: - if dcf_version == user_email: - google_match_state = GoogleLinkState.DCF_GOOD_CGC_NULL - else: - google_match_state = GoogleLinkState.DCF_BAD_CGC_NULL - elif dcf_version == user_email: - google_match_state = GoogleLinkState.NON_MATCHING_CGC_BAD # Cannot happen - elif cgc_version == user_email: - google_match_state = GoogleLinkState.NON_MATCHING_DCF_BAD - else: - google_match_state = GoogleLinkState.NON_MATCHING_ALL_BAD # Cannot happen - # Next three cases handle matching GoogleIDs: - elif dcf_version is None: - google_match_state = GoogleLinkState.BOTH_NULL - elif dcf_version == user_email: - google_match_state = GoogleLinkState.MATCHING_OK - elif dcf_version != user_email: - google_match_state = GoogleLinkState.MATCHING_BAD # Cannot happen - - return google_match_state - - -def _refresh_from_dcf(user_id): - """ - Whenever the user hits the user details page, we need to check how the DCF views the world (linkage, expirations, - datasets), and update accordingly. - - :raises TokenFailure: - :raises InternalTokenError: - :raises DCFCommFailure: - :raises RefreshTokenExpired: - """ - - user_email = User.objects.get(id=user_id).email - - # - # Haul the user data token string down from DCF: - # - - try: - the_user_token = _get_user_data_token_string(user_id) # the_user_token is a string. - except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure) as e: - raise e - - # - # Things that could be different: Google ID linkage, expiration time, approved datasets. - # Right now, we are not provided with expiration time, so we cannot check that. While NIH linkage - # could change in theory, that is fixed via DCF for the life of a refresh token. User could only change - # that by logging out/disconnecting from DCF and going back in again, which would give us a new refresh - # token. - # - - the_user_dict = _user_data_token_to_user_dict(the_user_token) - dcf_google_link = _get_google_link_from_user_dict(the_user_dict) - nih_id = _get_nih_id_from_user_dict(the_user_dict) - - # - # Compare to our versions: - # - - try: - dcf_token = get_stored_dcf_token(user_id) - except (TokenFailure, InternalTokenError, RefreshTokenExpired) as e: - raise e - - if nih_id.lower() != dcf_token.nih_username_lower: - logger.error("ERROR: UNEXPECTED NIH_USER_ID MISMATCH {} VERSUS {}".format(nih_id.lower(), - dcf_token.nih_username_lower)) - # - # More worrisome is a possible mismatch between - # - - google_match_state = _compare_google_ids(dcf_google_link.lower(), dcf_token.google_id, user_email) - - null_us = False - attach_us = False - detach_dcf = False - all_done = False - if google_match_state == GoogleLinkState.DCF_NULL_CGC_NON_NULL: - null_us = True - elif google_match_state == GoogleLinkState.DCF_GOOD_CGC_NULL: - attach_us = True - elif google_match_state == GoogleLinkState.DCF_BAD_CGC_NULL: - detach_dcf = True - elif google_match_state == GoogleLinkState.NON_MATCHING_CGC_BAD: # Cannot happen - raise Exception() - elif google_match_state == GoogleLinkState.NON_MATCHING_DCF_BAD: - detach_dcf = True - elif google_match_state == GoogleLinkState.NON_MATCHING_ALL_BAD: # Cannot happen - raise Exception() - elif google_match_state == GoogleLinkState.BOTH_NULL: - pass - elif google_match_state == GoogleLinkState.MATCHING_OK: - pass - elif google_match_state == GoogleLinkState.MATCHING_BAD: # Cannot happen - raise Exception() - - if null_us: - try: - _unlink_internally(user_id) - except (TokenFailure, InternalTokenError, Exception) as e: - raise e - elif attach_us: - try: - warning = _finish_the_link(user_id, user_email, use_expiration_time, st_logger, False) - except (TokenFailure, InternalTokenError, Exception) as e: - raise e - all_done = True - elif detach_dcf: - "Tell the user there is a problem" - - - # - # This is exercised when the NIH ID of the user, returned in the ID token is different than the one we - # have in our token database. Don't think this is even possible, since user would need to log in as the - # new NIH ID first... - # - - # - # If everything was consistent, if DCF tells the user is linked to an NIH ID, we would have that ID as one and - # only one linked record in our DB. - # - - if not all_done: - st_logger = StackDriverLogger.build_from_django_settings() - _, warning = handle_user_db_update_for_dcf_linking(request.user.id, user_data_dict, use_expiration_time, st_logger) - - - - # @login_required # def dcf_link_redo(request): # """ diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 98e15625..9666ae86 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -38,8 +38,16 @@ from google_helpers.resourcemanager_service import get_special_crm_resource from google_helpers.iam_service import get_iam_resource from dataset_utils.dataset_access_support_factory import DatasetAccessSupportFactory +from dataset_utils.dataset_config import DatasetGoogleGroupPair from google_helpers.pubsub_service import get_pubsub_service, get_full_topic_name +from dcf_support import get_stored_dcf_token, \ + TokenFailure, RefreshTokenExpired, InternalTokenError, DCFCommFailure, \ + GoogleLinkState, \ + get_google_link_from_user_dict, get_projects_from_user_dict, \ + get_nih_id_from_user_dict, user_data_token_to_user_dict, get_user_data_token_string, \ + compare_google_ids + logger = logging.getLogger('main_logger') OPEN_ACL_GOOGLE_GROUP = settings.OPEN_ACL_GOOGLE_GROUP @@ -49,19 +57,6 @@ MANAGED_SERVICE_ACCOUNTS_PATH = settings.MANAGED_SERVICE_ACCOUNTS_PATH -class TokenFailure(Exception): - """Thrown if we don't have our access/refresh tokens (user has disconnected from DCF)""" - -class InternalTokenError(Exception): - """Thrown if we have internal DB consistency errors """ - -class RefreshTokenExpired(Exception): - """Thrown if our refresh token is no longer valid and user must log in """ - - def __init__(self, seconds, token): - self.seconds = seconds - self.token = token - def verify_service_account(gcp_id, service_account, datasets, user_email, is_refresh=False, is_adjust=False, remove_all=False): # Only verify for protected datasets @@ -1331,55 +1326,122 @@ def get_dcf_refresh_key_remaining_seconds(user_id): return remaining_seconds -def get_stored_dcf_token(user_id): +def _resolve_multiple_nih_users(nih_users, user_id): """ - When a user breaks their connection with DCF, we flush out the revoked tokens. But if they have a - session running in another browser, they might still be clicking on links that expect a token. So - we need to be bulletproof on maybe not getting back a token. - - :raises TokenFailure: - :raises InternalTokenError: - :raises RefreshTokenExpired: + Multiple NIH user rows for the current user for the same nih_username. We want the one that is linked. + If more than one (is that possible??) take the one with the most recent usage. If nobody is linked, + again take the one with the most recent usage. Some of these cases should not be possible (?) but + trying to be bombproof here """ - dcf_tokens = DCFToken.objects.filter(user_id) - num_tokens = len(dcf_tokens) - if num_tokens != 1: - if num_tokens > 1: - logger.error('[ERROR] Unexpected Server Error: Multiple tokens found for user {}'.format(user_id)) - raise InternalTokenError() + nih_user = None + freshest_linked = None + freshest_linked_stamp = None + freshest_unlinked = None + freshest_unlinked_stamp = None + for user in nih_users: + if user.linked: + if (freshest_linked_stamp is None) or (freshest_linked_stamp < user.NIH_assertion_expiration): + freshest_linked_stamp = user.NIH_assertion_expiration + freshest_linked = user + if nih_user is None: + nih_user = nih_users.first() + else: + logger.error("[ERROR] Multiple linked nih users retrieved nih_user with user_id {}.".format(user_id)) else: - logger.info('[INFO] User {} tried to use a flushed token'.format(user_id)) - raise TokenFailure() + if (freshest_unlinked_stamp is None) or (freshest_unlinked_stamp < user.NIH_assertion_expiration): + freshest_unlinked_stamp = user.NIH_assertion_expiration + freshest_unlinked = user + + if freshest_linked: + nih_user = freshest_linked + elif freshest_unlinked: + nih_user = freshest_unlinked + else: + logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) + nih_user = None - dcf_token = dcf_tokens.first() - remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() - if remaining_seconds <= 60: - # Still make the token available to e.g. drop linkages from DB - raise RefreshTokenExpired(remaining_seconds, dcf_token) + return nih_user - return dcf_token +class RefreshCode: + NO_TOKEN = 1 + TOKEN_EXPIRED = 2 + INTERNAL_ERROR = 3 + DCF_COMMUNICATIONS_ERROR = 4 + NIH_ID_MISMATCH = 5 + NO_GOOGLE_LINK = 6 + GOOGLE_LINK_MISMATCH = 7 + UNEXPECTED_UNLINKED_NIH_USER = 8 + PROJECT_SET_UPDATED = 9 + ALL_MATCHES = 10 -def force_dcf_token_expiration(user_id): - """ - We have seen a case where DCF has rejected our valid refresh token when their server gets rolled. This should not - happen anymore. But if it does, we need to be able to force our token expirations ASAP so as to let the user login - again to get a new token. - :raises InternalTokenError: +def _refresh_from_dcf(user_id, nih_user): + """ + Whenever the user hits the user details page, we need to check how the DCF views the world (linkage, expirations, + datasets). If something is weird, we report it. If not, we make sure the allowed datasets are in sync. """ + + # + # First off, do we even have a token for the user? If we do, has it expired? If either case exists, there is + # nothing we can do. If we are good, haul the data down: + # + try: dcf_token = get_stored_dcf_token(user_id) - except InternalTokenError as e: - raise e - except (TokenFailure, RefreshTokenExpired): - # a no-op - return + the_user_token = get_user_data_token_string(user_id) # the_user_token is a string. + except TokenFailure: + return RefreshCode.NO_TOKEN + except RefreshTokenExpired: + return RefreshCode.TOKEN_EXPIRED + except InternalTokenError: + return RefreshCode.INTERNAL_ERROR + except DCFCommFailure: + raise RefreshCode.DCF_COMMUNICATIONS_ERROR + + # + # Things that could be different: Google ID linkage, expiration time, approved datasets. + # Right now, we are not provided with expiration time, so we cannot check that. While NIH linkage + # could change in theory, that is fixed via DCF for the life of a refresh token. User could only change + # that by logging out/disconnecting from DCF and going back in again, which would give us a new refresh + # token. + # - dcf_token.refresh_expires_at = pytz.utc.localize(datetime.datetime.utcnow()) - dcf_token.save() + the_user_dict = user_data_token_to_user_dict(the_user_token) + dcf_google_link = get_google_link_from_user_dict(the_user_dict) + dcf_google_link = dcf_google_link.lower() if dcf_google_link else dcf_google_link + nih_id = get_nih_id_from_user_dict(the_user_dict) + dict_o_projects = get_projects_from_user_dict(the_user_dict) + dcf_projects = set(dict_o_projects.keys()) - return + if nih_id.lower() != dcf_token.nih_username_lower: + logger.error("ERROR: UNEXPECTED NIH_USER_ID MISMATCH {} VERSUS {}".format(nih_id.lower(), + dcf_token.nih_username_lower)) + return RefreshCode.NIH_ID_MISMATCH + + # + # Much more possible is a mismatch in Google link state, though this should not be common: + # + + user_email = User.objects.get(id=user_id).email + google_match_state = compare_google_ids(dcf_google_link, dcf_token.google_id, user_email) + + if google_match_state == GoogleLinkState.BOTH_NULL: + return RefreshCode.NO_GOOGLE_LINK + elif google_match_state != GoogleLinkState.MATCHING_OK: + logger.error("ERROR: GOOGLE ID STATE MISMATCH FOR USER {}: {}".format(user_id, google_match_state)) + return RefreshCode.GOOGLE_LINK_MISMATCH + + if not nih_user: + return RefreshCode.UNEXPECTED_UNLINKED_NIH_USER + + our_user_projects = projects_for_user(user_id) + if our_user_projects != dcf_projects: + st_logger = StackDriverLogger.build_from_django_settings() + refresh_user_projects(nih_user, user_email, dcf_projects, st_logger) + return RefreshCode.PROJECT_SET_UPDATED + + return RefreshCode.ALL_MATCHES def get_nih_user_details(user_id, force_logout): @@ -1391,78 +1453,78 @@ def get_nih_user_details(user_id, force_logout): user_details = {} if settings.DCF_TEST: - # FIXME: Check in with DCF for info, throw DCFCommError if we have problems - # FIXME: If refresh token is expired, we cannot show any info until they log back in! + + # + # If we have detected that the user has logged into DCF with a different NIH username than what we think, + # nothing else matters. We tell them to log out. + # if force_logout: + user_details['error_state'] = None + user_details['dcf_comm_error'] = False user_details['force_DCF_logout'] = True user_details['NIH_username'] = force_logout return user_details # - # Now with DCF, we can have a user logged in as an NIH user, but not be linked (which means DCF does not - # have an association between NIH ID and Google ID). So while we previously did a get on a linked user, - # now we need to filter. If one of the users is linked, that is who we use. Otherwise, we can resolve the - # issue by looking at the current DCF token attached to the user to see who they are associated with. - # + # Otherwise, ask the DCF for current user info, + # FIXME: Check in with DCF for info, throw DCFCommError if we have problems + # FIXME: If refresh token is expired, we cannot show any info until they log back in! + user_details['force_DCF_logout'] = False user_details['refresh_required'] = False - try: - dcf_token = get_stored_dcf_token(user_id) - except TokenFailure: - return user_details # i.e. empty dict - except InternalTokenError: - return user_details # i.e. empty dict - except RefreshTokenExpired: - user_details['refresh_required'] = True - return user_details + user_details['refresh_key_ok'] = True + user_details['error_state'] = None + user_details['dcf_comm_error'] = False + user_details['link_mismatch'] = False + user_details['data_sets_updated'] = False - curr_user = User.objects.get(id=user_id) + nih_users = NIH_User.objects.filter(user_id=user_id, linked=True) - nih_users = NIH_User.objects.filter(user_id=user_id, NIH_username__iexact=dcf_token.nih_username) + nih_user = nih_users.first() if len(nih_users) == 1 else None - if len(nih_users) == 0: - user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) - return user_details # i.e. empty dict - - elif len(nih_users) == 1: - nih_user = nih_users.first() + match_state = _refresh_from_dcf(user_id, nih_user) + if match_state == RefreshCode.NO_TOKEN: + user_details['NIH_username'] = None + return user_details + elif match_state == RefreshCode.TOKEN_EXPIRED: + user_details['refresh_required'] = True + return user_details + elif match_state == RefreshCode.INTERNAL_ERROR: + user_details['error_state'] = 'Internal error encountered syncing with Data Commons' + return user_details + elif match_state == RefreshCode.DCF_COMMUNICATIONS_ERROR: + user_details['dcf_comm_error'] = True + return user_details + elif match_state == RefreshCode.NO_GOOGLE_LINK: + user_details['refresh_key_ok'] = False + return user_details + elif match_state == RefreshCode.GOOGLE_LINK_MISMATCH: + # If they have a bad Google ID linked at DCF, we force them to login again, which eventually + # tells them they need to switch it. + user_details['link_mismatch'] = True + elif match_state == RefreshCode.UNEXPECTED_UNLINKED_NIH_USER: + # Should not happen. Force a complete logout + user_details['NIH_username'] = None + return user_details + elif match_state == RefreshCode.PROJECT_SET_UPDATED: + user_details['data_sets_updated'] = True + elif match_state == RefreshCode.ALL_MATCHES: + pass else: - # - # Multiple NIH user rows for the current user for the same nih_username. We want the one that is linked. - # If more than one (is that possible??) take the one with the most recent usage. If nobody is linked, - # again take the one with the most recent usage. Some of these cases should not be possible (?) but - # trying to be bombproof here: - # - nih_user = None - freshest_linked = None - freshest_linked_stamp = None - freshest_unlinked = None - freshest_unlinked_stamp = None - for user in nih_users: - if user.linked: - if (freshest_linked_stamp is None) or (freshest_linked_stamp < user.NIH_assertion_expiration): - freshest_linked_stamp = user.NIH_assertion_expiration - freshest_linked = user - if nih_user is None: - nih_user = nih_users.first() - else: - logger.error("[ERROR] Multiple linked nih users retrieved nih_user with user_id {}.".format(user_id)) - else: - if (freshest_unlinked_stamp is None) or (freshest_unlinked_stamp < user.NIH_assertion_expiration): - freshest_unlinked_stamp = user.NIH_assertion_expiration - freshest_unlinked = user - - if freshest_linked: - nih_user = freshest_linked - elif freshest_unlinked: - nih_user = freshest_unlinked - else: - logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) - # FIXME: Second condition can no longer happen: - user_details['link_mismatch'] = (dcf_token.google_id is not None) and (dcf_token.google_id != curr_user.email) - return user_details # i.e. empty dict + user_details['error_state'] = 'Internal error encountered syncing with Data Commons' + return user_details + + # + # Now with DCF, we can have a user logged in as an NIH user, but not be linked (which means DCF does not + # have an association between NIH ID and Google ID). But if the user has not made that link at DCF, + # we treat them as unlinked. We are still only interested in fully linked NIH users! + # + + if not nih_user: # Extracted above + user_details['NIH_username'] = None + return user_details # # With the user_details page, we now need to check with DCF about current status before we display information @@ -1485,7 +1547,8 @@ def get_nih_user_details(user_id, force_logout): nih_user.active = False nih_user.NIH_assertion_expiration = now_time nih_user.save() - else: + + else: # Old non-DCF code: try: nih_user = NIH_User.objects.get(user_id=user_id, linked=True) except MultipleObjectsReturned as e: @@ -1505,18 +1568,30 @@ def get_nih_user_details(user_id, force_logout): user_details['NIH_active'] = nih_user.active user_details['auth_datasets'] = [] if len(user_auth_datasets) <= 0 else AuthorizedDataset.objects.filter(id__in=user_auth_datasets.values_list('authorized_dataset',flat=True)) - if settings.DCF_TEST: - # FIXME: Second condition can no longer happen: - user_details['link_mismatch'] = (dcf_token.google_id is None) or (dcf_token.google_id != curr_user.email) - try: - user_details['refresh_key_ok'] = get_dcf_refresh_key_remaining_seconds(user_id) > settings.DCF_TOKEN_REFRESH_WINDOW_SECONDS - except InternalTokenError: - return {} - user_details['force_DCF_logout'] = False - return user_details +def projects_for_user(user_id): + + retval = set() + try: + nih_user = NIH_User.objects.get(user_id=user_id, linked=True) + except MultipleObjectsReturned as e: + logger.warn("Multiple objects when retrieving nih_user with user_id {}. {}".format(str(user_id), str(e))) + return retval + except ObjectDoesNotExist as e: + logger.warn("No objects when retrieving nih_user with user_id {}. {}".format(str(user_id), str(e))) + return retval + + user_auth_datasets = AuthorizedDataset.objects.filter( + id__in=UserAuthorizedDatasets.objects.filter(nih_user=nih_user).values_list('authorized_dataset', flat=True)) + + for dataset in user_auth_datasets: + retval.add(dataset.whitelist_id) + + return retval + + def verify_user_is_in_gcp(user_id, gcp_id): user_in_gcp = False user_email = None @@ -1547,3 +1622,24 @@ def verify_user_is_in_gcp(user_id, gcp_id): user_in_gcp = False return user_in_gcp + + +def refresh_user_projects(nih_user, user_email, project_keys, st_logger): + """ + Bring our database in line with the projects that DCF tells us they are good for. + """ + + authorized_datasets = [] + for project in project_keys: + adqs = AuthorizedDataset.objects.filter(whitelist_id=project) + if len(adqs) == 1: + authorized_datasets.append(DatasetGoogleGroupPair(project, adqs.first().acl_google_group)) + + das = DatasetAccessSupportFactory.from_webapp_django_settings() + all_datasets = das.get_all_datasets_and_google_groups() + + for dataset in all_datasets: + handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, False, None, None, st_logger) + + return + From 9c209d0c5aa05bb969b4ad786e739fd3c1ace69c Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 3 Jul 2018 18:53:01 -0700 Subject: [PATCH 33/76] Improved error handling --- accounts/dcf_support.py | 8 +++--- accounts/dcf_views.py | 62 +++++++++++++++++++++++++++++++++++------ accounts/sa_utils.py | 61 ---------------------------------------- 3 files changed, 57 insertions(+), 74 deletions(-) diff --git a/accounts/dcf_support.py b/accounts/dcf_support.py index b31710ba..43ef3cc1 100755 --- a/accounts/dcf_support.py +++ b/accounts/dcf_support.py @@ -466,6 +466,7 @@ def _access_token_storage(token_dict, cgc_uid): refresh token to get a new access key. :raises TokenFailure: + :raises InternalTokenError: :raises RefreshTokenExpired: """ @@ -487,7 +488,7 @@ def _access_token_storage(token_dict, cgc_uid): try: dcf_token = get_stored_dcf_token(cgc_uid) - except (TokenFailure, RefreshTokenExpired) as e: + except (TokenFailure, InternalTokenError, RefreshTokenExpired) as e: logger.error("[INFO] _access_token_storage aborted: {}".format(str(e))) raise e @@ -574,6 +575,7 @@ def token_storage_for_user(my_token_dict): logger.exception(e) raise TokenFailure() except InternalTokenError as e: + # bubbles up from token_storage_for_user call logger.warning("Internal Token Exception") logger.exception(e) raise e @@ -793,9 +795,7 @@ def compare_google_ids(dcf_version, cgc_version, user_email): """ When we get new tokens from DCF, we want to sanity check if the Google IDs are in agreement. """ - print dcf_version - print cgc_version - print user_email + if dcf_version != cgc_version: # Most likely possibility is that either DCF or us thinks the google ID is None and the other doesn't. Another # possibility is that DCF has another Google ID for the user that is not consistent with the diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 90c0e746..c9866595 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -452,6 +452,8 @@ def dcf_link_callback(request): """ dcf_err_msg = "DCF reported an error {} logging in. Please contact the ISB-CGC administrator." + internal_err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator." + comm_err_msg = "There was a communications problem contacting Data Commons Framework." # # If there was an error, return that: Also, we now need to equip all callbacks to report @@ -493,9 +495,20 @@ def dcf_link_callback(request): # so we need to keep deleting the linkage at DCF! # + err_msg = None try: the_user_token_string = get_user_data_token_string(request.user.id) # a string. - except (TokenFailure, InternalTokenError, DCFCommFailure, RefreshTokenExpired): + except TokenFailure: + err_msg = internal_err_msg.format("0060") + except InternalTokenError: + err_msg = internal_err_msg.format("0061") + except RefreshTokenExpired: + err_msg = internal_err_msg.format("0062") + except DCFCommFailure: + err_msg = comm_err_msg + + if err_msg: + messages.error(request, err_msg) return redirect(reverse('user_detail', args=[request.user.id])) the_user_token_dict = json_loads(the_user_token_string) @@ -522,10 +535,22 @@ def dcf_link_callback(request): # No match? Not acceptable. Send user back to details page. The empty google ID in our table will # mean the page shows an option to try again. We need to # + if google_link != req_user.email: + err_msg = None try: unlink_at_dcf(request.user.id, True) # True means saved token is now updated with unlinked state - except (TokenFailure, InternalTokenError, DCFCommFailure, RefreshTokenExpired): + except TokenFailure: + err_msg = internal_err_msg.format("0064") + except InternalTokenError: + err_msg = internal_err_msg.format("0065") + except RefreshTokenExpired: + err_msg = internal_err_msg.format("0066") + except DCFCommFailure: + err_msg = comm_err_msg + + if err_msg: + messages.error(request, err_msg) return redirect(reverse('user_detail', args=[request.user.id])) message = "You must use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( @@ -542,8 +567,11 @@ def dcf_link_callback(request): st_logger = StackDriverLogger.build_from_django_settings() # Don't hit DCF again, we just did it (thus False): warning = _finish_the_link(request.user.id, google_link, use_expiration_time, st_logger, False) - except (TokenFailure, RefreshTokenExpired): - messages.warning(request, "say something witty here...") + except TokenFailure: + messages.error(request, "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0067")) + return redirect(reverse('user_detail', args=[request.user.id])) + except RefreshTokenExpired: + messages.error(request, "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0068")) return redirect(reverse('user_detail', args=[request.user.id])) if warning: @@ -713,12 +741,18 @@ def dcf_disconnect_user(request): # they would have to login in order to disconnect! # + err_msg = None try: dcf_token = get_stored_dcf_token(request.user.id) - except (TokenFailure, InternalTokenError): - return redirect(reverse('user_detail', args=[request.user.id])) + except TokenFailure: + err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0069") + except InternalTokenError: + err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0070") except RefreshTokenExpired: - messages.warning(request, "You will need to first login to the Data Commons again to disconnect your Google ID") + err_msg = "You will need to first login to the Data Commons again to disconnect your Google ID" + + if err_msg: + messages.error(request, err_msg) return redirect(reverse('user_detail', args=[request.user.id])) # @@ -726,10 +760,20 @@ def dcf_disconnect_user(request): # is no link when we try to do it, we ignore that fact: # + err_msg = None try: unlink_at_dcf(request.user.id, False) # Don't refresh, we are about to drop the record... - except (TokenFailure, InternalTokenError, RefreshTokenExpired, DCFCommFailure): - messages.warning(request, "Problems encountered unlinking Google ID at Data Commons. Please contact ISB-CGC Administrator") + except TokenFailure: + err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0071") + except InternalTokenError: + err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0072") + except RefreshTokenExpired: + err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0073") + except DCFCommFailure: + err_msg = "There was a communications problem contacting Data Commons Framework." + + if err_msg: + messages.warning(request, err_msg) return redirect(reverse('user_detail', args=[request.user.id])) # diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 9666ae86..ad63558b 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1302,67 +1302,6 @@ def deactivate_nih_add_to_open(user_id, user_email): logger.info(e) -def get_dcf_refresh_key_remaining_seconds(user_id): - """ - We need to know how many seconds are left before the user needs to log back in to NIH to get - a new refresh token, which will expire every 30 days. - - :raises InternalTokenError: - """ - - try: - dcf_token = get_stored_dcf_token(user_id) - except InternalTokenError as e: - raise e - except TokenFailure: - return -1 # ? No token? They expire immediately! - except RefreshTokenExpired as e: - return e.seconds - - remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() - logger.info('[INFO] user {} has {} seconds remaining on refresh token'. - format(dcf_token.nih_username, remaining_seconds)) - - return remaining_seconds - - -def _resolve_multiple_nih_users(nih_users, user_id): - """ - Multiple NIH user rows for the current user for the same nih_username. We want the one that is linked. - If more than one (is that possible??) take the one with the most recent usage. If nobody is linked, - again take the one with the most recent usage. Some of these cases should not be possible (?) but - trying to be bombproof here - """ - nih_user = None - freshest_linked = None - freshest_linked_stamp = None - freshest_unlinked = None - freshest_unlinked_stamp = None - for user in nih_users: - if user.linked: - if (freshest_linked_stamp is None) or (freshest_linked_stamp < user.NIH_assertion_expiration): - freshest_linked_stamp = user.NIH_assertion_expiration - freshest_linked = user - if nih_user is None: - nih_user = nih_users.first() - else: - logger.error("[ERROR] Multiple linked nih users retrieved nih_user with user_id {}.".format(user_id)) - else: - if (freshest_unlinked_stamp is None) or (freshest_unlinked_stamp < user.NIH_assertion_expiration): - freshest_unlinked_stamp = user.NIH_assertion_expiration - freshest_unlinked = user - - if freshest_linked: - nih_user = freshest_linked - elif freshest_unlinked: - nih_user = freshest_unlinked - else: - logger.error("[ERROR] Unexpected lack of nih_user for {}.".format(user_id)) - nih_user = None - - return nih_user - - class RefreshCode: NO_TOKEN = 1 TOKEN_EXPIRED = 2 From 62ea20008d1ff47601cab29fb86b9d9ea9f8bf7f Mon Sep 17 00:00:00 2001 From: elainelee Date: Tue, 3 Jul 2018 22:03:17 -0700 Subject: [PATCH 34/76] File Browser: Filter by Case Barcode - adding Filter Panel to Radiology tab --- cohorts/metadata_counting.py | 25 +++++++++++------- cohorts/metadata_helpers.py | 12 +++++---- cohorts/views.py | 49 +++++++++++++++++------------------- 3 files changed, 46 insertions(+), 40 deletions(-) diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index 894546ee..dd1070f9 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -166,7 +166,7 @@ def count_user_metadata(user, inc_filters=None, cohort_id=None): if db and db.open: db.close() -def count_public_data_type(user, data_query, inc_filters, program_list, filter_format=False, build='HG19'): +def count_public_data_type(user, data_query, inc_filters, program_list, filter_format=False, build='HG19', type='None'): db = None cursor = None @@ -175,18 +175,17 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f QUERY_BASE = """ SELECT {attr}, COUNT(*) AS count FROM ({data_query_clause}) AS qc - WHERE 1 {where_clause} + WHERE TRUE {where_clause} GROUP BY {attr}; """ filter_clauses = {} - try: db = get_sql_connection() cursor = db.cursor() - metadata_data_attr = fetch_build_data_attr(build) + metadata_data_attr = fetch_build_data_attr(build, type) # Make our where clauses for filter in inc_filters: @@ -210,11 +209,19 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f filter_clause = filter_clause.format(*[y for x in filter_clauses for y in filter_clauses[x]['parameters'] if x != attr or (filter_format and attr == 'data_format')]) where_clause = "AND ( {} )".format(filter_clause) query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr) - cursor.execute(query) - for row in cursor.fetchall(): - val = "None" if not row[0] else row[0] - counts[attr][val] = row[1] - + if type == 'dicom': + results = BigQuerySupport.execute_query_and_fetch_results(query) + else: + cursor.execute(query) + results = cursor.fetchall() + for row in results: + if type == 'dicom': + val = row['f'][0]['v'] + cnt = int(row['f'][1]['v']) + else: + val = "None" if not row[0] else row[0] + cnt = row[1] + counts[attr][val] = cnt return counts except Exception as e: diff --git a/cohorts/metadata_helpers.py b/cohorts/metadata_helpers.py index 1efcb3e6..f5b2a6f1 100644 --- a/cohorts/metadata_helpers.py +++ b/cohorts/metadata_helpers.py @@ -223,18 +223,20 @@ def get_sql_connection(): if db and db.open: db.close() -def fetch_build_data_attr(build): +def fetch_build_data_attr(build, type='None'): db = None cursor = None # Our methods and templates use HG and not hg casing; try to be consistent build = build.upper() - # TODO: make this progrmmatic - metadata_data_attrs = ['data_type', 'data_category','experimental_strategy','data_format','platform', 'disease_code',] - + if type == 'dicom' : + metadata_data_attrs = ['disease_code', ] + else: + metadata_data_attrs = ['data_type', 'data_category','experimental_strategy','data_format','platform', 'disease_code',] try: - + if len(METADATA_DATA_ATTR[build]) != len(metadata_data_attrs): + METADATA_DATA_ATTR[build]={} if not len(METADATA_DATA_ATTR[build]): db = get_sql_connection() cursor = db.cursor() diff --git a/cohorts/views.py b/cohorts/views.py index 9f00d657..979fea63 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -1602,15 +1602,14 @@ def cohort_filelist(request, cohort_id=0, panel_type=None): if debug: logger.debug('Called '+sys._getframe().f_code.co_name) template = 'cohorts/cohort_filelist{}.html'.format("_{}".format(panel_type) if panel_type else "") - if cohort_id == 0: messages.error(request, 'Cohort requested does not exist.') return redirect('/user_landing') try: metadata_data_attr_builds = { - 'HG19': fetch_build_data_attr('HG19'), - 'HG38': fetch_build_data_attr('HG38') + 'HG19': fetch_build_data_attr('HG19', panel_type), + 'HG38': fetch_build_data_attr('HG38', panel_type) } build = request.GET.get('build', 'HG19') @@ -1623,7 +1622,6 @@ def cohort_filelist(request, cohort_id=0, panel_type=None): if panel_type: items = cohort_files(request, cohort_id, build=build, access=has_access, type=panel_type) - for attr in items['metadata_data_counts']: for val in items['metadata_data_counts'][attr]: metadata_data_attr[attr]['values'][val]['count'] = items['metadata_data_counts'][attr][val] @@ -2067,9 +2065,8 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co filter_counts = None file_list = [] total_file_count = 0 - case_barcode = request.GET.get('case_barcode', '') - #case_barcode_condition = '' if not case_barcode else "AND cs.case_barcode ='" + case_barcode + "'" - case_barcode_condition = '' if not case_barcode else "AND cs.case_barcode like '%" + case_barcode + "%'" + case_barcode = request.GET.get('case_barcode', '').lower() + case_barcode_condition = '' if not case_barcode else "AND LOWER(cs.case_barcode) like '%" + case_barcode + "%'" try: # Attempt to get the cohort perms - this will cause an excpetion if we don't have them Cohort_Perms.objects.get(cohort_id=cohort_id, user_id=user_id) @@ -2077,7 +2074,6 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co if type == 'dicom': filter_counts = {} - limit_clause = "" offset_clause = "" @@ -2086,6 +2082,12 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co bq_cohort_project_id = settings.BIGQUERY_PROJECT_NAME data_project = settings.BIGQUERY_DATA_PROJECT_NAME + filter_conditions = '' + if len(inc_filters): + built_clause = build_where_clause(inc_filters, for_files=True) + filter_conditions = 'AND ' + 'bc.'+built_clause['query_str'] + filter_conditions = filter_conditions.replace("%s", "'{}'").format(*built_clause['value_tuple']) + file_list_query_base = """ SELECT cs.case_barcode, ds.StudyInstanceUID, ds.StudyDescription, bc.disease_code, bc.project_short_name FROM [{cohort_project}:{cohort_dataset}.{cohort_table}] cs @@ -2093,12 +2095,12 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co ON cs.case_barcode = ds.PatientID JOIN [{data_project}:{tcga_bioclin_dataset}.{tcga_clin_table}] bc ON bc.case_barcode=cs.case_barcode - WHERE cs.cohort_id = {cohort} {case_barcode_condition} + WHERE cs.cohort_id = {cohort} {filter_conditions} {case_barcode_condition} GROUP BY cs.case_barcode, ds.StudyInstanceUID, ds.StudyDescription, bc.disease_code, bc.project_short_name """.format(cohort_dataset=bq_cohort_dataset, cohort_project=bq_cohort_project_id, cohort_table=bq_cohort_table, data_project=data_project, dcf_data_table="TCGA_radiology_images", tcga_img_dataset="metadata", - tcga_bioclin_dataset="TCGA_bioclin_v0", tcga_clin_table="Clinical", cohort=cohort_id, case_barcode_condition=case_barcode_condition) + tcga_bioclin_dataset="TCGA_bioclin_v0", tcga_clin_table="Clinical", cohort=cohort_id, filter_conditions=filter_conditions, case_barcode_condition=case_barcode_condition) file_list_query = """ {select_clause} @@ -2127,26 +2129,31 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co } if limit > 0: - #limit_clause = ' LIMIT %s' % str(limit) limit_clause = ' LIMIT {}'.format(str(limit)) # Offset is only valid when there is a limit if offset > 0: - #offset_clause = ' OFFSET %s' % str(offset) offset_clause = ' OFFSET {}'.format(str(offset)) - order_clause = "ORDER BY " + col_map[sort_column] + (" DESC" if sort_order == 1 else "") + + order_clause = "ORDER BY " + col_map[sort_column] + (" DESC" if sort_order == 1 else "") + counts = {} if do_filter_count: # Query the count start = time.time() results = BigQuerySupport.execute_query_and_fetch_results(file_count_query.format(select_clause=file_list_query_base)) + print("*****query") + print(file_count_query.format(select_clause=file_list_query_base)) stop = time.time() logger.debug('[BENCHMARKING] Time to query BQ for dicom count: ' + (stop - start).__str__()) for entry in results: total_file_count = int(entry['f'][0]['v']) - + cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() + counts = count_public_data_type(request.user, file_list_query_base, + inc_filters, cohort_programs, (type is not None and type != 'all'), + build, type) # Query the file list only if there was anything to find - if (total_file_count and do_filter_count) or not do_filter_count: + if total_file_count >0 and do_filter_count or not do_filter_count: start = time.time() results = BigQuerySupport.execute_query_and_fetch_results( file_list_query.format( @@ -2155,9 +2162,7 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co ) ) stop = time.time() - logger.debug('[BENCHMARKING] Time to query BQ for dicom data: ' + (stop - start).__str__()) - if len(results) > 0: for entry in results: file_list.append({ @@ -2168,7 +2173,7 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co 'project_short_name': entry['f'][4]['v'], 'program': "TCGA" }) - + filter_counts = counts else: select_clause_base = """ SELECT md.sample_barcode, md.case_barcode, md.disease_code, md.file_name, md.file_name_key, @@ -2213,7 +2218,6 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co cursor = db.cursor(MySQLdb.cursors.DictCursor) cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() - #params = () select_clause = '' count_select_clause = '' first_program = True @@ -2229,7 +2233,6 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co built_clause = build_where_clause(inc_filters, for_files=True) filter_conditions = 'AND ' + built_clause['query_str'] filter_conditions = filter_conditions.replace("%s", "'{}'").format(*built_clause['value_tuple']) - union_template = (" UNION " if not first_program else "") + "(" + select_clause_base + ")" select_clause += union_template.format( cohort_id=cohort_id, @@ -2250,21 +2253,15 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co if not first_program: if limit > 0: - #limit_clause = ' LIMIT %s' % str(limit) limit_clause = ' LIMIT {}'.format(str(limit)) # Offset is only valid when there is a limit if offset > 0: - #offset_clause = ' OFFSET %s' % str(offset) offset_clause = ' OFFSET {}'.format(str(offset)) order_clause = "ORDER BY "+col_map[sort_column]+(" DESC" if sort_order == 1 else "") start = time.time() query = file_list_query.format(select_clause=select_clause, order_clause=order_clause, limit_clause=limit_clause, offset_clause=offset_clause) - #final_query = query % params - #cursor.execute(query, params) - #print(final_query) - #print(params) cursor.execute(query) stop = time.time() logger.info("[STATUS] Time to get file-list: {}s".format(str(stop - start))) From 377999055618b2ea55618409775f55d0839a52b4 Mon Sep 17 00:00:00 2001 From: elainelee Date: Thu, 5 Jul 2018 10:11:28 -0700 Subject: [PATCH 35/76] bug fix --- cohorts/metadata_counting.py | 1 - cohorts/views.py | 30 +++++++++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index dd1070f9..75abcdcc 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -178,7 +178,6 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f WHERE TRUE {where_clause} GROUP BY {attr}; """ - filter_clauses = {} try: diff --git a/cohorts/views.py b/cohorts/views.py index 979fea63..8dcf27c6 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -2097,10 +2097,24 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co ON bc.case_barcode=cs.case_barcode WHERE cs.cohort_id = {cohort} {filter_conditions} {case_barcode_condition} GROUP BY cs.case_barcode, ds.StudyInstanceUID, ds.StudyDescription, bc.disease_code, bc.project_short_name - """.format(cohort_dataset=bq_cohort_dataset, - cohort_project=bq_cohort_project_id, cohort_table=bq_cohort_table, - data_project=data_project, dcf_data_table="TCGA_radiology_images", tcga_img_dataset="metadata", - tcga_bioclin_dataset="TCGA_bioclin_v0", tcga_clin_table="Clinical", cohort=cohort_id, filter_conditions=filter_conditions, case_barcode_condition=case_barcode_condition) + """ + file_list_query_formatted = file_list_query_base.format(cohort_dataset=bq_cohort_dataset, + cohort_project=bq_cohort_project_id, cohort_table=bq_cohort_table, + data_project=data_project, dcf_data_table="TCGA_radiology_images", + tcga_img_dataset="metadata", + tcga_bioclin_dataset="TCGA_bioclin_v0", tcga_clin_table="Clinical", cohort=cohort_id, + filter_conditions=filter_conditions, case_barcode_condition=case_barcode_condition) + + file_list_query_filter_count_formatted = file_list_query_base.format(cohort_dataset=bq_cohort_dataset, + cohort_project=bq_cohort_project_id, + cohort_table=bq_cohort_table, + data_project=data_project, + dcf_data_table="TCGA_radiology_images", + tcga_img_dataset="metadata", + tcga_bioclin_dataset="TCGA_bioclin_v0", + tcga_clin_table="Clinical", cohort=cohort_id, + filter_conditions="", + case_barcode_condition=case_barcode_condition) file_list_query = """ {select_clause} @@ -2141,15 +2155,13 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co if do_filter_count: # Query the count start = time.time() - results = BigQuerySupport.execute_query_and_fetch_results(file_count_query.format(select_clause=file_list_query_base)) - print("*****query") - print(file_count_query.format(select_clause=file_list_query_base)) + results = BigQuerySupport.execute_query_and_fetch_results(file_count_query.format(select_clause=file_list_query_formatted)) stop = time.time() logger.debug('[BENCHMARKING] Time to query BQ for dicom count: ' + (stop - start).__str__()) for entry in results: total_file_count = int(entry['f'][0]['v']) cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() - counts = count_public_data_type(request.user, file_list_query_base, + counts = count_public_data_type(request.user, file_list_query_filter_count_formatted, inc_filters, cohort_programs, (type is not None and type != 'all'), build, type) # Query the file list only if there was anything to find @@ -2157,7 +2169,7 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co start = time.time() results = BigQuerySupport.execute_query_and_fetch_results( file_list_query.format( - select_clause=file_list_query_base, order_clause=order_clause, limit_clause=limit_clause, + select_clause=file_list_query_formatted, order_clause=order_clause, limit_clause=limit_clause, offset_clause=offset_clause ) ) From 408045fc5bd350c239b102a8c8469dab935dd121 Mon Sep 17 00:00:00 2001 From: "S. Paquette" Date: Thu, 5 Jul 2018 17:06:47 -0700 Subject: [PATCH 36/76] -> Add a CloudSQL service discovery builder --- google_helpers/cloudsql_service.py | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 google_helpers/cloudsql_service.py diff --git a/google_helpers/cloudsql_service.py b/google_helpers/cloudsql_service.py new file mode 100644 index 00000000..35fa6fc1 --- /dev/null +++ b/google_helpers/cloudsql_service.py @@ -0,0 +1,37 @@ +""" + +Copyright 2015-2018, Institute for Systems Biology + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +""" + +from oauth2client.client import GoogleCredentials +from django.conf import settings +import httplib2 +from .utils import build_with_retries + + +def get_sql_resource(): + + CLOUDSQL_SCOPES = [ + 'https://www.googleapis.com/auth/cloud-platform', + 'https://www.googleapis.com/auth/sqlservice.admin' + ] + + credentials = GoogleCredentials.from_stream( + settings.GOOGLE_APPLICATION_CREDENTIALS).create_scoped(CLOUDSQL_SCOPES) + http = httplib2.Http() + http = credentials.authorize(http) + service = build_with_retries('sqladmin', 'v1beta4', None, 2, http=http) + return service From 03c2a5e28134c69be8301442b85799135697cfab Mon Sep 17 00:00:00 2001 From: "S. Paquette" Date: Thu, 5 Jul 2018 19:45:20 -0700 Subject: [PATCH 37/76] -> Parameterized queries for cohort file listing --- cohorts/metadata_counting.py | 33 +++++++++++++++++++-------------- cohorts/views.py | 24 ++++++++++++++++++++---- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index 75abcdcc..c1b539be 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -186,6 +186,15 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f metadata_data_attr = fetch_build_data_attr(build, type) + case_barcode = None + + # Pull out the case barcode filter, if there is one + if 'case_barcode' in inc_filters: + case_barcode = inc_filters['case_barcode'] + del inc_filters['case_barcode'] + + case_barcode_condition = '' if not case_barcode else "AND LOWER(cs.case_barcode) like %s" + # Make our where clauses for filter in inc_filters: for prog in program_list: @@ -197,7 +206,7 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f subfilter[filter] = inc_filters[filter] built_clause = build_where_clause(subfilter, for_files=True) - filter_clauses[filter]['where_clause'] = built_clause['query_str'].replace("%s", "'{}'") + filter_clauses[filter]['where_clause'] = built_clause['query_str'] filter_clauses[filter]['parameters'] = built_clause['value_tuple'] for attr in metadata_data_attr: @@ -205,21 +214,17 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f where_clause = "" filter_clause = ') AND ('.join([filter_clauses[x]['where_clause'] for x in filter_clauses if x != attr or (filter_format and attr == 'data_format')]) if len(filter_clause): - filter_clause = filter_clause.format(*[y for x in filter_clauses for y in filter_clauses[x]['parameters'] if x != attr or (filter_format and attr == 'data_format')]) where_clause = "AND ( {} )".format(filter_clause) - query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr) - if type == 'dicom': - results = BigQuerySupport.execute_query_and_fetch_results(query) - else: - cursor.execute(query) - results = cursor.fetchall() + paramter_tuple = tuple(y for x in filter_clauses for y in filter_clauses[x]['parameters'] if + x != attr or (filter_format and attr == 'data_format')) + if case_barcode: + paramter_tuple += (case_barcode, ) + query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr, case_barcode_condition=case_barcode_condition) + cursor.execute(query, paramter_tuple) + results = cursor.fetchall() for row in results: - if type == 'dicom': - val = row['f'][0]['v'] - cnt = int(row['f'][1]['v']) - else: - val = "None" if not row[0] else row[0] - cnt = row[1] + val = "None" if not row[0] else row[0] + cnt = row[1] counts[attr][val] = cnt return counts diff --git a/cohorts/views.py b/cohorts/views.py index 65a4c78d..7cb23d12 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -2049,6 +2049,8 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co inc_filters = json.loads(request.GET.get('filters', '{}')) if request.GET else json.loads(request.POST.get('filters', '{}')) + logger.debug("inc_filters: {}".format(inc_filters)) + user = request.user user_email = user.email user_id = user.id @@ -2064,8 +2066,11 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co filter_counts = None file_list = [] total_file_count = 0 - case_barcode = request.GET.get('case_barcode', '').lower() - case_barcode_condition = '' if not case_barcode else "AND LOWER(cs.case_barcode) like '%" + case_barcode + "%'" + case_barcode = request.GET.get('case_barcode', None) + case_barcode_condition = '' + if case_barcode: + case_barcode_condition = "AND LOWER(cs.case_barcode) like %s" + case_barcode = "%{}%".format(case_barcode) try: # Attempt to get the cohort perms - this will cause an excpetion if we don't have them Cohort_Perms.objects.get(cohort_id=cohort_id, user_id=user_id) @@ -2232,6 +2237,8 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co select_clause = '' count_select_clause = '' first_program = True + count_params = () + filelist_params = () for program in cohort_programs: program_data_tables = Public_Data_Tables.objects.filter(program=program, build=build) if len(program_data_tables) <= 0: @@ -2243,7 +2250,9 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co if len(inc_filters): built_clause = build_where_clause(inc_filters, for_files=True) filter_conditions = 'AND ' + built_clause['query_str'] - filter_conditions = filter_conditions.replace("%s", "'{}'").format(*built_clause['value_tuple']) + filelist_params += built_clause['value_tuple'] + if case_barcode: + filelist_params += (case_barcode, ) union_template = (" UNION " if not first_program else "") + "(" + select_clause_base + ")" select_clause += union_template.format( cohort_id=cohort_id, @@ -2273,13 +2282,20 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co start = time.time() query = file_list_query.format(select_clause=select_clause, order_clause=order_clause, limit_clause=limit_clause, offset_clause=offset_clause) - cursor.execute(query) + if len(filelist_params) > 0: + logger.debug("query for filelist: {}".format(query)) + logger.debug("params: {}".format(str(filelist_params))) + cursor.execute(query, filelist_params) + else: + cursor.execute(query) stop = time.time() logger.info("[STATUS] Time to get file-list: {}s".format(str(stop - start))) counts = {} if do_filter_count: start = time.time() + if case_barcode: + inc_filters['case_barcode'] = [case_barcode] counts = count_public_data_type(request.user, count_select_clause, inc_filters, cohort_programs, (type is not None and type != 'all'), build) stop = time.time() From f0442b150e80cb504a7ceb30e393a4fbfdd3e968 Mon Sep 17 00:00:00 2001 From: "S. Paquette" Date: Thu, 5 Jul 2018 22:11:34 -0700 Subject: [PATCH 38/76] -> typo in case barcode parameter, removed extra type clause because I don't know why I had it there to begin with --- cohorts/views.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/cohorts/views.py b/cohorts/views.py index 7cb23d12..cbb3eb09 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -2049,8 +2049,6 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co inc_filters = json.loads(request.GET.get('filters', '{}')) if request.GET else json.loads(request.POST.get('filters', '{}')) - logger.debug("inc_filters: {}".format(inc_filters)) - user = request.user user_email = user.email user_id = user.id @@ -2202,7 +2200,7 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co WHERE cohort_id = {cohort_id} ) cs ON cs.case_barcode = md.case_barcode - WHERE md.file_uploaded='true' {type_conditions} {filter_conditions} {case_barcode_condition} + WHERE md.file_uploaded='true' {filter_conditions} {case_barcode_condition} """ file_list_query = """ @@ -2224,11 +2222,13 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co } if type == 'igv': - type_conditions = "AND md.data_format='BAM'" - inc_filters['data_format'] = ['BAM'] + if 'data_format' not in inc_filters: + inc_filters['data_format'] = [] + inc_filters['data_format'].append('BAM') elif type == 'camic': - type_conditions = "AND md.data_format='SVS'" - inc_filters['data_format'] = ['SVS'] + if 'data_format' not in inc_filters: + inc_filters['data_format'] = [] + inc_filters['data_format'].append('SVS') db = get_sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) @@ -2237,7 +2237,6 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co select_clause = '' count_select_clause = '' first_program = True - count_params = () filelist_params = () for program in cohort_programs: program_data_tables = Public_Data_Tables.objects.filter(program=program, build=build) @@ -2257,14 +2256,12 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co select_clause += union_template.format( cohort_id=cohort_id, metadata_table=program_data_table, - type_conditions=type_conditions, filter_conditions=filter_conditions, case_barcode_condition=case_barcode_condition) if do_filter_count: count_select_clause += union_template.format( cohort_id=cohort_id, metadata_table=program_data_table, - type_conditions=type_conditions, filter_conditions='', case_barcode_condition=case_barcode_condition) first_program = False From 4bd51775cf60868cbf24e001a3e40405bb1d5cf3 Mon Sep 17 00:00:00 2001 From: elainelee Date: Fri, 6 Jul 2018 11:29:27 -0700 Subject: [PATCH 39/76] bug fix --- cohorts/metadata_counting.py | 2 +- cohorts/metadata_helpers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index 75abcdcc..d7c16273 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -166,7 +166,7 @@ def count_user_metadata(user, inc_filters=None, cohort_id=None): if db and db.open: db.close() -def count_public_data_type(user, data_query, inc_filters, program_list, filter_format=False, build='HG19', type='None'): +def count_public_data_type(user, data_query, inc_filters, program_list, filter_format=False, build='HG19', type=None): db = None cursor = None diff --git a/cohorts/metadata_helpers.py b/cohorts/metadata_helpers.py index f5b2a6f1..97971601 100644 --- a/cohorts/metadata_helpers.py +++ b/cohorts/metadata_helpers.py @@ -223,7 +223,7 @@ def get_sql_connection(): if db and db.open: db.close() -def fetch_build_data_attr(build, type='None'): +def fetch_build_data_attr(build, type=None): db = None cursor = None From 1cf3a3b4ea92310e904bb99ba481eaef1f41f6a7 Mon Sep 17 00:00:00 2001 From: s-paquette Date: Fri, 6 Jul 2018 13:15:17 -0700 Subject: [PATCH 40/76] -> Support LIKE for single values in BQ query builder (kludgey for the moment) --- google_helpers/bigquery/bq_support.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/google_helpers/bigquery/bq_support.py b/google_helpers/bigquery/bq_support.py index 1807922b..d514af0c 100644 --- a/google_helpers/bigquery/bq_support.py +++ b/google_helpers/bigquery/bq_support.py @@ -497,7 +497,10 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with # Scalar param query_param['parameterType']['type'] = ('STRING' if re.compile(ur'[^0-9\.,]', re.UNICODE).search(values[0]) else 'INT64') query_param['parameterValue']['value'] = values[0] - filter_string += "{} = @{}".format(attr, param_name) + if query_param['parameterType']['type'] == 'STRING' and '%' in values[0]: + filter_string += "{} LIKE @{}".format(attr, param_name) + else: + filter_string += "{} = @{}".format(attr, param_name) else: # Array param query_param['parameterType']['type'] = "ARRAY" From 94f88319e1469be8e3061c327fcb25bca9c94b4b Mon Sep 17 00:00:00 2001 From: elainelee Date: Fri, 6 Jul 2018 16:44:40 -0700 Subject: [PATCH 41/76] File Browser Case Barcode search: converting the sql code to pass the 'WHERE' conditions as parameters (incorporating Suzanne's code) --- cohorts/metadata_counting.py | 22 ++++++++++++++++++---- cohorts/views.py | 35 ++++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index d7c16273..1149fee1 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -186,6 +186,15 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f metadata_data_attr = fetch_build_data_attr(build, type) + case_barcode = None + + # Pull out the case barcode filter, if there is one + if 'case_barcode' in inc_filters: + case_barcode = inc_filters['case_barcode'] + del inc_filters['case_barcode'] + + case_barcode_condition = '' if not case_barcode else "AND LOWER(cs.case_barcode) like %s" + # Make our where clauses for filter in inc_filters: for prog in program_list: @@ -197,21 +206,26 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f subfilter[filter] = inc_filters[filter] built_clause = build_where_clause(subfilter, for_files=True) - filter_clauses[filter]['where_clause'] = built_clause['query_str'].replace("%s", "'{}'") + filter_clauses[filter]['where_clause'] = built_clause['query_str'] filter_clauses[filter]['parameters'] = built_clause['value_tuple'] for attr in metadata_data_attr: + paramter_tuple = () counts[attr] = {x: 0 for x in metadata_data_attr[attr]['values']} where_clause = "" + if case_barcode: + paramter_tuple += (case_barcode, ) filter_clause = ') AND ('.join([filter_clauses[x]['where_clause'] for x in filter_clauses if x != attr or (filter_format and attr == 'data_format')]) if len(filter_clause): - filter_clause = filter_clause.format(*[y for x in filter_clauses for y in filter_clauses[x]['parameters'] if x != attr or (filter_format and attr == 'data_format')]) where_clause = "AND ( {} )".format(filter_clause) - query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr) + paramter_tuple += tuple(y for x in filter_clauses for y in filter_clauses[x]['parameters'] if + x != attr or (filter_format and attr == 'data_format')) + + query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr, case_barcode_condition=case_barcode_condition) if type == 'dicom': results = BigQuerySupport.execute_query_and_fetch_results(query) else: - cursor.execute(query) + cursor.execute(query, paramter_tuple) results = cursor.fetchall() for row in results: if type == 'dicom': diff --git a/cohorts/views.py b/cohorts/views.py index 65a4c78d..cbb3eb09 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -2064,8 +2064,11 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co filter_counts = None file_list = [] total_file_count = 0 - case_barcode = request.GET.get('case_barcode', '').lower() - case_barcode_condition = '' if not case_barcode else "AND LOWER(cs.case_barcode) like '%" + case_barcode + "%'" + case_barcode = request.GET.get('case_barcode', None) + case_barcode_condition = '' + if case_barcode: + case_barcode_condition = "AND LOWER(cs.case_barcode) like %s" + case_barcode = "%{}%".format(case_barcode) try: # Attempt to get the cohort perms - this will cause an excpetion if we don't have them Cohort_Perms.objects.get(cohort_id=cohort_id, user_id=user_id) @@ -2197,7 +2200,7 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co WHERE cohort_id = {cohort_id} ) cs ON cs.case_barcode = md.case_barcode - WHERE md.file_uploaded='true' {type_conditions} {filter_conditions} {case_barcode_condition} + WHERE md.file_uploaded='true' {filter_conditions} {case_barcode_condition} """ file_list_query = """ @@ -2219,11 +2222,13 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co } if type == 'igv': - type_conditions = "AND md.data_format='BAM'" - inc_filters['data_format'] = ['BAM'] + if 'data_format' not in inc_filters: + inc_filters['data_format'] = [] + inc_filters['data_format'].append('BAM') elif type == 'camic': - type_conditions = "AND md.data_format='SVS'" - inc_filters['data_format'] = ['SVS'] + if 'data_format' not in inc_filters: + inc_filters['data_format'] = [] + inc_filters['data_format'].append('SVS') db = get_sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) @@ -2232,6 +2237,7 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co select_clause = '' count_select_clause = '' first_program = True + filelist_params = () for program in cohort_programs: program_data_tables = Public_Data_Tables.objects.filter(program=program, build=build) if len(program_data_tables) <= 0: @@ -2243,19 +2249,19 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co if len(inc_filters): built_clause = build_where_clause(inc_filters, for_files=True) filter_conditions = 'AND ' + built_clause['query_str'] - filter_conditions = filter_conditions.replace("%s", "'{}'").format(*built_clause['value_tuple']) + filelist_params += built_clause['value_tuple'] + if case_barcode: + filelist_params += (case_barcode, ) union_template = (" UNION " if not first_program else "") + "(" + select_clause_base + ")" select_clause += union_template.format( cohort_id=cohort_id, metadata_table=program_data_table, - type_conditions=type_conditions, filter_conditions=filter_conditions, case_barcode_condition=case_barcode_condition) if do_filter_count: count_select_clause += union_template.format( cohort_id=cohort_id, metadata_table=program_data_table, - type_conditions=type_conditions, filter_conditions='', case_barcode_condition=case_barcode_condition) first_program = False @@ -2273,13 +2279,20 @@ def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='co start = time.time() query = file_list_query.format(select_clause=select_clause, order_clause=order_clause, limit_clause=limit_clause, offset_clause=offset_clause) - cursor.execute(query) + if len(filelist_params) > 0: + logger.debug("query for filelist: {}".format(query)) + logger.debug("params: {}".format(str(filelist_params))) + cursor.execute(query, filelist_params) + else: + cursor.execute(query) stop = time.time() logger.info("[STATUS] Time to get file-list: {}s".format(str(stop - start))) counts = {} if do_filter_count: start = time.time() + if case_barcode: + inc_filters['case_barcode'] = [case_barcode] counts = count_public_data_type(request.user, count_select_clause, inc_filters, cohort_programs, (type is not None and type != 'all'), build) stop = time.time() From 82bdff347da7eb610d471b4291f2368320292104 Mon Sep 17 00:00:00 2001 From: s-paquette Date: Mon, 9 Jul 2018 14:45:04 -0700 Subject: [PATCH 42/76] -> Convert cohort_files and count_public_data_types to BQ w/params --- cohorts/file_helpers.py | 388 ++++++++++++++++++++++++++ cohorts/metadata_counting.py | 40 ++- cohorts/views.py | 360 ++---------------------- google_helpers/bigquery/bq_support.py | 25 +- 4 files changed, 447 insertions(+), 366 deletions(-) create mode 100644 cohorts/file_helpers.py diff --git a/cohorts/file_helpers.py b/cohorts/file_helpers.py new file mode 100644 index 00000000..fa5eb6c4 --- /dev/null +++ b/cohorts/file_helpers.py @@ -0,0 +1,388 @@ +""" + +Copyright 2018, Institute for Systems Biology + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0how to c + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +""" + +import logging +import time +import MySQLdb + +from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned +from django.conf import settings + +from metadata_counting import count_public_data_type +from metadata_helpers import get_sql_connection, build_where_clause + +from projects.models import Program, Project, User_Data_Tables, Public_Metadata_Tables, Public_Data_Tables +from cohorts.models import Cohort, Cohort_Perms + +from google_helpers.bigquery.cohort_support import BigQuerySupport + +logger = logging.getLogger('main_logger') + + +def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offset=0, sort_column='col-program', sort_order=0, build='HG19', access=None, type=None, do_filter_count=True): + + if not user: + raise Exception("A user must be supplied to view a cohort's files.") + if not cohort_id: + raise Exception("A cohort ID must be supplied to view a its files.") + + if not inc_filters: + inc_filters = {} + + user_email = user.email + user_id = user.id + + resp = None + db = None + cursor = None + query_limit = limit + type_conditions = "" + limit_clause = "" + offset_clause = "" + + filter_counts = None + file_list = [] + total_file_count = 0 + + case_barcode = None + case_barcode_condition = '' + + # DICOM uses BQ, and that WHERE clause builder can handle the LIKE clause, + # but the MySQL WHERE clause builder can't + if not type == 'dicom': + if 'case_barcode' in inc_filters: + case_barcode = inc_filters['case_barcode'] + del inc_filters['case_barcode'] + + if case_barcode: + case_barcode_condition = "AND LOWER(cs.case_barcode) LIKE %s" + case_barcode = "%{}%".format(case_barcode) + + try: + # Attempt to get the cohort perms - this will cause an excpetion if we don't have them + Cohort_Perms.objects.get(cohort_id=cohort_id, user_id=user_id) + + if type == 'dicom': + + filter_counts = {} + limit_clause = "" + offset_clause = "" + + bq_cohort_table = settings.BIGQUERY_COHORT_TABLE_ID + bq_cohort_dataset = settings.COHORT_DATASET_ID + bq_cohort_project_id = settings.BIGQUERY_PROJECT_NAME + data_project = settings.BIGQUERY_DATA_PROJECT_NAME + + built_clause = None + + filter_conditions = '' + if len(inc_filters): + built_clause = BigQuerySupport.build_bq_filter_and_params(inc_filters, field_prefix='bc.') + filter_conditions = 'AND ' + built_clause['filter_string'] + + file_list_query_base = """ + SELECT cs.case_barcode, ds.StudyInstanceUID, ds.StudyDescription, bc.disease_code, bc.project_short_name + FROM `{cohort_project}.{cohort_dataset}.{cohort_table}` cs + JOIN `{data_project}.{tcga_img_dataset}.{dcf_data_table}` ds + ON cs.case_barcode = ds.PatientID + JOIN `{data_project}.{tcga_bioclin_dataset}.{tcga_clin_table}` bc + ON bc.case_barcode=cs.case_barcode + WHERE cs.cohort_id = {cohort} {filter_conditions} + GROUP BY cs.case_barcode, ds.StudyInstanceUID, ds.StudyDescription, bc.disease_code, bc.project_short_name + """ + file_list_query_formatted = file_list_query_base.format(cohort_dataset=bq_cohort_dataset, + cohort_project=bq_cohort_project_id, cohort_table=bq_cohort_table, data_project=data_project, + dcf_data_table="TCGA_radiology_images", tcga_img_dataset="metadata", + tcga_bioclin_dataset="TCGA_bioclin_v0", tcga_clin_table="Clinical", cohort=cohort_id, + filter_conditions=filter_conditions + ) + + file_list_query_filter_count_formatted = file_list_query_base.format( + cohort_dataset=bq_cohort_dataset, cohort_project=bq_cohort_project_id, + cohort_table=bq_cohort_table, data_project=data_project, + dcf_data_table="TCGA_radiology_images", tcga_img_dataset="metadata", + tcga_bioclin_dataset="TCGA_bioclin_v0", tcga_clin_table="Clinical", cohort=cohort_id, + filter_conditions="" + ) + + file_list_query = """ + #standardSQL + {select_clause} + {order_clause} + {limit_clause} + {offset_clause} + """ + + file_count_query = """ + #standardSQL + SELECT COUNT(*) + FROM ( + {select_clause} + ) + """ + + # col_map: used in the sql ORDER BY clause + # key: html column attribute 'columnId' + # value: db table column name + col_map = { + 'col-program': 'bc.project_short_name', + 'col-barcode': 'cs.case_barcode', + 'col-diseasecode': 'bc.disease_code', + 'col-projectname': 'bc.project_short_name', + 'col-studydesc': 'ds.StudyDescription', + 'col-studyuid': 'ds.StudyInstanceUID' + } + + if limit > 0: + limit_clause = ' LIMIT {}'.format(str(limit)) + # Offset is only valid when there is a limit + if offset > 0: + offset_clause = ' OFFSET {}'.format(str(offset)) + + order_clause = "ORDER BY " + col_map[sort_column] + (" DESC" if sort_order == 1 else "") + counts = {} + if do_filter_count: + # Query the count + start = time.time() + logger.debug("Query: {}".format(file_count_query.format(select_clause=file_list_query_formatted))) + if built_clause: + logger.debug("Params: {}".format(built_clause['parameters'])) + results = BigQuerySupport.execute_query_and_fetch_results( + file_count_query.format(select_clause=file_list_query_formatted), + built_clause['parameters'] if built_clause else None + ) + stop = time.time() + logger.debug('[BENCHMARKING] Time to query BQ for dicom count: ' + (stop - start).__str__()) + for entry in results: + total_file_count = int(entry['f'][0]['v']) + cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() + counts = count_public_data_type(user, file_list_query_filter_count_formatted, + inc_filters, cohort_programs, (type is not None and type != 'all'), + build, type) + # Query the file list only if there was anything to find + if total_file_count > 0 and do_filter_count or not do_filter_count: + start = time.time() + results = BigQuerySupport.execute_query_and_fetch_results( + file_list_query.format( + select_clause=file_list_query_formatted, order_clause=order_clause, limit_clause=limit_clause, + offset_clause=offset_clause + ) + ) + stop = time.time() + logger.debug('[BENCHMARKING] Time to query BQ for dicom data: ' + (stop - start).__str__()) + if len(results) > 0: + for entry in results: + file_list.append({ + 'case': entry['f'][0]['v'], + 'study_uid': entry['f'][1]['v'], + 'study_desc': entry['f'][2]['v'] or 'N/A', + 'disease_code': entry['f'][3]['v'], + 'project_short_name': entry['f'][4]['v'], + 'program': "TCGA" + }) + filter_counts = counts + else: + select_clause_base = """ + SELECT md.sample_barcode, md.case_barcode, md.disease_code, md.file_name, md.file_name_key, + md.index_file_name, md.access, md.acl, md.platform, md.data_type, md.data_category, + md.experimental_strategy, md.data_format, md.file_gdc_id, md.case_gdc_id, md.project_short_name + FROM {metadata_table} md + JOIN ( + SELECT DISTINCT case_barcode + FROM cohorts_samples + WHERE cohort_id = {cohort_id} + ) cs + ON cs.case_barcode = md.case_barcode + WHERE md.file_uploaded='true' {filter_conditions} {case_barcode_condition} + """ + + file_list_query = """ + {select_clause} + {order_clause} + {limit_clause} + {offset_clause} + """ + col_map = { + 'col-program': 'project_short_name', + 'col-barcode': 'case_barcode', + 'col-filename': 'file_name', + 'col-diseasecode': 'disease_code', + 'col-exp-strategy': 'experimental_strategy', + 'col-platform': 'platform', + 'col-datacat': 'data_category', + 'col-datatype': 'data_type', + 'col-dataformat': 'data_format' + } + + if type == 'igv': + if 'data_format' not in inc_filters: + inc_filters['data_format'] = [] + inc_filters['data_format'].append('BAM') + elif type == 'camic': + if 'data_format' not in inc_filters: + inc_filters['data_format'] = [] + inc_filters['data_format'].append('SVS') + + db = get_sql_connection() + cursor = db.cursor(MySQLdb.cursors.DictCursor) + + cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() + select_clause = '' + count_select_clause = '' + first_program = True + filelist_params = () + for program in cohort_programs: + program_data_tables = Public_Data_Tables.objects.filter(program=program, build=build) + if len(program_data_tables) <= 0: + logger.debug("[STATUS] No metadata_data table for {}, build {}--skipping.".format(program.name,build)) + # This program has no metadata_data table for this build, or at all--skip + continue + program_data_table = program_data_tables[0].data_table + filter_conditions = '' + if len(inc_filters): + built_clause = build_where_clause(inc_filters, for_files=True) + filter_conditions = 'AND ' + built_clause['query_str'] + filelist_params += built_clause['value_tuple'] + if case_barcode: + filelist_params += (case_barcode, ) + union_template = (" UNION " if not first_program else "") + "(" + select_clause_base + ")" + select_clause += union_template.format( + cohort_id=cohort_id, + metadata_table=program_data_table, + filter_conditions=filter_conditions, + case_barcode_condition=case_barcode_condition) + if do_filter_count: + count_select_clause += union_template.format( + cohort_id=cohort_id, + metadata_table=program_data_table, + filter_conditions='', + case_barcode_condition=case_barcode_condition) + first_program = False + + # if first_program is still true, we found no programs with data tables for this build + if not first_program: + + if limit > 0: + limit_clause = ' LIMIT {}'.format(str(limit)) + # Offset is only valid when there is a limit + if offset > 0: + offset_clause = ' OFFSET {}'.format(str(offset)) + order_clause = "ORDER BY "+col_map[sort_column]+(" DESC" if sort_order == 1 else "") + + start = time.time() + query = file_list_query.format(select_clause=select_clause, order_clause=order_clause, limit_clause=limit_clause, + offset_clause=offset_clause) + if len(filelist_params) > 0: + logger.debug("query for filelist: {}".format(query)) + logger.debug("params: {}".format(str(filelist_params))) + cursor.execute(query, filelist_params) + else: + cursor.execute(query) + stop = time.time() + logger.info("[STATUS] Time to get file-list: {}s".format(str(stop - start))) + + counts = {} + if do_filter_count: + start = time.time() + if case_barcode: + inc_filters['case_barcode'] = [case_barcode] + counts = count_public_data_type(user, count_select_clause, + inc_filters, cohort_programs, (type is not None and type != 'all'), build) + stop = time.time() + logger.info("[STATUS] Time to count public data files: {}s".format(str((stop-start)))) + + if cursor.rowcount > 0: + for item in cursor.fetchall(): + whitelist_found = False + # If this is a controlled-access entry, check for the user's access to it + if item['access'] == 'controlled' and access: + whitelists = item['acl'].split(',') + for whitelist in whitelists: + if whitelist in access: + whitelist_found = True + + file_list.append({ + 'sample': item['sample_barcode'], + 'case': item['case_barcode'], + 'disease_code': item['disease_code'], + 'build': build.lower(), + 'cloudstorage_location': item['file_name_key'] or 'N/A', + 'index_name': item['index_file_name'] or 'N/A', + 'access': (item['access'] or 'N/A'), + 'user_access': str(item['access'] != 'controlled' or whitelist_found), + 'filename': item['file_name'] or 'N/A', + 'exp_strat': item['experimental_strategy'] or 'N/A', + 'platform': item['platform'] or 'N/A', + 'datacat': item['data_category'] or 'N/A', + 'datatype': (item['data_type'] or 'N/A'), + 'dataformat': (item['data_format'] or 'N/A'), + 'program': item['project_short_name'].split("-")[0], + 'case_gdc_id': (item['case_gdc_id'] or 'N/A'), + 'file_gdc_id': (item['file_gdc_id'] or 'N/A'), + 'project_short_name': (item['project_short_name'] or 'N/A'), + 'cohort_id': cohort_id + }) + filter_counts = counts + files_counted = False + # Add to the file total + if do_filter_count: + for attr in filter_counts: + if files_counted: + continue + for val in filter_counts[attr]: + if not files_counted and (attr not in inc_filters or val in inc_filters[attr]): + total_file_count += int(filter_counts[attr][val]) + files_counted = True + else: + filter_counts = {} + resp = { + 'total_file_count': total_file_count, + 'page': page, + 'file_list': file_list, + 'build': build, + 'metadata_data_counts': filter_counts + } + + except (IndexError, TypeError) as e: + logger.error("Error obtaining list of samples in cohort file list") + logger.exception(e) + resp = {'error': 'Error obtaining list of samples in cohort file list'} + + except ObjectDoesNotExist as e: + logger.error("[ERROR] Permissions exception when retrieving cohort file list for cohort {}:".format(str(cohort_id))) + logger.exception(e) + resp = {'error': "User {} does not have permission to view cohort {}, and so cannot export it or its file manifest.".format(user_email, str(cohort_id))} + + except MultipleObjectsReturned as e: + logger.error("[ERROR] Permissions exception when retrieving cohort file list for cohort {}:".format(str(cohort_id))) + logger.exception(e) + perms = Cohort_Perms.objects.filter(cohort_id=cohort_id, user_id=user_id).values_list('cohort_id','user_id') + logger.error("[ERROR] Permissions found: {}".format(str(perms))) + resp = {'error': "There was an error while retrieving cohort {}'s permissions--please contact the administrator.".format(str(cohort_id))} + + except Exception as e: + logger.error("[ERROR] Exception obtaining file list and platform counts:") + logger.exception(e) + resp = {'error': 'Error getting counts'} + + finally: + if cursor: cursor.close() + if db and db.open: db.close() + + return resp + diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index 1149fee1..58d4b357 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -1,6 +1,6 @@ """ -Copyright 2017, Institute for Systems Biology +Copyright 2018, Institute for Systems Biology Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -212,21 +212,37 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f for attr in metadata_data_attr: paramter_tuple = () counts[attr] = {x: 0 for x in metadata_data_attr[attr]['values']} - where_clause = "" - if case_barcode: - paramter_tuple += (case_barcode, ) - filter_clause = ') AND ('.join([filter_clauses[x]['where_clause'] for x in filter_clauses if x != attr or (filter_format and attr == 'data_format')]) - if len(filter_clause): - where_clause = "AND ( {} )".format(filter_clause) - paramter_tuple += tuple(y for x in filter_clauses for y in filter_clauses[x]['parameters'] if - x != attr or (filter_format and attr == 'data_format')) - - query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr, case_barcode_condition=case_barcode_condition) if type == 'dicom': - results = BigQuerySupport.execute_query_and_fetch_results(query) + query = """ + #standardSQL + {query} + """.format(query=query) + + where_clause = '' + parameters = None + if len(inc_filters): + built_clause = BigQuerySupport.build_bq_filter_and_params(inc_filters) + where_clause = "AND ( {} )".format(built_clause['filter_string']) + parameters = built_clause['parameters'] + + query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause) + + results = BigQuerySupport.execute_query_and_fetch_results(query, parameters) else: + where_clause = "" + if case_barcode: + paramter_tuple += (case_barcode, ) + filter_clause = ') AND ('.join([filter_clauses[x]['where_clause'] for x in filter_clauses if x != attr or (filter_format and attr == 'data_format')]) + if len(filter_clause): + where_clause = "AND ( {} )".format(filter_clause) + paramter_tuple += tuple(y for x in filter_clauses for y in filter_clauses[x]['parameters'] if + x != attr or (filter_format and attr == 'data_format')) + + query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr, case_barcode_condition=case_barcode_condition) cursor.execute(query, paramter_tuple) results = cursor.fetchall() + + for row in results: if type == 'dicom': val = row['f'][0]['v'] diff --git a/cohorts/views.py b/cohorts/views.py index cbb3eb09..ac5a5a22 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -46,6 +46,7 @@ from accounts.models import GoogleProject from metadata_helpers import * from metadata_counting import * +from file_helpers import * from models import Cohort, Samples, Cohort_Perms, Source, Filters, Cohort_Comments from projects.models import Program, Project, User_Data_Tables, Public_Metadata_Tables, Public_Data_Tables from accounts.sa_utils import auth_dataset_whitelists_for_user @@ -1602,6 +1603,7 @@ def cohort_filelist(request, cohort_id=0, panel_type=None): if debug: logger.debug('Called '+sys._getframe().f_code.co_name) template = 'cohorts/cohort_filelist{}.html'.format("_{}".format(panel_type) if panel_type else "") + if cohort_id == 0: messages.error(request, 'Cohort requested does not exist.') return redirect('/user_landing') @@ -1621,7 +1623,12 @@ def cohort_filelist(request, cohort_id=0, panel_type=None): items = None if panel_type: - items = cohort_files(request, cohort_id, build=build, access=has_access, type=panel_type) + inc_filters = json.loads(request.GET.get('filters', '{}')) if request.GET else json.loads( + request.POST.get('filters', '{}')) + if request.GET.get('case_barcode', None): + inc_filters['case_barcode'] = ["%{}%".format(request.GET.get('case_barcode')),] + items = cohort_files(cohort_id, inc_filters=inc_filters, user=request.user, build=build, access=has_access, type=panel_type) + for attr in items['metadata_data_counts']: for val in items['metadata_data_counts'][attr]: metadata_data_attr[attr]['values'][val]['count'] = items['metadata_data_counts'][attr][val] @@ -1715,12 +1722,15 @@ def cohort_filelist_ajax(request, cohort_id=0, panel_type=None): sort_order = int(request.GET.get('sort_order')) params['sort_order'] = sort_order - build = request.GET.get('build','HG19') has_access = auth_dataset_whitelists_for_user(request.user.id) - result = cohort_files(request=request, cohort_id=cohort_id, build=build, access=has_access, type=panel_type, do_filter_count=do_filter_count, **params) + inc_filters = json.loads(request.GET.get('filters', '{}')) if request.GET else json.loads( + request.POST.get('filters', '{}')) + if request.GET.get('case_barcode', None): + inc_filters['case_barcode'] = ["%{}%".format(request.GET.get('case_barcode')), ] + result = cohort_files(cohort_id, user=request.user, inc_filters=inc_filters, build=build, access=has_access, type=panel_type, do_filter_count=do_filter_count, **params) # If nothing was found, our total file count will reflect that if do_filter_count: @@ -1816,7 +1826,12 @@ def streaming_csv_view(request, cohort_id=0): if not re.compile(r'[Hh][Gg](19|38)').search(build): raise Exception("Invalid build supplied") - items = cohort_files(request=request, cohort_id=cohort_id, limit=limit, build=build) + inc_filters = json.loads(request.GET.get('filters', '{}')) if request.GET else json.loads( + request.POST.get('filters', '{}')) + if request.GET.get('case_barcode', None): + inc_filters['case_barcode'] = ["%{}%".format(request.GET.get('case_barcode')), ] + items = cohort_files(cohort_id, user=request.user, inc_filters=inc_filters, limit=limit, build=build) + if 'file_list' in items: file_list = items['file_list'] else: @@ -2044,343 +2059,6 @@ def get_cohort_filter_panel(request, cohort_id=0, program_id=0): return render(request, template, template_values) -@login_required -def cohort_files(request, cohort_id, limit=25, page=1, offset=0, sort_column='col-program', sort_order=0, build='HG38', access=None, type=None, do_filter_count=True): - - inc_filters = json.loads(request.GET.get('filters', '{}')) if request.GET else json.loads(request.POST.get('filters', '{}')) - - user = request.user - user_email = user.email - user_id = user.id - - resp = None - db = None - cursor = None - query_limit = limit - type_conditions = "" - limit_clause = "" - offset_clause = "" - - filter_counts = None - file_list = [] - total_file_count = 0 - case_barcode = request.GET.get('case_barcode', None) - case_barcode_condition = '' - if case_barcode: - case_barcode_condition = "AND LOWER(cs.case_barcode) like %s" - case_barcode = "%{}%".format(case_barcode) - try: - # Attempt to get the cohort perms - this will cause an excpetion if we don't have them - Cohort_Perms.objects.get(cohort_id=cohort_id, user_id=user_id) - - if type == 'dicom': - - filter_counts = {} - limit_clause = "" - offset_clause = "" - - bq_cohort_table = settings.BIGQUERY_COHORT_TABLE_ID - bq_cohort_dataset = settings.COHORT_DATASET_ID - bq_cohort_project_id = settings.BIGQUERY_PROJECT_NAME - data_project = settings.BIGQUERY_DATA_PROJECT_NAME - - filter_conditions = '' - if len(inc_filters): - built_clause = build_where_clause(inc_filters, for_files=True) - filter_conditions = 'AND ' + 'bc.'+built_clause['query_str'] - filter_conditions = filter_conditions.replace("%s", "'{}'").format(*built_clause['value_tuple']) - - file_list_query_base = """ - SELECT cs.case_barcode, ds.StudyInstanceUID, ds.StudyDescription, bc.disease_code, bc.project_short_name - FROM [{cohort_project}:{cohort_dataset}.{cohort_table}] cs - JOIN [{data_project}:{tcga_img_dataset}.{dcf_data_table}] ds - ON cs.case_barcode = ds.PatientID - JOIN [{data_project}:{tcga_bioclin_dataset}.{tcga_clin_table}] bc - ON bc.case_barcode=cs.case_barcode - WHERE cs.cohort_id = {cohort} {filter_conditions} {case_barcode_condition} - GROUP BY cs.case_barcode, ds.StudyInstanceUID, ds.StudyDescription, bc.disease_code, bc.project_short_name - """ - file_list_query_formatted = file_list_query_base.format(cohort_dataset=bq_cohort_dataset, - cohort_project=bq_cohort_project_id, cohort_table=bq_cohort_table, - data_project=data_project, dcf_data_table="TCGA_radiology_images", - tcga_img_dataset="metadata", - tcga_bioclin_dataset="TCGA_bioclin_v0", tcga_clin_table="Clinical", cohort=cohort_id, - filter_conditions=filter_conditions, case_barcode_condition=case_barcode_condition) - - file_list_query_filter_count_formatted = file_list_query_base.format(cohort_dataset=bq_cohort_dataset, - cohort_project=bq_cohort_project_id, - cohort_table=bq_cohort_table, - data_project=data_project, - dcf_data_table="TCGA_radiology_images", - tcga_img_dataset="metadata", - tcga_bioclin_dataset="TCGA_bioclin_v0", - tcga_clin_table="Clinical", cohort=cohort_id, - filter_conditions="", - case_barcode_condition=case_barcode_condition) - - file_list_query = """ - {select_clause} - {order_clause} - {limit_clause} - {offset_clause} - """ - - file_count_query = """ - SELECT COUNT(*) - FROM ( - {select_clause} - ) - """ - - # col_map: used in the sql ORDER BY clause - # key: html column attribute 'columnId' - # value: db table column name - col_map = { - 'col-program': 'bc.project_short_name', - 'col-barcode': 'cs.case_barcode', - 'col-diseasecode': 'bc.disease_code', - 'col-projectname': 'bc.project_short_name', - 'col-studydesc': 'ds.StudyDescription', - 'col-studyuid': 'ds.StudyInstanceUID' - } - - if limit > 0: - limit_clause = ' LIMIT {}'.format(str(limit)) - # Offset is only valid when there is a limit - if offset > 0: - offset_clause = ' OFFSET {}'.format(str(offset)) - - - - order_clause = "ORDER BY " + col_map[sort_column] + (" DESC" if sort_order == 1 else "") - counts = {} - if do_filter_count: - # Query the count - start = time.time() - results = BigQuerySupport.execute_query_and_fetch_results(file_count_query.format(select_clause=file_list_query_formatted)) - stop = time.time() - logger.debug('[BENCHMARKING] Time to query BQ for dicom count: ' + (stop - start).__str__()) - for entry in results: - total_file_count = int(entry['f'][0]['v']) - cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() - counts = count_public_data_type(request.user, file_list_query_filter_count_formatted, - inc_filters, cohort_programs, (type is not None and type != 'all'), - build, type) - # Query the file list only if there was anything to find - if total_file_count >0 and do_filter_count or not do_filter_count: - start = time.time() - results = BigQuerySupport.execute_query_and_fetch_results( - file_list_query.format( - select_clause=file_list_query_formatted, order_clause=order_clause, limit_clause=limit_clause, - offset_clause=offset_clause - ) - ) - stop = time.time() - logger.debug('[BENCHMARKING] Time to query BQ for dicom data: ' + (stop - start).__str__()) - if len(results) > 0: - for entry in results: - file_list.append({ - 'case': entry['f'][0]['v'], - 'study_uid': entry['f'][1]['v'], - 'study_desc': entry['f'][2]['v'] or 'N/A', - 'disease_code': entry['f'][3]['v'], - 'project_short_name': entry['f'][4]['v'], - 'program': "TCGA" - }) - filter_counts = counts - else: - select_clause_base = """ - SELECT md.sample_barcode, md.case_barcode, md.disease_code, md.file_name, md.file_name_key, - md.index_file_name, md.access, md.acl, md.platform, md.data_type, md.data_category, - md.experimental_strategy, md.data_format, md.file_gdc_id, md.case_gdc_id, md.project_short_name - FROM {metadata_table} md - JOIN ( - SELECT DISTINCT case_barcode - FROM cohorts_samples - WHERE cohort_id = {cohort_id} - ) cs - ON cs.case_barcode = md.case_barcode - WHERE md.file_uploaded='true' {filter_conditions} {case_barcode_condition} - """ - - file_list_query = """ - {select_clause} - {order_clause} - {limit_clause} - {offset_clause} - """ - col_map = { - 'col-program': 'project_short_name', - 'col-barcode': 'case_barcode', - 'col-filename': 'file_name', - 'col-diseasecode': 'disease_code', - 'col-exp-strategy': 'experimental_strategy', - 'col-platform': 'platform', - 'col-datacat': 'data_category', - 'col-datatype': 'data_type', - 'col-dataformat': 'data_format' - } - - if type == 'igv': - if 'data_format' not in inc_filters: - inc_filters['data_format'] = [] - inc_filters['data_format'].append('BAM') - elif type == 'camic': - if 'data_format' not in inc_filters: - inc_filters['data_format'] = [] - inc_filters['data_format'].append('SVS') - - db = get_sql_connection() - cursor = db.cursor(MySQLdb.cursors.DictCursor) - - cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() - select_clause = '' - count_select_clause = '' - first_program = True - filelist_params = () - for program in cohort_programs: - program_data_tables = Public_Data_Tables.objects.filter(program=program, build=build) - if len(program_data_tables) <= 0: - logger.debug("[STATUS] No metadata_data table for {}, build {}--skipping.".format(program.name,build)) - # This program has no metadata_data table for this build, or at all--skip - continue - program_data_table = program_data_tables[0].data_table - filter_conditions = '' - if len(inc_filters): - built_clause = build_where_clause(inc_filters, for_files=True) - filter_conditions = 'AND ' + built_clause['query_str'] - filelist_params += built_clause['value_tuple'] - if case_barcode: - filelist_params += (case_barcode, ) - union_template = (" UNION " if not first_program else "") + "(" + select_clause_base + ")" - select_clause += union_template.format( - cohort_id=cohort_id, - metadata_table=program_data_table, - filter_conditions=filter_conditions, - case_barcode_condition=case_barcode_condition) - if do_filter_count: - count_select_clause += union_template.format( - cohort_id=cohort_id, - metadata_table=program_data_table, - filter_conditions='', - case_barcode_condition=case_barcode_condition) - first_program = False - - # if first_program is still true, we found no programs with data tables for this build - if not first_program: - - if limit > 0: - limit_clause = ' LIMIT {}'.format(str(limit)) - # Offset is only valid when there is a limit - if offset > 0: - offset_clause = ' OFFSET {}'.format(str(offset)) - order_clause = "ORDER BY "+col_map[sort_column]+(" DESC" if sort_order == 1 else "") - - start = time.time() - query = file_list_query.format(select_clause=select_clause, order_clause=order_clause, limit_clause=limit_clause, - offset_clause=offset_clause) - if len(filelist_params) > 0: - logger.debug("query for filelist: {}".format(query)) - logger.debug("params: {}".format(str(filelist_params))) - cursor.execute(query, filelist_params) - else: - cursor.execute(query) - stop = time.time() - logger.info("[STATUS] Time to get file-list: {}s".format(str(stop - start))) - - counts = {} - if do_filter_count: - start = time.time() - if case_barcode: - inc_filters['case_barcode'] = [case_barcode] - counts = count_public_data_type(request.user, count_select_clause, - inc_filters, cohort_programs, (type is not None and type != 'all'), build) - stop = time.time() - logger.info("[STATUS] Time to count public data files: {}s".format(str((stop-start)))) - - if cursor.rowcount > 0: - for item in cursor.fetchall(): - whitelist_found = False - # If this is a controlled-access entry, check for the user's access to it - if item['access'] == 'controlled' and access: - whitelists = item['acl'].split(',') - for whitelist in whitelists: - if whitelist in access: - whitelist_found = True - - file_list.append({ - 'sample': item['sample_barcode'], - 'case': item['case_barcode'], - 'disease_code': item['disease_code'], - 'build': build.lower(), - 'cloudstorage_location': item['file_name_key'] or 'N/A', - 'index_name': item['index_file_name'] or 'N/A', - 'access': (item['access'] or 'N/A'), - 'user_access': str(item['access'] != 'controlled' or whitelist_found), - 'filename': item['file_name'] or 'N/A', - 'exp_strat': item['experimental_strategy'] or 'N/A', - 'platform': item['platform'] or 'N/A', - 'datacat': item['data_category'] or 'N/A', - 'datatype': (item['data_type'] or 'N/A'), - 'dataformat': (item['data_format'] or 'N/A'), - 'program': item['project_short_name'].split("-")[0], - 'case_gdc_id': (item['case_gdc_id'] or 'N/A'), - 'file_gdc_id': (item['file_gdc_id'] or 'N/A'), - 'project_short_name': (item['project_short_name'] or 'N/A'), - 'cohort_id': cohort_id - }) - filter_counts = counts - files_counted = False - # Add to the file total - if do_filter_count: - for attr in filter_counts: - if files_counted: - continue - for val in filter_counts[attr]: - if not files_counted and (attr not in inc_filters or val in inc_filters[attr]): - total_file_count += int(filter_counts[attr][val]) - files_counted = True - else: - filter_counts = {} - resp = { - 'total_file_count': total_file_count, - 'page': page, - 'file_list': file_list, - 'build': build, - 'metadata_data_counts': filter_counts - } - - except (IndexError, TypeError) as e: - logger.error("Error obtaining list of samples in cohort file list") - logger.exception(e) - resp = {'error': 'Error obtaining list of samples in cohort file list'} - - except ObjectDoesNotExist as e: - logger.error("[ERROR] Permissions exception when retrieving cohort file list for cohort {}:".format(str(cohort_id))) - logger.exception(e) - resp = {'error': "User {} does not have permission to view cohort {}, and so cannot export it or its file manifest.".format(user_email, str(cohort_id))} - - except MultipleObjectsReturned as e: - logger.error("[ERROR] Permissions exception when retrieving cohort file list for cohort {}:".format(str(cohort_id))) - logger.exception(e) - perms = Cohort_Perms.objects.filter(cohort_id=cohort_id, user_id=user_id).values_list('cohort_id','user_id') - logger.error("[ERROR] Permissions found: {}".format(str(perms))) - resp = {'error': "There was an error while retrieving cohort {}'s permissions--please contact the administrator.".format(str(cohort_id))} - - except Exception as e: - logger.error("[ERROR] Exception obtaining file list and platform counts:") - logger.exception(e) - resp = {'error': 'Error getting counts'} - - finally: - if cursor: cursor.close() - if db and db.open: db.close() - - logger.debug("[STATUS] Returning response from cohort_files") - - return resp - - # Master method for exporting data types to BQ, GCS, etc. @login_required @csrf_protect diff --git a/google_helpers/bigquery/bq_support.py b/google_helpers/bigquery/bq_support.py index d514af0c..cc56d733 100644 --- a/google_helpers/bigquery/bq_support.py +++ b/google_helpers/bigquery/bq_support.py @@ -292,6 +292,8 @@ def execute_query(self, query, parameters=None, write_disposition='WRITE_EMPTY', job_id = query_job['jobReference']['jobId'] + query_results = None + # Cost Estimates don't actually run as fully-fledged jobs, and won't be inserted as such, # so we just get back the estimate immediately if cost_est: @@ -324,11 +326,6 @@ def execute_query(self, query, parameters=None, write_disposition='WRITE_EMPTY', logger.error("[ERROR] Query took longer than the allowed time to execute--" + "if you check job ID {} manually you can wait for it to finish.".format(job_id)) - logger.debug("[STATUS] Logging statements for test debug:") - logger.debug("State: {}".format(str(job_is_done['status']['state']) if job_is_done and 'status' in job_is_done and 'state' in job_is_done['status'] else 'N/A')) - logger.debug("Exeucting project: {}".format(self.executing_project)) - logger.debug("jobId: {}".format(job_is_done['jobReference']['jobId'])) - if 'statistics' in job_is_done and 'query' in job_is_done['statistics'] and 'timeline' in \ job_is_done['statistics']['query']: logger.debug("Elapsed: {}".format(str(job_is_done['statistics']['query']['timeline'][-1]['elapsedMs']))) @@ -406,7 +403,7 @@ def get_job_results(cls, job_reference): # TODO: add support for BETWEEN # TODO: add support for <>= @staticmethod - def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with_count_toggle=False): + def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with_count_toggle=False, field_prefix=None): result = { 'filter_string': '', 'parameters': [] @@ -433,7 +430,7 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with type = attr.split(':')[-1] invert = bool(attr.split(':')[3] == 'NOT') param_name = 'gene{}{}'.format(str(mut_filtr_count), '_{}'.format(param_suffix) if param_suffix else '') - filter_string = 'Hugo_Symbol = @{} AND '.format(param_name) + filter_string = '{}Hugo_Symbol = @{} AND '.format('' if not field_prefix else field_prefix, param_name) gene_query_param = { 'name': param_name, @@ -456,13 +453,13 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with } if type == 'category' and values[0] == 'any': - filter_string += 'Variant_Classification IS NOT NULL' + filter_string += '{}Variant_Classification IS NOT NULL'.format('' if not field_prefix else field_prefix,) var_query_param = None else: if type == 'category': values = MOLECULAR_CATEGORIES[values[0]]['attrs'] var_param_name = "var_class{}{}".format(str(mut_filtr_count), '_{}'.format(param_suffix) if param_suffix else '') - filter_string += 'Variant_Classification {}IN UNNEST(@{})'.format('NOT ' if invert else '', var_param_name) + filter_string += '{}Variant_Classification {}IN UNNEST(@{})'.format('' if not field_prefix else field_prefix, 'NOT ' if invert else '', var_param_name) var_query_param['name'] = var_param_name var_query_param['parameterType']['type'] = 'ARRAY' var_query_param['parameterValue'] = {'arrayValues': [{'value': x} for x in values]} @@ -474,6 +471,8 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with mut_filtr_count += 1 + logger.debug("other filters: {}".format(str(other_filters))) + for attr, values in other_filters.items(): filter_string = '' param_name = attr + '{}'.format('_{}'.format(param_suffix) if param_suffix else '') @@ -488,7 +487,7 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with } if 'None' in values: values.remove('None') - filter_string = "{} IS NULL".format(attr) + filter_string = "{}{} IS NULL".format('' if not field_prefix else field_prefix, attr) if len(values) > 0: if len(filter_string): @@ -498,15 +497,15 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with query_param['parameterType']['type'] = ('STRING' if re.compile(ur'[^0-9\.,]', re.UNICODE).search(values[0]) else 'INT64') query_param['parameterValue']['value'] = values[0] if query_param['parameterType']['type'] == 'STRING' and '%' in values[0]: - filter_string += "{} LIKE @{}".format(attr, param_name) + filter_string += "{}{} LIKE @{}".format('' if not field_prefix else field_prefix, attr, param_name) else: - filter_string += "{} = @{}".format(attr, param_name) + filter_string += "{}{} = @{}".format('' if not field_prefix else field_prefix, attr, param_name) else: # Array param query_param['parameterType']['type'] = "ARRAY" query_param['parameterValue'] = {'arrayValues': [{'value': x} for x in values]} query_param['parameterType']['arrayType'] = {'type': ('STRING' if re.compile(ur'[^0-9\.,]', re.UNICODE).search(values[0]) else 'INT64')} - filter_string += "{} IN UNNEST(@{})".format(attr, param_name) + filter_string += "{}{} IN UNNEST(@{})".format('' if not field_prefix else field_prefix, attr, param_name) if with_count_toggle: filter_string = "({}) OR @{}_filtering = 'not_filtering'".format(filter_string,param_name) From 86336c92a12f2111d27c695407adc66769bed39a Mon Sep 17 00:00:00 2001 From: s-paquette Date: Mon, 9 Jul 2018 15:39:40 -0700 Subject: [PATCH 43/76] -> Integration of filter count toggling into public data type counts for DICOM -> Moved count_files into new file_helpers module --- cohorts/file_helpers.py | 14 +++----- cohorts/metadata_counting.py | 69 ++++++++++++++++++++---------------- 2 files changed, 44 insertions(+), 39 deletions(-) diff --git a/cohorts/file_helpers.py b/cohorts/file_helpers.py index fa5eb6c4..3187bc16 100644 --- a/cohorts/file_helpers.py +++ b/cohorts/file_helpers.py @@ -159,15 +159,12 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse if do_filter_count: # Query the count start = time.time() - logger.debug("Query: {}".format(file_count_query.format(select_clause=file_list_query_formatted))) - if built_clause: - logger.debug("Params: {}".format(built_clause['parameters'])) results = BigQuerySupport.execute_query_and_fetch_results( file_count_query.format(select_clause=file_list_query_formatted), built_clause['parameters'] if built_clause else None ) stop = time.time() - logger.debug('[BENCHMARKING] Time to query BQ for dicom count: ' + (stop - start).__str__()) + logger.debug('[BENCHMARKING] Time to query BQ for dicom count: ' + str(stop - start)) for entry in results: total_file_count = int(entry['f'][0]['v']) cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() @@ -181,10 +178,11 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse file_list_query.format( select_clause=file_list_query_formatted, order_clause=order_clause, limit_clause=limit_clause, offset_clause=offset_clause - ) + ), + built_clause['parameters'] if built_clause else None ) stop = time.time() - logger.debug('[BENCHMARKING] Time to query BQ for dicom data: ' + (stop - start).__str__()) + logger.debug('[BENCHMARKING] Time to query BQ for dicom data: ' + str(stop - start)) if len(results) > 0: for entry in results: file_list.append({ @@ -288,13 +286,11 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse query = file_list_query.format(select_clause=select_clause, order_clause=order_clause, limit_clause=limit_clause, offset_clause=offset_clause) if len(filelist_params) > 0: - logger.debug("query for filelist: {}".format(query)) - logger.debug("params: {}".format(str(filelist_params))) cursor.execute(query, filelist_params) else: cursor.execute(query) stop = time.time() - logger.info("[STATUS] Time to get file-list: {}s".format(str(stop - start))) + logger.info("[STATUS] Time to get filelist: {}s".format(str(stop - start))) counts = {} if do_filter_count: diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index 58d4b357..4e5d4240 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -160,7 +160,8 @@ def count_user_metadata(user, inc_filters=None, cohort_id=None): return user_data_counts except (Exception) as e: - logger.error(traceback.format_exc()) + logger.error("[ERROR] While counting user metadata:") + logger.exception(e) finally: if cursor: cursor.close() if db and db.open: db.close() @@ -169,8 +170,9 @@ def count_user_metadata(user, inc_filters=None, cohort_id=None): def count_public_data_type(user, data_query, inc_filters, program_list, filter_format=False, build='HG19', type=None): db = None cursor = None - counts = {} + filter_clauses = {} + built_clause = None QUERY_BASE = """ SELECT {attr}, COUNT(*) AS count @@ -178,7 +180,7 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f WHERE TRUE {where_clause} GROUP BY {attr}; """ - filter_clauses = {} + try: db = get_sql_connection() @@ -196,39 +198,48 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f case_barcode_condition = '' if not case_barcode else "AND LOWER(cs.case_barcode) like %s" # Make our where clauses - for filter in inc_filters: - for prog in program_list: - if not validate_filter_key(filter, prog.id, build): - raise Exception("Filters must be in valid JSON format and conform to metadata_data columns.") - filter_clauses[filter] = {'where_clause': None, 'parameters': None} - - subfilter = {} - subfilter[filter] = inc_filters[filter] - - built_clause = build_where_clause(subfilter, for_files=True) - filter_clauses[filter]['where_clause'] = built_clause['query_str'] - filter_clauses[filter]['parameters'] = built_clause['value_tuple'] + if type != 'dicom': + for filter in inc_filters: + for prog in program_list: + if not validate_filter_key(filter, prog.id, build): + raise Exception("Filters must be in valid JSON format and conform to metadata_data columns.") + filter_clauses[filter] = {'where_clause': None, 'parameters': None} + + subfilter = {} + subfilter[filter] = inc_filters[filter] + + built_clause = None + build_where_clause(subfilter, for_files=True) + filter_clauses[filter]['where_clause'] = built_clause['query_str'] + filter_clauses[filter]['parameters'] = built_clause['value_tuple'] + else: + if len(inc_filters): + built_clause = BigQuerySupport.build_bq_filter_and_params(inc_filters, with_count_toggle=True) for attr in metadata_data_attr: - paramter_tuple = () counts[attr] = {x: 0 for x in metadata_data_attr[attr]['values']} - if type == 'dicom': - query = """ - #standardSQL - {query} - """.format(query=query) + if type == 'dicom': where_clause = '' parameters = None - if len(inc_filters): - built_clause = BigQuerySupport.build_bq_filter_and_params(inc_filters) + count_params = None + if built_clause: where_clause = "AND ( {} )".format(built_clause['filter_string']) parameters = built_clause['parameters'] + count_params = built_clause['count_params'] - query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause) + query = """ + #standardSQL + {query} + """.format(query=QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr)) + if count_params and attr in count_params: + count_params[attr]['parameterValue']['value'] = 'not_filtering' results = BigQuerySupport.execute_query_and_fetch_results(query, parameters) + if count_params and attr in count_params: + count_params[attr]['parameterValue']['value'] = 'filtering' else: + paramter_tuple = () where_clause = "" if case_barcode: paramter_tuple += (case_barcode, ) @@ -242,7 +253,6 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f cursor.execute(query, paramter_tuple) results = cursor.fetchall() - for row in results: if type == 'dicom': val = row['f'][0]['v'] @@ -502,7 +512,7 @@ def count_public_metadata(user, cohort_id=None, inc_filters=None, program_id=Non results = BigQuerySupport.execute_query_and_fetch_results(query, params) stop = time.time() - logger.debug('[BENCHMARKING] Time to query BQ for mutation data: '+(stop - start).__str__()) + logger.debug('[BENCHMARKING] Time to query BQ for mutation data: '+str(stop - start)) if len(results) > 0: for barcode in results: @@ -606,7 +616,7 @@ def count_public_metadata(user, cohort_id=None, inc_filters=None, program_id=Non stop = time.time() - logger.debug('[BENCHMARKING] Time to create temporary filter/cohort tables in count_metadata: '+(stop - start).__str__()) + logger.debug('[BENCHMARKING] Time to create temporary filter/cohort tables in count_metadata: '+str(stop - start)) count_query_set = [] @@ -798,7 +808,7 @@ def count_public_metadata(user, cohort_id=None, inc_filters=None, program_id=Non item[type] = item[type][:-2] stop = time.time() - logger.debug('[BENCHMARKING] Time to query filter count set in metadata_counts:'+(stop - start).__str__()) + logger.debug('[BENCHMARKING] Time to query filter count set in metadata_counts:'+str(stop - start)) # query sample and case counts count_query = 'SELECT COUNT(DISTINCT %s) FROM %s' @@ -922,7 +932,7 @@ def public_metadata_counts(req_filters, cohort_id, user, program_id, limit=None, filters[key]['values'].append(value) except Exception as e: - logger.error(traceback.format_exc()) + logger.exception(e) raise Exception('Filters must be a valid JSON formatted object of filter sets, with value lists keyed on filter names.') start = time.time() @@ -993,7 +1003,6 @@ def user_metadata_counts(user, user_data_filters, cohort_id): except Exception, e: logger.error('[ERROR] Exception when counting user metadata: ') logger.exception(e) - logger.error(traceback.format_exc()) def validate_and_count_barcodes(barcodes, user_id): From c769fbd578506833dc514c55b934c076658be256 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 10 Jul 2018 12:30:41 -0700 Subject: [PATCH 44/76] Tweaked error handling --- accounts/dcf_support.py | 5 +++-- accounts/dcf_views.py | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/accounts/dcf_support.py b/accounts/dcf_support.py index 43ef3cc1..597dd873 100755 --- a/accounts/dcf_support.py +++ b/accounts/dcf_support.py @@ -527,7 +527,7 @@ def dcf_call(full_url, user_id, mode='get', post_body=None, force_token=False): :raises RefreshTokenExpired: """ - dcf_token = get_stored_dcf_token(user_id) # Can raise a TokenFailure or RefreshTokenExpired + dcf_token = get_stored_dcf_token(user_id) expires_in = (dcf_token.expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() logger.info("[INFO] Token Expiration : {} seconds".format(expires_in)) @@ -617,6 +617,7 @@ def _access_token_storage(token_dict, cgc_uid): refresh token to get a new access key. :raises TokenFailure: + :raises InternalTokenError: :raises RefreshTokenExpired: """ @@ -638,7 +639,7 @@ def _access_token_storage(token_dict, cgc_uid): try: dcf_token = get_stored_dcf_token(cgc_uid) - except (TokenFailure, RefreshTokenExpired) as e: + except (TokenFailure, InternalTokenError, RefreshTokenExpired) as e: logger.error("[INFO] _access_token_storage aborted: {}".format(str(e))) raise e diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index c9866595..6fa48ca1 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -599,11 +599,15 @@ def dcf_link_extend(request): err_msg, returned_expiration_str, user_data_token_string = refresh_at_dcf(request.user.id) except TokenFailure: err_msg = "Your Data Commons Framework identity needs to be reestablished to complete this task." + except InternalTokenError: + err_msg = "There was an unexpected internal error {}. Please contact feedback@isb-cgc.org.".format("0081") except RefreshTokenExpired: err_msg = "Your login to the Data Commons Framework has expired. You will need to log in again." except DCFCommFailure: err_msg = comm_err_msg - except Exception: + except Exception as e: + logger.error("[ERROR]: Unexpected Exception {}".format(str(e))) + logger.exception(e) err_msg = "Unexpected problem." if err_msg: From abc5ad520be02c91a04f9ef4eecd36b58b484485 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 10 Jul 2018 13:54:38 -0700 Subject: [PATCH 45/76] Fix unbound local error --- accounts/dcf_support.py | 1 + 1 file changed, 1 insertion(+) diff --git a/accounts/dcf_support.py b/accounts/dcf_support.py index 597dd873..8ffcc598 100755 --- a/accounts/dcf_support.py +++ b/accounts/dcf_support.py @@ -356,6 +356,7 @@ def refresh_at_dcf(user_id): err_msg = None returned_expiration_str = None massaged_string = None + resp = None # # Call DCF to drop the linkage. Note that this will immediately remove them from controlled access. From a0543a15e6cd3eaad8b88f962413671131081858 Mon Sep 17 00:00:00 2001 From: elainelee Date: Tue, 10 Jul 2018 15:13:46 -0700 Subject: [PATCH 46/76] bug fix --- cohorts/file_helpers.py | 15 +++++------ cohorts/metadata_counting.py | 39 +++++++++++++++------------ google_helpers/bigquery/bq_support.py | 2 +- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/cohorts/file_helpers.py b/cohorts/file_helpers.py index 3187bc16..57a2be23 100644 --- a/cohorts/file_helpers.py +++ b/cohorts/file_helpers.py @@ -47,15 +47,14 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse user_email = user.email user_id = user.id - resp = None + #resp = None db = None cursor = None - query_limit = limit - type_conditions = "" + #query_limit = limit limit_clause = "" offset_clause = "" - filter_counts = None + #filter_counts = None file_list = [] total_file_count = 0 @@ -68,10 +67,9 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse if 'case_barcode' in inc_filters: case_barcode = inc_filters['case_barcode'] del inc_filters['case_barcode'] - if case_barcode: - case_barcode_condition = "AND LOWER(cs.case_barcode) LIKE %s" - case_barcode = "%{}%".format(case_barcode) + case_barcode_condition = "AND LOWER(cs.case_barcode) LIKE LOWER(%s)" + case_barcode = ''.join(case_barcode) try: # Attempt to get the cohort perms - this will cause an excpetion if we don't have them @@ -79,7 +77,6 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse if type == 'dicom': - filter_counts = {} limit_clause = "" offset_clause = "" @@ -269,7 +266,7 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse cohort_id=cohort_id, metadata_table=program_data_table, filter_conditions='', - case_barcode_condition=case_barcode_condition) + case_barcode_condition='') first_program = False # if first_program is still true, we found no programs with data tables for this build diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index 4e5d4240..924b38a7 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -189,13 +189,18 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f metadata_data_attr = fetch_build_data_attr(build, type) case_barcode = None - + case_barcode_condition = "" + case_barcode_param = None # Pull out the case barcode filter, if there is one if 'case_barcode' in inc_filters: case_barcode = inc_filters['case_barcode'] del inc_filters['case_barcode'] - - case_barcode_condition = '' if not case_barcode else "AND LOWER(cs.case_barcode) like %s" + if type == 'dicom': + case_barcode_built_clause = BigQuerySupport.build_bq_filter_and_params({'case_barcode': case_barcode}) + case_barcode_param = case_barcode_built_clause['parameters'] + case_barcode_condition = 'AND ' + case_barcode_built_clause['filter_string'] + else: + case_barcode_condition = " AND ( LOWER (case_barcode) LIKE LOWER(%s) )" # Make our where clauses if type != 'dicom': @@ -208,8 +213,7 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f subfilter = {} subfilter[filter] = inc_filters[filter] - built_clause = None - build_where_clause(subfilter, for_files=True) + built_clause = build_where_clause(subfilter, for_files=True) filter_clauses[filter]['where_clause'] = built_clause['query_str'] filter_clauses[filter]['parameters'] = built_clause['value_tuple'] else: @@ -218,16 +222,17 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f for attr in metadata_data_attr: counts[attr] = {x: 0 for x in metadata_data_attr[attr]['values']} - if type == 'dicom': where_clause = '' - parameters = None + parameters = [] count_params = None + if case_barcode: + where_clause += case_barcode_condition + parameters.extend(case_barcode_param) if built_clause: - where_clause = "AND ( {} )".format(built_clause['filter_string']) - parameters = built_clause['parameters'] + where_clause += " AND ( {} )".format(built_clause['filter_string']) + parameters.extend(built_clause['parameters']) count_params = built_clause['count_params'] - query = """ #standardSQL {query} @@ -239,18 +244,18 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f if count_params and attr in count_params: count_params[attr]['parameterValue']['value'] = 'filtering' else: - paramter_tuple = () where_clause = "" - if case_barcode: - paramter_tuple += (case_barcode, ) filter_clause = ') AND ('.join([filter_clauses[x]['where_clause'] for x in filter_clauses if x != attr or (filter_format and attr == 'data_format')]) if len(filter_clause): where_clause = "AND ( {} )".format(filter_clause) - paramter_tuple += tuple(y for x in filter_clauses for y in filter_clauses[x]['parameters'] if + parameter_tuple = tuple(y for x in filter_clauses for y in filter_clauses[x]['parameters'] if x != attr or (filter_format and attr == 'data_format')) - - query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr, case_barcode_condition=case_barcode_condition) - cursor.execute(query, paramter_tuple) + if case_barcode: + where_clause += case_barcode_condition + case_barcode = "".join(case_barcode) + parameter_tuple += (case_barcode, ) + query = QUERY_BASE.format(data_query_clause=data_query, where_clause=where_clause, attr=attr) + cursor.execute(query, parameter_tuple) results = cursor.fetchall() for row in results: diff --git a/google_helpers/bigquery/bq_support.py b/google_helpers/bigquery/bq_support.py index cc56d733..1242f416 100644 --- a/google_helpers/bigquery/bq_support.py +++ b/google_helpers/bigquery/bq_support.py @@ -497,7 +497,7 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with query_param['parameterType']['type'] = ('STRING' if re.compile(ur'[^0-9\.,]', re.UNICODE).search(values[0]) else 'INT64') query_param['parameterValue']['value'] = values[0] if query_param['parameterType']['type'] == 'STRING' and '%' in values[0]: - filter_string += "{}{} LIKE @{}".format('' if not field_prefix else field_prefix, attr, param_name) + filter_string += "LOWER({}{}) LIKE LOWER(@{})".format('' if not field_prefix else field_prefix, attr, param_name) else: filter_string += "{}{} = @{}".format('' if not field_prefix else field_prefix, attr, param_name) else: From 907ec3d2b349344b654b9a635dfd13ca8481ef60 Mon Sep 17 00:00:00 2001 From: elainelee Date: Tue, 10 Jul 2018 16:21:25 -0700 Subject: [PATCH 47/76] code clean up --- cohorts/file_helpers.py | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/cohorts/file_helpers.py b/cohorts/file_helpers.py index 57a2be23..f82c599e 100644 --- a/cohorts/file_helpers.py +++ b/cohorts/file_helpers.py @@ -46,37 +46,18 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse user_email = user.email user_id = user.id - - #resp = None db = None cursor = None - #query_limit = limit limit_clause = "" offset_clause = "" - - #filter_counts = None file_list = [] total_file_count = 0 - case_barcode = None - case_barcode_condition = '' - - # DICOM uses BQ, and that WHERE clause builder can handle the LIKE clause, - # but the MySQL WHERE clause builder can't - if not type == 'dicom': - if 'case_barcode' in inc_filters: - case_barcode = inc_filters['case_barcode'] - del inc_filters['case_barcode'] - if case_barcode: - case_barcode_condition = "AND LOWER(cs.case_barcode) LIKE LOWER(%s)" - case_barcode = ''.join(case_barcode) - try: # Attempt to get the cohort perms - this will cause an excpetion if we don't have them Cohort_Perms.objects.get(cohort_id=cohort_id, user_id=user_id) if type == 'dicom': - limit_clause = "" offset_clause = "" @@ -192,6 +173,13 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse }) filter_counts = counts else: + case_barcode = None + case_barcode_condition = '' + if 'case_barcode' in inc_filters: + case_barcode = ''.join(inc_filters['case_barcode']) + del inc_filters['case_barcode'] + case_barcode_condition = " AND LOWER(cs.case_barcode) LIKE LOWER(%s)" + select_clause_base = """ SELECT md.sample_barcode, md.case_barcode, md.disease_code, md.file_name, md.file_name_key, md.index_file_name, md.access, md.acl, md.platform, md.data_type, md.data_category, @@ -271,7 +259,6 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse # if first_program is still true, we found no programs with data tables for this build if not first_program: - if limit > 0: limit_clause = ' LIMIT {}'.format(str(limit)) # Offset is only valid when there is a limit From 844d95126fd0a4f1edfdfbdf00b44ec4f1078542 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 10 Jul 2018 18:36:44 -0700 Subject: [PATCH 48/76] Adding legacy detection --- accounts/sa_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index ad63558b..34e57719 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1417,6 +1417,7 @@ def get_nih_user_details(user_id, force_logout): user_details['dcf_comm_error'] = False user_details['link_mismatch'] = False user_details['data_sets_updated'] = False + user_details['legacy_linkage'] = False nih_users = NIH_User.objects.filter(user_id=user_id, linked=True) @@ -1425,7 +1426,11 @@ def get_nih_user_details(user_id, force_logout): match_state = _refresh_from_dcf(user_id, nih_user) if match_state == RefreshCode.NO_TOKEN: - user_details['NIH_username'] = None + if nih_user: + user_details['legacy_linkage'] = True + user_details['NIH_username'] = nih_user.NIH_username + else: + user_details['NIH_username'] = None return user_details elif match_state == RefreshCode.TOKEN_EXPIRED: user_details['refresh_required'] = True From e99dc7b9f97fc4a7c9c2ab06265c70462253f0fe Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 11 Jul 2018 13:22:55 -0700 Subject: [PATCH 49/76] Better ID reuse message --- accounts/sa_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 34e57719..f8d6d7ed 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -855,9 +855,9 @@ def found_linking_problems(NIH_username, user_id, user_email, my_st_logger, resu if settings.DCF_TEST: user_message = "User {} is already linked to the eRA commons identity {}. " \ - "Please log out of the Data Commons now using the link below, then " \ - "click the link to disconnect from {} before trying to log in " \ - "using {}".format(user_email, existing_nih_user_name, existing_nih_user_name, NIH_username) + "You must now use the link below to first log out of the Data Commons. " \ + "Then, please have {} unlink from {} before trying this again." \ + .format(user_email, existing_nih_user_name, user_email, existing_nih_user_name) else: user_message = "User {} is already linked to the eRA commons identity {}. " \ "Please unlink these before authenticating with the eRA commons " \ From 972ec338fcce19febc6ac2615f61b543c636932e Mon Sep 17 00:00:00 2001 From: elainelee Date: Wed, 11 Jul 2018 15:07:45 -0700 Subject: [PATCH 50/76] bugfix: changed paramters=[] to parameters=None --- cohorts/metadata_counting.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cohorts/metadata_counting.py b/cohorts/metadata_counting.py index 924b38a7..53ad8b18 100644 --- a/cohorts/metadata_counting.py +++ b/cohorts/metadata_counting.py @@ -224,14 +224,17 @@ def count_public_data_type(user, data_query, inc_filters, program_list, filter_f counts[attr] = {x: 0 for x in metadata_data_attr[attr]['values']} if type == 'dicom': where_clause = '' - parameters = [] + parameters = None count_params = None if case_barcode: where_clause += case_barcode_condition - parameters.extend(case_barcode_param) + parameters = case_barcode_param if built_clause: where_clause += " AND ( {} )".format(built_clause['filter_string']) - parameters.extend(built_clause['parameters']) + if parameters: + parameters.extend(built_clause['parameters']) + else: + parameters = built_clause['parameters'] count_params = built_clause['count_params'] query = """ #standardSQL From 55b0d9dcbd24f98c1d93fa73664f048f00936862 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 11 Jul 2018 18:02:47 -0700 Subject: [PATCH 51/76] Dropped hyper-logging, cleaned error messages, handle session-stored data loss. --- accounts/dcf_views.py | 73 +++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 6fa48ca1..9afacf50 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -66,7 +66,6 @@ def oauth2_login(request): First step of OAuth2 login to DCF. Just build the URL that we send back to the browser in the refresh request """ try: - logger.info("[INFO] OAuth1 a") full_callback = request.build_absolute_uri(reverse('dcf_callback')) @@ -79,15 +78,14 @@ def oauth2_login(request): client_id, _ = get_secrets() - logger.info("[INFO] OAuth1 b") # Found that 'user' scope had to be included to be able to do the user query on callback, and the data scope # to do data queries. Starting to recognize a pattern here... oauth = OAuth2Session(client_id, redirect_uri=full_callback, scope=['openid', 'user', 'data']) authorization_url, state = oauth.authorization_url(DCF_AUTH_URL) - logger.info("[INFO] OAuth1 c") + # stash the state string in the session! request.session['dcfOAuth2State'] = state - logger.info("[INFO] OAuth1 d") + return HttpResponseRedirect(authorization_url) finally: @@ -123,11 +121,10 @@ def oauth2_callback(request): """ comm_err_msg = "There was a communications problem contacting Data Commons Framework." - internal_err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator." - dcf_err_msg = "DCF reported an error {} logging in. Please contact the ISB-CGC administrator." + internal_err_msg = "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org." + dcf_err_msg = "DCF reported an error {} logging in. Please report this to feedback@isb-cgc.org." try: - logger.info("[INFO] OAuthCB a") full_callback = request.build_absolute_uri(reverse('dcf_callback')) # For future reference, this also worked, using underlying requests library: @@ -162,7 +159,6 @@ def oauth2_callback(request): # development: # - logger.info("[INFO] OAuthCB b") if settings.IS_DEV and full_callback.startswith('http://localhost'): os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' @@ -170,14 +166,18 @@ def oauth2_callback(request): saved_state = request.session['dcfOAuth2State'] else: logger.error("[ERROR] Missing dcfOAuth2State during callback") - messages.error(request, internal_err_msg.format("001")) + # + # If the user hung out on the DCF login site for a long time, and finally got back to us, they would + # hit the login screen and then end up back here. The session would have expired, and so there would be + # no saved state available. So we should not send back a scary and cryptic "Internal Error", but a + # message that login could not be completed and they should try again. + # + messages.error(request, "Login could not be completed, possibly due to session expiration. Please try again.") return redirect(reverse('user_detail', args=[request.user.id])) client_id, client_secret = get_secrets() - logger.info("[INFO] OAuthCB c") # You MUST provide the callback *here* to get it into the fetch request dcf = OAuth2Session(client_id, state=saved_state, redirect_uri=full_callback) - logger.info("[INFO] OAuthCB c1") auth_response = request.build_absolute_uri(request.get_full_path()) # You MUST provide the client_id *here* (again!) in order to get this to do basic auth! DCF will not authorize @@ -198,7 +198,6 @@ def oauth2_callback(request): finally: client_secret = None # clear this in case we are in Debug mode to keep this out of the browser - logger.info("[INFO] OAuthCB d") if token_data['token_type'] != 'Bearer': logger.error("[ERROR] Token type returned was not 'Bearer'") messages.error(request, internal_err_msg.format("002")) @@ -214,7 +213,7 @@ def oauth2_callback(request): my_jwt = jwt.PyJWT() my_jwt.register_algorithm('RS256', RSAAlgorithm(RSAAlgorithm.SHA256)) - logger.info("[INFO] OAuthCB e") + # # DCF's key endpoint provides a list of keys they use. Right now, only one, but to future-proof, we want # to choose the right one from the list. But that means we need to parse the first element of the JWT tuple @@ -233,7 +232,7 @@ def oauth2_callback(request): # # Get the key list from the endpoint and choose which one was used in the JWT: # - logger.info("[INFO] OAuthCB f") + try: resp = dcf.get(settings.DCF_KEY_URL) except Exception as e: @@ -257,7 +256,7 @@ def oauth2_callback(request): # # Decode the JWT! # - logger.info("[INFO] OAuthCB g") + try: alg_list = ['RS256'] decoded_jwt_id = my_jwt.decode(token_data['id_token'], key=use_key, algorithms=alg_list, @@ -289,7 +288,6 @@ def oauth2_callback(request): # u'pur': u'id', (The "purpose" of the token. This is an ID. Refresh tokens say "refresh") # u'sub': u'integer user key'} - logger.info("[INFO] OAuthCB h") dcf_user_id = decoded_jwt_id['sub'] # @@ -303,7 +301,6 @@ def oauth2_callback(request): nih_from_dcf = get_nih_id_from_user_dict(user_data_dict) google_link = get_google_link_from_user_dict(user_data_dict) - logger.info("[INFO] OAuthCB i") # We now have the NIH User ID back from DCF; we also might now know the Google ID they have linked to previously # (it comes back in the user_id). Note that this routine is going to get called every 30 days or so when we @@ -328,8 +325,6 @@ def oauth2_callback(request): request.session['dcfForcedLogout'] = nih_from_dcf return redirect(reverse('user_detail', args=[request.user.id])) - logger.info("[INFO] OAuthCB j") - # # We now are almost ready to stash the token. One field in the table is the Google ID. First time # through, it will be blank. Otherwise, it either matches our login ID, or might be some rando @@ -359,7 +354,6 @@ def oauth2_callback(request): # flow diverges. For the GET, we wrap things up in the callback. For the PATCH, we wrap things up immediately: # - logger.info("[INFO] OAuthCB k") if google_link: # @@ -373,7 +367,6 @@ def oauth2_callback(request): # req_user = User.objects.get(id=request.user.id) - logger.info("[INFO] OAuthCB l") if google_link != req_user.email: try: unlink_at_dcf(request.user.id, True) # True means after unlinking, we call DCF again to update our link state @@ -398,8 +391,6 @@ def oauth2_callback(request): # The link matches. So we use PATCH. Any problems encountered and we return error message to user: # - logger.info("[INFO] OAuthCB m") - try: err_msg, returned_expiration_str, _ = refresh_at_dcf(request.user.id) except TokenFailure: @@ -420,10 +411,8 @@ def oauth2_callback(request): # to finish the link # - logger.info("[INFO] OAuthCB n") use_expiration_time = calc_expiration_time(returned_expiration_str) - logger.info("[INFO] OAuthCB o") # Don't hit DCF again, we just did it (thus False): warning = _finish_the_link(request.user.id, req_user.email, use_expiration_time, st_logger, False) messages.warning(request, warning) @@ -435,7 +424,7 @@ def oauth2_callback(request): # User has not yet been linked, so start the redirect flow with the user and DCF that will result # in us getting the callback below to finish the process: # - logger.info("[INFO] OAuthCB p") + link_callback = request.build_absolute_uri(reverse('dcf_link_callback')) callback = '{}?redirect={}'.format(DCF_GOOGLE_URL, link_callback) @@ -451,8 +440,8 @@ def dcf_link_callback(request): conditions. """ - dcf_err_msg = "DCF reported an error {} logging in. Please contact the ISB-CGC administrator." - internal_err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator." + dcf_err_msg = "DCF reported an error {} logging in. Please report this to feedback@isb-cgc.org." + internal_err_msg = "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org." comm_err_msg = "There was a communications problem contacting Data Commons Framework." # @@ -568,10 +557,10 @@ def dcf_link_callback(request): # Don't hit DCF again, we just did it (thus False): warning = _finish_the_link(request.user.id, google_link, use_expiration_time, st_logger, False) except TokenFailure: - messages.error(request, "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0067")) + messages.error(request, "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org.".format("0067")) return redirect(reverse('user_detail', args=[request.user.id])) except RefreshTokenExpired: - messages.error(request, "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0068")) + messages.error(request, "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org.".format("0068")) return redirect(reverse('user_detail', args=[request.user.id])) if warning: @@ -585,7 +574,7 @@ def dcf_link_extend(request): Put a user's GoogleID in the ACL groups for 24 (more) hours: """ - comm_err_msg = "There was a communications problem contacting Data Commons Framework." + comm_err_msg = "There was a communications problem contacting the Data Commons Framework." # # If user has disconnected their ID in another window before clicking this link, they would easily get a @@ -660,7 +649,7 @@ def _finish_the_link(user_id, user_email, expiration_time, st_logger, refresh_fi if dcf_token.google_id is not None and dcf_token.google_id != user_email: return 'Unexpected internal error detected during linking: email/ID mismatch. ' \ - 'Please report this to the ISB-CGC administrator' + 'Please report this to feedback@isb-cgc.org' dcf_token.google_id = user_email if refresh_first: @@ -749,9 +738,9 @@ def dcf_disconnect_user(request): try: dcf_token = get_stored_dcf_token(request.user.id) except TokenFailure: - err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0069") + err_msg = "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org.".format("0069") except InternalTokenError: - err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0070") + err_msg = "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org.".format("0070") except RefreshTokenExpired: err_msg = "You will need to first login to the Data Commons again to disconnect your Google ID" @@ -768,11 +757,11 @@ def dcf_disconnect_user(request): try: unlink_at_dcf(request.user.id, False) # Don't refresh, we are about to drop the record... except TokenFailure: - err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0071") + err_msg = "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org.".format("0071") except InternalTokenError: - err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0072") + err_msg = "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org.".format("0072") except RefreshTokenExpired: - err_msg = "There was an internal error {} logging in. Please contact the ISB-CGC administrator.".format("0073") + err_msg = "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org.".format("0073") except DCFCommFailure: err_msg = "There was a communications problem contacting Data Commons Framework." @@ -793,18 +782,15 @@ def dcf_disconnect_user(request): data = { 'token': dcf_token.refresh_token } - logger.info("[INFO] DDU B") auth = requests.auth.HTTPBasicAuth(client_id, client_secret) resp = requests.request('POST', DCF_REVOKE_URL, data=data, auth=auth) client_id = None client_secret = None - logger.info("[INFO] DDU C") - if resp.status_code != 200 and resp.status_code != 204: logger.error(request, '[ERROR] Token revocation problem: {} : {}'.format(resp.status_code, resp.text)) - messages.warning(request, "Problems encountered revoking access token at Data Commons. Please contact ISB-CGC Administrator") + messages.warning(request, "Problems encountered revoking access token at Data Commons. Please report this to feedback@isb-cgc.org") # # Now we do the internal unlinking, which includes detach the user in our NIH tables, and detach the user from data permissions. @@ -816,7 +802,7 @@ def dcf_disconnect_user(request): # Token problem? Don't care; it is about to be blown away pass except (InternalTokenError, Exception): - messages.warning(request, "Internal problem encountered disconnecting from Data Commons. Please contact ISB-CGC Administrator") + messages.warning(request, "Internal problem encountered disconnecting from Data Commons. Please report this to feedback@isb-cgc.org") return redirect(reverse('user_detail', args=[request.user.id])) # @@ -830,7 +816,7 @@ def dcf_disconnect_user(request): except TokenFailure: dcf_token = None except InternalTokenError: - messages.warning(request, "Internal problem encountered disconnecting from Data Commons. Please contact ISB-CGC Administrator") + messages.warning(request, "Internal problem encountered disconnecting from Data Commons. Please report this to feedback@isb-cgc.org") return redirect(reverse('user_detail', args=[request.user.id])) except RefreshTokenExpired as e: dcf_token = e.token @@ -844,7 +830,6 @@ def dcf_disconnect_user(request): # logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) - logger.info("[INFO] DDU D") callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) return HttpResponseRedirect(callback) From a0c4195766020c4ad6a21fbad3d6c0245f67049e Mon Sep 17 00:00:00 2001 From: elainelee Date: Fri, 13 Jul 2018 14:15:19 -0700 Subject: [PATCH 52/76] bugfix: BQ & GCS export to pass case barcode filter --- cohorts/views.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cohorts/views.py b/cohorts/views.py index ac5a5a22..8054905b 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -2171,9 +2171,13 @@ def export_data(request, cohort_id=0, export_type=None, export_sub_type=None): cohort_programs = Cohort.objects.get(id=cohort_id).get_programs() union_queries = [] inc_filters = json.loads(request.POST.get('filters', '{}')) + if inc_filters.get('case_barcode'): + case_barcode = inc_filters.get('case_barcode') + inc_filters['case_barcode'] = ["%{}%".format(case_barcode),] + filter_params = None if len(inc_filters): - filter_and_params = BigQuerySupport.build_bq_filter_and_params(inc_filters) + filter_and_params = BigQuerySupport.build_bq_filter_and_params(inc_filters, field_prefix='md.' if export_type == 'file_manifest' else None) filter_params = filter_and_params['parameters'] filter_conditions = "AND {}".format(filter_and_params['filter_string']) @@ -2233,9 +2237,6 @@ def export_data(request, cohort_id=0, export_type=None, export_sub_type=None): tz=settings.TIME_ZONE ) ) - - query_string = "" - if len(union_queries) > 1: query_string = ") UNION ALL (".join(union_queries) query_string = '(' + query_string + ')' @@ -2298,8 +2299,6 @@ def export_data(request, cohort_id=0, export_type=None, export_sub_type=None): ) ) - query_string = "" - if len(union_queries) > 1: query_string = ") UNION ALL (".join(union_queries) query_string = '(' + query_string + ')' From 12e328553601d2c8dbc8aefedd1309dcd63336fd Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Fri, 13 Jul 2018 17:11:19 -0700 Subject: [PATCH 53/76] Some message clean up, plus better handling of DCF linking refusal case. --- accounts/dcf_views.py | 90 +++++++++++++++++++++++++++++++++---------- accounts/sa_utils.py | 12 ++++-- 2 files changed, 79 insertions(+), 23 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 9afacf50..d57507a8 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -33,7 +33,7 @@ from sa_utils import found_linking_problems, DemoLoginResults, \ handle_user_db_update_for_dcf_linking, \ - unlink_account_in_db_for_dcf, refresh_user_projects + unlink_account_in_db_for_dcf, refresh_user_projects, have_linked_user from dcf_support import get_stored_dcf_token, \ TokenFailure, RefreshTokenExpired, InternalTokenError, DCFCommFailure, \ @@ -449,10 +449,19 @@ def dcf_link_callback(request): # any random error that is reported back to us. # error = request.GET.get('error', None) + defer_error_return = False; if error: error_description = request.GET.get('error_description', "") if error == 'g_acnt_link_error': + # OK, it turns out that it is not so hard to get the error from DCF that "User already has a + # linked Google account". If a user has gotten themselves to DCF's sign-in via Google page in *two + # separate browsers*, then logged in on one, and then the other, the second browser will trigger that + # message. + # If we get the message, we should tell the user what email they are registered with! For that, we will + # need to get the token, which we do below in the regular flow. So defer the return in this case... message = 'Issue with the linkage between user and their Google account' + if error_description == "User already has a linked Google account": + defer_error_return = True elif error == 'g_acnt_auth_failure': message = "Issue with Oauth2 flow to AuthN user's Google account" elif error == 'g_acnt_access_error': @@ -462,19 +471,9 @@ def dcf_link_callback(request): logger.error("[ERROR]: DCF reports an error ({}, {}, {}) trying to link Google ID".format(error, message, error_description)) - messages.error(request, dcf_err_msg.format("D002")) - return redirect(reverse('user_detail', args=[request.user.id])) - - # - # The callback provides us with both the link expiration and the user ID that was linked. BUT THIS IS - # COMING FROM THE USER, IS NOT SIGNED, AND SO CANNOT BE TRUSTED! Pull them out and verify them. If things - # are not too crazy, we accept the value we are sent: - # - - returned_expiration_str = request.GET.get('exp', None) - returned_google_link = request.GET.get('linked_email', None) - - use_expiration_time = calc_expiration_time(returned_expiration_str) + if not defer_error_return: + messages.error(request, dcf_err_msg.format("D002")) + return redirect(reverse('user_detail', args=[request.user.id])) # # We will NEVER accept a Google ID that does not match. At this point, we need to wrestle @@ -503,9 +502,54 @@ def dcf_link_callback(request): the_user_token_dict = json_loads(the_user_token_string) the_user_dict = the_user_token_dict['context']['user'] - # Just parses the google link out of the recently return token. + # Just parses the google link out of the recently return token: + google_link = get_google_link_from_user_dict(the_user_dict) + # need this in a couple of places, so do it now: + + req_user = User.objects.get(id=request.user.id) + + # + # OK, we just got back what the DCF thinks the user's google linking state is. If they reported above that + # the user is already linked, use that info to fully inform the user about what is going on. If they report + # that there is *no link*, but we got back an error, then we need to let the user know this. This case arose in + # testing, based on DCF making a decision based on stale cookie data in a second browser. + # + + if defer_error_return: + if google_link is None: + # DCF is confused. We have seen this case. + err_msg = "Data Commons experienced an internal error. Please use another browser to Associate with eRA Commons Account." + messages.error(request, err_msg) + else: + # User had two browsers open and tried to login on both. If the user ID in the token matches + # what we think it should be, just post this fact for user to see. + if have_linked_user(request.user.id) and google_link == req_user.email: + warn_msg = "Data Commons reported that you were already linked with Google ID {}." + messages.warning(request, warn_msg) + + else: + # DCF says we are linked already, but we do not have a linked user, or the email we have is + # not matching what DCF thinks it is. This is so messed up that we should tell the user there + # was a problem, and let the user_details page try to figure out what to tell the user about + # the inconsistent state. + err_msg = "Data Commons did not accept linking request." + messages.error(request, err_msg) + + return redirect(reverse('user_detail', args=[request.user.id])) + + # + # The callback provides us with both the link expiration and the user ID that was linked. BUT THIS IS + # COMING FROM THE USER, IS NOT SIGNED, AND SO CANNOT BE TRUSTED! Pull them out and verify them. If things + # are not too crazy, we accept the value we are sent: + # + + returned_expiration_str = request.GET.get('exp', None) + returned_google_link = request.GET.get('linked_email', None) + + use_expiration_time = calc_expiration_time(returned_expiration_str) + if returned_google_link: if google_link != returned_google_link: logger.error("[ERROR]: DCF RETURNED CONFLICTING GOOGLE LINK {} VERSUS {}".format(returned_google_link, @@ -519,7 +563,7 @@ def dcf_link_callback(request): messages.error(request, dcf_err_msg.format("D003")) return redirect(reverse('user_detail', args=[request.user.id])) - req_user = User.objects.get(id=request.user.id) + # # No match? Not acceptable. Send user back to details page. The empty google ID in our table will # mean the page shows an option to try again. We need to @@ -583,15 +627,17 @@ def dcf_link_extend(request): returned_expiration_str = None user_data_token_string = None + err_msg = None + warn_msg = None try: err_msg, returned_expiration_str, user_data_token_string = refresh_at_dcf(request.user.id) except TokenFailure: - err_msg = "Your Data Commons Framework identity needs to be reestablished to complete this task." + warn_msg = "Your Data Commons Framework identity needs to be reestablished to complete this task." except InternalTokenError: err_msg = "There was an unexpected internal error {}. Please contact feedback@isb-cgc.org.".format("0081") except RefreshTokenExpired: - err_msg = "Your login to the Data Commons Framework has expired. You will need to log in again." + warn_msg = "Your login to the Data Commons Framework has expired. You will need to log in again." except DCFCommFailure: err_msg = comm_err_msg except Exception as e: @@ -602,6 +648,9 @@ def dcf_link_extend(request): if err_msg: messages.error(request, err_msg) return redirect(reverse('user_detail', args=[request.user.id])) + elif warn_msg: + messages.warning(request, warn_msg) + return redirect(reverse('user_detail', args=[request.user.id])) use_expiration_time = calc_expiration_time(returned_expiration_str) user_data_dict = user_data_token_to_user_dict(user_data_token_string) @@ -738,9 +787,10 @@ def dcf_disconnect_user(request): try: dcf_token = get_stored_dcf_token(request.user.id) except TokenFailure: - err_msg = "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org.".format("0069") + # No token? We are done! + return redirect(reverse('user_detail', args=[request.user.id])) except InternalTokenError: - err_msg = "There was an internal error {} logging in. Please report this to feedback@isb-cgc.org.".format("0070") + err_msg = "There was an internal error {} unlinking. Please report this to feedback@isb-cgc.org.".format("0070") except RefreshTokenExpired: err_msg = "You will need to first login to the Data Commons again to disconnect your Google ID" diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index f8d6d7ed..46aaf3d5 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1383,11 +1383,17 @@ def _refresh_from_dcf(user_id, nih_user): return RefreshCode.ALL_MATCHES +def have_linked_user(user_id): + """ + Answers if the user is linked + """ + nih_users = NIH_User.objects.filter(user_id=user_id, linked=True) + return len(nih_users) == 1 + + def get_nih_user_details(user_id, force_logout): """ - :param user_id: - :param force_logout: - :return: + When used with DCF, this compares DCF state with our state and acts accordingly. """ user_details = {} From 3e0dfafef335d61e0218b5be2f608176eb26aced Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Mon, 16 Jul 2018 15:49:56 -0700 Subject: [PATCH 54/76] Handling of GoogleID mismatch and None values on link callback. --- accounts/dcf_views.py | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index d57507a8..da9b0f79 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -551,22 +551,58 @@ def dcf_link_callback(request): use_expiration_time = calc_expiration_time(returned_expiration_str) if returned_google_link: + # + # OK, two (realistic) possible cases here if (google_link != returned_google_link). Note that having + # returned_google_link be None is not realistic unless there is a fundamental DCF bug. + # Generally, we do not care what the returned_google_link is, since we basically use this as an event + # to go refresh our token and get the latest info. ALTHOUGH at the moment (7/16/18), we have no other way + # to get the link expiration time. + # + # Case 1 is where google_link and returned_google_link are both not None, and are different. THIS SHOULD NOT + # (IN GENERAL) BE VERY LIKELY. Because DCF forbids overwriting an existing link with a new link value. BUT we have + # dueling browser login test that show it happening, possibly because DCF is only rejecting second link attempts + # early in the flow, but is not checking for/requiring an existing NULL value while writing to their DB. So + # if that was caught, we would not expect not-None-but-unequal. (Note that it *would be* possible if there was + # a significant delay receiving/processing this linking callback, and another actor had successfully + # unlinked/relinked during that delay). + # + # Case 2 is where the returned link has a value, but when we check, the freshest token from DCF says they + # are unlinked. This could happen if there was a race and an unlinking request to DCF got processed before + # this link callback got processed. + # + # Regardless, we need to use the user info just obtained from get_user_data_token_string() as definitive + # in deciding what to do here. + # + if google_link != returned_google_link: logger.error("[ERROR]: DCF RETURNED CONFLICTING GOOGLE LINK {} VERSUS {}".format(returned_google_link, google_link)) + if google_link is not None: + # + # Report the difference, but do not do anything. User details page should process any discrepancies: + # + messages.error(request, "Data Commons reports that you have already linked with " \ + "Google ID {}. ".format(google_link)) + return redirect(reverse('user_detail', args=[request.user.id])) + else: logger.info("DCF provided google link was consistent") else: + # + # If the DCF callback does not provide a Google ID, we will log it, but not bug the user. We will just drag + # the data out of the token: + # logger.error("No google link provided by DCF") if google_link is None: - messages.error(request, dcf_err_msg.format("D003")) + # + # If we are now seeing that we are NOT linked anymore, we tell the user, and bag it. + messages.error(request, "Data Commons reports that you have just unlinked your Google ID.") return redirect(reverse('user_detail', args=[request.user.id])) - # # No match? Not acceptable. Send user back to details page. The empty google ID in our table will - # mean the page shows an option to try again. We need to + # mean the page shows an option to try again. # if google_link != req_user.email: From 9d44cd0bbbf5862733e85b067cb808a113e9d292 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 17 Jul 2018 11:16:27 -0700 Subject: [PATCH 55/76] Dropped data scope, fixed error message check. --- accounts/dcf_views.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index da9b0f79..b8e8a719 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -80,7 +80,7 @@ def oauth2_login(request): # Found that 'user' scope had to be included to be able to do the user query on callback, and the data scope # to do data queries. Starting to recognize a pattern here... - oauth = OAuth2Session(client_id, redirect_uri=full_callback, scope=['openid', 'user', 'data']) + oauth = OAuth2Session(client_id, redirect_uri=full_callback, scope=['openid', 'user']) authorization_url, state = oauth.authorization_url(DCF_AUTH_URL) # stash the state string in the session! @@ -260,7 +260,7 @@ def oauth2_callback(request): try: alg_list = ['RS256'] decoded_jwt_id = my_jwt.decode(token_data['id_token'], key=use_key, algorithms=alg_list, - audience=['openid', 'user', 'data', client_id]) + audience=['openid', 'user', client_id]) except Exception as e: logger.error("[ERROR] Decoding JWT failure") logger.exception(e) @@ -449,7 +449,7 @@ def dcf_link_callback(request): # any random error that is reported back to us. # error = request.GET.get('error', None) - defer_error_return = False; + defer_error_return = False if error: error_description = request.GET.get('error_description', "") if error == 'g_acnt_link_error': @@ -460,7 +460,7 @@ def dcf_link_callback(request): # If we get the message, we should tell the user what email they are registered with! For that, we will # need to get the token, which we do below in the regular flow. So defer the return in this case... message = 'Issue with the linkage between user and their Google account' - if error_description == "User already has a linked Google account": + if error_description == "User already has a linked Google account.": defer_error_return = True elif error == 'g_acnt_auth_failure': message = "Issue with Oauth2 flow to AuthN user's Google account" From 53e0b71529fb31514cc86f46e6a57ab9b5772916 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 17 Jul 2018 11:56:46 -0700 Subject: [PATCH 56/76] Logging for disconnect race condition --- accounts/dcf_views.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index b8e8a719..c883f5f4 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -606,6 +606,7 @@ def dcf_link_callback(request): # if google_link != req_user.email: + logger.info("Now calling DCF to disconnect {} Google ID; we needed {} ".format(google_link, req_user.email)) err_msg = None try: unlink_at_dcf(request.user.id, True) # True means saved token is now updated with unlinked state @@ -622,6 +623,7 @@ def dcf_link_callback(request): messages.error(request, err_msg) return redirect(reverse('user_detail', args=[request.user.id])) + logger.info("DCF has returned following disconnect request: {} should be dropped for {} ".format(google_link, req_user.email)) message = "You must use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( req_user.email, google_link) messages.warning(request, message) From f1b6426bc591e524904be41d9f58ed74511ce882 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Tue, 17 Jul 2018 18:22:22 -0700 Subject: [PATCH 57/76] Another pass for error messages for race conditions. --- accounts/dcf_views.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index c883f5f4..10779828 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -519,8 +519,9 @@ def dcf_link_callback(request): if defer_error_return: if google_link is None: - # DCF is confused. We have seen this case. - err_msg = "Data Commons experienced an internal error. Please use another browser to Associate with eRA Commons Account." + # DCF is confused (stale cookie) OR user tried to connect with a non-CGC Google ID in a login race condition + # with this login, and we unlinked them before this got processed. The user is currently unlinked. + err_msg = "Data Commons did not accept linking request. Please use a single browser for linking/unlinking requests." messages.error(request, err_msg) else: # User had two browsers open and tried to login on both. If the user ID in the token matches @@ -597,7 +598,9 @@ def dcf_link_callback(request): if google_link is None: # # If we are now seeing that we are NOT linked anymore, we tell the user, and bag it. - messages.error(request, "Data Commons reports that you have just unlinked your Google ID.") + # + messages.error(request, "Data Commons reports that you do not yet have a valid linked Google ID. " + "Please use a single brower for linking/unlinking requests.") return redirect(reverse('user_detail', args=[request.user.id])) # From 384e2b1c2e7bf947a1266599244b9e9fb9351094 Mon Sep 17 00:00:00 2001 From: s-paquette Date: Wed, 18 Jul 2018 11:18:57 -0700 Subject: [PATCH 58/76] -> BETWEEN support --- google_helpers/bigquery/bq_support.py | 68 +++++++++++++++++++++------ 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/google_helpers/bigquery/bq_support.py b/google_helpers/bigquery/bq_support.py index 1242f416..c94b4ec7 100644 --- a/google_helpers/bigquery/bq_support.py +++ b/google_helpers/bigquery/bq_support.py @@ -20,6 +20,7 @@ import re from time import sleep from uuid import uuid4 +import copy from django.conf import settings from google_helpers.bigquery.service import get_bigquery_service from abstract import BigQueryABC @@ -281,7 +282,6 @@ def insert_bq_query_job(self, query,parameters=None, write_disposition='WRITE_EM projectId=self.executing_project, body=job_desc).execute(num_retries=5) - # Runs a basic, optionally parameterized query # If self.project_id, self.dataset_id, and self.table_id are set they # will be used as the destination table for the query @@ -400,8 +400,13 @@ def get_job_results(cls, job_reference): # Breaks out ' IS NULL' # 2+ values are converted to IN (,...) # Filters must already be pre-bucketed or formatted - # TODO: add support for BETWEEN - # TODO: add support for <>= + # Use of LIKE is detected based on single-length value array and use of % in the value string + # Support special 'mutation' filter category + # Support for Greater/Less than (or equal to) via [gl]t[e]{0,1} in attr name, + # eg. {"age_at_diagnosis_gte": [50,]} + # Support for BETWEEN via _btw in attr name, eg. ("wbc_at_diagnosis_btw": [800,1200]} + # + # TODO: add support for DATES @staticmethod def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with_count_toggle=False, field_prefix=None): result = { @@ -425,6 +430,8 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with other_filters[attr] = filters[attr] mut_filtr_count = 1 + + # 'Mutation' filters, special category for MUT: type filters for attr, values in mutation_filters.items(): gene = attr.split(':')[2] type = attr.split(':')[-1] @@ -473,17 +480,14 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with logger.debug("other filters: {}".format(str(other_filters))) + # Standard query filters for attr, values in other_filters.items(): filter_string = '' param_name = attr + '{}'.format('_{}'.format(param_suffix) if param_suffix else '') query_param = { 'name': param_name, - 'parameterType': { - - }, - 'parameterValue': { - - } + 'parameterType': {}, + 'parameterValue': {} } if 'None' in values: values.remove('None') @@ -496,10 +500,42 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with # Scalar param query_param['parameterType']['type'] = ('STRING' if re.compile(ur'[^0-9\.,]', re.UNICODE).search(values[0]) else 'INT64') query_param['parameterValue']['value'] = values[0] - if query_param['parameterType']['type'] == 'STRING' and '%' in values[0]: - filter_string += "LOWER({}{}) LIKE LOWER(@{})".format('' if not field_prefix else field_prefix, attr, param_name) - else: - filter_string += "{}{} = @{}".format('' if not field_prefix else field_prefix, attr, param_name) + if query_param['parameterType']['type'] == 'STRING': + if '%' in values[0]: + filter_string += "LOWER({}{}) LIKE LOWER(@{})".format('' if not field_prefix else field_prefix, attr, param_name) + else: + filter_string += "{}{} = @{}".format('' if not field_prefix else field_prefix, attr, + param_name) + elif query_param['parameterType']['type'] == 'INT64': + if attr.endsWith('_gt') or attr.endsWith('_gte'): + filter_string += "{}{} >{} @{}".format( + '' if not field_prefix else field_prefix, attr[:attr.rfind('_')], + '=' if attr.endsWith('_gte') else '', + param_name + ) + elif attr.endsWith('_lt') or attr.endsWith('_lte'): + filter_string += "{}{} <{} @{}".format( + '' if not field_prefix else field_prefix, attr[:attr.rfind('_')], + '=' if attr.endsWith('_lte') else '', + param_name + ) + elif len(values) == 2 and attr.endsWith('_btw'): + query_param['parameterType']['type'] = ('STRING' if re.compile(ur'[^0-9\.,]', re.UNICODE).search(values[0]) else 'INT64') + param_name_1 = param_name + '_btw_1' + param_name_2 = param_name + '_btw_2' + filter_string += "{}{} BETWEEN @{} AND @{}".format( + '' if not field_prefix else field_prefix, attr[:attr.rfind('_')], + param_name_1, + param_name_2 + ) + query_param_1 = query_param + query_param_2 = copy.deepcopy(query_param) + query_param = [query_param_1, query_param_2, ] + query_param_1['name'] = query_param_1 + query_param_1['parameterValue']['value'] = values[0] + query_param_2['name'] = query_param_2 + query_param_2['parameterValue']['value'] = values[1] + else: # Array param query_param['parameterType']['type'] = "ARRAY" @@ -521,7 +557,11 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with result['parameters'].append(result['count_params'][param_name]) filter_set.append('({})'.format(filter_string)) - result['parameters'].append(query_param) + + if type(query_param) is list: + result['parameters'].extend(query_param) + else: + result['parameters'].append(query_param) result['filter_string'] = " {} ".format(comb_with).join(filter_set) From 3a5ccee8656e5b51762354cad415cddf99923c80 Mon Sep 17 00:00:00 2001 From: elainelee Date: Wed, 18 Jul 2018 14:03:01 -0700 Subject: [PATCH 59/76] bugfix --- google_helpers/bigquery/bq_support.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/google_helpers/bigquery/bq_support.py b/google_helpers/bigquery/bq_support.py index c94b4ec7..93e908bb 100644 --- a/google_helpers/bigquery/bq_support.py +++ b/google_helpers/bigquery/bq_support.py @@ -430,7 +430,7 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with other_filters[attr] = filters[attr] mut_filtr_count = 1 - + type = None # 'Mutation' filters, special category for MUT: type filters for attr, values in mutation_filters.items(): gene = attr.split(':')[2] @@ -558,7 +558,7 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with filter_set.append('({})'.format(filter_string)) - if type(query_param) is list: + if type != None and type(query_param) is list: result['parameters'].extend(query_param) else: result['parameters'].append(query_param) From 403cf80a5ea4cd9a88f00b70328469f239d0ea62 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 18 Jul 2018 15:11:37 -0700 Subject: [PATCH 60/76] Bug arising from reenabling ERA SAML A and A --- accounts/sa_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 46aaf3d5..14439481 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1173,13 +1173,15 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, logger.debug("[STATUS] UserAuthorizedDatasets for {}: {}".format(nih_user.NIH_username, str(uad))) need_to_add = False + user_on_acl = False if handle_acls: try: result = directory_client.members().get(groupKey=dataset.google_group_name, memberKey=user_email).execute(http=http_auth) + user_on_acl = len(result) > 0 # If we found them in the ACL but they're not currently authorized for it, remove them from it and the table - if len(result) and not dataset_in_auth_set: + if user_on_acl and not dataset_in_auth_set: directory_client.members().delete(groupKey=dataset.google_group_name, memberKey=user_email).execute(http=http_auth) logger.warn( @@ -1215,7 +1217,7 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, # Sometimes an account is in the Google Group but not the database - add them if they should # have access. # May 2018: Not handling ACL groups anymore, we skip this step (added handle_acls condition) - elif not len(uad) and handle_acls and len(result) and dataset_in_auth_set: + elif not len(uad) and handle_acls and user_on_acl and dataset_in_auth_set: logger.info( "User {} was was found in group {} but not the database--adding them.".format( user_email, dataset.google_group_name From 979fc7e9a4e8f7484fd11025e6d922e22595143c Mon Sep 17 00:00:00 2001 From: s-paquette Date: Wed, 18 Jul 2018 15:19:53 -0700 Subject: [PATCH 61/76] -> Fix for type being overshadowed --- google_helpers/bigquery/bq_support.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/google_helpers/bigquery/bq_support.py b/google_helpers/bigquery/bq_support.py index 93e908bb..5a2db4d9 100644 --- a/google_helpers/bigquery/bq_support.py +++ b/google_helpers/bigquery/bq_support.py @@ -430,11 +430,10 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with other_filters[attr] = filters[attr] mut_filtr_count = 1 - type = None # 'Mutation' filters, special category for MUT: type filters for attr, values in mutation_filters.items(): gene = attr.split(':')[2] - type = attr.split(':')[-1] + filter_type = attr.split(':')[-1] invert = bool(attr.split(':')[3] == 'NOT') param_name = 'gene{}{}'.format(str(mut_filtr_count), '_{}'.format(param_suffix) if param_suffix else '') filter_string = '{}Hugo_Symbol = @{} AND '.format('' if not field_prefix else field_prefix, param_name) @@ -459,11 +458,11 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with } } - if type == 'category' and values[0] == 'any': + if filter_type == 'category' and values[0] == 'any': filter_string += '{}Variant_Classification IS NOT NULL'.format('' if not field_prefix else field_prefix,) var_query_param = None else: - if type == 'category': + if filter_type == 'category': values = MOLECULAR_CATEGORIES[values[0]]['attrs'] var_param_name = "var_class{}{}".format(str(mut_filtr_count), '_{}'.format(param_suffix) if param_suffix else '') filter_string += '{}Variant_Classification {}IN UNNEST(@{})'.format('' if not field_prefix else field_prefix, 'NOT ' if invert else '', var_param_name) @@ -478,8 +477,6 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with mut_filtr_count += 1 - logger.debug("other filters: {}".format(str(other_filters))) - # Standard query filters for attr, values in other_filters.items(): filter_string = '' @@ -558,7 +555,7 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with filter_set.append('({})'.format(filter_string)) - if type != None and type(query_param) is list: + if type(query_param) is list: result['parameters'].extend(query_param) else: result['parameters'].append(query_param) From b551365ded7f308671b85278dde77f2544244b25 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 18 Jul 2018 19:35:12 -0700 Subject: [PATCH 62/76] Rework of unlinked google ID problem --- accounts/dcf_views.py | 131 +++++++++++++++++++++++------------------- accounts/sa_utils.py | 10 ++-- 2 files changed, 75 insertions(+), 66 deletions(-) diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 10779828..4f2e62fa 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -105,7 +105,8 @@ def dcf_simple_logout(request): If the user is trying to login with an NIH idea already in use by somebody else, or if they are already linked with a different NIH ID, we immediately reject the response from DCF and tell the user they need to logout to try again. This involves simply sending them back to DCF; the user's DCF session cookies do the rest to let - DCF know who they are. Note we also clear the session key we are using to record the error. + DCF know who they are. Note we also clear the session key we are using to record the error. This is now also used + if we have Google Link ID inconsistencies, since DCF session cookies currently need to be cleared. ''' request.session.pop('dcfForcedLogout', None) logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) @@ -322,7 +323,7 @@ def oauth2_callback(request): for warn in results.messages: messages.warning(request, warn) # stash the requirement to only show a logout link in the session! - request.session['dcfForcedLogout'] = nih_from_dcf + request.session['dcfForcedLogout'] = True return redirect(reverse('user_detail', args=[request.user.id])) # @@ -517,28 +518,30 @@ def dcf_link_callback(request): # testing, based on DCF making a decision based on stale cookie data in a second browser. # + good_to_go = True if defer_error_return: if google_link is None: # DCF is confused (stale cookie) OR user tried to connect with a non-CGC Google ID in a login race condition # with this login, and we unlinked them before this got processed. The user is currently unlinked. err_msg = "Data Commons did not accept linking request. Please use a single browser for linking/unlinking requests." messages.error(request, err_msg) + request.session['dcfForcedLogout'] = True # See comment below about stale DCF session cookies + return redirect(reverse('user_detail', args=[request.user.id])) else: # User had two browsers open and tried to login on both. If the user ID in the token matches # what we think it should be, just post this fact for user to see. if have_linked_user(request.user.id) and google_link == req_user.email: warn_msg = "Data Commons reported that you were already linked with Google ID {}." messages.warning(request, warn_msg) - + return redirect(reverse('user_detail', args=[request.user.id])) else: # DCF says we are linked already, but we do not have a linked user, or the email we have is - # not matching what DCF thinks it is. This is so messed up that we should tell the user there - # was a problem, and let the user_details page try to figure out what to tell the user about - # the inconsistent state. + # not matching what DCF thinks it is. This is so messed up! We tell the user there + # was a problem, force a logout, and proceed to unlink them below: err_msg = "Data Commons did not accept linking request." messages.error(request, err_msg) - - return redirect(reverse('user_detail', args=[request.user.id])) + request.session['dcfForcedLogout'] = True # See comment below about stale DCF session cookies + good_to_go = False # # The callback provides us with both the link expiration and the user ID that was linked. BUT THIS IS @@ -546,69 +549,71 @@ def dcf_link_callback(request): # are not too crazy, we accept the value we are sent: # - returned_expiration_str = request.GET.get('exp', None) - returned_google_link = request.GET.get('linked_email', None) + if good_to_go: # skip this stuff if we just want to use the disconnect step below: + returned_expiration_str = request.GET.get('exp', None) + returned_google_link = request.GET.get('linked_email', None) - use_expiration_time = calc_expiration_time(returned_expiration_str) + use_expiration_time = calc_expiration_time(returned_expiration_str) - if returned_google_link: - # - # OK, two (realistic) possible cases here if (google_link != returned_google_link). Note that having - # returned_google_link be None is not realistic unless there is a fundamental DCF bug. - # Generally, we do not care what the returned_google_link is, since we basically use this as an event - # to go refresh our token and get the latest info. ALTHOUGH at the moment (7/16/18), we have no other way - # to get the link expiration time. - # - # Case 1 is where google_link and returned_google_link are both not None, and are different. THIS SHOULD NOT - # (IN GENERAL) BE VERY LIKELY. Because DCF forbids overwriting an existing link with a new link value. BUT we have - # dueling browser login test that show it happening, possibly because DCF is only rejecting second link attempts - # early in the flow, but is not checking for/requiring an existing NULL value while writing to their DB. So - # if that was caught, we would not expect not-None-but-unequal. (Note that it *would be* possible if there was - # a significant delay receiving/processing this linking callback, and another actor had successfully - # unlinked/relinked during that delay). - # - # Case 2 is where the returned link has a value, but when we check, the freshest token from DCF says they - # are unlinked. This could happen if there was a race and an unlinking request to DCF got processed before - # this link callback got processed. - # - # Regardless, we need to use the user info just obtained from get_user_data_token_string() as definitive - # in deciding what to do here. - # + if returned_google_link: + # + # OK, two (realistic) possible cases here if (google_link != returned_google_link). Note that having + # returned_google_link be None is not realistic unless there is a fundamental DCF bug. + # Generally, we do not care what the returned_google_link is, since we basically use this as an event + # to go refresh our token and get the latest info. ALTHOUGH at the moment (7/16/18), we have no other way + # to get the link expiration time. + # + # Case 1 is where google_link and returned_google_link are both not None, and are different. THIS SHOULD NOT + # (IN GENERAL) BE VERY LIKELY. Because DCF forbids overwriting an existing link with a new link value. BUT we have + # dueling browser login test that show it happening, possibly because DCF is only rejecting second link attempts + # early in the flow, but is not checking for/requiring an existing NULL value while writing to their DB. So + # if that was caught, we would not expect not-None-but-unequal. (Note that it *would be* possible if there was + # a significant delay receiving/processing this linking callback, and another actor had successfully + # unlinked/relinked during that delay). + # + # Case 2 is where the returned link has a value, but when we check, the freshest token from DCF says they + # are unlinked. This could happen if there was a race and an unlinking request to DCF got processed before + # this link callback got processed. + # + # Regardless, we need to use the user info just obtained from get_user_data_token_string() as definitive + # in deciding what to do here. + # - if google_link != returned_google_link: - logger.error("[ERROR]: DCF RETURNED CONFLICTING GOOGLE LINK {} VERSUS {}".format(returned_google_link, - google_link)) - if google_link is not None: - # - # Report the difference, but do not do anything. User details page should process any discrepancies: - # - messages.error(request, "Data Commons reports that you have already linked with " \ - "Google ID {}. ".format(google_link)) - return redirect(reverse('user_detail', args=[request.user.id])) + if google_link != returned_google_link: + logger.error("[ERROR]: DCF RETURNED CONFLICTING GOOGLE LINK {} VERSUS {}".format(returned_google_link, + google_link)) + if google_link is not None: + # + # Report the difference, but keep on going. We will use the google_link coming out of the token + # to continue the process and either null it or accept it: + # + messages.warning(request, "Data Commons reports that you have already linked with " \ + "Google ID {}. ".format(google_link)) + else: + logger.info("DCF provided google link was consistent") else: - logger.info("DCF provided google link was consistent") - else: - # - # If the DCF callback does not provide a Google ID, we will log it, but not bug the user. We will just drag - # the data out of the token: - # - logger.error("No google link provided by DCF") + # + # If the DCF callback does not provide a Google ID, we will log it, but not bug the user. We will just drag + # the data out of the token. This would be out of spec behavior: + # + logger.error("No google link provided by DCF") - if google_link is None: - # - # If we are now seeing that we are NOT linked anymore, we tell the user, and bag it. - # - messages.error(request, "Data Commons reports that you do not yet have a valid linked Google ID. " - "Please use a single brower for linking/unlinking requests.") - return redirect(reverse('user_detail', args=[request.user.id])) + if google_link is None: + # + # If we are now seeing that we are NOT linked anymore, we tell the user, and bag it. + # + messages.error(request, "Data Commons reports that you do not yet have a valid linked Google ID. " + "Please use a single browser for linking/unlinking requests.") + request.session['dcfForcedLogout'] = True # See comment below about stale DCF session cookies + return redirect(reverse('user_detail', args=[request.user.id])) # # No match? Not acceptable. Send user back to details page. The empty google ID in our table will # mean the page shows an option to try again. # - if google_link != req_user.email: + if (google_link != req_user.email) or not good_to_go: logger.info("Now calling DCF to disconnect {} Google ID; we needed {} ".format(google_link, req_user.email)) err_msg = None try: @@ -627,9 +632,15 @@ def dcf_link_callback(request): return redirect(reverse('user_detail', args=[request.user.id])) logger.info("DCF has returned following disconnect request: {} should be dropped for {} ".format(google_link, req_user.email)) + message = "You must use your ISB-CGC login email ({}) to link with the DCF instead of {}".format( req_user.email, google_link) - messages.warning(request, message) + messages.error(request, message) + + # As of now (7/18/18), despite the fact that we have disconnected the bogus link at DCF, if we send the user + # back to do the linking, a stale browser cookie will tell DCF that they are linked, and reject our request. So + # we need to force a logout to kill the cookie. + request.session['dcfForcedLogout'] = True return redirect(reverse('user_detail', args=[request.user.id])) # diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 14439481..e181e324 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1338,7 +1338,7 @@ def _refresh_from_dcf(user_id, nih_user): except InternalTokenError: return RefreshCode.INTERNAL_ERROR except DCFCommFailure: - raise RefreshCode.DCF_COMMUNICATIONS_ERROR + return RefreshCode.DCF_COMMUNICATIONS_ERROR # # Things that could be different: Google ID linkage, expiration time, approved datasets. @@ -1410,17 +1410,15 @@ def get_nih_user_details(user_id, force_logout): user_details['error_state'] = None user_details['dcf_comm_error'] = False user_details['force_DCF_logout'] = True - user_details['NIH_username'] = force_logout return user_details # # Otherwise, ask the DCF for current user info, - # FIXME: Check in with DCF for info, throw DCFCommError if we have problems - # FIXME: If refresh token is expired, we cannot show any info until they log back in! + # user_details['force_DCF_logout'] = False user_details['refresh_required'] = False - user_details['refresh_key_ok'] = True + user_details['no_google_link'] = False user_details['error_state'] = None user_details['dcf_comm_error'] = False user_details['link_mismatch'] = False @@ -1450,7 +1448,7 @@ def get_nih_user_details(user_id, force_logout): user_details['dcf_comm_error'] = True return user_details elif match_state == RefreshCode.NO_GOOGLE_LINK: - user_details['refresh_key_ok'] = False + user_details['no_google_link'] = True return user_details elif match_state == RefreshCode.GOOGLE_LINK_MISMATCH: # If they have a bad Google ID linked at DCF, we force them to login again, which eventually From 03bd992e6df8ed81dd2d6b518879b4a282721428 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Wed, 18 Jul 2018 19:38:08 -0700 Subject: [PATCH 63/76] Fixing incorrect add bug --- accounts/sa_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 14439481..ae1770bf 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1196,8 +1196,9 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, ) ) except HttpError: - # if the user_email doesn't exist in the google group an HttpError will be thrown... - need_to_add = True + # if the user_email doesn't exist in the google group an HttpError will be thrown... It means nothing + # should happen... + pass else: need_to_add = (len(uad) == 0) and dataset_in_auth_set From 4758bb07fa2c6b6c983b32faf546d87e94d6d52c Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Thu, 19 Jul 2018 11:51:03 -0700 Subject: [PATCH 64/76] Another pass at handling user datasets --- accounts/sa_utils.py | 103 +++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index ae1770bf..bbe800de 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -1172,7 +1172,6 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, logger.debug("[STATUS] UserAuthorizedDatasets for {}: {}".format(nih_user.NIH_username, str(uad))) - need_to_add = False user_on_acl = False if handle_acls: try: @@ -1180,7 +1179,9 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, memberKey=user_email).execute(http=http_auth) user_on_acl = len(result) > 0 - # If we found them in the ACL but they're not currently authorized for it, remove them from it and the table + # If we found them in the ACL but they're not currently authorized for it, remove them from it. Note + # race condition: If they were just dropped by somebody else, the following call raises an error, which + # also gets caught below. if user_on_acl and not dataset_in_auth_set: directory_client.members().delete(groupKey=dataset.google_group_name, memberKey=user_email).execute(http=http_auth) @@ -1199,8 +1200,6 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, # if the user_email doesn't exist in the google group an HttpError will be thrown... It means nothing # should happen... pass - else: - need_to_add = (len(uad) == 0) and dataset_in_auth_set # # Either remove them from the table, or add them to the table. @@ -1216,20 +1215,49 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, uad.delete() # Sometimes an account is in the Google Group but not the database - add them if they should - # have access. - # May 2018: Not handling ACL groups anymore, we skip this step (added handle_acls condition) - elif not len(uad) and handle_acls and user_on_acl and dataset_in_auth_set: - logger.info( - "User {} was was found in group {} but not the database--adding them.".format( - user_email, dataset.google_group_name - ) - ) - st_logger.write_text_log_entry( - LOG_NAME_ERA_LOGIN_VIEW, - "[WARN] User {} was was found in group {} but not the database--adding them.".format( - user_email, dataset.google_group_name - ) - ) + # have access. July 2018: No, that is a bad idea. Privilege must only flow in one direction. + # Database must be considered definitive. + # May 2018: If not handling ACL groups anymore, we skip this step (added handle_acls condition) + # elif not len(uad) and handle_acls and user_on_acl and dataset_in_auth_set: + # logger.info( + # "User {} was was found in group {} but not the database--adding them.".format( + # user_email, dataset.google_group_name + # ) + # ) + # st_logger.write_text_log_entry( + # LOG_NAME_ERA_LOGIN_VIEW, + # "[WARN] User {} was was found in group {} but not the database--adding them.".format( + # user_email, dataset.google_group_name + # ) + # ) + # uad, created = UserAuthorizedDatasets.objects.update_or_create(nih_user=nih_user, + # authorized_dataset=ad) + # if not created: + # logger.warn("[WARNING] Unable to create entry for user {} and dataset {}.".format(user_email, + # ad.whitelist_id)) + # else: + # logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) + + if handle_acls: + # Check for their need to be in the ACL, and add them + if dataset_in_auth_set and not user_on_acl: + body = { + "email": user_email, + "role": "MEMBER" + } + + result = directory_client.members().insert( + groupKey=dataset.google_group_name, + body=body + ).execute(http=http_auth) + + logger.info(result) + logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) + st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + "[STATUS] User {} added to {}.".format(user_email, + dataset.google_group_name)) + # Add them to the database as well + if (len(uad) == 0) and dataset_in_auth_set: uad, created = UserAuthorizedDatasets.objects.update_or_create(nih_user=nih_user, authorized_dataset=ad) if not created: @@ -1238,40 +1266,11 @@ def handle_user_for_dataset(dataset, nih_user, user_email, authorized_datasets, else: logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) - if need_to_add: - if handle_acls: - # Check for their need to be in the ACL, and add them - if dataset_in_auth_set: - body = { - "email": user_email, - "role": "MEMBER" - } - - result = directory_client.members().insert( - groupKey=dataset.google_group_name, - body=body - ).execute(http=http_auth) - - logger.info(result) - logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) - st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - "[STATUS] User {} added to {}.".format(user_email, - dataset.google_group_name)) - # Add them to the database as well - if not len(uad): - uad, created = UserAuthorizedDatasets.objects.update_or_create(nih_user=nih_user, - authorized_dataset=ad) - if not created: - logger.warn("[WARNING] Unable to create entry for user {} and dataset {}.".format(user_email, - ad.whitelist_id)) - else: - logger.info("[STATUS] Added user {} to dataset {}.".format(user_email, ad.whitelist_id)) - - # logger.info(result) - # logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) - # st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, - # "[STATUS] User {} added to {}.".format(user_email, - # dataset.google_group_name)) + # logger.info(result) + # logger.info("User {} added to {}.".format(user_email, dataset.google_group_name)) + # st_logger.write_text_log_entry(LOG_NAME_ERA_LOGIN_VIEW, + # "[STATUS] User {} added to {}.".format(user_email, + # dataset.google_group_name)) def deactivate_nih_add_to_open(user_id, user_email): # 5/14/18 NO! active flag has nothing to do with user logout, but instead is set to zero when user expires off of ACL group From e802255430a1036664bf1426b714583dd83df585 Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Thu, 19 Jul 2018 18:28:53 -0700 Subject: [PATCH 65/76] Another pass at handling DCF login/relink flow --- accounts/dcf_support.py | 28 ++++++++++++++++++++++++++-- accounts/sa_utils.py | 35 ++++++++++++++++++++++++++++------- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/accounts/dcf_support.py b/accounts/dcf_support.py index 8ffcc598..270bac94 100755 --- a/accounts/dcf_support.py +++ b/accounts/dcf_support.py @@ -81,6 +81,30 @@ def get_stored_dcf_token(user_id): return dcf_token +def get_auth_elapsed_time(user_id): + """ + There is benefit in knowing when the user did their NIH login at DCF, allowing us to e.g. estimate + if they have recently tried to do the linking step. This is pretty hackish, but should work. + + :raises InternalTokenError: + """ + remaining_seconds = None + dcf_token = None + try: + dcf_token = get_stored_dcf_token(user_id) + except TokenFailure: # No token, user has logged out. + return 2592000 # sorta like infinity + except RefreshTokenExpired as e: + remaining_seconds = e.seconds + except InternalTokenError as e: + raise e + + if not remaining_seconds: + remaining_seconds = (dcf_token.refresh_expires_at - pytz.utc.localize(datetime.datetime.utcnow())).total_seconds() + # DCF tokens last 30 days = 2592000 seconds. Use this to calculate when we first got it: + elapsed_seconds = 2592000 - remaining_seconds + return elapsed_seconds + def get_access_expiration(user_id): nih_users = NIH_User.objects.filter(user_id=user_id, linked=True) @@ -693,8 +717,8 @@ def refresh_token_storage(token_dict, decoded_jwt, user_token, nih_username_from logger.info('[INFO] Refresh token storage. New token expires at {}'.format(str(expiration_time))) - # FIXME! Make sure that the NIH name is going to be unique before we shove it into the table. Don't - # depend on the DB table constraint. + # We previously made sure that the NIH name is going to be unique before we shove it into the table, calling + # found_linking_problems(). Don't depend on the DB table constraint. # Note that (nih_username_lower, user_id) is enforced unique in the table: DCFToken.objects.update_or_create(user_id=cgc_uid, diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index e181e324..4cd9e91e 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -43,7 +43,7 @@ from dcf_support import get_stored_dcf_token, \ TokenFailure, RefreshTokenExpired, InternalTokenError, DCFCommFailure, \ - GoogleLinkState, \ + GoogleLinkState, get_auth_elapsed_time, \ get_google_link_from_user_dict, get_projects_from_user_dict, \ get_nih_id_from_user_dict, user_data_token_to_user_dict, get_user_data_token_string, \ compare_google_ids @@ -1403,7 +1403,7 @@ def get_nih_user_details(user_id, force_logout): # # If we have detected that the user has logged into DCF with a different NIH username than what we think, - # nothing else matters. We tell them to log out. + # nothing else matters. We tell them to log out. Same if they have a bad Google ID. # if force_logout: @@ -1431,6 +1431,18 @@ def get_nih_user_details(user_id, force_logout): match_state = _refresh_from_dcf(user_id, nih_user) + # It is not essential, but helps the user if we can suggest they log out + # before trying to fix problems (we provide them with a logout link no + # matter what). + + try: + since_login_est = get_auth_elapsed_time(user_id) + except InternalTokenError: + user_details['error_state'] = 'Internal error encountered syncing with Data Commons' + return user_details + + live_cookie_probable = since_login_est < (60 * 10) + if match_state == RefreshCode.NO_TOKEN: if nih_user: user_details['legacy_linkage'] = True @@ -1448,15 +1460,24 @@ def get_nih_user_details(user_id, force_logout): user_details['dcf_comm_error'] = True return user_details elif match_state == RefreshCode.NO_GOOGLE_LINK: - user_details['no_google_link'] = True + # If they have no Google link, and they have recently tried to link, just get them + # to log out. Otherwise, get them to log in again to fix it: + if live_cookie_probable: + user_details['force_DCF_logout'] = True + else: + user_details['no_google_link'] = True return user_details elif match_state == RefreshCode.GOOGLE_LINK_MISMATCH: - # If they have a bad Google ID linked at DCF, we force them to login again, which eventually - # tells them they need to switch it. - user_details['link_mismatch'] = True + # If they have a mismatched Google link, and they have recently tried to link, just get them + # to log out. Otherwise, get them to log in again to fix it: + if live_cookie_probable: + user_details['force_DCF_logout'] = True + else: + user_details['link_mismatch'] = True + return user_details elif match_state == RefreshCode.UNEXPECTED_UNLINKED_NIH_USER: # Should not happen. Force a complete logout - user_details['NIH_username'] = None + user_details['force_DCF_logout'] = True return user_details elif match_state == RefreshCode.PROJECT_SET_UPDATED: user_details['data_sets_updated'] = True From 055d1ad22c904ab5b6b0364fe960cf6cecc1495e Mon Sep 17 00:00:00 2001 From: wlongabaugh Date: Mon, 23 Jul 2018 12:47:13 -0700 Subject: [PATCH 66/76] Drop DCF token on forced logout --- accounts/dcf_support.py | 23 +++++++++++++++++++++++ accounts/dcf_views.py | 24 ++++++++++++------------ 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/accounts/dcf_support.py b/accounts/dcf_support.py index 270bac94..bab72b0e 100755 --- a/accounts/dcf_support.py +++ b/accounts/dcf_support.py @@ -81,6 +81,29 @@ def get_stored_dcf_token(user_id): return dcf_token + +def drop_dcf_token(user_id): + """ + If we are forcing a logout from DCF, it is because we need to get the user to start with a clean slate. Dropping + their DCF token is part of that. + + :raises InternalTokenError: + """ + try: + dcf_token = get_stored_dcf_token(user_id) + except TokenFailure: + dcf_token = None + except InternalTokenError as e: + raise e + except RefreshTokenExpired as e: + dcf_token = e.token + + if dcf_token: + dcf_token.delete() + + return None + + def get_auth_elapsed_time(user_id): """ There is benefit in knowing when the user did their NIH login at DCF, allowing us to e.g. estimate diff --git a/accounts/dcf_views.py b/accounts/dcf_views.py index 4f2e62fa..79b368f6 100755 --- a/accounts/dcf_views.py +++ b/accounts/dcf_views.py @@ -39,7 +39,7 @@ TokenFailure, RefreshTokenExpired, InternalTokenError, DCFCommFailure, \ get_google_link_from_user_dict, get_projects_from_user_dict, \ get_nih_id_from_user_dict, user_data_token_to_user_dict, get_user_data_token_string, \ - user_data_token_dict_massaged, \ + user_data_token_dict_massaged, drop_dcf_token, \ user_data_token_dict_to_user_dict, get_secrets, refresh_token_storage, \ unlink_at_dcf, refresh_at_dcf, decode_token_chunk, calc_expiration_time @@ -101,14 +101,21 @@ def oauth2_login(request): @login_required def dcf_simple_logout(request): - ''' - If the user is trying to login with an NIH idea already in use by somebody else, or if they are already linked + """ + If the user is trying to login with an NIH ID already in use by somebody else, or if they are already linked with a different NIH ID, we immediately reject the response from DCF and tell the user they need to logout to try again. This involves simply sending them back to DCF; the user's DCF session cookies do the rest to let DCF know who they are. Note we also clear the session key we are using to record the error. This is now also used if we have Google Link ID inconsistencies, since DCF session cookies currently need to be cleared. - ''' + """ + request.session.pop('dcfForcedLogout', None) + try: + drop_dcf_token(request.user.id) + except InternalTokenError: + messages.warning(request, "Internal problem encountered disconnecting from Data Commons. Please report this to feedback@isb-cgc.org") + return redirect(reverse('user_detail', args=[request.user.id])) + logout_callback = request.build_absolute_uri(reverse('user_detail', args=[request.user.id])) callback = '{}?next={}'.format(DCF_LOGOUT_URL, logout_callback) return HttpResponseRedirect(callback) @@ -914,17 +921,10 @@ def dcf_disconnect_user(request): # try: - dcf_token = get_stored_dcf_token(request.user.id) - except TokenFailure: - dcf_token = None + drop_dcf_token(request.user.id) except InternalTokenError: messages.warning(request, "Internal problem encountered disconnecting from Data Commons. Please report this to feedback@isb-cgc.org") return redirect(reverse('user_detail', args=[request.user.id])) - except RefreshTokenExpired as e: - dcf_token = e.token - - if dcf_token: - dcf_token.delete() # # Finally, we need to send the user to logout from the DCF, which is needed to clear the From 9d3280e10b892acee19ba7ed74f447844696c95a Mon Sep 17 00:00:00 2001 From: s-paquette Date: Wed, 25 Jul 2018 18:38:23 -0700 Subject: [PATCH 67/76] -> Debug lines for new managed SA --- accounts/sa_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index 14439481..e07158b7 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -231,6 +231,7 @@ def verify_service_account(gcp_id, service_account, datasets, user_email, is_ref if not member_sa.startswith(projectNumber+'-') and not project_id_re.search(member_sa) and \ not (msa.is_managed_this_project(member_sa, projectNumber, gcp_id)) and \ not sab.is_blacklisted(member_sa): + logger.debug("[STATUS] {} is managed this project: {}".format(member_sa,str(msa.is_managed_this_project(member_sa, projectNumber, gcp_id)))) invalid_members['external_sa'].append(member_sa) # If we haven't already invalidated this member SA for being from outside the project, check to see if anyone From 38d8e942d1bc9cc80cbcf7002937ead62ab9124d Mon Sep 17 00:00:00 2001 From: s-paquette Date: Wed, 25 Jul 2018 19:42:09 -0700 Subject: [PATCH 68/76] -> Handle new metadata_data table format --- cohorts/file_helpers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cohorts/file_helpers.py b/cohorts/file_helpers.py index f82c599e..6337dba1 100644 --- a/cohorts/file_helpers.py +++ b/cohorts/file_helpers.py @@ -181,8 +181,8 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse case_barcode_condition = " AND LOWER(cs.case_barcode) LIKE LOWER(%s)" select_clause_base = """ - SELECT md.sample_barcode, md.case_barcode, md.disease_code, md.file_name, md.file_name_key, - md.index_file_name, md.access, md.acl, md.platform, md.data_type, md.data_category, + SELECT md.sample_barcode, md.case_barcode, md.disease_code, md.file_name_key, + md.index_file_name_key, md.access, md.acl, md.platform, md.data_type, md.data_category, md.experimental_strategy, md.data_format, md.file_gdc_id, md.case_gdc_id, md.project_short_name FROM {metadata_table} md JOIN ( @@ -302,10 +302,10 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse 'disease_code': item['disease_code'], 'build': build.lower(), 'cloudstorage_location': item['file_name_key'] or 'N/A', - 'index_name': item['index_file_name'] or 'N/A', + 'index_name': item['index_file_name_key'] or 'N/A', 'access': (item['access'] or 'N/A'), 'user_access': str(item['access'] != 'controlled' or whitelist_found), - 'filename': item['file_name'] or 'N/A', + 'filename': item['file_name_key'].split('/')[-1] or 'N/A', 'exp_strat': item['experimental_strategy'] or 'N/A', 'platform': item['platform'] or 'N/A', 'datacat': item['data_category'] or 'N/A', From 11f71e6234c56ecc9e988e8a680e7978d153fc0d Mon Sep 17 00:00:00 2001 From: elainelee Date: Thu, 26 Jul 2018 13:46:51 -0700 Subject: [PATCH 69/76] bugfix --- cohorts/file_helpers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cohorts/file_helpers.py b/cohorts/file_helpers.py index f82c599e..43020867 100644 --- a/cohorts/file_helpers.py +++ b/cohorts/file_helpers.py @@ -181,8 +181,8 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse case_barcode_condition = " AND LOWER(cs.case_barcode) LIKE LOWER(%s)" select_clause_base = """ - SELECT md.sample_barcode, md.case_barcode, md.disease_code, md.file_name, md.file_name_key, - md.index_file_name, md.access, md.acl, md.platform, md.data_type, md.data_category, + SELECT md.sample_barcode, md.case_barcode, md.disease_code, substring_index(md.file_name_key, '/', -1) as file_name, md.file_name_key, + md.index_file_name_key, md.access, md.acl, md.platform, md.data_type, md.data_category, md.experimental_strategy, md.data_format, md.file_gdc_id, md.case_gdc_id, md.project_short_name FROM {metadata_table} md JOIN ( @@ -191,7 +191,7 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse WHERE cohort_id = {cohort_id} ) cs ON cs.case_barcode = md.case_barcode - WHERE md.file_uploaded='true' {filter_conditions} {case_barcode_condition} + WHERE TRUE {filter_conditions} {case_barcode_condition} """ file_list_query = """ @@ -272,6 +272,8 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse if len(filelist_params) > 0: cursor.execute(query, filelist_params) else: + print("###query") + print(query) cursor.execute(query) stop = time.time() logger.info("[STATUS] Time to get filelist: {}s".format(str(stop - start))) @@ -302,7 +304,7 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse 'disease_code': item['disease_code'], 'build': build.lower(), 'cloudstorage_location': item['file_name_key'] or 'N/A', - 'index_name': item['index_file_name'] or 'N/A', + 'index_name': item['index_file_name_key'] or 'N/A', 'access': (item['access'] or 'N/A'), 'user_access': str(item['access'] != 'controlled' or whitelist_found), 'filename': item['file_name'] or 'N/A', From 44806a3f3f5f5460a7f3a3d47a906f2e3c84bb30 Mon Sep 17 00:00:00 2001 From: elainelee Date: Thu, 26 Jul 2018 15:16:37 -0700 Subject: [PATCH 70/76] code changes with the new metadata table --- cohorts/file_helpers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cohorts/file_helpers.py b/cohorts/file_helpers.py index 43020867..e632c48a 100644 --- a/cohorts/file_helpers.py +++ b/cohorts/file_helpers.py @@ -272,8 +272,6 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse if len(filelist_params) > 0: cursor.execute(query, filelist_params) else: - print("###query") - print(query) cursor.execute(query) stop = time.time() logger.info("[STATUS] Time to get filelist: {}s".format(str(stop - start))) From 6ca052f16503358cbc165d44fa39b6d29672fc9e Mon Sep 17 00:00:00 2001 From: "S. Paquette" Date: Thu, 26 Jul 2018 17:27:09 -0700 Subject: [PATCH 71/76] -> More managed SA debugging --- accounts/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/accounts/utils.py b/accounts/utils.py index f0f3c9b0..000c60c4 100644 --- a/accounts/utils.py +++ b/accounts/utils.py @@ -137,6 +137,7 @@ class ManagedServiceAccounts(ServiceObjectBase): def __init__(self, managed_service_accounts): self.managed_service_accounts = set(managed_service_accounts) + logger.debug("[STATUS] Managed service accounts: {}".format(str(managed_service_accounts))) def is_managed(self, service_account): return '@{}'.format(service_account.split('@')[-1]) in self.managed_service_accounts From 6fd447bfd86500e82ec7a477df3dd98e4e3a8d2f Mon Sep 17 00:00:00 2001 From: "S. Paquette" Date: Thu, 26 Jul 2018 19:59:00 -0700 Subject: [PATCH 72/76] -> Remove debug lines --- accounts/sa_utils.py | 1 - accounts/utils.py | 1 - 2 files changed, 2 deletions(-) diff --git a/accounts/sa_utils.py b/accounts/sa_utils.py index b0d40888..42286918 100644 --- a/accounts/sa_utils.py +++ b/accounts/sa_utils.py @@ -231,7 +231,6 @@ def verify_service_account(gcp_id, service_account, datasets, user_email, is_ref if not member_sa.startswith(projectNumber+'-') and not project_id_re.search(member_sa) and \ not (msa.is_managed_this_project(member_sa, projectNumber, gcp_id)) and \ not sab.is_blacklisted(member_sa): - logger.debug("[STATUS] {} is managed this project: {}".format(member_sa,str(msa.is_managed_this_project(member_sa, projectNumber, gcp_id)))) invalid_members['external_sa'].append(member_sa) # If we haven't already invalidated this member SA for being from outside the project, check to see if anyone diff --git a/accounts/utils.py b/accounts/utils.py index 000c60c4..f0f3c9b0 100644 --- a/accounts/utils.py +++ b/accounts/utils.py @@ -137,7 +137,6 @@ class ManagedServiceAccounts(ServiceObjectBase): def __init__(self, managed_service_accounts): self.managed_service_accounts = set(managed_service_accounts) - logger.debug("[STATUS] Managed service accounts: {}".format(str(managed_service_accounts))) def is_managed(self, service_account): return '@{}'.format(service_account.split('@')[-1]) in self.managed_service_accounts From 2ff03436dcdf26910f7eeee35c9e7335d64103a6 Mon Sep 17 00:00:00 2001 From: elainelee Date: Fri, 27 Jul 2018 13:50:59 -0700 Subject: [PATCH 73/76] File Browser filters: setting Pathology metadata data attributes to 1. data type 2. data format 3. disease code (removing Platform, Data Category, and Experimental Strategy) --- cohorts/file_helpers.py | 2 +- cohorts/metadata_helpers.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cohorts/file_helpers.py b/cohorts/file_helpers.py index e632c48a..b4f9c2bb 100644 --- a/cohorts/file_helpers.py +++ b/cohorts/file_helpers.py @@ -282,7 +282,7 @@ def cohort_files(cohort_id, inc_filters=None, user=None, limit=25, page=1, offse if case_barcode: inc_filters['case_barcode'] = [case_barcode] counts = count_public_data_type(user, count_select_clause, - inc_filters, cohort_programs, (type is not None and type != 'all'), build) + inc_filters, cohort_programs, (type is not None and type != 'all'), build, type) stop = time.time() logger.info("[STATUS] Time to count public data files: {}s".format(str((stop-start)))) diff --git a/cohorts/metadata_helpers.py b/cohorts/metadata_helpers.py index 97971601..812b2575 100644 --- a/cohorts/metadata_helpers.py +++ b/cohorts/metadata_helpers.py @@ -232,6 +232,8 @@ def fetch_build_data_attr(build, type=None): if type == 'dicom' : metadata_data_attrs = ['disease_code', ] + elif type == 'camic': + metadata_data_attrs = ['data_type', 'data_format', 'disease_code', ] else: metadata_data_attrs = ['data_type', 'data_category','experimental_strategy','data_format','platform', 'disease_code',] try: From 02e9d749da71fdc0e38d8dd439a829b563dff0ca Mon Sep 17 00:00:00 2001 From: "S. Paquette" Date: Fri, 27 Jul 2018 15:30:53 -0700 Subject: [PATCH 74/76] -> Fix for new tables --- cohorts/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cohorts/views.py b/cohorts/views.py index 8054905b..736f8111 100755 --- a/cohorts/views.py +++ b/cohorts/views.py @@ -2201,7 +2201,7 @@ def export_data(request, cohort_id=0, export_type=None, export_sub_type=None): GROUP BY case_barcode, sample_barcode ) cs ON ((NOT cs.sample_barcode ='' AND cs.sample_barcode=md.sample_barcode) OR (cs.case_barcode=md.case_barcode)) - WHERE md.file_uploaded {filter_conditions} + WHERE TRUE {filter_conditions} GROUP BY md.sample_barcode, md.case_barcode, cloud_storage_location, md.platform, md.data_type, md.data_category, exp_strategy, md.data_format, gdc_file_uuid, gdc_case_uuid, md.project_short_name, cohort_id, build, date_added From 0ad119a94ecdf8d1c132d34ea8362c779c74a371 Mon Sep 17 00:00:00 2001 From: s-paquette Date: Mon, 30 Jul 2018 17:32:44 -0700 Subject: [PATCH 75/76] -> Bugfix: endswith, not endsWith --- google_helpers/bigquery/bq_support.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/google_helpers/bigquery/bq_support.py b/google_helpers/bigquery/bq_support.py index 5a2db4d9..7588e210 100644 --- a/google_helpers/bigquery/bq_support.py +++ b/google_helpers/bigquery/bq_support.py @@ -504,19 +504,19 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with filter_string += "{}{} = @{}".format('' if not field_prefix else field_prefix, attr, param_name) elif query_param['parameterType']['type'] == 'INT64': - if attr.endsWith('_gt') or attr.endsWith('_gte'): + if attr.endswith('_gt') or attr.endswith('_gte'): filter_string += "{}{} >{} @{}".format( '' if not field_prefix else field_prefix, attr[:attr.rfind('_')], - '=' if attr.endsWith('_gte') else '', + '=' if attr.endswith('_gte') else '', param_name ) - elif attr.endsWith('_lt') or attr.endsWith('_lte'): + elif attr.endswith('_lt') or attr.endswith('_lte'): filter_string += "{}{} <{} @{}".format( '' if not field_prefix else field_prefix, attr[:attr.rfind('_')], - '=' if attr.endsWith('_lte') else '', + '=' if attr.endswith('_lte') else '', param_name ) - elif len(values) == 2 and attr.endsWith('_btw'): + elif len(values) == 2 and attr.endswith('_btw'): query_param['parameterType']['type'] = ('STRING' if re.compile(ur'[^0-9\.,]', re.UNICODE).search(values[0]) else 'INT64') param_name_1 = param_name + '_btw_1' param_name_2 = param_name + '_btw_2' From c192a574854497f3182a02a16f22cb3965e24780 Mon Sep 17 00:00:00 2001 From: s-paquette Date: Mon, 30 Jul 2018 17:37:42 -0700 Subject: [PATCH 76/76] -> Bugfix for BETWEEN support --- google_helpers/bigquery/bq_support.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/google_helpers/bigquery/bq_support.py b/google_helpers/bigquery/bq_support.py index 7588e210..b7d18a41 100644 --- a/google_helpers/bigquery/bq_support.py +++ b/google_helpers/bigquery/bq_support.py @@ -528,9 +528,9 @@ def build_bq_filter_and_params(filters, comb_with='AND', param_suffix=None, with query_param_1 = query_param query_param_2 = copy.deepcopy(query_param) query_param = [query_param_1, query_param_2, ] - query_param_1['name'] = query_param_1 + query_param_1['name'] = param_name_1 query_param_1['parameterValue']['value'] = values[0] - query_param_2['name'] = query_param_2 + query_param_2['name'] = param_name_2 query_param_2['parameterValue']['value'] = values[1] else: