diff --git a/Dockerfile b/Dockerfile index 3cf8b9ffc..73f910384 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,10 +37,10 @@ RUN mkdir -p /etc/service/uwsgi && \ cp "${KOBOCAT_SRC_DIR}/docker/init.bash" /etc/my_init.d/10_init_kobocat.bash && \ cp "${KOBOCAT_SRC_DIR}/docker/sync_static.sh" /etc/my_init.d/11_sync_static.bash && \ mkdir -p "${KOBOCAT_SRC_DIR}/emails/" && \ - chown -R wsgi "${KOBOCAT_SRC_DIR}/emails/" && \ + chown -R "${UWSGI_USER}" "${KOBOCAT_SRC_DIR}/emails/" && \ mkdir -p "${BACKUPS_DIR}" && \ mkdir -p "${KOBOCAT_LOGS_DIR}" && \ - chown -R wsgi "${KOBOCAT_LOGS_DIR}" + chown -R "${UWSGI_USER}" "${KOBOCAT_LOGS_DIR}" RUN echo "db:*:*:kobo:kobo" > /root/.pgpass && \ chmod 600 /root/.pgpass diff --git a/Dockerfile.kobocat_base b/Dockerfile.kobocat_base index 20aab02c4..0335f00bd 100644 --- a/Dockerfile.kobocat_base +++ b/Dockerfile.kobocat_base @@ -5,7 +5,9 @@ FROM kobotoolbox/base-kobos:latest ENV KOBOCAT_TMP_DIR=/srv/kobocat_tmp \ # Store editable packages (pulled from VCS repos) in their own directory. - PIP_EDITABLE_PACKAGES_DIR=/srv/pip_editable_packages + PIP_EDITABLE_PACKAGES_DIR=/srv/pip_editable_packages \ + UWSGI_USER=wsgi \ + UWSGI_GROUP=wsgi ########################### @@ -37,4 +39,4 @@ RUN mkdir -p ${PIP_EDITABLE_PACKAGES_DIR} && \ RUN mkdir -p /srv/src && \ cd /srv/src && \ git clone --depth 1 https://github.com/kobotoolbox/kobocat-template.git -b master && \ - chown -R wsgi /srv/src/kobocat-template + chown -R ${UWSGI_USER} /srv/src/kobocat-template diff --git a/docker/kobocat.ini b/docker/kobocat.ini index 691d20c9d..551bd152d 100644 --- a/docker/kobocat.ini +++ b/docker/kobocat.ini @@ -13,6 +13,6 @@ socket = 0.0.0.0:8000 buffer-size = 32768 harakiri = 120 -uid = wsgi -gid = wsgi +uid = $(UWSGI_USER) +gid = $(UWSGI_GROUP) die-on-term = true diff --git a/docker/run_celery.bash b/docker/run_celery.bash index 831892418..c46c32f6d 100755 --- a/docker/run_celery.bash +++ b/docker/run_celery.bash @@ -7,4 +7,4 @@ CELERYD_OPTIONS="-Ofair --beat --loglevel=DEBUG" cd "${KOBOCAT_SRC_DIR}" -exec /sbin/setuser wsgi python manage.py celeryd $CELERYD_OPTIONS +exec /sbin/setuser "${UWSGI_USER}" python manage.py celeryd $CELERYD_OPTIONS diff --git a/docker/run_uwsgi.bash b/docker/run_uwsgi.bash index 3d2c5ee2d..bf366eda7 100755 --- a/docker/run_uwsgi.bash +++ b/docker/run_uwsgi.bash @@ -3,4 +3,10 @@ set -e source /etc/profile -exec /sbin/setuser wsgi /usr/local/bin/uwsgi --ini "${KOBOCAT_SRC_DIR}/docker/kobocat.ini" + +if [[ "$(stat -c '%U' ${KOBOCAT_LOGS_DIR})" != "${UWSGI_USER}" ]]; then + echo 'Restoring ownership of Logs directory.' + chown -R "${UWSGI_USER}" "${KOBOCAT_LOGS_DIR}" +fi + +exec /sbin/setuser "${UWSGI_USER}" /usr/local/bin/uwsgi --ini "${KOBOCAT_SRC_DIR}/docker/kobocat.ini" diff --git a/docker/sync_static.sh b/docker/sync_static.sh index deeda8360..9bc19c861 100755 --- a/docker/sync_static.sh +++ b/docker/sync_static.sh @@ -12,7 +12,7 @@ echo "Collecting static files..." python manage.py collectstatic -v 0 --noinput echo "Done" echo "Fixing permissions..." -chown -R wsgi "${KOBOCAT_SRC_DIR}" +chown -R "${UWSGI_USER}" "${KOBOCAT_SRC_DIR}" echo "Done." echo "Syncing to nginx folder..." rsync -aq ${KOBOCAT_SRC_DIR}/onadata/static/* /srv/static/ diff --git a/fabfile/legacy.py b/fabfile/legacy.py index 220992ebb..f24f0d1fe 100644 --- a/fabfile/legacy.py +++ b/fabfile/legacy.py @@ -85,7 +85,7 @@ def reload(deployment_name, branch='master'): def deploy_ref(deployment_name, ref): setup_env(deployment_name) with cd(env.kc_path): - run("git fetch origin") + run("git fetch --all --tags") # Make sure we're not moving to an older codebase git_output = run_no_pty( 'git rev-list {}..HEAD --count 2>&1'.format(ref)) @@ -136,5 +136,5 @@ def deploy_ref(deployment_name, ref): run("sudo restart uwsgi") -def deploy(deployment_name, branch='master'): - deploy_ref(deployment_name, 'origin/{}'.format(branch)) +def deploy(deployment_name, branch='origin/master'): + deploy_ref(deployment_name, branch) diff --git a/onadata/apps/logger/management/commands/fix_root_node_names.py b/onadata/apps/logger/management/commands/fix_root_node_names.py index 156beacca..3f321565f 100644 --- a/onadata/apps/logger/management/commands/fix_root_node_names.py +++ b/onadata/apps/logger/management/commands/fix_root_node_names.py @@ -1,156 +1,204 @@ +import math import time -from xml.dom import minidom from django.core.management.base import BaseCommand, CommandError +from django.db import connections +from django.db.models import Func, F, Value from onadata.apps.logger.models import XForm, Instance -AUTO_NAME_INSTANCE_XML_SEARCH_STRING = 'uploaded_form_' - def replace_first_and_last(s, old, new): s = s.replace(old, new, 1) # credit: http://stackoverflow.com/a/2556252 return new.join(s.rsplit(old, 1)) -def get_xform_root_node_name(xform): - parsed = minidom.parseString(xform.xml.encode('utf-8')) - instance_xml = parsed.getElementsByTagName('instance')[0] - root_node_name = None - for child in instance_xml.childNodes: - if child.nodeType == child.ELEMENT_NODE: - root_node_name = child.nodeName - break - return root_node_name - -def write_same_line(stdout, output, last_len=[0]): - stdout.write(u'\r{}'.format(output), ending='') +def write_same_line(dest_file, output, last_len=[0]): + dest_file.write(u'\r{}'.format(output), ending='') this_len = len(output) too_short = last_len[0] - this_len last_len[0] = this_len if too_short > 0: - stdout.write(' ' * too_short, ending='') - stdout.flush() + dest_file.write(' ' * too_short, ending='') + dest_file.flush() + +XFORM_ROOT_NODE_NAME_PATTERN = '[^<]*< *([^ ]+)' +INSTANCE_ROOT_NODE_NAME_PATTERN = '\/ *([^\/> ]+) *>[^<>]*$' class Command(BaseCommand): ''' This command cleans up inconsistences between the root instance node name specified in the form XML and the actual instance XML. Where a discrepancy exists, the instance will be changed to match the form. The cause of these mismatches is documented at - https://github.com/kobotoolbox/kobocat/issues/222. See also + https://github.com/kobotoolbox/kobocat/issues/222 and + https://github.com/kobotoolbox/kobocat/issues/358. See also https://github.com/kobotoolbox/kobocat/issues/242. ''' help = 'fixes instances whose root node names do not match their forms' def add_arguments(self, parser): parser.add_argument( - '--minimum-instance-id', + '--minimum-instance-pk', type=int, + dest='pk__gte', help='consider only instances whose ID is greater than or equal '\ 'to this number' ) parser.add_argument( - '--minimum-form-id', - type=int, - help='consider only forms whose ID is greater than or equal '\ - 'to this number' + '--form-id-string', + dest='xform__id_string', + help='consider only forms matching this `id_string`. combine '\ + 'with --username to ensure an exact match' ) parser.add_argument( '--username', + dest='xform__user__username', help='consider only forms belonging to a particular user' ) + parser.add_argument( + '--instance-xml-search-string', + dest='xml__contains', + help='consider only instances whose XML contains this string'\ + ) def handle(self, *args, **options): + verbosity = options['verbosity'] + if len(list(connections)) > 1: + raise NotImplementedError( + "This management command does not support multiple-database " + "configurations" + ) + connection = connections['default'] + if connection.Database.__name__ != 'psycopg2': + raise NotImplementedError( + "Only the `psycopg2` database backend is supported") + + instances = Instance.objects.all().order_by('pk') + xforms = XForm.objects.all() + for option in ( + 'pk__gte', + 'xform__id_string', + 'xform__user__username', + 'xml__contains' + ): + if options[option]: + instances = instances.filter(**{option: options[option]}) + if option.startswith('xform__'): + xforms = xforms.filter( + **{option[len('xform__'):]: options[option]} + ) + + instances = instances.annotate( + root_node_name=Func( + F('xml'), + Value(INSTANCE_ROOT_NODE_NAME_PATTERN), + function='regexp_matches' + ) + ).values_list('pk', 'xform_id', 'root_node_name') + if not instances.exists(): + self.stderr.write('No Instances found.') + return t0 = time.time() - forms_complete = 0 - mismatch_count = 0 - failed_forms = 0 + self.stderr.write( + 'Fetching Instances; please allow several minutes...', ending='') + instances = list(instances) + self.stderr.write( + 'got {} in {} seconds.'.format( + len(instances), + int(time.time() - t0) + ) + ) + + # Getting the XForm root node names separately is far more efficient + # than calling `regexp_matches` on `xform__xml` in the `Instance` query + xforms = xforms.annotate( + root_node_name=Func( + F('xml'), + Value(XFORM_ROOT_NODE_NAME_PATTERN), + function='regexp_matches' + ) + ).values_list('pk', 'root_node_name') + self.stderr.write('Fetching XForm root node names...', ending='') + t0 = time.time() + xform_root_node_names = dict(xforms) + self.stderr.write( + 'got {} in {} seconds.'.format( + len(xform_root_node_names), + int(time.time() - t0) + ) + ) + + completed_instances = 0 + changed_instances = 0 failed_instances = 0 - criteria = {'xml__contains': AUTO_NAME_INSTANCE_XML_SEARCH_STRING} - if options['minimum_instance_id']: - criteria['id__gte'] = options['minimum_instance_id'] - if options['username']: - criteria['xform__user__username'] = options['username'] - last_instance = Instance.objects.last() - kpi_auto_named_instances = Instance.objects.filter(**criteria) - affected_xforms = XForm.objects.filter( - id__in=kpi_auto_named_instances.values_list( - 'xform_id', flat=True).distinct() - ).order_by('id') - if options['minimum_form_id']: - affected_xforms = affected_xforms.filter( - id__gte=options['minimum_form_id']) - if options['verbosity']: - self.stdout.write('Running slow query... ', ending='') - self.stdout.flush() - total_forms = affected_xforms.count() - for xform in affected_xforms.iterator(): - try: - xform_root_node_name = get_xform_root_node_name(xform) - except Exception as e: - if options['verbosity']: - self.stderr.write( - '!!! Failed to get root node name for form {}: ' \ - '{}'.format(xform.id, e.message) + progress_interval = 1 # second + t0 = time.time() + t_last = t0 + + self.stdout.write( + 'Instance\tXForm\tOld Root Node Name\tNew Root Node Name') + for instance in instances: + t_now = time.time() + if (verbosity > 1 and t_now - t_last >= progress_interval + and completed_instances): + t_last = t_now + t_elapsed = t_now - t0 + write_same_line( + self.stderr, + 'Completed {} Instances: {} changed, {} failed; ' + '{}s elapsed, {} Instance/sec.'.format( + completed_instances, + changed_instances, + failed_instances, + int(t_elapsed), + int(completed_instances / t_elapsed) ) - failed_forms += 1 + ) + + instance_id = instance[0] + xform_id = instance[1] + # `regexp_matches` results come back as `list`s from the ORM + instance_root_node_name = instance[2] + xform_root_node_name = xform_root_node_names[xform_id] + if not len(instance_root_node_name) == 1: + self.stderr.write( + '!!! Failed to get root node name for Instance {}'.format( + instance_id) + ) + failed_instances += 1 + completed_instances += 1 continue - affected_instances = xform.instances.exclude( - xml__endswith=''.format(xform_root_node_name) - ).order_by('id') - for instance in affected_instances: - try: - root_node_name = instance.get_root_node_name() - except Exception as e: - if options['verbosity']: - self.stderr.write( - '!!! Failed to get root node name for instance ' \ - '{}: {}'.format(instance.id, e.message) - ) - failed_instances += 1 - continue - # Our crude `affected_instances` filter saves us a lot of work, - # but it doesn't account for things like trailing - # whitespace--so there might not really be a discrepancy - if xform_root_node_name != root_node_name: - instance.xml = replace_first_and_last( - instance.xml, root_node_name, xform_root_node_name) - instance.save(force=True) - mismatch_count += 1 - if options['verbosity'] > 1: - write_same_line( - self.stdout, - u'Form {} ({}): corrected instance {}. ' \ - 'Completed {} of {} forms.'.format( - xform_root_node_name, - xform.id, - instance.id, - forms_complete, - total_forms - )) - forms_complete += 1 - if options['verbosity'] > 1: - write_same_line( - self.stdout, - u'Form {} ({}). Completed {} of {} forms.'.format( - xform_root_node_name, - xform.id, - forms_complete, - total_forms - )) - t1 = time.time() - if options['verbosity']: - self.stdout.write('') - self.stdout.write( - 'Corrected {} instances in {} minutes and {} seconds.'.format( - mismatch_count, - int((t1 - t0) / 60), - (t1 - t0) % 60 - )) - if failed_forms or failed_instances: + if not len(xform_root_node_name) == 1: self.stderr.write( - 'Failed to process {} forms and {} instances'.format( - failed_forms, failed_instances - ) + '!!! Failed to get root node name for XForm {}'.format( + xform_id) ) - self.stdout.write( - 'At the start of processing, the last instance ID ' \ - 'was {}.'.format(last_instance.id) + failed_instances += 1 + completed_instances += 1 + continue + + instance_root_node_name = instance_root_node_name[0] + xform_root_node_name = xform_root_node_name[0] + if instance_root_node_name == xform_root_node_name: + completed_instances += 1 + continue + + queryset = Instance.objects.filter(pk=instance_id).only('xml') + fixed_xml = replace_first_and_last( + queryset[0].xml, instance_root_node_name, xform_root_node_name) + queryset.update(xml=fixed_xml) + self.stdout.write('{}\t{}\t{}\t{}'.format( + instance_id, xform_id, + instance_root_node_name, xform_root_node_name + )) + changed_instances += 1 + completed_instances += 1 + + self.stderr.write( + '\nFinished {} Instances: {} changed, {} failed.'.format( + completed_instances, + changed_instances, + failed_instances ) + ) + self.stdout.write( + 'At the start of processing, the last instance PK ' + 'was {}.'.format(instance_id) + ) diff --git a/onadata/apps/logger/tasks.py b/onadata/apps/logger/tasks.py index 87eb5f2e8..2d4fb029b 100644 --- a/onadata/apps/logger/tasks.py +++ b/onadata/apps/logger/tasks.py @@ -5,10 +5,10 @@ from django.core.management import call_command @shared_task(soft_time_limit=600, time_limit=900) -def fix_root_node_names(minimum_instance_id): +def fix_root_node_names(**kwargs): call_command( 'fix_root_node_names', - minimum_instance_id=minimum_instance_id, + **kwargs ) ###### END ISSUE 242 FIX ###### diff --git a/onadata/apps/viewer/models/data_dictionary.py b/onadata/apps/viewer/models/data_dictionary.py index 815b0f264..9daca43cb 100644 --- a/onadata/apps/viewer/models/data_dictionary.py +++ b/onadata/apps/viewer/models/data_dictionary.py @@ -148,6 +148,9 @@ def add_instances(self): def save(self, *args, **kwargs): if self.xls: survey = create_survey_from_xls(self.xls) + survey.update({ + 'name': survey.id_string, + }) self.json = survey.to_json() self.xml = survey.to_xml() self._mark_start_time_boolean() diff --git a/onadata/apps/viewer/tasks.py b/onadata/apps/viewer/tasks.py index 0b1493680..f569e7c1a 100644 --- a/onadata/apps/viewer/tasks.py +++ b/onadata/apps/viewer/tasks.py @@ -1,6 +1,9 @@ +import logging +import pytz import re import sys -from celery import task +from celery import task, shared_task +from datetime import datetime, timedelta from django.conf import settings from django.core.mail import mail_admins from requests import ConnectionError @@ -392,3 +395,32 @@ def email_mongo_sync_status(): '\n\n'.join([before_report, after_report, SYNC_MONGO_MANUAL_INSTRUCTIONS])) + + +@shared_task(soft_time_limit=60, time_limit=90) +def log_stuck_exports_and_mark_failed(): + # How long can an export possibly run, not including time spent waiting in + # the Celery queue? + max_export_run_time = getattr(settings, 'CELERYD_TASK_TIME_LIMIT', 2100) + # Allow a generous grace period + max_allowed_export_age = timedelta(seconds=max_export_run_time * 4) + this_moment = datetime.now(tz=pytz.UTC) + oldest_allowed_timestamp = this_moment - max_allowed_export_age + stuck_exports = Export.objects.filter( + internal_status=Export.PENDING, + created_on__lt=oldest_allowed_timestamp + ) + for stuck_export in stuck_exports: + logging.warning( + 'Stuck export: pk {}, type {}, username {}, id_string {}, ' + 'age {}'.format( + stuck_export.pk, + stuck_export.export_type, + stuck_export.xform.user.username, + stuck_export.xform.id_string, + this_moment - stuck_export.created_on + ) + ) + # Export.save() is a busybody; bypass it with update() + stuck_exports.filter(pk=stuck_export.pk).update( + internal_status=Export.FAILED) diff --git a/onadata/settings/common.py b/onadata/settings/common.py index 02eb851d6..75b4f6a29 100644 --- a/onadata/settings/common.py +++ b/onadata/settings/common.py @@ -469,6 +469,7 @@ def configure_logging(logger, **kwargs): SUPPORTED_MEDIA_UPLOAD_TYPES = [ 'image/jpeg', 'image/png', + 'image/svg+xml', 'audio/mpeg', 'video/3gpp', 'audio/wav', diff --git a/onadata/settings/kc_environ.py b/onadata/settings/kc_environ.py index a8a33adf5..92d660f7a 100644 --- a/onadata/settings/kc_environ.py +++ b/onadata/settings/kc_environ.py @@ -1,4 +1,5 @@ import os +from datetime import timedelta from onadata.settings.common import * @@ -173,19 +174,77 @@ except raven.exceptions.InvalidGitRepository: pass + # The below is NOT required for Sentry to log unhandled exceptions, but it + # is necessary for capturing messages sent via the `logging` module. + # https://docs.getsentry.com/hosted/clients/python/integrations/django/#integration-with-logging + LOGGING = { + 'version': 1, + 'disable_existing_loggers': False, # Was `True` in Sentry documentation + 'root': { + 'level': 'WARNING', + 'handlers': ['sentry'], + }, + 'formatters': { + 'verbose': { + 'format': '%(levelname)s %(asctime)s %(module)s ' + '%(process)d %(thread)d %(message)s' + }, + }, + 'handlers': { + 'sentry': { + 'level': 'WARNING', + 'class': 'raven.contrib.django.raven_compat.handlers.SentryHandler', + }, + 'console': { + 'level': 'DEBUG', + 'class': 'logging.StreamHandler', + 'formatter': 'verbose' + } + }, + 'loggers': { + 'django.db.backends': { + 'level': 'ERROR', + 'handlers': ['console'], + 'propagate': False, + }, + 'raven': { + 'level': 'DEBUG', + 'handlers': ['console'], + 'propagate': False, + }, + 'sentry.errors': { + 'level': 'DEBUG', + 'handlers': ['console'], + 'propagate': False, + }, + }, + } + CELERYD_HIJACK_ROOT_LOGGER = False + POSTGIS_VERSION = (2, 1, 2) +CELERYBEAT_SCHEDULE = { + # Periodically mark exports stuck in the "pending" state as "failed" + # See https://github.com/kobotoolbox/kobocat/issues/315 + 'log-stuck-exports-and-mark-failed': { + 'task': 'onadata.apps.viewer.tasks.log_stuck_exports_and_mark_failed', + 'schedule': timedelta(hours=6), + }, +} + ### ISSUE 242 TEMPORARY FIX ### # See https://github.com/kobotoolbox/kobocat/issues/242 ISSUE_242_MINIMUM_INSTANCE_ID = os.environ.get( 'ISSUE_242_MINIMUM_INSTANCE_ID', None) +ISSUE_242_INSTANCE_XML_SEARCH_STRING = os.environ.get( + 'ISSUE_242_INSTANCE_XML_SEARCH_STRING', 'uploaded_form_') if ISSUE_242_MINIMUM_INSTANCE_ID is not None: - from datetime import timedelta - CELERYBEAT_SCHEDULE = { - 'fix-root-node-names': { - 'task': 'onadata.apps.logger.tasks.fix_root_node_names', - 'schedule': timedelta(hours=1), - 'args': (int(ISSUE_242_MINIMUM_INSTANCE_ID),) - }, + CELERYBEAT_SCHEDULE['fix-root-node-names'] = { + 'task': 'onadata.apps.logger.tasks.fix_root_node_names', + 'schedule': timedelta(hours=1), + 'kwargs': { + 'pk__gte': int(ISSUE_242_MINIMUM_INSTANCE_ID), + 'xml__contains': ISSUE_242_INSTANCE_XML_SEARCH_STRING + } } ###### END ISSUE 242 FIX ######