AfricasVoices · DanielMwendwa · Jul 18, 2024 · Mar 24, 2023 · Mar 24, 2023 · Mar 24, 2023
diff --git a/configurations/test_pipeline_configuration.py b/configurations/test_pipeline_configuration.py
@@ -173,6 +173,32 @@
             )
         )
     ],
+    kobotoolbox_sources=[
+        KoboToolBoxSource(
+            token_file_url="gs://avf-credentials/dev-kobotoolbox-credentials.json",
+            sync_config=KoboToolBoxToEngagementDBConfiguration(
+                asset_uid="aGHhW23K5kyM6xwh3uEeaY",
+                participant_id_configuration=KoboToolBoxParticipantIdConfiguration(
+                    data_column_name="phone_number",
+                    id_type=KoboToolBoxParticipantIdTypes.KENYA_MOBILE_NUMBER
+                ),
+                ignore_invalid_mobile_numbers=True,
+                question_configurations=[
+                    # Long answer
+                    KoboToolBoxQuestionConfiguration(data_column_name="leap_s05e01?", engagement_db_dataset="s01e01"),
+
+                    # Multiple choice question
+                    KoboToolBoxQuestionConfiguration(data_column_name="gender", engagement_db_dataset="gender"),
+
+                    # Numeric answer
+                    KoboToolBoxQuestionConfiguration(data_column_name="age", engagement_db_dataset="age"),
+
+                    # Multiple choice question
+                    KoboToolBoxQuestionConfiguration(data_column_name="disability", engagement_db_dataset="disability"),
+                ]
+            )
+        )
+    ],
     coda_sync=CodaConfiguration(
         coda=CodaClientConfiguration(credentials_file_url="gs://avf-credentials/coda-staging.json"),
         sync_config=CodaSyncConfiguration(

diff --git a/docker-run-kobotoolbox-to-engagement-db.sh b/docker-run-kobotoolbox-to-engagement-db.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+set -e
+
+PROJECT_NAME="$(<configurations/docker_image_project_name.txt)"
+IMAGE_NAME=$PROJECT_NAME-kobotoolbox-to-engagement-db
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --incremental-cache-volume)
+            INCREMENTAL_ARG="--incremental-cache-path /cache"
+            INCREMENTAL_CACHE_VOLUME_NAME="$2"
+            shift 2;;
+        --)
+            shift
+            break;;
+        *)
+            break;;
+    esac
+done
+
+# Check that the correct number of arguments were provided.
+if [[ $# -ne 5 ]]; then
+    echo "Usage: $0
+    [--incremental-cache-volume <incremental-cache-volume>]
+    <user> <google-cloud-credentials-file-path> <configuration-file> <code-schemes-dir> <data-dir>"
+    exit
+fi
+
+# Assign the program arguments to bash variables.
+USER=$1
+GOOGLE_CLOUD_CREDENTIALS_PATH=$2
+CONFIGURATION_FILE=$3
+CODE_SCHEMES_DIR=$4
+DATA_DIR=$5
+
+# Build an image for this pipeline stage.
+docker build -t "$IMAGE_NAME" .
+
+# Create a container from the image that was just built.
+CMD="pipenv run python -u sync_kobotoolbox_to_engagement_db.py ${INCREMENTAL_ARG} ${USER} \
+    /credentials/google-cloud-credentials.json configuration"
+
+if [[ "$INCREMENTAL_ARG" ]]; then
+    container="$(docker container create -t -w /app --mount source="$INCREMENTAL_CACHE_VOLUME_NAME",target=/cache "$IMAGE_NAME" /bin/bash -c "$CMD")"
+else
+    container="$(docker container create -t -w /app "$IMAGE_NAME" /bin/bash -c "$CMD")"
+fi
+
+echo "Created container $container"
+container_short_id=${container:0:7}
+
+# Copy input data into the container
+echo "Copying $GOOGLE_CLOUD_CREDENTIALS_PATH -> $container_short_id:/credentials/google-cloud-credentials.json"
+docker cp "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$container:/credentials/google-cloud-credentials.json"
+
+echo "Copying $CODE_SCHEMES_DIR -> $container_short_id:/app/code_schemes"
+docker cp "$CODE_SCHEMES_DIR" "$container:/app/code_schemes"
+
+echo "Copying $CONFIGURATION_FILE -> $container_short_id:/app/configuration.py"
+docker cp "$CONFIGURATION_FILE" "$container:/app/configuration.py"
+
+# Run the container
+echo "Starting container $container_short_id"
+docker start -a "$container"
+
+# Copy cache data out of the container for backup
+if [[ "$INCREMENTAL_ARG" ]]; then
+    echo "Copying $container_short_id:/cache/. -> $DATA_DIR/Cache"
+    mkdir -p "$DATA_DIR/Cache"
+    docker cp "$container:/cache/." "$DATA_DIR/Cache"
+fi
+
+# Tear down the container when it has run successfully
+docker container rm "$container" >/dev/null
diff --git a/run_pipeline.sh b/run_pipeline.sh
@@ -3,9 +3,9 @@
 set -e
 
 
-if [[ $# -ne 6 ]]; then
+if [[ $# -ne 7 ]]; then
     echo "Usage: ./run_pipeline.sh"
-    echo "  <user> <pipeline-name> <google-cloud-credentials-file-path> <configuration-module> <data-dir>"
+    echo "<user> <pipeline-name> <google-cloud-credentials-file-path> <configuration-file> <code-schemes-dir> <data-dir> <archive-dir>"
     echo "Runs the pipeline end-to-end (sync-rapid-pro-to-engagement-db, sync-engagement-db-to-coda, sync-coda-to-engagement-db,\
           sync-engagement-db-to-rapid-pro, run-engagement-db-to-analysis, ARCHIVE)"
     exit 1
@@ -14,31 +14,44 @@ fi
 USER=$1
 PIPELINE_NAME=$2
 GOOGLE_CLOUD_CREDENTIALS_PATH=$3
-CONFIGURATION_MODULE=$4
-DATA_DIR=$5
-ARCHIVE_LOCATION=$6
+CONFIGURATION_FILE=$4
+CODE_SCHEMES_DIR=$5
+DATA_DIR=$6
+ARCHIVE_DIR=$7
 
 DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 HASH=$(git rev-parse HEAD)
 RUN_ID="$DATE-$HASH"
-ARCHIVE_FILE="$ARCHIVE_LOCATION/data-$RUN_ID.tar.gzip"
+ARCHIVE_FILE="$ARCHIVE_DIR/data-$RUN_ID.tar.gzip"
 
-./docker-run-log-pipeline-event.sh  "$CONFIGURATION_MODULE" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$RUN_ID" "PipelineRunStart"
+echo "Starting a new pipeline run with id ${RUN_ID}"
 
-./docker-sync-rapid-pro-to-engagement-db.sh --incremental-cache-volume "$PIPELINE_NAME-rapid-pro-to-engagement-db-cache"  \
-                         "$USER" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_MODULE" "$DATA_DIR"
+./docker-run-log-pipeline-event.sh  "$CONFIGURATION_FILE" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$RUN_ID" "PipelineRunStart"
 
-./docker-sync-engagement-db-to-coda.sh --incremental-cache-volume "$PIPELINE_NAME-engagement-db-to-coda-cache" "$USER" \
-                        "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_MODULE" "$DATA_DIR"
+./docker-sync-rapid-pro-to-engagement-db.sh \
+    --incremental-cache-volume "$PIPELINE_NAME-rapid-pro-to-engagement-db-cache"  \
+    "$USER" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_FILE" "$CODE_SCHEMES_DIR" "$DATA_DIR"
 
-./docker-sync-coda-to-engagement-db.sh --incremental-cache-volume --skip-updating-coda-users-and-code-schemes "$PIPELINE_NAME-coda-to-engagement-db-cache" "$USER" \
-                        "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_MODULE" "$DATA_DIR"
+./docker-run-kobotoolbox-to-engagement-db.sh \
+    --incremental-cache-volume "$PIPELINE_NAME-kobotoolbox-to-engagement-db-cache-prod"  \
+    "$USER" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_FILE" "$CODE_SCHEMES_DIR" "$DATA_DIR"
 
-./docker-run-engagement-db-to-analysis.sh --incremental-cache-volume "$PIPELINE_NAME-engagement-db-to-analysis-cache" \
-                        "$USER" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_MODULE" "$DATA_DIR"
+./docker-sync-engagement-db-to-coda.sh \
+    --incremental-cache-volume "$PIPELINE_NAME-engagement-db-to-coda-cache" \
+    "$USER" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_FILE" "$CODE_SCHEMES_DIR" "$DATA_DIR"
+
+./docker-sync-coda-to-engagement-db.sh \
+    --incremental-cache-volume "$PIPELINE_NAME-coda-to-engagement-db-cache" \
+    "$USER" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_FILE" "$CODE_SCHEMES_DIR" "$DATA_DIR"
+
+./docker-run-engagement-db-to-analysis.sh \
+    --incremental-cache-volume "$PIPELINE_NAME-engagement-db-to-analysis-cache" \
+    "$USER" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_FILE" "$CODE_SCHEMES_DIR" "$DATA_DIR"
 
 ./archive_data_dir.sh "$DATA_DIR" "$ARCHIVE_FILE"
 
-./docker-run-upload-archive-files.sh "$USER" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_MODULE" "$ARCHIVE_LOCATION"
+./docker-run-upload-archive-files.sh \
+    "$USER" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$CONFIGURATION_FILE" "$CODE_SCHEMES_DIR" "$ARCHIVE_DIR"
 
-./docker-run-log-pipeline-event.sh  "$CONFIGURATION_MODULE" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$RUN_ID" "PipelineRunEnd"
+./docker-run-log-pipeline-event.sh \
+    "$CONFIGURATION_FILE" "$CODE_SCHEMES_DIR" "$GOOGLE_CLOUD_CREDENTIALS_PATH" "$RUN_ID" "PipelineRunEnd"
diff --git a/src/kobotoolbox_to_engagement_db/__init__.py b/src/kobotoolbox_to_engagement_db/__init__.py
diff --git a/src/kobotoolbox_to_engagement_db/configuration.py b/src/kobotoolbox_to_engagement_db/configuration.py
@@ -0,0 +1,88 @@
+import json
+
+from core_data_modules.logging import Logger
+from storage.google_cloud import google_cloud_utils
+
+
+log = Logger(__name__)
+
+
+class KoboToolBoxParticipantIdTypes:
+    # TODO: Consider moving this to core
+    KENYA_MOBILE_NUMBER = "kenya_mobile_number"
+
+
+class KoboToolBoxParticipantIdConfiguration:
+    def __init__(self, data_column_name, id_type):
+        """
+        Initializes a configuration object for a participant uuid question.
+
+        :param data_column_name:  This is the KoboToolBox variable name that stores response(s) for a question.
+                                  e.g. "What is your phone number?"
+        :type data_column_name: str
+        :param id_type: The type of UUID used for the question. See `KoboToolBoxParticipantIdTypes` for valid values.
+        :type id_type: str
+        """
+        self.data_column_name = data_column_name
+        self.id_type = id_type
+
+
+class KoboToolBoxQuestionConfiguration:
+    def __init__(self, data_column_name, engagement_db_dataset):
+        """
+        Initializes a configuration object for specifying the KoboToolBox variable name to sync from and the engagement database dataset to sync to.
+
+        :param data_column_name: This is a KoboToolBox variable name that store response for a question.
+        :type data_column_name: str
+        :param engagement_db_dataset: Name of the dataset to use in the engagement database.
+        :type engagement_db_dataset: str
+        """
+        self.data_column_name = data_column_name
+        self.engagement_db_dataset = engagement_db_dataset
+
+#TODO: Extract common config and move to common/src
+class KoboToolBoxToEngagementDBConfiguration:
+    def __init__(self, asset_uid, question_configurations, participant_id_configuration=None, ignore_invalid_mobile_numbers=False):
+        """
+        Initializes a Configuration for syncing a KoboToolBox form with the Engagment Database.
+
+        :param asset_uid: The unique identifier of the KoboToolBox form to sync with the engagement database.
+        :type asset_uid: str
+        :param question_configurations: The list of `QuestionConfiguration` objects, one for each question to sync.
+                                         Each `QuestionConfiguration` object specifies the mapping between a question
+                                         on the KoboToolBox form and the corresponding field on the engagement database.
+        :type question_configurations: List[QuestionConfiguration]
+        :param participant_id_configuration: Optional configuration for the participant uuid.
+                                               If set, the participant uuid will be derived from the answer to an
+                                               id question, otherwise it will be set to the form response id.
+        :type participant_id_configuration: ParticipantIdConfiguration | None
+        ignore_invalid_mobile_numbers: bool = False
+        ignore_invalid_mobile_numbers: Whether to ignore invalid mobile numbers during validation.
+                                    If a participant provides an invalid mobile number, instead of the pipeline terminating with a valueError
+                                    the participant uuid will be derived from the form response id. 
+        :raises AssertionError: If `ignore_invalid_mobile_numbers` is set to True but `participant_id_configuration` has a
+                              id_type that is not `KoboToolBoxParticipantIdTypes.KENYA_MOBILE_NUMBER`.                              
+        """
+        self.asset_uid = asset_uid
+        self.question_configurations = question_configurations
+        self.participant_id_configuration = participant_id_configuration
+        self.ignore_invalid_mobile_numbers = ignore_invalid_mobile_numbers
+
+        if participant_id_configuration is not None and participant_id_configuration.id_type not in \
+            [KoboToolBoxParticipantIdTypes.KENYA_MOBILE_NUMBER]:
+            assert ignore_invalid_mobile_numbers == False, f"`ignore_invalid_mobile_numbers` cannot be set to True " \
+                f"if participant id type is {participant_id_configuration.id_type}. See `KoboToolBoxToEngagementDBConfiguration`"
+
+
+class KoboToolBoxSource:
+    def __init__(self, token_file_url, sync_config):
+        """
+        Initializes a KoboToolBoxSource instance for syncing KoboToolBox form data to an engagement database.
+
+        :param token_file_url: The GS url path to the kobotoolbox api token file.
+        :type token_file_url: GS url
+        :param sync_config: The sync configuration for the KoboToolBox form data
+        :type sync_config: KoboToolBoxToEngagementDBConfiguration
+        """
+        self.token_file_url = token_file_url
+        self.sync_config = sync_config
diff --git a/src/kobotoolbox_to_engagement_db/kobotoolbox_client.py b/src/kobotoolbox_to_engagement_db/kobotoolbox_client.py
@@ -0,0 +1,87 @@
+import requests
+import json
+from dateutil.parser import isoparse
+
+from storage.google_cloud import google_cloud_utils
+from core_data_modules.logging import Logger
+
+log = Logger(__name__)
+
+BASE_URL = "https://kobo.humanitarianresponse.info/api/v2/assets"
+
+
+class KoboToolBoxClient:
+    def get_authorization_headers(google_cloud_credentials_file_path, token_file_url):
+        """
+        Retrieves a KoboToolBox API token and returns it as a dictionary of authorization headers.
+
+        :param google_cloud_credentials_file_path: Path to the Google Cloud service account credentials file to use when
+                                                downloading api token.
+        :type google_cloud_credentials_file_path: str
+        :param token_file_url: Path to the Google Cloud file path that contains KoboToolBox account api token.
+        :type token_file_url: str
+        :return: A dictionary of authorization headers containing the KoboToolBox API token.
+        :rtype: dict
+        """
+        log.info('Downloading KoboToolBox access token...')
+        api_token = json.loads(google_cloud_utils.download_blob_to_string(
+            google_cloud_credentials_file_path, token_file_url).strip())
+
+        authorization_headers = {"Authorization": f'Token {api_token["api_token"]}'}
+
+        return authorization_headers
+
+
+    def get_form_responses(authorization_headers, asset_uid, submitted_after_exclusive=None):
+        """
+        Retrieves the responses for a specified kobotoolbox form.
+
+        :param authorization_headers: A dictionary of authorization headers for the API call.
+        :type authorization_headers: dict
+        :param asset_uid: The UID of the form for which responses are to be retrieved.
+        :type asset_uid: str
+        :param submitted_after_exclusive: A datetime object specifying the earliest submission time. If set, only downloads responses last
+                                        submitted after this datetime. If None, downloads responses from all of time.
+        :type submitted_after_exclusive: datetime.datetime | None
+        :raises: requests.exceptions.RequestException: If an error occurs while making the API call.
+        :return: A list of dictionaries, each representing a response to the specified form.
+        :rtype: list of dict
+
+        Examples:
+            To retrieve all responses for a kobotoolbox form:
+
+            >>> authorization_headers = {'Authorization': 'Bearer your_token'}
+            >>> asset_uid = 'your_form_uid'
+            >>> form_responses = get_form_responses(authorization_headers, asset_uid)
+            >>> print(len(form_responses))
+            100
+
+            To retrieve responses submitted after a specific time:
+
+            >>> authorization_headers = {'Authorization': 'Bearer your_token'}
+            >>> asset_uid = 'your_form_uid'
+            >>> submitted_after_exclusive = datetime.datetime(2022, 1, 1)
+            >>> form_responses = get_form_responses(authorization_headers, asset_uid, submitted_after_exclusive)
+            >>> print(len(form_responses))
+            50
+        """
+        timestamp_log = ""
+        if submitted_after_exclusive is not None:
+            submitted_after_exclusive = submitted_after_exclusive.isoformat()
+            timestamp_log = f", last submitted after {submitted_after_exclusive}"
+            query = f'{{"_submission_time":{{"$gt":"{submitted_after_exclusive}"}}}}'
+            log.info(f"Downloading responses for Asset '{asset_uid}'{timestamp_log}")
+            request = f'{BASE_URL}/{asset_uid}/data/?query={query}&format=json'
+        else:
+            log.info(f"Downloading all responses for Asset '{asset_uid}")
+            request = f'{BASE_URL}/{asset_uid}/data/?format=json'
+
+        response = requests.get(request, headers=authorization_headers, verify=False)
+        if response.content:
+            form_responses = json.loads(response.content)['results']
+            log.info(f"Downloaded {len(form_responses)} total responses")
+        else: 
+            log.info(f"No responses downloaded for Asset '{asset_uid}'{timestamp_log}. Status code: {response.status_code}")
+            form_responses = []
+
+        return form_responses