From 0281b84acc9f63f5a5783390d18c7416afc54f12 Mon Sep 17 00:00:00 2001 From: Gord Nuttall Date: Mon, 25 Jan 2021 13:06:35 -0700 Subject: [PATCH] Cleaned up authentication steps and readme after install test with business analyst. --- README.md | 143 ++++++++++++++++++-------------- ga_flattener.yaml | 4 +- ga_flattener_colon.yaml | 4 +- tools/pubsub_message_publish.py | 36 ++++---- 4 files changed, 105 insertions(+), 82 deletions(-) diff --git a/README.md b/README.md index 259f887..05332b8 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,89 @@ # README # -Google Analytics 360 Flattener. A Google Cloud Platform (GCP) solution that unnests (flattens) Google Analytics Data stored in Bigquery. The GCP resources for the solutions are installed via Deployment Manager. +Google Analytics 360 Flattener. A Google Cloud Platform (GCP) solution that unnests (flattens) Google Analytics Data stored in Bigquery. +The GCP resources for the solutions are installed via Deployment Manager. ## Local dependencies ## -* Python 3.7 or higher as base interpreter -* Create a virtual environment -* Install python packages using cf/requirements.txt +* Google Cloud Platform SDK. Download and install from these instructions: https://cloud.google.com/sdk/docs/install +* Python >= 3.7. Download and install from https://python.org. +* Web browser +* git (**optional** for cloning this code repository) -## Directories ## +## Prerequisites ## +1. Browse to https://cloud.console.google.com to create Google GCP project or use + an existing project that has Google Analytics data flowing to it. + Referred to as **[PROJECT_ID]**. +2. Grant the installing user (you most likely) the pre-defined IAM role of "Owner". +3. As the installing user for **[PROJECT_ID]**, enable the following APIs + * Cloud Build API + * Cloud Functions API. + * Identity and Access Management (IAM) API +4. As the installing user for **[PROJECT_ID]**, grant the following pre-defined IAM roles to + **[PROJECT_NUMBER]**@cloudservices.gserviceaccount.com (built in service + account) otherwise deployment will fail with permission errors. See + for + detailed explanation. + * Logs Configuration Writer + * Cloud Functions Developer + * pub/sub Admin +5. As the installing user for **[PROJECT_ID]**, create bucket for staging code during deployment, for example: + **[PROJECT_NUMBER]**-function-code-staging. Referred to as **[BUCKET_NAME]**. +6. Clone this github repo or download the source code from the releases section to your local machine or + cloud shell. +7. Edit the _ga_flattener.yaml_ and _ga_flattener_colon.yaml_ files, specifically all occurrences of _properties-->codeBucket_ value . Set the value to **[BUCKET_NAME]** (see step above) + +_**The following steps are only required if you plan to backfill historical tables._** +8. Install python 3.7 or higher +9. From a command prompt, upgrade pip (Command: py -m pip install --upgrade pip) +10. Navigate to the root directory of the source code that was downloaded or cloned in step 6 above. +10. From a command prompt, install python virtual environments (Command: py -m pip install --user virtualenv) +11. Create a virtual environment for the source code in step 6 (Command: py -m venv venv) +12. Active the virtual environment in the step above. +13. Install the python dependent packages into the virtual environment. (Command: pip install -r cf\requirements.txt) + +## Installation steps ## +1. Execute command: gcloud config set project **[PROJECT_ID]** +2. Execute command: gcloud config set account . **Note** - This must be the installing user from above prerequisites. +3. Navigate (locally) to root directory of this repository +4. If **[PROJECT_ID]** does **NOT** contain a colon (:) execute command: + * gcloud deployment-manager deployments create **[Deployment Name]** --config ga_flattener.yaml + otherwise follow these steps: + 1. execute command: + * gcloud deployment-manager deployments create **[Deployment Name]** --config ga_flattener_colon.yaml + 2. Trigger function (with a blank message) named **[Deployment Name]**-cfconfigbuilderps. It will create the necessary configuration file in the applications Google Coud Storage bucket. An easy method to do this is to browse to https://console.cloud.google.com/functions and click the cloud function named **[Deployment Name]**-cfconfigbuilderps and go to the testing section and click "TEST THIS FUNCTION". + +## Verification steps ## +1. After installation, a configuration file named config_datasets.json exists in **gs://[Deployment Name]-[PROJECT_NUMBER]-adswerve-ga-flat-config/** (Cloud Storage Bucket within **[PROJECT_ID]**). This file contains all the datasets that have "ga_sessions_yyyymmdd" tables and which tables to unnest. This configuration is required for this GA flattener solution to run daily or to backfill historical data. Edit this file accordingly to include or exclude certain datasets or tables to unnest. For example: + * { "123456789": ["sessions","hits","products"] } will only flatten those 3 nested tables for GA view 123456789 + * { "123456789": ["sessions","hits","products", "promotions", "experiments"], "987654321": ["sessions","hits"] } will flatten all possible nested tables for GA view 123456789 but only _sessions_ and _hits_ for GA View 987654321. + +_**The following steps are only required if you plan to backfill historical tables._** +2. Modify values in the configuration section of tools/pubsub_message_publish.py accordingly. **Suggestion:** Use a small date range to start, like yesterday only. +3. From a gcloud command prompt, authenticate the installating user using command: + _gcloud auth application-default login_ +4. Run tools/pubsub_message_publish.py locally, which will publish a + simulated logging event of GA data being ingested into BigQuery. Check dataset(s) that are configured for new date sharded tables such as (depending on what is configured): + * ga_flat_experiments_(x) + * ga_flat_hits_(x) + * ga_flat_products_(x) + * ga_flat_promotions_(x) + * ga_flat_sessions_(x) + +## Un-install steps ## +1. Optional command to remove solution: + * gcloud deployment-manager deployments delete **[Deployment Name]** -q + +## Common errors ## +### Install ### +* * **Message:** AccessDeniedException: 403 **[PROJECT_NUMBER]**@cloudbuild.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. + * **Resolution:** Ensure the value (Cloud Storage bucket name) configured in "codeBucket" setting of ga_flattener*.yaml is correct. **[PROJECT_NUMBER]**@cloudbuild.gserviceaccount.com only requires GCP predefined role of _Cloud Build Service Account_ +### Verification ### +* * **Message:** google.auth.exceptions.DefaultCredentialsError: Could not automatically determine credentials. Please set GOOGLE_APPLICATION_CREDENTIALS or explicitly create credentials and re-run the application. For more information, please see https://cloud.google.com/docs/authentication/getting-started + * **Resolution:** Ensure you run the gcloud command _gcloud auth application-default login_ as this sets up the required authentication and it'll just work. + +## Repository directories ## * cf : pub/sub triggered cloud function that executes a destination query to unnest(flatten) the .ga_sessions_yyyymmdd table - immediately upon arrival in BigQuery into 5 tables: + immediately upon arrival in BigQuery into these tables, depending on the configuration: * ga_flat_sessions_yyyymmdd * ga_flat_hits_yyyymmdd * ga_flat_products_yyyymmdd @@ -23,7 +97,7 @@ Google Analytics 360 Flattener. A Google Cloud Platform (GCP) solution that unn location: [DEPLOYMENT NAME]-[PROJECT_NUMBER]-adswerve-ga-flat-config\config_datasets.json -## Files ## +## Repository files ## * dm_helper.py: provides consistent names for GCP resources accross solution. Configuration and constants also found in the class in this file @@ -34,57 +108,4 @@ Google Analytics 360 Flattener. A Google Cloud Platform (GCP) solution that unn * tools/pubsub_message_publish.py : python based utility to publish a message to simulate an event that's being monitored in GCP logging. Useful for smoke testing and back-filling data historically. -* LICENSE: BSD 3-Clause open source license - -## Prerequisites ## -1. Create Google GCP project or use an existing project that has Google - Analytics data flowing to it. Referred to as [PROJECT_ID] -2. Enable the Cloud Build API -3. Enable the Cloud Functions API -4. Enable the Identity and Access Management (IAM) API -5. Add "Logs Configuration Writer", "Cloud Functions Developer", "pub/sub Admin" pre - defined IAM roles to - [PROJECT_NUMBER]@cloudservices.gserviceaccount.com (built in service - account) otherwise deployment will fail with permission errors. See - for - detailed explanation. -6. Install gCloud locally or use cloud shell. -7. Clone this github repo -8. Create bucket for staging code during deployment, for example: - [PROJECT_NUMBER]-function-code-staging. Referred to as [BUCKET_NAME]. -9. Edit the ga_flattener.yaml file, specifically the - _properties-->codeBucket_ value of the function and httpfunction - resources. Set the value for both to [BUCKET_NAME] (see previous step) - -## Installation steps ## -1. Execute command: gcloud config set project [PROJECT_ID] -2. Execute command: gcloud config set account -3. Navigate (locally) to root directory of this repository -4. If [PROJECT_ID] does **NOT** contain a colon (:) execute command: - * gcloud deployment-manager deployments create [Deployment Name] --config ga_flattener.yaml - otherwise follow these steps: - 1. execute command: - * gcloud deployment-manager deployments create [Deployment Name] --config ga_flattener_colon.yaml - 2. Trigger function (with a blank message) named [Deployment Name]-cfconfigbuilderps. It will create the necessary configuration file in the applications Google Coud Storage bucket. - -## Verification steps ## -1. After installation, a configuration file named config_datasets.json will exists in gs://[Deployment Name]-[PROJECT_NUMBER]-adswerve-ga-flat-config/ (Cloud Storage Bucket within [PROJECT_ID]). This file will contains all the datasets that have "ga_sessions_yyyymmdd" tables and which tables to unnest. This configuration is required for the cloud function to execute. - -## Testing / Simulating Event ## -1. Modify values in lines 7-17 of - tools/pubsub_message_publish.py accordingly. -2. Run tools/pubsub_message_publish.py locally, which will publish a - simulated logging event of GA data being ingested into BigQuery. Check dataset for date sharded tables named: - * ga_flat_experiments_(x) - * ga_flat_hits_(x) - * ga_flat_products_(x) - * ga_flat_promotions_(x) - * ga_flat_sessions_(x) - -## Un-install steps ## -1. Optional command to remove solution: - * gcloud deployment-manager deployments delete [Deployment Name] -q - -## Common install errors ## -1. * **Message:** Step #2: AccessDeniedException: 403 [PROJECT_NUMBER]@cloudbuild.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. - * **Resolution:** Ensure the value (Cloud Storage bucket name) configured in "codeBucket" setting of ga_flattener*.yaml is correct. [PROJECT_NUMBER]@cloudbuild.gserviceaccount.com only requires GCP predefined role of _Cloud Build Service Account_ \ No newline at end of file +* LICENSE: BSD 3-Clause open source license \ No newline at end of file diff --git a/ga_flattener.yaml b/ga_flattener.yaml index 137759e..30fe2d8 100644 --- a/ga_flattener.yaml +++ b/ga_flattener.yaml @@ -39,7 +39,7 @@ resources: - name: function type: dmt_cloud_function.py properties: - # All the files that start with this prefix will be packed in the Cloud Function (PUBSUB) + # All the files that start with this prefix will be packed in the Cloud Function that flattens the data codeLocation: cf/ codeBucket: as-dev-gord-staging triggerType: pubsub @@ -51,7 +51,7 @@ resources: - name: httpfunction type: dmt_cloud_function.py properties: - # All the files that start with this prefix will be packed in the Cloud Function (HTTP) + # All the files that start with this prefix will be packed in the Cloud Function that sets up the configuration codeLocation: cfconfigbuilder/ codeBucket: as-dev-gord-staging triggerType: http diff --git a/ga_flattener_colon.yaml b/ga_flattener_colon.yaml index cdf94c4..b7d39e9 100644 --- a/ga_flattener_colon.yaml +++ b/ga_flattener_colon.yaml @@ -39,7 +39,7 @@ resources: - name: function type: dmt_cloud_function.py properties: - # All the files that start with this prefix will be packed in the Cloud Function (PUBSUB) + # All the files that start with this prefix will be packed in the Cloud Function that flattens the data codeLocation: cf/ codeBucket: 843904533364-function-code-staging triggerType: pubsub @@ -51,7 +51,7 @@ resources: - name: httpfunction type: dmt_cloud_function.py properties: - # All the files that start with this prefix will be packed in the Cloud Function (HTTP) + # All the files that start with this prefix will be packed in the Cloud Function that sets up the configuration codeLocation: cfconfigbuilderps/ codeBucket: 843904533364-function-code-staging triggerType: pubsub diff --git a/tools/pubsub_message_publish.py b/tools/pubsub_message_publish.py index c34e124..0492290 100644 --- a/tools/pubsub_message_publish.py +++ b/tools/pubsub_message_publish.py @@ -1,28 +1,30 @@ from google.cloud import pubsub_v1 -import os import json import datetime, time - from tests.test_base import BaseUnitTest -try: - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] -except: - print("setting GOOGLE_APPLICATION_CREDENTIALS env var") - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./credentials/as-dev-ian-0ef537352615.json" # mac - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "c:\\auth_keys\\AP Bootcamp 2019 - Gord-gordsserviceaccount-9eb6a157db12.json" # windows -topic_name = "gafltnr-topic" # pubsub topic your CF is subscribed to -project_id = "analyticspros.com:spotted-cinnamon-834" -IS_TEST = False # set to False to backfill, True for unit testing -dry_run = False # set to False to Backfill -datasets_to_backfill = ["102887025"] #GA Views to backfill, "24973611" +# To authenticate, run the following command. The account you choose will execute this python script +# gcloud auth application-default login + +'''*****************************''' +''' Configuration Section Start ''' +'''*****************************''' +topic_name = "gafltnr-topic" # pubsub topic your cloud function is subscribed to Example: [Deployment Name]-topic +project_id = "12345-project-gcp" # GCP project ID, example: [PROJECT_ID] +dry_run = True # set to False to Backfill. Setting to True will not pubish any messages to pubsub, but simply show what would have been published. +# Desired dates to backfill, both start and end are inclusive +backfill_range_start = datetime.datetime(2021, 1, 10) +backfill_range_end = datetime.datetime(2021, 1, 10) # datetime.datetime.today() +datasets_to_backfill = ["123456789"] #GA Views to backfill, "24973611" +'''*****************************''' +''' Configuration Section End ''' +'''*****************************''' #Seconds to sleep between each property date shard SLEEP_TIME = 5 # throttling -# ga_sessions_YYYYMMDD tables of desired dates must exist in order to backfill. -# both start and end are inclusive -backfill_range_start = datetime.datetime(2015, 5, 26) -backfill_range_end = datetime.datetime(2016, 12, 5) # datetime.datetime.today() +#Unit Testing flag +IS_TEST = False # set to False to backfill, True for unit testing + if IS_TEST: datasets_to_backfill = [BaseUnitTest.DATASET]