diff --git a/datasets/america_health_rankings/pipelines/_images/run_csv_transform_kub/Dockerfile b/datasets/america_health_rankings/pipelines/_images/run_csv_transform_kub/Dockerfile index 62b210f95..02ecb9775 100644 --- a/datasets/america_health_rankings/pipelines/_images/run_csv_transform_kub/Dockerfile +++ b/datasets/america_health_rankings/pipelines/_images/run_csv_transform_kub/Dockerfile @@ -10,6 +10,7 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and +# limitations under the License. FROM python:3.8 # Allow statements and log messages to appear in Cloud logs @@ -21,6 +22,15 @@ COPY requirements.txt ./ # Install the packages specified in the requirements file RUN python3 -m pip install --no-cache-dir -r requirements.txt +# Install google storage gcloud to communicate with GCS buckets +RUN python3 -m pip install fsspec +RUN python3 -m pip install gcsfs +RUN curl -sSL https://sdk.cloud.google.com | bash +ENV PATH $PATH:/root/google-cloud-sdk/bin + +# Install openpyxl +RUN python3 -m pip install openpyxl + # The WORKDIR instruction sets the working directory for any RUN, CMD, # ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile. # If the WORKDIR doesn’t exist, it will be created even if it’s not used in diff --git a/datasets/america_health_rankings/pipelines/ahr/ahr_dag.py b/datasets/america_health_rankings/pipelines/ahr/ahr_dag.py index 52a099556..a854cd145 100644 --- a/datasets/america_health_rankings/pipelines/ahr/ahr_dag.py +++ b/datasets/america_health_rankings/pipelines/ahr/ahr_dag.py @@ -1,4 +1,4 @@ -# Copyright 2021 Google LLC +# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ dag_id="america_health_rankings.ahr", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="3 10 * * *", catchup=False, default_view="graph", ) as dag: @@ -36,10 +36,10 @@ # Run CSV transform within kubernetes pod ahr_transform_csv = kubernetes_pod.KubernetesPodOperator( task_id="ahr_transform_csv", - startup_timeout_seconds=600, name="america_health_rankings_ahr", - namespace="composer", - service_account_name="datasets", + namespace="composer-user-workloads", + service_account_name="default", + config_file="/home/airflow/composer_kube_config", image_pull_policy="Always", image="{{ var.json.america_health_rankings.container_registry.run_csv_transform_kub }}", env_vars={ @@ -51,10 +51,10 @@ "CSV_HEADERS": '["edition","report_type","measure_name","state_name","subpopulation","value","lower_ci","upper_ci","source","source_date"]', "RENAME_MAPPINGS": '{"Edition": "edition","Report Type": "report_type","Measure Name": "measure_name","State Name": "state_name","Subpopulation": "subpopulation","Value": "value","Lower CI": "lower_ci","Upper CI": "upper_ci","Source": "source","Source Date": "source_date"}', }, - resources={ - "request_memory": "2G", - "request_cpu": "1", - "request_ephemeral_storage": "10G", + container_resources={ + "memory": {"request": "16Gi"}, + "cpu": {"request": "1"}, + "ephemeral-storage": {"request": "10Gi"}, }, ) diff --git a/datasets/america_health_rankings/pipelines/ahr/pipeline.yaml b/datasets/america_health_rankings/pipelines/ahr/pipeline.yaml index 16d2c6695..11e1542cc 100644 --- a/datasets/america_health_rankings/pipelines/ahr/pipeline.yaml +++ b/datasets/america_health_rankings/pipelines/ahr/pipeline.yaml @@ -29,7 +29,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "3 10 * * *" catchup: False default_view: graph @@ -40,10 +40,10 @@ dag: args: task_id: "ahr_transform_csv" - startup_timeout_seconds: 600 name: "america_health_rankings_ahr" - namespace: "composer" - service_account_name: "datasets" + namespace: "composer-user-workloads" + service_account_name: "default" + config_file: "/home/airflow/composer_kube_config" image_pull_policy: "Always" image: "{{ var.json.america_health_rankings.container_registry.run_csv_transform_kub }}" env_vars: @@ -56,10 +56,13 @@ dag: ["edition","report_type","measure_name","state_name","subpopulation","value","lower_ci","upper_ci","source","source_date"] RENAME_MAPPINGS: >- {"Edition": "edition","Report Type": "report_type","Measure Name": "measure_name","State Name": "state_name","Subpopulation": "subpopulation","Value": "value","Lower CI": "lower_ci","Upper CI": "upper_ci","Source": "source","Source Date": "source_date"} - resources: - request_memory: "2G" - request_cpu: "1" - request_ephemeral_storage: "10G" + container_resources: + memory: + request: "16Gi" + cpu: + request: "1" + ephemeral-storage: + request: "10Gi" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table"