diff --git a/README.md b/README.md index 6adca22..629163b 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ We needed a way to: 1) Reliably download files from a Globus collection over HTTPS -2) Decrypt them on the fly ([crypt4gh](https://github.com/EGA-archive/crypt4gh)) +2) Optionally decrypt them on the fly ([crypt4gh](https://github.com/EGA-archive/crypt4gh)) 3) Store the plaintext files in an object store (bucket), ready for cloud based data science workflows The [file handler CLI](https://github.com/ebi-gdp/globus-file-handler-cli) takes care of 1) and 2). @@ -12,18 +12,35 @@ The [file handler CLI](https://github.com/ebi-gdp/globus-file-handler-cli) takes Downloaded files can also be saved to a local filesystem. +> [!NOTE] +> This workflow grabs crypt4gh secret keys from the INTERVENE key handler service, but could be adapted to work with local crypt4gh key pairs + +### Table of Contents + +- [Parameters](#parameters) + * [File input](#file-input) + * [Secret key](#secret-key) + * [Application properties](#application-properties) + * [crypt4gh application properties](#crypt4gh-application-properties) +- [Example use cases](#example-use-cases) + * [Download files from a Globus collection over HTTPS](#download-files-from-a-globus-collection-over-https) + * [Downloading files with crypt4gh decryption on the fly](#downloading-files-with-crypt4gh-decryption-on-the-fly) + * [Downloading files to an object store (bucket)](#downloading-files-to-an-object-store) +- [Helm support](#helm-support) + ## Parameters +### File input + +> [!IMPORTANT] +> This is parameter is mandatory + `--input` must be a JSON array with the following structure: ``` { "dir_path_on_guest_collection": "bwingfield@ebi.ac.uk/test_hapnest/", "files": [ - { - "filename": "hapnest.pvar", - "size": 278705850 - }, { "filename": "hapnest.pgen.crypt4gh", "size": 278825058 @@ -32,17 +49,39 @@ Downloaded files can also be saved to a local filesystem. } ``` -`--config_secrets` must be a path to a spring boot application properties file with the following structure: +### Secret key + +> [!IMPORTANT] +> This is parameter is optional + +`--secret_key` must be a JSON file with the following structure: + +``` +{"secretId": "77451C57-0FCC-460F-91A3-E0DED05B440F", "secretIdVersion": "1"} +``` + +The secret key is used to contact the platform key handler service and grab the correct crypt4gh secret key. + +### Application properties + +> [!IMPORTANT] +> This parameter is mandatory + +> [!TIP] +> Be careful of trailing whitespace in properties files + +`--config_application` must be a path to a spring boot application properties file with the following structure: ``` ##################################################################################### # Application config ##################################################################################### +spring.main.web-application-type=none data.copy.buffer-size=8192 ##################################################################################### # Apache HttpClient connection config ##################################################################################### -webclient.connection.pipe-size=4096 +webclient.connection.pipe-size=${data.copy.buffer-size} webclient.connection.connection-timeout=5 webclient.connection.socket-timeout=0 webclient.connection.read-write-timeout=30000 @@ -61,17 +100,12 @@ file.download.retry.attempts.back-off-period=2000 ##################################################################################### # Globus config ##################################################################################### -globus.guest-collection.domain= +globus.guest-collection.domain=@globus.guest-collection.url@ #Oauth globus.aai.access-token.uri=https://auth.globus.org/v2/oauth2/token -globus.aai.client-id= -globus.aai.client-secret= -globus.aai.scopes= -##################################################################################### -# Crypt4gh config -##################################################################################### -crypt4gh.binary-path=/opt/bin/crypt4gh -crypt4gh.shell-path=/bin/bash -c +globus.aai.client-id=@globus.aai.client-id@ +globus.aai.client-secret=@globus.aai.client-secret@ +globus.aai.scopes=https://auth.globus.org/scopes/c1e6310c-11d5-4e8a-9443-211884f04c6f/https ##################################################################################### # Logging config ##################################################################################### @@ -79,13 +113,30 @@ logging.level.uk.ac.ebi.intervene=INFO logging.level.org.springframework=WARN logging.level.org.apache.http=WARN logging.level.org.apache.http.wire=WARN +``` + +See the [file handler CLI](https://github.com/ebi-gdp/globus-file-handler-cli) README for a description of the configuration. + +### crypt4gh application properties + +> [!IMPORTANT] +> This is parameter is optional + +`--config_crypt4gh` must be a path to a spring boot application properties file with the following structure: + +``` ##################################################################################### -# key handler service config +# Crypt4gh config ##################################################################################### -intervene.key-handler.basic-auth=Basic -intervene.key-handler.secret-key.password= -intervene.key-handler.base-url=https:///key-handler +crypt4gh.binary-path=/opt/bin/crypt4gh +crypt4gh.shell-path=/bin/bash -c +##################################################################################### +# Intervene service config +##################################################################################### +intervene.key-handler.base-url=http://localhost:8040/bff/key-handler intervene.key-handler.keys.uri=/key/{secretId}/version/{secretIdVersion} +intervene.key-handler.basic-auth=${KEY_HANDLER_BASIC_AUTH:basic-auth} +intervene.key-handler.secret-key.password=${SEC_KEY_PASSWD:test-password} ``` See the [file handler CLI](https://github.com/ebi-gdp/globus-file-handler-cli) README for a description of the configuration. @@ -103,29 +154,47 @@ which integrates with the key handler service. ## Example use cases +> [!TIP] +> `--debug` can be helpful to keep files containing sensitive data if you're having problems with a transfer (disabled by default) + +### Download files from a Globus collection over HTTPS + +``` +$ nextflow run main.nf -profile docker \ + --input input.json \ + --config_application application.properties \ + --outdir downloads +``` + ### Downloading files with crypt4gh decryption on the fly It makes sense to submit these jobs to [a grid executor](https://www.nextflow.io/docs/latest/executor.html), like SLURM or cloud batch, because decryption on the fly will use ~1 CPU for each file: ``` -$ nextflow run main.nf -profile \ +$ nextflow run main.nf -profile docker \ + --input input.json \ + --secret_key key.json \ + --config_application application.properties \ + --config_crypt4gh application-crypt4gh-secret-manager.properties \ --config_secrets assets/secret.properties \ - --input assets/example_input.json \ --outdir downloads \ - --secret_key key + --decrypt ``` -### Downloading files to an object store (bucket) +### Downloading files to an object store It's possible to use nextflow's support for object storage to transfer files from Globus directly to a bucket: ``` -$ nextflow run main.nf -profile \ +$ nextflow run main.nf -profile docker \ + -c cloud.config \ + --input input.json \ + --secret_key key.json \ + --config_application application.properties \ + --config_crypt4gh application-crypt4gh-secret-manager.properties \ --config_secrets assets/secret.properties \ - --input assets/example_input.json \ - --secret_key key \ - --outdir gs://test-bucket/downloads \ - -w gs://test-bucket/work + --outdir gs://pathtobucket/downloads \ + -w gs://pathworkbucket/work ``` For best performance use a cloud executor and enable fusion in the nextflow configuration: @@ -145,6 +214,7 @@ fusion { tower { accessToken = 'token' + workspaceId = 'work' enabled = true } @@ -156,3 +226,9 @@ google { } } ``` + +## Helm support + +`helm/` contains a [helm chart](https://helm.sh/docs/topics/charts/) which can install a [Job](https://kubernetes.io/docs/concepts/workloads/controllers/job/) to a Kubernetes cluster. + +In the helm chart worker processes run in Cloud Batch by default with crypt4gh decryption on the fly enabled. diff --git a/assets/example_input.json b/assets/example_input.json index 7098171..9868226 100644 --- a/assets/example_input.json +++ b/assets/example_input.json @@ -1,10 +1,6 @@ { "dir_path_on_guest_collection": "bwingfield@ebi.ac.uk/test_hapnest/", "files": [ - { - "filename": "hapnest.pvar", - "size": 278705850 - }, { "filename": "hapnest.pgen.crypt4gh", "size": 278825058 diff --git a/assets/key.json b/assets/key.json new file mode 100644 index 0000000..3f32ed3 --- /dev/null +++ b/assets/key.json @@ -0,0 +1,4 @@ +{ + "secretId": "8D705854-9EEA-44C5-9937-E4E5228B8457", + "secretIdVersion": "1" +} diff --git a/helm/.gitignore b/helm/.gitignore new file mode 100644 index 0000000..7f47975 --- /dev/null +++ b/helm/.gitignore @@ -0,0 +1 @@ +values.yaml diff --git a/helm/.helmignore b/helm/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/helm/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm/Chart.yaml b/helm/Chart.yaml new file mode 100644 index 0000000..a90d7d1 --- /dev/null +++ b/helm/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: globflow +description: A Helm chart for a globflow file transfer with crypt4gh decryption on the fly + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "2.0.0" diff --git a/helm/templates/config.yaml b/helm/templates/config.yaml new file mode 100644 index 0000000..b4f216a --- /dev/null +++ b/helm/templates/config.yaml @@ -0,0 +1,47 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-transfer-config +data: + input.json: {{ toJson .Values.globflowInput | quote }} + key.json: {{ toJson .Values.keyHandlerSecret | quote }} + params.yml: | +{{- range $key, $value := .Values.globflowParams }} + {{ $key }}: {{ $value }} +{{- end }} + nxf.config: | + workDir = {{ .Values.nxfParams.workBucketPath | quote }} + + process { + executor = 'google-batch' + maxRetries = 1 + } + + google { + project = {{ .Values.nxfParams.gcpProject | quote }} + location = {{ .Values.nxfParams.location | quote }} + batch { + spot = {{ .Values.nxfParams.spot }} + } + } + + wave { + enabled = {{ .Values.nxfParams.wave }} + } + + fusion { + enabled = {{ .Values.nxfParams.fusion }} + } + + tower { + accessToken = {{ .Values.secrets.towerToken | quote }} + workspaceId = {{ .Values.secrets.towerId | quote }} + enabled = true + } + scm: | + providers { + ebi { + server = 'https://gitlab.ebi.ac.uk' + platform = 'gitlab' + } + } \ No newline at end of file diff --git a/helm/templates/job.yaml b/helm/templates/job.yaml new file mode 100644 index 0000000..6183194 --- /dev/null +++ b/helm/templates/job.yaml @@ -0,0 +1,51 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }} +spec: + ttlSecondsAfterFinished: 3600 + backoffLimit: 0 + template: + metadata: + annotations: + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" + spec: + serviceAccountName: nextflow + containers: + - name: globflow + image: {{ .Values.baseImage }}:{{ .Values.dockerTag }} + imagePullPolicy: {{ .Values.pullPolicy }} + command: ['sh', '-c', "nextflow run https://gitlab.ebi.ac.uk/gdp-public/globflow.git -params-file /opt/nxf/params.yml -c /opt/nxf/nxf.config --decrypt"] + env: + - name: NXF_SCM_FILE + value: /opt/nxf/scm + resources: + requests: + cpu: "1" + memory: 2G + ephemeral-storage: 10G + volumeMounts: + - name: transfer-config + mountPath: /opt/nxf + - name: globflow-secrets + mountPath: /opt/globflow/ + readOnly: true + volumes: + - name: transfer-config + configMap: + name: {{ .Release.Name }}-transfer-config + items: + - key: nxf.config + path: nxf.config + - key: scm + path: scm + - key: params.yml + path: params.yml + - key: input.json + path: input.json + - key: key.json + path: key.json + - name: globflow-secrets + secret: + secretName: {{ .Release.Name }}-transfer-secrets + restartPolicy: Never \ No newline at end of file diff --git a/helm/templates/secrets.yaml b/helm/templates/secrets.yaml new file mode 100644 index 0000000..46c735a --- /dev/null +++ b/helm/templates/secrets.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Release.Name }}-transfer-secrets +stringData: + application.properties: | + ##################################################################################### + # Application config + ##################################################################################### + spring.main.web-application-type=none + data.copy.buffer-size=8192 + ##################################################################################### + # Apache HttpClient connection config + ##################################################################################### + webclient.connection.pipe-size=${data.copy.buffer-size} + webclient.connection.connection-timeout=5 + webclient.connection.socket-timeout=0 + webclient.connection.read-write-timeout=30000 + ##################################################################################### + # File download retry config + ##################################################################################### + # EXPONENTIAL/FIXED + file.download.retry.strategy=FIXED + file.download.retry.attempts.max=3 + # Exponential + file.download.retry.attempts.delay=1000 + file.download.retry.attempts.maxDelay=30000 + file.download.retry.attempts.multiplier=2 + # Fixed + file.download.retry.attempts.back-off-period=2000 + ##################################################################################### + # Globus config + ##################################################################################### + globus.guest-collection.domain={{ .Values.secrets.globusDomain }} + #Oauth + globus.aai.access-token.uri=https://auth.globus.org/v2/oauth2/token + globus.aai.client-id={{ .Values.secrets.globusClientId }} + globus.aai.client-secret={{ .Values.secrets.globusClientSecret }} + globus.aai.scopes=https://auth.globus.org/scopes/c1e6310c-11d5-4e8a-9443-211884f04c6f/https + ##################################################################################### + # Logging config + ##################################################################################### + logging.level.uk.ac.ebi.intervene=INFO + logging.level.org.springframework=WARN + logging.level.org.apache.http=WARN + logging.level.org.apache.http.wire=WARN + application-crypt4gh-secret-manager.properties: | + ##################################################################################### + # Crypt4gh config + ##################################################################################### + crypt4gh.binary-path=/opt/bin/crypt4gh + crypt4gh.shell-path=/bin/bash -c + ##################################################################################### + # Intervene service config + ##################################################################################### + intervene.key-handler.basic-auth=Basic {{ .Values.secrets.keyHandlerToken }} + intervene.key-handler.secret-key.password={{ .Values.secrets.keyHandlerPassword }} + intervene.key-handler.base-url={{ .Values.secrets.keyHandlerURL }} + intervene.key-handler.keys.uri=/key/{secretId}/version/{secretIdVersion} diff --git a/helm/templates/serviceaccount.yaml b/helm/templates/serviceaccount.yaml new file mode 100644 index 0000000..b8a5f9d --- /dev/null +++ b/helm/templates/serviceaccount.yaml @@ -0,0 +1,8 @@ +{{- $serviceAccountExists := lookup "v1" "ServiceAccount" .Release.Namespace .Values.serviceAccount.name -}} +{{- if not $serviceAccountExists -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Values.serviceAccount.name }} + namespace: {{ .Release.Namespace }} +{{- end }} \ No newline at end of file diff --git a/helm/values-example.yaml b/helm/values-example.yaml new file mode 100644 index 0000000..d92e215 --- /dev/null +++ b/helm/values-example.yaml @@ -0,0 +1,53 @@ + + +baseImage: "docker.io/nextflow/nextflow" +dockerTag: "24.10.0" +pullPolicy: "IfNotPresent" + +serviceAccount: + create: true + name: nextflow + +# change for each run +keyHandlerSecret: + secretId: "77451C57-0FCC-460F-91A3-E0DED05B440F" + secretIdVersion: "1" + +# change for each run +globflowInput: + dir_path_on_guest_collection: test@example.com/test-collection + files: + - filename: hapnest.psam.c4gh + size: 8669 + - filename: hapnest.pgen.c4gh + size: 278825058 + - filename: hapnest.pvar.c4gh + size: 215004174 + +globflowParams: + outdir: "gs://testbucket/data" # update this to point to a new bucket + # mostly static values below + input: /opt/nxf/input.json + secret_key: /opt/nxf/key.json + config_application: /opt/globflow/application.properties + config_crypt4gh: /opt/globflow/application-crypt4gh-secret-manager.properties + +# change for each deployment +secrets: + globusDomain: globus-domain + globusClientId: secret-client-id + globusClientSecret: client-secret + towerToken: tower-token + towerId: "tower-id" + keyHandlerToken: key-handler-token + keyHandlerPassword: key-handler-password + keyHandlerURL: key-handler-url + +# change for each run +nxfParams: + workBucketPath: "gs://testbucket/work" + gcpProject: "gcp-project" + location: "gcp-location" + spot: true + wave: true + fusion: true diff --git a/main.nf b/main.nf index 0036355..9839a3e 100644 --- a/main.nf +++ b/main.nf @@ -2,32 +2,78 @@ import groovy.json.JsonSlurper -if (!params.config_secrets) { - error "Error: missing mandatory parameter --config_secrets" + +if (!params.config_application) { + error "Error: missing mandatory parameter --config_application" } if (!params.input) { error "Error: missing mandatory parameter --input" } -if (!params.secret_key) { - error "Error: missing --secret_key" +if (params.decrypt) { + if (!params.config_crypt4gh) { + error "Error: missing mandatory parameter --config_crypt4gh" + } + if (!params.secret_key) { + error "Error: missing --secret_key" + } +} else { + if (params.config_crypt4gh || params.secret_key) { + log.info "INFO: Ignoring --config_crypt4gh or --secret_key when --decrypt is not set" + } +} + +if (params.debug) { + log.info "INFO: Debug mode enabled (not cleaning up sensitive intermediate files)" +} else { + log.info "INFO: Debug mode disabled (being careful to clean up sensitive intermediate files)" } -process download_decrypt { + +process download { + stageInMode "${ params.debug ? 'copy' : 'symlink' }" + tag "${in_map.filename}" + // drops the "output" directory from path when publishing + publishDir "$params.outdir", mode: "move", saveAs: { "${file(it).getName()}" } errorStrategy { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } - maxRetries 3 + beforeScript 'mkdir output' + + input: + val in_map + path application_properties, stageAs: "application.properties" + + output: + path "output/*" + + script: + """ + java -jar /opt/globus-file-handler-cli-1.0.5.jar \ + --globus_file_transfer_source_path "globus:///${in_map.dir_path_on_guest_collection}/${in_map.filename}" \ + --globus_file_transfer_destination_path "file:///\$PWD/output/${file(in_map.filename).baseName}" \ + --file_size ${in_map.size} + + if [ "$params.debug" = false ] ; then + # this will keep nextflow log files but delete sensitive inputs + echo "Cleaning all files except output directory" + rm *.properties + fi + """ +} + +process download_decrypt_key_handler { + stageInMode "${ params.debug ? 'copy' : 'symlink' }" tag "${in_map.filename}" // drops the "output" directory from path when publishing publishDir "$params.outdir", mode: "move", saveAs: { "${file(it).getName()}" } - container "${ workflow.containerEngine == 'singularity' ? - "oras://ghcr.io/ebi-gdp/globus-file-handler-cli:1.0.4-singularity" : - "ghcr.io/ebi-gdp/globus-file-handler-cli:1.0.4" }" + errorStrategy { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } + beforeScript 'mkdir output' input: val in_map - path secret_config, stageAs: "secret.properties" + path application_properties, stageAs: "application.properties" + path crypt4gh_properties, stageAs: "application-crypt4gh-secret-manager.properties" path secret_key, stageAs: "secret-config.json" output: @@ -35,29 +81,36 @@ process download_decrypt { script: """ - mkdir output - - java -jar /opt/globus-file-handler-cli-1.0.4.jar \ - --spring.config.location=./secret.properties \ + java -jar /opt/globus-file-handler-cli-1.0.5.jar \ + --spring.profiles.active=crypt4gh-secret-manager \ --globus_file_transfer_source_path "globus:///${in_map.dir_path_on_guest_collection}/${in_map.filename}" \ --globus_file_transfer_destination_path "file:///\$PWD/output/${file(in_map.filename).baseName}" \ --file_size ${in_map.size} \ --crypt4gh \ --sk "file:///\$PWD/secret-config.json" - rm -f ./* 2>/dev/null || true # delete everything except output directory + if [ "$params.debug" = false ] ; then + # this will keep nextflow log files but delete sensitive inputs + echo "Cleaning all files except output directory" + rm *.sec *.properties *.json + fi """ } workflow { // using first() to create reusable value channels - Channel.fromPath(params.secret_key, checkIfExists: true).first().set { secret_key } - Channel.fromPath(params.config_secrets, checkIfExists: true).first().set { secrets_config_path } + Channel.fromPath(params.config_application, checkIfExists: true).first().set { application_properties } // this channel is a list of hashmaps, one for each file to be downloaded Channel.fromPath(params.input, checkIfExists: true).map { parseInput(it) }.flatten().set { ch_input } - download_decrypt(ch_input, secrets_config_path, secret_key) + if (params.decrypt) { + Channel.fromPath(params.secret_key, checkIfExists: true).first().set { secret_key } + Channel.fromPath(params.config_crypt4gh, checkIfExists: true).first().set { crypt4gh_properties } + download_decrypt_key_handler(ch_input, application_properties, crypt4gh_properties, secret_key) + } else { + download(ch_input, application_properties) + } } diff --git a/nextflow.config b/nextflow.config index 2080433..b0a8239 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,37 +1,31 @@ +nextflow.enable.strict = true + params { - // mandatory params - config_secrets = null input = null secret_key = null - - // optional params + config_application = null + config_crypt4gh = null outdir = "results" + debug = false // keep files around for debugging + decrypt = false // try to decrypt on the fly? } profiles { docker { docker.enabled = true - singularity.enabled = false } arm { docker.runOptions = '--platform=linux/arm64' } - singularity { - singularity.enabled = true - docker.enabled = false - } } process { - maxRetries = 3 - maxErrors = '-1' cpus = 1 memory = 2.GB time = 6.h + container = "ghcr.io/ebi-gdp/globus-file-handler-cli:1.0.5" } -nextflow.enable.strict = true - manifest { name = 'ebi-gdp/globflow' author = 'Benjamin Wingfield' @@ -40,5 +34,5 @@ manifest { description = 'Download files from Globus over HTTPS, with decryption on the fly' mainScript = 'main.nf' nextflowVersion = '>=23.10.1' - version = '1.0.0' + version = '2.0.0' }