diff --git a/.github/workflows/build-test.yaml b/.github/workflows/e2e.yaml similarity index 76% rename from .github/workflows/build-test.yaml rename to .github/workflows/e2e.yaml index 7207ca14b..85c45f8e4 100644 --- a/.github/workflows/build-test.yaml +++ b/.github/workflows/e2e.yaml @@ -54,11 +54,18 @@ jobs: registry1Password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}} ghToken: ${{ secrets.GITHUB_TOKEN }} - - name: Create /opt/uds LVM + - name: Test Create Default LVM run: | - sudo mkdir -p /opt/uds - sudo chown -Rv 65534:65534 /opt/uds + sudo uds run create:logical-volume - - name: Test the UDS RKE2 + Custom Zarf Init Bootstrap (`local-path`) + - name: Test Create Default TLS Cert run: | - sudo uds run uds-rke2-local-path-test --no-progress --log-level warn -a amd64 + sudo uds run create-tls-local-path-dev + + - name: Test Deploy UDS RKE2 + run: | + sudo uds run test:uds-rke2 --set VERSION=dev --log-level warn + + - name: Test Deploy `local-path` Flavor Custom Zarf Init + run: | + sudo uds run test:local-path-minio-init --set VERSION=dev --log-level warn diff --git a/.github/workflows/tag-and-release.yaml b/.github/workflows/tag-and-release.yaml index 1973009fc..1ed7ba244 100644 --- a/.github/workflows/tag-and-release.yaml +++ b/.github/workflows/tag-and-release.yaml @@ -69,8 +69,8 @@ jobs: echo "Publishing for tag: ${{ github.ref }}" if [[ "${{ github.ref }}" == "refs/tags/dev" ]]; then - sudo uds run release-dev --set VERSION=dev --no-progress --no-log-file --log-level debug + sudo uds run release else - sudo uds run release --no-progress --no-log-file --log-level debug + sudo uds run release-dev fi shell: bash diff --git a/.github/workflows/zarf-lint.yaml b/.github/workflows/zarf-lint.yaml index caeb8838c..506c18464 100644 --- a/.github/workflows/zarf-lint.yaml +++ b/.github/workflows/zarf-lint.yaml @@ -51,3 +51,5 @@ jobs: check-jsonschema packages/minio/zarf.yaml --schemafile zarf.schema.json check-jsonschema packages/local-path/zarf.yaml --schemafile zarf.schema.json check-jsonschema packages/rook-ceph/zarf.yaml --schemafile zarf.schema.json + check-jsonschema packages/leapfrogai/zarf.yaml --schemafile zarf.schema.json + check-jsonschema packages/nvidia-gpu-operator/zarf.yaml --schemafile zarf.schema.json diff --git a/.gitignore b/.gitignore index 82e7fd90f..f94a3b678 100644 --- a/.gitignore +++ b/.gitignore @@ -11,8 +11,10 @@ tmp/ uds-bundle-*.tar.zst # Secrets -**/*.cert -**/*.key +tls/ +cert*/ +**/*.cert* +**/*.key* # Builds build/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 13193211c..fe6923d39 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -127,6 +127,16 @@ repos: files: "deploy.yaml" types: [yaml] args: ["--schemafile", "tasks.schema.json"] + - id: check-jsonschema + name: "Validate Setup Tasks Against Schema" + files: "setup.yaml" + types: [yaml] + args: ["--schemafile", "tasks.schema.json"] + - id: check-jsonschema + name: "Validate Test Tasks Against Schema" + files: "test.yaml" + types: [yaml] + args: ["--schemafile", "tasks.schema.json"] - repo: local hooks: - id: delete-schema diff --git a/README.md b/README.md index 74f1a43da..710c18412 100644 --- a/README.md +++ b/README.md @@ -9,56 +9,43 @@ This Zarf package serves as an air-gapped production environment for deploying [ See the [UDS RKE2 Mermaid diagram](docs/DIAGRAM.md) for visual representations of the tech stack's components and order of operations. +## Table of Contents + +1. [Pre-Requisites](#pre-requisites) +2. [Usage](#usage) + - [Virtual Machines](#virtual-machines) + - [Bundles](#bundles) + - [Quick Start](#quick-start) + - [Latest](#latest) + - [Development](#development) +3. [Additional Info](#additional-info) + ## Pre-Requisites -### Deployment Target +The following are requirements for an environment where a user is deploying UDS RKE2 and its custom components and applications. -- A base installation of [Ubuntu Server 20.04+](https://ubuntu.com/download/server) on the node's host system +- A base installation of [Ubuntu 20.04 or 22.04](https://ubuntu.com/download/server) on the node's host system (server or desktop) - [UDS CLI](https://github.com/defenseunicorns/uds-cli/blob/main/README.md#install) using the versions specified in the [UDS Common repository](https://github.com/defenseunicorns/uds-common/blob/main/README.md#supported-tool-versions) - See the RKE2 documentation for host system [pre-requisites](https://docs.rke2.io/install/requirements) -- See the Rook-Ceph documentation for the host system [pre-requisites](https://rook.io/docs/rook/latest-release/Getting-Started/Prerequisites/prerequisites/) based on the node's role and the cluster's configurations - -### UDS CLI Aliasing - -Below are instructions for adding UDS CLI aliases that are useful for deployments that occur in an air-gap with only the UDS CLI binary available to the delivery engineer. +- See the [Application-Specific](#application-specific) and [Flavor-Specific Infrastructure](#flavor-specific-infrastructure) configuration sections for instruction on setup based on what is deployed atop UDS RKE2 -For general CLI UX, put the following in your shell configuration (e.g., `/root/.bashrc`): - -```bash -alias k="uds zarf tools kubectl" -alias kubectl="uds zarf tools kubectl" -alias zarf='uds zarf' -alias k9s='uds zarf tools monitor' -alias udsclean="uds zarf tools clear-cache && rm -rf ~/.uds-cache && rm -rf ~/.zarf-cache && rm -rf /tmp/uds* && rm -rf /tmp/zarf-*" -``` - -For fulfilling `xargs` and `kubectl` binary requirements necessary for running some of the _optional_ deployment helper scripts: - -```bash -touch /usr/local/bin/kubectl -echo '#!/bin/bash\nuds zarf tools kubectl "$@"' > /usr/local/bin/kubectl -chmod +x /usr/local/bin/kubectl -``` - -### Local Development +## Usage -- All pre-requisites listed in [Deployment Target](#deployment-target) -- [Docker](https://docs.docker.com/get-docker/) or [Podman](https://podman.io/getting-started/installation) for running, building, and pulling images +> [!IMPORTANT] +> This entire repository assumes that you have root access, and all scripts and actions are run as root. Use `sudo su` to activate a root shell. -## Usage +This section provides minimal context and instructions for quickly deploying the base UDS RKE2 capability. See the [DEVELOPMENT.md](docs/DEVELOPMENT.md) for instructions on how to further develop UDS RKE2. ### Virtual Machines > [!CAUTION] -> Due to the the disk formatting operations, networking and STIG configurations that are applied to a node's host, it is highly recommended that the contents of this repository are not directly installed on a personal machine. +> Due to the the disk formatting and mount operations, networking and STIG configurations that are applied to a node's host, it is highly recommended that the contents of this repository are not directly installed on a personal machine. The best way to test UDS RKE2 is to spin-up one or more nodes using a containerized method, such as virtual machines or networks. [LeapfrogAI](https://github.com/defenseunicorns/leapfrogai), the main support target of this bundle, requires GPU passthrough to all worker nodes that will have a taint for attracting pods with GPU resource and workload requirements. -Please see the [VM setup documentation](./docs/VM.md) and VM setup scripts to learn more about manually creating development VM. - -VM setup may not be necessary if using Longhorn or Local Path Provisioner, but it is highly recommended when using Rook-Ceph. +Please see the [VM setup documentation](./docs/VM.md) and VM setup scripts to learn more about manually creating development VM.. ### Bundles @@ -68,125 +55,88 @@ There are 3 main "flavors" of the UDS RKE2 Core bundle, with 4 distinct flavors 2. (WIP) [Longhorn](./docs/LONGHORN.md) + [MinIO](./docs/MINIO.md) 3. (WIP) [Rook-Ceph](./docs/ROOK-CEPH.md) -Each bundle can also be experimented with using the Zarf package creation and deployment commands via the UDS tasks outlined in the sections below. - -### Packages +### Quick Start -See the [Configuration section](#configuration) for more details on each specific package in each of the bundle flavors. +The following are quick starts for the `local-path` flavored UDS RKE2 bundle. This does not include the optional NVIDIA GPU operator and LeapfrogAI workarounds Zarf packages. -### UDS Tasks +#### Latest -This repository uses [UDS CLI](https://github.com/defenseunicorns/uds-cli)'s built-in [task runner](https://github.com/defenseunicorns/maru-runner) to perform all actions required to run, develop, and publish the UDS RKE2 tech stack. - -Run the following to see all the tasks in the main [`tasks.yaml`](./tasks.yaml), and their descriptions: +1. Change directory to the bundle and deploy the bundle: ```bash -uds run --list-all -``` - -#### Create - -See the UDS [`create` tasks](./tasks/create.yaml) file for more details. - -To create all packages and bundles, do the following: - -```bash -# Login to Registry1 -set +o history -export REGISTRY1_USERNAME="YOUR-USERNAME-HERE" -export REGISTRY1_PASSWORD="YOUR-PASSWORD-HERE" -echo $REGISTRY1_PASSWORD | zarf tools registry login registry1.dso.mil --username $REGISTRY1_USERNAME --password-stdin -set -o history - -# Login to ghcr -set +o history -export GHCR_USERNAME="YOUR-USERNAME-HERE" -export GHCR_PASSWORD="YOUR-PASSWORD-HERE" -echo $GHCR_PASSWORD | zarf tools registry login ghcr.io --username $GHCR_USERNAME --password-stdin -set -o history - -uds run create:all +# use `ifconfig` to identify the NETWORK_INTERFACE for L2 advertisement +uds run uds-rke2-local-path-core --set NETWORK_INTERFACE=eth0 ``` -#### Deploy - -> [!NOTE] -> The pre-deployment setup of the host machine is storage solution-dependent, so be sure to check the documentation for the package flavor you are deploying: [`local-path`](./docs/LOCAL-PATH.md), [`longhorn`](./docs/LONGHORN.md), or [`rook-ceph`](./docs/ROOK-CEPH.md). - -See the UDS [`deploy` tasks](./tasks/deploy.yaml) file for more details. - -To deploy a bundle (e.g., UDS RKE2 bootstrap with `local-path` flavor), do the following: +2. Modify your `/etc/hosts` according to your base IP on the Istio Tenant gateway ```bash -# LATEST -uds run uds-rke2-local-path-core +# /etc/hosts -# DEV -uds run uds-rke2-local-path-core-dev +192.168.0.200 keycloak.admin.uds.dev grafana.admin.uds.dev neuvector.admin.uds.dev +192.168.0.201 sso.uds.dev ``` -#### Publish - -See the UDS [`publish` tasks](./tasks/publish.yaml) file for more details. Also see the `release` task in the main [`tasks.yaml`](./tasks.yaml). +#### Development -To publish all packages and bundles, do the following: +1. Login to GitHub Container Registry (GHCR) and [DoD's Registry1](https://registry1.dso.mil/): ```bash # Login to GHCR set +o history export GHCR_USERNAME="YOUR-USERNAME-HERE" export GHCR_PASSWORD="YOUR-PASSWORD-HERE" -echo $GHCR_PASSWORD | zarf tools registry login ghcr.io --username $GHCR_USERNAME --password-stdin +echo $GHCR_PASSWORD | uds zarf tools registry login ghcr.io --username $GHCR_USERNAME --password-stdin set -o history -# if create:all was already run -uds run publish:all - -# if create:all was not already run -uds run release +# Login to Registry1 +set +o history +export REGISTRY1_USERNAME="YOUR-USERNAME-HERE" +export REGISTRY1_PASSWORD="YOUR-PASSWORD-HERE" +echo $REGISTRY1_PASSWORD | uds zarf tools registry login registry1.dso.mil --username $REGISTRY1_USERNAME --password-stdin +set -o history ``` -#### Remove - -Run the following to remove all Docker, Zarf and UDS artifacts from the host: +2. Build all necessary packages and then create and deploy the bundle ```bash -uds run setup:clean +# use `ifconfig` to identify the NETWORK_INTERFACE for L2 advertisement +uds run uds-rke2-local-path-core-dev --set NETWORK_INTERFACE=eth0 ``` -Run the following to completely destroy the UDS RKE2 node and all of UDS RKE2's artifacts from the node's host: +3. Modify your `/etc/hosts` according to your base IP on the Istio Tenant gateway ```bash -uds run setup:uds-rke2-destroy -``` - -#### Test - -Run the following to run the E2E CI test(s): +# /etc/hosts -```bash -uds run uds-rke2-local-path-test +192.168.0.200 keycloak.admin.uds.local grafana.admin.uds.local neuvector.admin.uds.local +192.168.0.201 sso.uds.local ``` ## Additional Info -Below are resources to explain some of the rationale and inner workings of the RKE2 cluster's infrastructure. +The following sub-sections outlines all of the configuration documentation, which includes additional information, optional Zarf packages, and customization options for each component of UDS RKE2. -### Configuration +### Base Infrastructure -- [Operating System Configuration](docs/OS.md) -- [RKE2-Specific Configuration](docs/RKE2.md) +- [Operating System](docs/OS.md) +- [RKE2-Specific](docs/RKE2.md) - [UDS-RKE2 Infrastructure and Exemptions](docs/UDS-RKE2.md) -- [MinIO Configuration](docs/MINIO.md) -- [Rook-Ceph Configuration](docs/ROOK-CEPH.md) -- [Longhorn Configuration](docs/LONGHORN.md) +- [Hosts, DNS and TLS Configuration](docs/DNS-TLS.md) + +### Flavor-Specific Infrastructure + +- [Rook-Ceph](docs/ROOK-CEPH.md) +- [Longhorn](docs/LONGHORN.md) - [Local Path Provisioner](docs/LOCAL-PATH.md) - [Custom Zarf Init](docs/INIT.md) +- [MinIO](docs/MINIO.md) ### Application-Specific - [UDS Core](UDS-CORE.md) -- [LeapfrogAI](docs/LEAPFROGAI.md) +- [LeapfrogAI Workarounds](docs/LEAPFROGAI.md) +- [NVIDIA GPU Operator](docs/NVIDIA-GPU-OPERATOR.md) ### Virtual Machine Setup and Testing @@ -204,3 +154,4 @@ Below are resources to explain some of the rationale and inner workings of the R - [RKE2 Zarf Init](https://github.com/defenseunicorns/zarf-package-rke2-init) - [Zarf Longhorn Init](https://github.com/defenseunicorns/zarf-init-longhorn) - [UDS Rook-Ceph Capability](https://github.com/defenseunicorns/uds-capability-rook-ceph) +- [UDS Nutanix SWF Bundle](https://github.com/defenseunicorns/uds-bundle-software-factory-nutanix/tree/main) diff --git a/bundles/dev/ca.conf b/bundles/dev/ca.conf new file mode 100644 index 000000000..e1d79c063 --- /dev/null +++ b/bundles/dev/ca.conf @@ -0,0 +1,17 @@ +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_ca +prompt = no +default_bits = 4096 +default_md = sha256 + +[req_distinguished_name] +CN = UDS RKE2 Root CA +O = Defense Unicorns +OU = UDS RKE2 Product Team + +[v3_ca] +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid:always,issuer +basicConstraints = critical, CA:true, pathlen:0 +keyUsage = critical, digitalSignature, cRLSign, keyCertSign diff --git a/bundles/dev/local-path-core/uds-bundle.yaml b/bundles/dev/local-path-core/uds-bundle.yaml index d91d33e36..edba0dcca 100644 --- a/bundles/dev/local-path-core/uds-bundle.yaml +++ b/bundles/dev/local-path-core/uds-bundle.yaml @@ -5,83 +5,90 @@ kind: UDSBundle metadata: name: uds-rke2-local-path-core description: "A UDS bundle for bootstrapping an RKE2 cluster and installing UDS Core services" - # x-release-please-start-version version: "dev" - # x-release-please-end architecture: amd64 packages: - name: uds-rke2 description: "Deploys the UDS RKE2 Zarf package" - repository: ghcr.io/justinthelaw/packages/uds/uds-rke2 - # x-release-please-start-version + path: ../../../build/packages/ ref: "dev" - # x-release-please-end - - name: local-path-init + - name: init description: "Deploys the custom Zarf Init package" - repository: ghcr.io/justinthelaw/packages/uds/uds-rke2/init + path: ../../../build/packages/local-path/ # TODO: renovate setup - ref: "v0.33.0-local-path" + ref: "v0.33.0" + overrides: + minio: + minio: + variables: + - name: BUCKET_PERSISTENCE_SIZE + description: "Size of the default PVC for MinIO (e.g., 30Gi)" + path: persistence.size - - name: uds-rke2-infrastructure + - name: infrastructure description: "Deploys the UDS RKE2 base infrastructure Zarf package" - repository: ghcr.io/justinthelaw/packages/uds/uds-rke2/infrastructure - # x-release-please-start-version + path: ../../../build/packages/ ref: "dev" - # x-release-please-end + overrides: + infrastructure: + uds-rke2-infrastructure: + variables: + - name: INTERFACE + path: interface - - name: uds-core + - name: core description: "Deploys UDS Core services" - repository: ghcr.io/defenseunicorns/packages/uds/core + path: ../../../build/packages/ # TODO: renovate setup - ref: "0.22.0-registry1" + ref: "0.23.0" overrides: loki: loki: values: - # Override default DNS service name for Loki Gateway - - path: "global.dnsService" - value: "rke2-coredns-rke2-coredns" - # If S3 API is external to the cluster - - path: loki.storage.s3.endpoint - value: "" - - path: loki.storage.s3.secretAccessKey - value: "" - - path: loki.storage.s3.accessKeyId - value: "" + # Set DNS service name for Loki Gateway + - path: global.dnsService + value: rke2-coredns-rke2-coredns + # Ensure we don't hit query limits + - path: loki.limits_config.split_queries_by_interval + value: "30m" + - path: loki.query_scheduler.max_outstanding_requests_per_tenant + value: 32000 # This is the new default in Loki 3.0 variables: + - name: LOKI_S3_ENDPOINT + description: "The S3 endpoint" + path: loki.storage.s3.endpoint - name: LOKI_CHUNKS_BUCKET description: "The object storage bucket for Loki chunks" path: loki.storage.bucketNames.chunks - default: "uds" - name: LOKI_RULER_BUCKET description: "The object storage bucket for Loki ruler" path: loki.storage.bucketNames.ruler - default: "uds" - name: LOKI_ADMIN_BUCKET description: "The object storage bucket for Loki admin" path: loki.storage.bucketNames.admin - default: "uds" - name: LOKI_S3_REGION description: "The S3 region" path: loki.storage.s3.region - default: "us-east-1" + - name: LOKI_S3_ACCESS_KEY_ID + description: "The S3 Access Key ID" + path: loki.storage.s3.accessKeyId + - name: LOKI_S3_SECRET_ACCESS_KEY + path: loki.storage.s3.secretAccessKey + description: "The S3 Secret Access Key" - name: LOKI_IRSA_ROLE_ARN description: "The irsa role annotation" path: serviceAccount.annotations.irsa/role-arn - name: LOKI_WRITE_REPLICAS path: write.replicas description: "Loki write replicas" - default: "1" - name: LOKI_READ_REPLICAS path: read.replicas description: "Loki read replicas" - default: "1" - name: LOKI_BACKEND_REPLICAS path: backend.replicas description: "Loki backend replicas" - default: "1" velero: velero: @@ -89,35 +96,61 @@ packages: - name: VELERO_USE_SECRET description: "Toggle use secret off to use IRSA." path: credentials.useSecret - default: "false" - name: VELERO_IRSA_ROLE_ARN description: "IRSA role arn for annotation" path: serviceAccount.server.annotations.irsa/role-arn - default: "" + keycloak: + keycloak: + values: + # Turn off verifications for dev or prototype deployments + - path: realmInitEnv + value: + EMAIL_VERIFICATION_ENABLED: false + OTP_ENABLED: false + TERMS_AND_CONDITIONS_ENABLED: false + X509_OCSP_FAIL_OPEN: false + variables: + # TODO: this can only be `false` if a Postgres is integrated and accessible in the cluster + - name: KEYCLOAK_DEV_MODE + path: devMode + default: true + - name: KEYCLOAK_FIPS_MODE + path: fips + default: true + # Enable headless admin account generation, SHOULD BE CHANGED IMMEDIATELY AFTER + # Can be found in `keycloak-admin-password` secret in-cluster + - name: KEYCLOAK_ENABLE_INSECURE_ADMIN_PASSWORD + path: insecureAdminPasswordGeneration.enabled + default: true + - name: KEYCLOAK_ADMIN_USERNAME + path: insecureAdminPasswordGeneration.username + default: admin + + # Access to UDS cluster administration services (e.g., *admin.uds.dev, grafana.admin.uds.dev) + # Access to UDS application services (e.g., *.uds.dev, ai.uds.dev) istio-admin-gateway: uds-istio-config: - # Access to UDS cluster administration services (e.g., *admin.uds.dev, grafana.admin.uds.dev) variables: - name: ADMIN_TLS_CERT description: "The TLS cert for the admin gateway (must be base64 encoded)" - path: ../tls.cert + path: tls.cert - name: ADMIN_TLS_KEY description: "The TLS key for the admin gateway (must be base64 encoded)" - path: ../tls.key - + path: tls.key istio-tenant-gateway: - # Access to UDS application services (e.g., *.uds.dev, ai.uds.dev) uds-istio-config: variables: - name: TENANT_TLS_CERT description: "The TLS cert for the tenant gateway (must be base64 encoded)" - path: ../tls.cert + path: tls.cert - name: TENANT_TLS_KEY description: "The TLS key for the tenant gateway (must be base64 encoded)" - path: ../tls.key + path: tls.key - - name: uds-rke2-exemptions + - name: exemptions description: "Deploys the UDS Core Pepr policy exemptions" - repository: ghcr.io/justinthelaw/packages/uds/uds-rke2/exemptions - ref: "dev-local-path" + path: ../../../build/packages/local-path/ + ref: "dev" + optionalComponents: + - nvidia-gpu-operator-exemptions diff --git a/bundles/dev/local-path-core/uds-config.yaml b/bundles/dev/local-path-core/uds-config.yaml index ce0c0b40e..6213e9370 100644 --- a/bundles/dev/local-path-core/uds-config.yaml +++ b/bundles/dev/local-path-core/uds-config.yaml @@ -1,26 +1,55 @@ options: - log_level: warn architecture: amd64 shared: - domain: uds.dev + domain: uds.local variables: uds-rke2: join_token: $JOIN_TOKEN + + infrastructure: + network_interface: $NETWORK_INTERFACE + local-path-init: # Workaround for SELinux EBS issue - https://github.com/bottlerocket-os/bottlerocket/issues/2417 registry_hpa_enable: false + registry_pvc_enabled: true registry_pvc_size: 100Gi - registry_pvc_access_mode: ReadWriteMany - node_path_map_values_file: values/node-path-map-values.yaml - uds-core: - loki_chunks_bucket: $LOKI_S3_BUCKET - loki_ruler_bucket: $LOKI_S3_BUCKET - loki_admin_bucket: $LOKI_S3_BUCKET - loki_s3_region: $LOKI_S3_AWS_REGION - loki_irsa_role_arn: $LOKI_S3_ROLE_ARN - velero_use_secret: $VELERO_USE_SECRET - velero_irsa_role_arn: $VELERO_S3_ROLE_ARN - velero_bucket: $VELERO_S3_BUCKET - velero_bucket_region: $VELERO_S3_AWS_REGION + # Set this to ReadWriteMany for multi-node, HA situations + # See documentation on the StorageClass Zarf Init flavors for more details + registry_pvc_access_mode: ReadWriteOnce + storage_configuration_values_file: values/storage-configuration-values.yaml + + # MinIO bucket persistence volume size + bucket_persistence_size: $BUCKET_PERSISTENCE_SIZE + + core: + loki_s3_endpoint: http://minio.minio.svc.cluster.local:9000 + loki_chunks_bucket: uds + loki_ruler_bucket: uds + loki_admin_bucket: uds + loki_s3_region: us-east-1 + loki_s3_access_key_id: uds + loki_s3_secret_access_key: uds-secret + loki_irsa_role_arn: "" + loki_write_replicas: 1 + loki_read_replicas: 1 + loki_backend_replicas: 1 + + velero_use_secret: false + velero_irsa_role_arn: "" + + # Zarf variables not available via UDS bundle values path overrides + velero_bucket: uds + velero_bucket_region: us-east-1 + velero_bucket_provider_url: http://minio.minio.svc.cluster.local:9000 + velero_bucket_credential_name: uds + velero_bucket_credential_key: uds-secret + + # TLS CERT and KEY values must be base64 encoded + # Paste the entire TLS CERT or KEY value into the fields below + admin_tls_cert: BASE_64_ENCODED_TLS_CERT + admin_tls_key: BASE_64_ENCODED_TLS_KEY + tenant_tls_cert: BASE_64_ENCODED_TLS_CERT + tenant_tls_key: BASE_64_ENCODED_TLS_KEY diff --git a/bundles/dev/tls.conf b/bundles/dev/tls.conf new file mode 100644 index 000000000..196cbcd53 --- /dev/null +++ b/bundles/dev/tls.conf @@ -0,0 +1,21 @@ +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_req +prompt = no +default_bits = 4096 +default_md = sha256 + +[req_distinguished_name] +CN = *.uds.local +O = Defense Unicorns +OU = UDS RKE2 Product Team + +[v3_req] +keyUsage = critical, digitalSignature, keyEncipherment +extendedKeyUsage = serverAuth, clientAuth +subjectAltName = @alt_names +basicConstraints = CA:FALSE + +[alt_names] +DNS.1 = *.uds.local +DNS.2 = *.admin.uds.local diff --git a/bundles/latest/local-path-core/uds-bundle.yaml b/bundles/latest/local-path-core/uds-bundle.yaml index fae3e4484..09a8ee403 100644 --- a/bundles/latest/local-path-core/uds-bundle.yaml +++ b/bundles/latest/local-path-core/uds-bundle.yaml @@ -23,6 +23,14 @@ packages: repository: ghcr.io/justinthelaw/packages/uds/uds-rke2/init # TODO: renovate setup ref: "v0.33.0-local-path" + overrides: + minio: + minio: + variables: + - name: BUCKET_PERSISTENCE_SIZE + description: "Size of the default PVC for MinIO (e.g., 30Gi)" + path: persistence.size + default: "30Gi" - name: uds-rke2-infrastructure description: "Deploys the UDS RKE2 base infrastructure Zarf package" @@ -30,58 +38,64 @@ packages: # x-release-please-start-version ref: "0.4.2" # x-release-please-end + overrides: + infrastructure: + uds-rke2-infrastructure: + variables: + - name: INTERFACE + path: interface - name: uds-core description: "Deploys UDS Core services" repository: ghcr.io/defenseunicorns/packages/uds/core # TODO: renovate setup - ref: "0.22.0-registry1" + ref: "0.23.0-registry1" overrides: loki: loki: values: - # Override default DNS service name for Loki Gateway + # Set DNS service name for Loki Gateway - path: "global.dnsService" value: "rke2-coredns-rke2-coredns" - # If S3 API is external to the cluster - - path: loki.storage.s3.endpoint - value: "" - - path: loki.storage.s3.secretAccessKey - value: "" - - path: loki.storage.s3.accessKeyId - value: "" + # Ensure we don't hit query limits + - path: loki.limits_config.split_queries_by_interval + value: "30m" + - path: loki.query_scheduler.max_outstanding_requests_per_tenant + value: 32000 # This is the new default in Loki 3.0 variables: + - name: LOKI_S3_ENDPOINT + description: "The S3 endpoint" + path: loki.storage.s3.endpoint - name: LOKI_CHUNKS_BUCKET description: "The object storage bucket for Loki chunks" path: loki.storage.bucketNames.chunks - default: "uds" - name: LOKI_RULER_BUCKET description: "The object storage bucket for Loki ruler" path: loki.storage.bucketNames.ruler - default: "uds" - name: LOKI_ADMIN_BUCKET description: "The object storage bucket for Loki admin" path: loki.storage.bucketNames.admin - default: "uds" - name: LOKI_S3_REGION description: "The S3 region" path: loki.storage.s3.region - default: "us-east-1" + - name: LOKI_S3_ACCESS_KEY_ID + description: "The S3 Access Key ID" + path: loki.storage.s3.accessKeyId + - name: LOKI_S3_SECRET_ACCESS_KEY + path: loki.storage.s3.secretAccessKey + description: "The S3 Secret Access Key" - name: LOKI_IRSA_ROLE_ARN description: "The irsa role annotation" path: serviceAccount.annotations.irsa/role-arn - name: LOKI_WRITE_REPLICAS path: write.replicas description: "Loki write replicas" - default: "1" - name: LOKI_READ_REPLICAS path: read.replicas description: "Loki read replicas" - default: "1" - name: LOKI_BACKEND_REPLICAS path: backend.replicas description: "Loki backend replicas" - default: "1" velero: velero: @@ -89,35 +103,61 @@ packages: - name: VELERO_USE_SECRET description: "Toggle use secret off to use IRSA." path: credentials.useSecret - default: "false" - name: VELERO_IRSA_ROLE_ARN description: "IRSA role arn for annotation" path: serviceAccount.server.annotations.irsa/role-arn - default: "" + keycloak: + keycloak: + values: + # Turn off verifications for dev or prototype deployments + - path: realmInitEnv + value: + EMAIL_VERIFICATION_ENABLED: true + OTP_ENABLED: true + TERMS_AND_CONDITIONS_ENABLED: true + X509_OCSP_FAIL_OPEN: true + variables: + # TODO: this can only be `false` if a Postgres is integrated and accessible in the cluster + - name: KEYCLOAK_DEV_MODE + path: devMode + default: true + - name: KEYCLOAK_FIPS_MODE + path: fips + default: true + # Enable headless admin account generation, SHOULD BE CHANGED IMMEDIATELY AFTER + # Can be found in `keycloak-admin-password` secret in-cluster + - name: KEYCLOAK_ENABLE_INSECURE_ADMIN_PASSWORD + path: insecureAdminPasswordGeneration.enabled + default: true + - name: KEYCLOAK_ADMIN_USERNAME + path: insecureAdminPasswordGeneration.username + default: admin + + # Access to UDS cluster administration services (e.g., *admin.uds.dev, grafana.admin.uds.dev) + # Access to UDS application services (e.g., *.uds.dev, ai.uds.dev) istio-admin-gateway: uds-istio-config: - # Access to UDS cluster administration services (e.g., *admin.uds.dev, grafana.admin.uds.dev) variables: - name: ADMIN_TLS_CERT description: "The TLS cert for the admin gateway (must be base64 encoded)" - path: ../tls.cert + path: tls.cert - name: ADMIN_TLS_KEY description: "The TLS key for the admin gateway (must be base64 encoded)" - path: ../tls.key - + path: tls.key istio-tenant-gateway: - # Access to UDS application services (e.g., *.uds.dev, ai.uds.dev) uds-istio-config: variables: - name: TENANT_TLS_CERT description: "The TLS cert for the tenant gateway (must be base64 encoded)" - path: ../tls.cert + path: tls.cert - name: TENANT_TLS_KEY description: "The TLS key for the tenant gateway (must be base64 encoded)" - path: ../tls.key + path: tls.key - name: uds-rke2-exemptions description: "Deploys the UDS Core Pepr policy exemptions" repository: ghcr.io/justinthelaw/packages/uds/uds-rke2/exemptions - ref: "0.4.2-local-path" + ref: "0.5.0-local-path" + optionalComponents: + - nvidia-gpu-operator-exemptions diff --git a/bundles/latest/local-path-core/uds-config.yaml b/bundles/latest/local-path-core/uds-config.yaml index ce0c0b40e..54f1eb431 100644 --- a/bundles/latest/local-path-core/uds-config.yaml +++ b/bundles/latest/local-path-core/uds-config.yaml @@ -1,5 +1,4 @@ options: - log_level: warn architecture: amd64 shared: @@ -8,19 +7,41 @@ shared: variables: uds-rke2: join_token: $JOIN_TOKEN + + infrastructure: + network_interface: $NETWORK_INTERFACE + local-path-init: # Workaround for SELinux EBS issue - https://github.com/bottlerocket-os/bottlerocket/issues/2417 registry_hpa_enable: false registry_pvc_size: 100Gi - registry_pvc_access_mode: ReadWriteMany - node_path_map_values_file: values/node-path-map-values.yaml + # Set this to ReadWriteMany for multi-node, HA situations + # See documentation on the StorageClass Zarf Init flavors for more details + registry_pvc_access_mode: ReadWriteOnce + storage_configuration_values_file: values/storage-configuration-values.yaml + + # MinIO bucket persistence volume size + bucket_persistence_size: $BUCKET_PERSISTENCE_SIZE + uds-core: - loki_chunks_bucket: $LOKI_S3_BUCKET - loki_ruler_bucket: $LOKI_S3_BUCKET - loki_admin_bucket: $LOKI_S3_BUCKET - loki_s3_region: $LOKI_S3_AWS_REGION - loki_irsa_role_arn: $LOKI_S3_ROLE_ARN - velero_use_secret: $VELERO_USE_SECRET - velero_irsa_role_arn: $VELERO_S3_ROLE_ARN - velero_bucket: $VELERO_S3_BUCKET - velero_bucket_region: $VELERO_S3_AWS_REGION + loki_s3_endpoint: http://minio.minio.svc.cluster.local:9000 + loki_chunks_bucket: uds + loki_ruler_bucket: uds + loki_admin_bucket: uds + loki_s3_region: us-east-1 + loki_s3_access_key_id: uds + loki_s3_secret_access_key: uds-secret + loki_irsa_role_arn: "" + loki_write_replicas: 1 + loki_read_replicas: 1 + loki_backend_replicas: 1 + + velero_use_secret: false + velero_irsa_role_arn: "" + + # Zarf variables not available via UDS bundle values path overrides + velero_bucket: uds + velero_bucket_region: us-east-1 + velero_bucket_provider_url: http://minio.minio.svc.cluster.local:9000 + velero_bucket_credential_name: uds + velero_bucket_credential_key: uds-secret diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md new file mode 100644 index 000000000..4cb0a0a14 --- /dev/null +++ b/docs/DEVELOPMENT.md @@ -0,0 +1,236 @@ +# Development + +> [!IMPORTANT] +> This entire repository assumes that you have root access, and all scripts and actions are run as root. Use `sudo su` to activate a root shell. + +The purpose of this document is to describe how to run a development loop on the tech stack, using the `local-path` flavored bundle. + +## Contributing + +The [CONTRIBUTING.md](../.github/CONTRIBUTING.md) is the source of truth for actions to be performed prior to committing to a branch in the repository. Read that first before following the rest of this guide. + +### Local Development + +The following are requirements for building images locally for development and testing. + +- All pre-requisites listed in the `Deployment` section of the [README.md](../README.md) +- [Docker](https://docs.docker.com/get-docker/) or [Podman](https://podman.io/getting-started/installation) for running, building, and pulling images + +## UDS CLI Aliasing + +Below are instructions for adding UDS CLI aliases that are useful for deployments that occur in an air-gap with only the UDS CLI binary available to the delivery engineer. + +For general CLI UX, put the following in your shell configuration (e.g., `/root/.bashrc`): + +```bash +alias k="uds zarf tools kubectl" +alias kubectl="uds zarf tools kubectl" +alias zarf='uds zarf' +alias k9s='uds zarf tools monitor' +alias udsclean="uds zarf tools clear-cache && rm -rf ~/.uds-cache && rm -rf /tmp/zarf-*" +``` + +For fulfilling `xargs` and `kubectl` binary requirements necessary for running some of the _optional_ deployment helper scripts and for full functionality within `uds zarf tools monitor`: + +```bash +touch /usr/local/bin/kubectl +echo -e '#!/bin/bash\nuds zarf tools kubectl "$@"' > /usr/local/bin/kubectl +chmod +x /usr/local/bin/kubectl +``` + +## Tasks + +Task files contain `variables` that are passed throughout the files, and affect deploy-time variables that configure the values (and therefore, Helm charts) of the services or applications being deployed. + +Individual may also contain `inputs`, which means they are supposed to be a re-useable sub-task meant to be consumed by another task. These tasks with inputs cannot be used unless they are consumed by a a parent task that provides the required inputs. + +Run the following to see all the tasks in the "root" [`tasks.yaml`](./tasks.yaml), and their descriptions: + +```bash +uds run --list-all +``` + +In the following sub-sections, we dive into each of the sub-task files and types, and provide examples for each. In the next sections, [Bundle Development](#bundle-development) and [Package Development](#package-development), the instructions for how to use UDS tasks to perform full dev-loops on UDS RKE2 bundles and packages are detailed. + +### Deploy + +> [!NOTE] +> The pre-deployment setup of the host machine is storage solution-dependent, so be sure to check the documentation for the package flavor you are deploying: [`local-path`](./docs/LOCAL-PATH.md), [`longhorn`](./docs/LONGHORN.md), or [`rook-ceph`](./docs/ROOK-CEPH.md). + +See the UDS [`deploy` tasks](./tasks/deploy.yaml) file for more details. + +For example, to deploy the UDS RKE2 bootstrap bundle with `local-path` flavor, do the following: + +```bash +# create and deploy the local dev version, with /opt/uds as the PV mount, and +# the network interface for L2 advertisement on eth0 +uds run uds-rke2-local-path-core-dev --set NETWORK_INTERFACE=eth0 + +# below are examples of dev version deployments of optional packages +uds run deploy:leapfrogai-workarounds --set VERSION=dev +uds run deploy:nvidia-gpu-operator --set VERSION=dev +``` + +### Create + +See the UDS [`create` tasks](./tasks/create.yaml) file for more details. + +To create individual packages and bundles, reference the following example for NVIDIA GPU Operator: + +```bash +# create the local dev version of the Zarf package +uds run create:nvidia-gpu-operator --set VERSION=dev +``` + +### Publish + +See the UDS [`publish` tasks](./tasks/publish.yaml) file for more details. Also see the `release` task in the main [`tasks.yaml`](./tasks.yaml). + +To publish all packages and bundles, do the following: + +```bash +# release all packages with a `dev` version +uds run release-dev +``` + +### Remove + +Run the following to remove all Docker, Zarf and UDS artifacts from the host: + +```bash +uds run setup:clean +``` + +Run the following to completely destroy the UDS RKE2 node and all of UDS RKE2's artifacts from the node's host: + +```bash +uds run setup:uds-rke2-destroy +``` + +## Bundle Development + +To build and deploy an ENTIRE bundle, use the tasks located in the `CREATE AND DEPLOY BUNDLES` section of the [tasks.yaml](../tasks.yaml). Be careful to note the difference between pulling the LATEST remote packages and bundle, and creating + deploying the local DEV versions of the packages and bundle. + +If you have modified the deploy-time variables in a [uds-config.yaml](bundles/dev/local-path-core/uds-config.yaml), but none of the bundle components, and want to complete a re-deployment, you will need to run the TLS creation and injection step again: + +```bash +# recreate the dev TLS certs and inject into the modified uds-config.yaml +uds run create-tls-local-path-dev + +# deploy the pre-created UDS bundle with the modified uds-config.yaml +uds run deploy:local-path-core-bundle-dev +``` + +If you modified an individual package within the bundle, and want to do an integrated install again, you can just create the modified package again, and re-create the bundle: + +```bash +# recreate the local-path-init package +uds run create:local-path-init --set VERSION=dev + +# recreate the bundle and deploy +uds run create:local-path-core-bundle-dev +uds run deploy:local-path-core-bundle-dev +``` + +## Package Development + +If you don't want to build an entire bundle, or you want to dev-loop on a single package in an existing, Zarf-init'd cluster, you can do so by performing a `uds zarf package remove [PACKAGE_NAME]` and re-deploying the package into the cluster. + +To build a single package, use the tasks located in the `STANDARD PACKAGES`, `INIT PACKAGES`, or `APP-SPECIFIC PACKAGES` sections of the [create.yaml](../create.yaml). Be careful to note the difference between building the LATEST packages and creating + deploying the local DEV versions of the packages. + +For example, this is how you build and deploy a local DEV version of a package: + +```bash +# if package is already in the cluster, and you are deploying a new one +uds zarf package remove nvidia-gpu-operator --confirm + +# create and deploy the new package +uds run create:nvidia-gpu-operator --set VERSION=dev +uds run deploy:nvidia-gpu-operator --set VERSION=dev +``` + +For example, this is how you pull and deploy a LATEST version of a package: + +```bash +# pull and deploy latest versions +uds zarf package pull oci://ghcr.io/justinthelaw/packages/uds/uds-rke2/nvidia-gpu-operator:latest -a amd64 +uds run deploy:nvidia-gpu-operator +``` + +## Airgap Testing + +### Pre-Cluster/Node Bootstrapping + +This sub-section is mainly for the pre-cluster or node bootstrapping steps, and targets the testing of the air-gapped bootstrapping of UDS RKE2 infrastructure. + +You can use the [air-gapping script](./vm/scripts/airgap.sh) in the VM documentation directory to perform an IP tables manipulation to emulate an airgap. Modify the following lines, which allow local area network traffic, in the script based on your LAN configuration: + +```bash +# Allow local network traffic - adjust to match your local network +iptables -A OUTPUT -d 192.168.1.0/24 -j ACCEPT +iptables -A OUTPUT -d 10.42.0.0/24 -j ACCEPT +``` + +To reverse this effect, just execute the [airgap reversion script](./vm/scripts/reverse-airgap.sh). + +> [!CAUTION] +> Please note that the airgap reversion script flushes ALL existing rules, so modify the script or manually reset your IP table rules if the script does not work for your configuration. + +### Post-Cluster/Node Bootstrapping + + + +## Troubleshooting + +If your RKE2 cluster is failing to spin up in the first place, you can use `journalctl` to monitor the progress. Please note that it may take up to 10 minutes for the cluster spin-up and move on to the next step of the UDS RKE2 bundle deployment. + +```bash +journalctl -xef -u rke2-server +``` + +Occasionally, a package you are trying to re-deploy, or a namespace you are trying to delete, may hang. To workaround this, be sure to check the events and logs of all resources, to include pods, deployments, daemonsets, clusterpolicies, etc. There may be finalizers, Pepr hooks, and etc. causing the re-deployment or deletion to fail. Use the `k9s` and `kubectl` tools that are vendored with UDS CLI, like in the examples below: + +```bash +# k9s CLI for debugging +uds zarf tools monitor + +# kubectl command for logs +uds zarf tools kubectl logs DaemonSet/metallb-speaker -n uds-rke2-infrastructure --follow +``` + +To describe node-level data, like resource usage, non-terminated pods, taints, etc. run the following command: + +```bash +uds zarf tools kubectl describe node +``` + +To check which pods are sucking up GPUs in particular, you can run the following `yq` command: + +```bash +uds zarf tools kubectl get pods \ +--all-namespaces \ +--output=yaml \ +| uds zarf tools yq eval -o=json ' + ["Pod", "Namespace", "Container", "GPU"] as $header | + [$header] + [ + .items[] | + .metadata as $metadata | + .spec.containers[] | + select(.resources.requests["nvidia.com/gpu"]) | + [ + $metadata.name, + $metadata.namespace, + .name, + .resources.requests["nvidia.com/gpu"] + ] + ]' - \ +| jq -r '(.[0] | @tsv), (.[1:][] | @tsv)' \ +| column -t -s $'\t' +``` + +Another debugging step is through the CUDA sample tests, [CUDA Vector Add](../tests/cuda-vector-add.yaml) or [Device Query](../tests/device-query.yaml). These can be deployed by executing the following on an existing cluster with NVIDIA GPU operator installed: + +```bash +uds zarf tools kubectl apply tests/cuda-vector-add.yaml +uds zarf tools kubectl apply tests/device-query.yaml +``` diff --git a/docs/DIAGRAM.md b/docs/DIAGRAM.md index 9229d8d59..917ff0d7e 100644 --- a/docs/DIAGRAM.md +++ b/docs/DIAGRAM.md @@ -1,6 +1,6 @@ # UDS RKE2 Diagram -Below is an diagram showing an example deployment of UDS RKE2 with the `local-path` flavor custom Zarf Init and LeapfrogAI deployed on top. The dependency chain and installation order are from bottom to top. +Below is an diagram showing an example deployment of UDS RKE2 with the `local-path` flavor custom Zarf Init, NVIDIA GPU Operator and LeapfrogAI deployed on top. The dependency chain and installation order are from bottom to top. ```mermaid flowchart @@ -14,12 +14,22 @@ flowchart lfai_workarounds --> lfai_package end + init["Zarf Package: nvidia-gpu-operator"] + subgraph "NVIDIA GPU Operator" + nfd["Zarf Component: node-feature-discovery"] + nvidia["Zarf Component: nvidia-gpu-operator"] + + direction BT + nfd --> nvidia + end + subgraph "UDS RKE2 Exemptions" exemptions[" Zarf Package: uds-rke2-exemptions-local-path - uds-rke2-infrastructure-exemptions - local-path-exemptions + - nvidia-gpu-operator-exemptions "] end @@ -42,6 +52,7 @@ flowchart - Velero "] end + subgraph "UDS RKE2 Infrastructure" infrastructure[" Zarf Package: infrastructure diff --git a/docs/DNS-TLS.md b/docs/DNS-TLS.md new file mode 100644 index 000000000..0e851f619 --- /dev/null +++ b/docs/DNS-TLS.md @@ -0,0 +1,72 @@ +# Domain Name Service (DNS) and Transport Layer Security (TLS) Certificates + +## Initial Domain Context + +One of the core assumptions of the original [`uds-k3d`](https://github.com/defenseunicorns/uds-k3d) package is the use of `uds.dev` as the base domain for your production environment. This assumption is integral to the DNS and network configuration provided by the package. It is based on an existing public DNS entry for `*.uds.dev` that resolves to `127.0.0.1` (localhost). + +In this repository's `uds-rke2` packages and bundles, this public DNS resolution will not work. UDS RKE2's services are exposed via the host machine's IP, and not via localhost. The following section notes the `/etc/hosts/` modifications required to access virtual services being served by the Istio gateways. + +## Modifying Domain Name + +> [!NOTE] +> Modifying the domain requires the associated TLS certificate and key creation configuration to also be modified. Please see the [`create:tls` task](../tasks/create.yaml) for more details. + +In the UDS create and deploy actions, there is a `DOMAIN` variable that can be set to affect how the underlying packages are built and deployed. The `DOMAIN` is required for both stages as each package requires the setting of the domain at different steps (create or deploy-time). + +An example of the shared `DOMAIN` variable in a UDS configuration file (DEV): + +```yaml +shared: + domain: uds.local +``` + +## Certificate Authority (CA) and TLS Certificate Management + +The CA and TLS certs are all created and injected by the aforementioned `create:tls` UDS task. To modify this behavior to use your own CA and TLS certificates, you will need to copy and paste your TLS key and cert, base64 encoded, into the `uds-config-dev.yaml` or `uds-config-[LATEST_VERSION].yaml` PRIOR to running the UDS task to deploy the bundle. + +The CA certs that result from this process, or the CA certs you used to sign the original TLS certs, must be available to the host machine(s) and cluster so that the HTTPS errors do not show up to the end-users of the web applications and API, and so that services within the cluster (e.g., [Supabase and KeyCloak in LeapfrogAI](./LEAPFROGAI.md) case) that reach out to each other via HTTPS do not error out due to CA trust issues. + +Once the CA cert has been created as part of the overall `uds-rke2-local-path-core` or `uds-rke2-local-path-core-dev` tasks, you copy the CA certs into your host machine's trust store. For example, in Ubuntu the following can be executed (as root): + +```bash +cp build/packages/local-path/tls/ca.pem /usr/local/share/ca-certificates/ca.crt +update-ca-certificates +``` + +If you are using a browser that does not use the host machine's trust store location, then you will need to upload the CA certificate into the browser's settings related to Trust, Privacy and/or Security. Please refer to your browser's documentation for more details. + +### CA Trust Bundles + +UDS Core, which UDS RKE2 is reliant on, has an [outstanding issue for centralized management of CA trust bundles](https://github.com/defenseunicorns/uds-core/issues/464) within the cluster. This issue is outside the scope of UDS RKE2's base infrastructure and any applications that have CA trust issues due to service mesh incompatibilities or communication must follow the pattern seen in the [`leapfrogAI-workarounds` package](../packages/leapfrogai/zarf.yaml). + +## Host File Modifications + +The default Istio Ingress gateways deployed with the UDS RKE2 bundle are assigned the following MetalLB allocated IPs, where `BASE_IP` is the IP of the host machine as identified within the MetalLB component of UDS RKE2 INfrastructure Zarf package: + +- `admin`: `.200` +- `tenant`: `.201` +- `passthrough`: `.202` + +If an `/etc/hosts` file needs to be modified for access via a host's browser, then modify the `/etc/hosts/` accordingly. Below is an example entry: + +```toml +127.0.0.1 localhost +127.0.1.1 device-name + +# UDS and LeapfrogAI subdomains +192.168.0.200 keycloak.admin.uds.dev grafana.admin.uds.dev neuvector.admin.uds.dev +192.168.0.201 leapfrogai-api.uds.dev sso.uds.dev leapfrogai.uds.dev leapfrogai-rag.uds.dev ai.uds.dev supabase-kong.uds.dev +``` + +## CoreDNS Override + +If any internal services require an `https://` "reach-around" in order to interact with another service's API, then the Corefile of the RKE2 CoreDNS service can be modified by following the [RKE CoreDNS Helm Chart configuration instructions](https://www.suse.com/support/kb/doc/?id=000021179). + +An example CoreDNS override is seen in [`leapfrogAI-workarounds` package](../packages/leapfrogai/zarf.yaml). + +This is not a recommended approach as most, if not all, services should be capable of communicating via the secured internal Istio service mesh. + +## Additional Info + +- [CoreDNS K8s Documentation](https://kubernetes.io/docs/tasks/administer-cluster/coredns/) +- [RKE2 CoreDNS Customization](https://www.suse.com/support/kb/doc/?id=000021179) diff --git a/docs/DNS.md b/docs/DNS.md deleted file mode 100644 index eff80a48b..000000000 --- a/docs/DNS.md +++ /dev/null @@ -1,31 +0,0 @@ -# Domain Name Service (DNS) - -## Domain Assumptions - -One of the core assumptions of the original [`uds-k3d`](https://github.com/defenseunicorns/uds-k3d) package is the use of `uds.dev` as the base domain for your production environment. This assumption is integral to the DNS and network configuration provided by the package. It is based on an existing public DNS entry for `*.uds.dev` that resolves to `127.0.0.1` (localhost). - -In this repository's `uds-rke2` packages and bundles, this public DNS resolution will not work. UDS RKE2's services are exposed via the host machine's IP, and not via localhost. The following section notes the `/etc/hosts/` modifications required to access virtual services being served by the Istio gateways. - -## Host File Modifications - -The default Istio Ingress gateways deployed with the UDS RKE2 bundle are assigned the following MetalLB allocated IPs, where `BASE_IP` is the IP of the host machine as identified within the MetalLB component of UDS RKE2 INfrastructure Zarf package: - -- `admin`: `.200` -- `tenant`: `.201` -- `passthrough`: `.202` - -If an `/etc/hosts` file needs to be modified for access via a host's browser, then modify the `/etc/hosts/` accordingly. Below is an example entry: - -```text -127.0.0.1 localhost -184.223.9.200 grafana.admin.uds.dev neuvector.admin.uds.dev -184.223.9.201 sso.admin.uds.dev -``` - -## CoreDNS Override - -If any internal services require an `https://` "reach-around" in order to interact with another service's API, then the Corefile of the RKE2 CoreDNS service can be modified by following the [RKE CoreDNS Helm Chart configuration instructions](https://www.suse.com/support/kb/doc/?id=000021179). - -This is not a recommended approach, as all services should be capable of communicating via the secured internal Kubernetes network. - -Additionally, an Nginx service and configuration must be installed into the cluster. An example Nginx configuration for K3d can be found in the [uds-k3d repository](https://github.com/defenseunicorns/uds-k3d/blob/main/chart/templates/nginx.yaml). The Nginx configuration assumes the use of `uds.dev` as the base domain. This configuration is tailored to support the production environment setup, ensuring that Nginx correctly handles requests and routes them within the cluster, based on the `uds.dev` domain. diff --git a/docs/INIT.md b/docs/INIT.md index f4e274dab..4a9d1fe7c 100644 --- a/docs/INIT.md +++ b/docs/INIT.md @@ -21,3 +21,9 @@ The below is used to manually create the custom Zarf init package, which is usua # Create the zarf init package uds zarf package create --architecture amd64 --confirm --set AGENT_IMAGE_TAG=$(zarf version) ``` + +## Additional Info + +- [Zarf Repository](https://github.com/defenseunicorns/zarf) +- [Zarf Documentation Website](https://docs.zarf.dev/getting-started/) +- [Zarf Init Package](https://docs.zarf.dev/ref/init-package/) diff --git a/docs/LEAPFROGAI.md b/docs/LEAPFROGAI.md index 439154033..fc3ccd253 100644 --- a/docs/LEAPFROGAI.md +++ b/docs/LEAPFROGAI.md @@ -1,29 +1,33 @@ # LeapfrogAI -**Supported Version**: 0.7.2 +**Supported Version**: 0.9.1 ## Supporting Packages Supporting packages are specific to the LeapfrogAI version outlined within this document, as well as the pre-requisites and caveats surrounding the deployment of UDS RKE2 on a host environment. -### NVIDIA GPU Operator - - - ### LeapfrogAI Workarounds The following are workarounds for LeapfrogAI that must be implemented within the cluster after LeapfrogAI has been deployed. #### RKE2 CoreDNS -Patch the RKE2 CoreDNS Corefile with the tenant and admin gateway rewrites for Supabase and KeyCloak hand-offs +Patch the RKE2 CoreDNS Corefile with the tenant and admin gateway rewrites for Supabase and KeyCloak hand-offs. The RKE2 CoreDNS service needs to proxy requests to the external Supabase's HTTPS endpoint to the internal cluster service instead, and also for the KeyCloak admin service as well. This is because the Supabase authentication handoff requires interaction with a third-party, SSO service that is served from an HTTPS endpoint. This CoreDNS workaround allows us to properly resolve the Supabase and KeyCloak HTTPS endpoints internally without leaving the cluster. -#### Supabase URL +In the "LATEST" bundles and package published to GHCR domain used for the CoreDNS reroute is, by default, `uds.dev`; whereas the "DEV" bundles are `uds.local` by default. Please see the UDS [create](../tasks/create.yaml) and [deploy](../tasks/deploy.yaml) tasks for details on how to change this to a domain of your choice. + +See the [DNS and TLS docs](./DNS-TLS.md) for some more detail on rationale, and the [CA Certificates for Supabase section](#ca-certificates-for-supabase) for some workarounds required when the Domain, CA cert, and/or the TLS cert/key are changed for a particular deployment environment. + +#### CA Certificates for Supabase + +As mentioned in the previous section, the CA certificate used to sign the TLS certificates in the Istio Gateways (tenant and admin), must be provided to services that interact with Supabase via HTTPS protocol. + +The workarounds package contains a method for supplying these CA certificates to the containers that communicate over HTTPS to/from Supabase containers (i.e., `keycloak` -> `supabase-auth` -> `leapfrogai-ui`). -Patch the Supabase URL in the Supabase Studio ConfigMap +## Additional Info -The Supabase public URL within the Supabase Studio ConfigMap is missing due to a failed helm replacement of the Zarf variable. This patch goes in and ensures the global Domain variable is inserted into the Supabase public URL. Without this variable, Supabase Studio -dashboards fail to function properly. +- [LeapfrogAI Repository](https://github.com/defenseunicorns/leapfrogai) +- [LeapfrogAI Documentation Website](https://docs.leapfrog.ai/docs/) diff --git a/docs/LOCAL-PATH.md b/docs/LOCAL-PATH.md index 74b709586..874ba2c3d 100644 --- a/docs/LOCAL-PATH.md +++ b/docs/LOCAL-PATH.md @@ -10,7 +10,29 @@ Local Path Provisioner can still be useful if paired with an operator with built #### Node Configuration -Node-level storage configurations are set within the [Node Path Map values file](../packages/local-path/values/node-path-map-values.yaml). The instructions for filling out the values file are within the values file. The default mount for all nodes that do not have a specific configuration is `/opt/uds/`. +Node-level storage configurations are set within the [storage configuration values file](../packages/local-path/values/storage-configuration-values.yaml). The instructions for filling out the values file are within the values file. The default Zarf package expects a mounted location of `/opt/uds/` on a single node, which **DOES NOT** allow for `ReadWriteMany` and `ReadOnlyMany` access modes for PVCs. + +When modifying or supplying a storage-configuration-values.yaml, please note that `nodePathMap` and `sharedFileSystemPath` are mutually exclusive. If `sharedFileSystemPath` is used, then `nodePathMap` must be set to `[]`. + +The following are the general node-level configuration rules and information for each type of storage configuration: + +1. **`nodePathMap`**: the place user can customize where to store the data on each node + - If one node is not listed on the nodePathMap, and Kubernetes wants to create volume on it, the paths specified in + DEFAULT_PATH_FOR_NON_LISTED_NODES will be used for provisioning. + - If one node is listed on the nodePathMap, the specified paths will be used for provisioning. + 1. If one node is listed but with paths set to [], the provisioner will refuse to provision on this node. + 2. If more than one path was specified, the path would be chosen randomly when provisioning. + - The configuration must obey following rules: + 1. A path must start with /, a.k.a an absolute path. + 2. Root directory (/) is prohibited. + 3. No duplicate paths allowed for one node. + 4. No duplicate node allowed. + 5. The path must not already be owned by a different system user + +2. **`sharedFileSystemPath`**: allows the provisioner to use a filesystem that is mounted on all + - nodes at the same time. In this case all access modes are supported: `ReadWriteOnce`, + - `ReadOnlyMany` and `ReadWriteMany` for storage claims. In addition + - `volumeBindingMode: Immediate` can be used in StorageClass definition. #### Storage Configuration @@ -20,7 +42,12 @@ Ensure that the local volume mount points are accessible to the cluster. For exa # mount the device to an existing filepath sudo mount ubuntu/vg/extra /opt/uds -# change permissions to the nonroot or nobody user for local storage volume creation +## OR ## + +# create a directory at an existing root filesystem +sudo mkdir -p /opt/uds + +# change permissions to the nonroot user for local storage volume access (RWX) sudo chown -Rv 65534:65534 /opt/uds ``` diff --git a/docs/MINIO.md b/docs/MINIO.md index 4582e8332..2de46f42c 100644 --- a/docs/MINIO.md +++ b/docs/MINIO.md @@ -172,3 +172,8 @@ bundle: - "s3:ListBucket" - "s3:ListBucketMultipartUploads" ``` + +## Additional Info + +- [MinIO Repository](https://github.com/minio/minio) +- [MinIO Documentation Website](https://min.io/docs/minio/kubernetes/upstream/) diff --git a/docs/NVIDIA-GPU-OPERATOR.md b/docs/NVIDIA-GPU-OPERATOR.md new file mode 100644 index 000000000..b3fb040b4 --- /dev/null +++ b/docs/NVIDIA-GPU-OPERATOR.md @@ -0,0 +1,83 @@ +# NVIDIA GPU Operator + +The NVIDIA GPU Operator provides a single-pane management resources that follows the Kubernetes operator pattern. When configured properly, capabilities like time-slicing and multi-instance GPUs can be provisioned on existing GPU resources on nodes within the cluster. Additionally, some optional components within the NVIDIA GPU Operator allow engineers to maintain NVIDIA dependencies in a Kubernetes-native way. + +## Node Feature Discovery + +The Kubernetes Node Feature Discovery component allows other Kubernetes resources define and consume hardware and software resources available on a node. The NVIDIA GPU Operator requires this to be installed on the cluster beforehand so that NVIDIA GPUs can be characterized properly + +## Optional Components + +> [!IMPORTANT] +> Many of the default-disabled optional components of the operator contain images/containers that are not available within IronBank, and must be pulled from NVCR. + +### NVIDIA Container Toolkit + +The NVIDIA Container Toolkit allows containerized applications and services to consume NVIDIA GPUs as resources. This is usually pre-installed on the host node prior to air-gapping or via an internally mirrored package repository or by bringing in the dependencies into the air-gap. The NVIDIA GPU Operator includes a DaemonSet that can be enabled to install the NVIDIA Container Toolkit on the host as a Kubernetes resource, allowing engineers the flexibility of deploying and updating the toolkit in a Kubernetes-native way. + +If your NVIDIA Container Toolkit is pre-installed, please ensure that the `containerd` runtime was correctly configured post-toolkit installation. The `/etc/containerd/config.toml` should look something like this: + +```toml +version = 2 + +[plugins] + + [plugins."io.containerd.grpc.v1.cri"] + enable_cdi = true + cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"] + + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-experimental] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia-experimental.options] + BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime-experimental" +``` + +### NVIDIA GPU Drivers + +NVIDIA's Driver GPUs are usually pre-installed on the host node, similar to the NVIDIA Container Toolkit. Upgrading GPU drivers and ensuring they remain up-to-date can be done using the NVIDIA GPU Operator as well. By providing the correct pre-compiled drivers within the [`nvidia-gpu-operator-values.yaml`](../packages/nvidia-gpu-operator/values/nvidia-gpu-operator-values.yaml), and ensuring the host meets minimum requirements for installing these drivers via a Kubernetes pod, engineers can maintain and deploy drivers to host node sin a Kubernetes-native way. + +### Multi-Instance GPUs + +Multi-Instance GPU (MIG) relies on extra configuration and understanding of the deployment environment's GPUs. Please see the [MIG configuration](#configuration) guide for more details. + +## Usage + +### Configuration + +Create and deploy-time configuration of the Zarf package is done mainly on the following components, and is dependent on the engineer's final desired configuration and the cluster's available node resources: + +1. (Create-time) [NVIDIA GPU Drivers](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/precompiled-drivers.html) +2. (Deploy-time) [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) +3. (Deploy-time) [Time-Slicing](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html) +4. (Deploy-time) [Multi-Instance GPUs](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html) + +### Cleanup + +In order to perform a fresh install after a previous deployment of the NVIDIA GPU operator, the engineer must remove the following directory from the host: + +```bash +sudo rm -rf /run/nvidia +``` + +## Additional Info + +- [NVIDIA GPU Operator Repository](https://github.com/NVIDIA/gpu-operator) +- [NVIDIA GPU Operator Documentation Website](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/overview.html) diff --git a/docs/RKE2.md b/docs/RKE2.md index 7d0f3ef7f..57ee2c1e4 100644 --- a/docs/RKE2.md +++ b/docs/RKE2.md @@ -49,10 +49,42 @@ An example setup is provided below: - Node3: `/root/rke2-startup.sh -t -s -T ` - NodeN (agent nodes): `/root/rke2-startup.sh -t -s -a` +### Script `containerd` Configuration + +The `containerd` configuration template is within the startup script. This configuration template adds options to the host's `containerd` configuration, as seen in the example below: + +```toml +# /var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl +version = 2 + +[plugins."io.containerd.internal.v1.opt"] + path = "/var/lib/rancher/rke2/agent/containerd" + +[plugins."io.containerd.grpc.v1.cri"] + stream_server_address = "127.0.0.1" + stream_server_port = "10010" + enable_selinux = false + enable_unprivileged_ports = true + enable_unprivileged_icmp = true + sandbox_image = "registry1.dso.mil/ironbank/opensource/pause/pause:3.9" + +[plugins."io.containerd.grpc.v1.cri".containerd] + snapshotter = "overlayfs" + disable_snapshot_annotations = true + +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + SystemdCgroup = true +``` + ## Additional Info - [RKE2 Releases](https://github.com/rancher/rke2/releases) -- [Air-Gap Install](https://docs.rke2.io/install/airgap#tarball-method) -- [RKE2 Installation options](https://docs.rke2.io/install/methods) -- [RKE2 Configuration file](https://docs.rke2.io/install/configuration) -- [RKE2 High-availability](https://ranchermanager.docs.rancher.com/how-to-guides/new-user-guides/kubernetes-cluster-setup/rke2-for-rancher) +- [RKE2 Air-Gap Install](https://docs.rke2.io/install/airgap#tarball-method) +- [RKE2 Installation Options](https://docs.rke2.io/install/methods) +- [RKE2 Configuration File](https://docs.rke2.io/install/configuration) +- [RKE2 High-Availability](https://ranchermanager.docs.rancher.com/how-to-guides/new-user-guides/kubernetes-cluster-setup/rke2-for-rancher) +- [RKE2 Repository](https://github.com/rancher/rke2) +- [RKE2 Documentation Website](https://docs.rke2.io/install/quickstart) diff --git a/docs/ROOK-CEPH.md b/docs/ROOK-CEPH.md index e875c4af9..4c6615f62 100644 --- a/docs/ROOK-CEPH.md +++ b/docs/ROOK-CEPH.md @@ -97,3 +97,10 @@ sgdisk --zap-all $DISK dd if=/dev/zero of="$DISK" bs=1M count=100 oflag=direct,dsync partprobe $DISK ``` + +## Additional Info + +- [Rook-Ceph Repository](https://github.com/rook/rook) +- [Rook-Ceph Documentation Website](https://rook.io/docs/rook/latest-release/Getting-Started/intro/) +- [Ceph Repository](https://github.com/ceph/ceph) +- [Ceph Documentation Website](https://docs.ceph.com/en/latest/dev/developer_guide/intro/) diff --git a/docs/UDS-CORE.md b/docs/UDS-CORE.md index 6c0ad3d95..0761de01b 100644 --- a/docs/UDS-CORE.md +++ b/docs/UDS-CORE.md @@ -1,7 +1,7 @@ # UDS Core -**Supported Version**: 0.22.0 +**Supported Version**: 0.23.0 > [!CAUTION] > UDS Core will not deploy or function properly unless deployed as part of a UDS bundle with the proper overrides applied. @@ -10,7 +10,7 @@ ### S3 Bucket -The S3 bucket being used by the UDS Core services, Loki and Velero, are pointed to the MinIO or ROok-Ceph bucket generated by default using the custom Zarf Init. +The S3 bucket being used by the UDS Core services, Loki and Velero, are pointed to the MinIO or Rook-Ceph bucket generated by default using the custom Zarf Init. ### CoreDNS @@ -19,3 +19,7 @@ The CoreDNS service identified by the Loki Gateway must be pointed to the pre-pa ### Custom TLS Certificates Each Istio Ingress/Egress gateway must have a valid TLS certificate. Options to point to a pre-populated certificate are provided. + +## Additional Info + +- [UDS Core Repository](https://github.com/defenseunicorns/uds-core) diff --git a/docs/UDS-RKE2.md b/docs/UDS-RKE2.md index fee8241d9..c5647c4f1 100644 --- a/docs/UDS-RKE2.md +++ b/docs/UDS-RKE2.md @@ -4,7 +4,11 @@ This is an extension of the [RKE2 configuration documentation](./RKE2.md), and p ## Infrastructure -This package deploys MetalLB and MachineID + Pause integration for L2 advertisement and pod/namespace integrity. +This package deploys MetalLB and MachineID + Pause for L2 advertisement and pod/namespace integrity, respectively. + +The L2 advertisement requires the network interface and IP address pool. These are supplied via variables seen in the [Zarf package deployment](../packages/uds-rke2/infrastructure/zarf.yaml) or UDS bundle deployment ([`local-path-core` bundle configuration example](../bundles/dev/local-path-core/uds-config.yaml)) manifests. + +To find the interface that you would like to advertise on, use `ifconfig` and identify the local network-facing interface. An example network interface is `eth0`, when advertising to the local network via `192.168.x.x`. ## Exemptions diff --git a/docs/vm/scripts/airgap.sh b/docs/vm/scripts/airgap.sh new file mode 100755 index 000000000..d0a6eb640 --- /dev/null +++ b/docs/vm/scripts/airgap.sh @@ -0,0 +1,52 @@ +#!/bin/bash +set -e + +# Check if NETWORK_INTERFACE is set +if [ -z "$NETWORK_INTERFACE" ]; then + echo "Error: NETWORK_INTERFACE environment variable is not set." + exit 1 +fi + +# Get IP address and subnet for the specified interface +LOCAL_IP=$(ip -4 addr show $NETWORK_INTERFACE | grep -oP '(?<=inet\s)\d+(\.\d+){3}/\d+') +if [ -z "$LOCAL_IP" ]; then + echo "Error: Could not determine IP address for $NETWORK_INTERFACE" + exit 1 +fi + +# Get IP address and subnet for the flannel interface +FLANNEL_IP=$(ip -4 addr show flannel.1 2>/dev/null | grep -oP '(?<=inet\s)\d+(\.\d+){3}/\d+') +if [ -z "$FLANNEL_IP" ]; then + echo "Warning: Could not determine IP address for flannel interface. Using default 10.42.0.0/24" + FLANNEL_IP="10.42.0.0/24" +fi + +# Set default policies +iptables -P INPUT ACCEPT +iptables -P FORWARD ACCEPT +iptables -P OUTPUT DROP + +# Allow loopback +iptables -A OUTPUT -o lo -j ACCEPT + +# Allow established connections +iptables -A OUTPUT -m state --state ESTABLISHED,RELATED -j ACCEPT + +# Allow SSH connections on port 22 from any IP address +iptables -A INPUT -p tcp --dport 22 -j ACCEPT +iptables -A OUTPUT -p tcp --sport 22 -j ACCEPT + +# Allow local network traffic +iptables -A OUTPUT -d $LOCAL_IP -j ACCEPT +iptables -A OUTPUT -d $FLANNEL_IP -j ACCEPT + +# Repeat for IPv6 +ip6tables -P INPUT ACCEPT +ip6tables -P FORWARD ACCEPT +ip6tables -P OUTPUT DROP +ip6tables -A OUTPUT -o lo -j ACCEPT +ip6tables -A OUTPUT -m state --state ESTABLISHED,RELATED -j ACCEPT + +# Allow IPv6 SSH connections on port 22 from any IP address +ip6tables -A INPUT -p tcp --dport 22 -j ACCEPT +ip6tables -A OUTPUT -p tcp --sport 22 -j ACCEPT diff --git a/docs/vm/scripts/reverse-airgap.sh b/docs/vm/scripts/reverse-airgap.sh new file mode 100755 index 000000000..09c301c2b --- /dev/null +++ b/docs/vm/scripts/reverse-airgap.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -e + +# Function to log messages +log_message() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +log_message "Starting reverse airgap process..." + +# Flush all IP table rules +log_message "Flushing all iptables rules..." +iptables -F +iptables -t nat -F +iptables -t mangle -F +iptables -X + +log_message "Flushing all ip6tables rules..." +ip6tables -F +ip6tables -t nat -F +ip6tables -t mangle -F +ip6tables -X + +# Reset to the default IP table policies +log_message "Resetting iptables policies to ACCEPT..." +iptables -P INPUT ACCEPT +iptables -P FORWARD ACCEPT +iptables -P OUTPUT ACCEPT + +log_message "Resetting ip6tables policies to ACCEPT..." +ip6tables -P INPUT ACCEPT +ip6tables -P FORWARD ACCEPT +ip6tables -P OUTPUT ACCEPT + +# Remove any persistent iptables rules if they exist +if [ -f /etc/iptables/rules.v4 ]; then + log_message "Removing persistent IPv4 rules..." + rm -f /etc/iptables/rules.v4 +fi + +if [ -f /etc/iptables/rules.v6 ]; then + log_message "Removing persistent IPv6 rules..." + rm -f /etc/iptables/rules.v6 +fi + +log_message "Reverse airgap process completed." \ No newline at end of file diff --git a/packages/init/zarf-config.yaml b/packages/init/zarf-config.yaml index fc2112e2c..096bf23ad 100644 --- a/packages/init/zarf-config.yaml +++ b/packages/init/zarf-config.yaml @@ -24,7 +24,7 @@ package: # LOCAL PATH PROVISIONER ######################## - node_path_map_values_file: values/node-path-map-values.yaml + storage_configuration_values_file: values/storage-configuration-values.yaml ########### # ROOK-CEPH diff --git a/packages/leapfrogai/zarf-config.yaml b/packages/leapfrogai/zarf-config.yaml index dda954dc0..16e5364e5 100644 --- a/packages/leapfrogai/zarf-config.yaml +++ b/packages/leapfrogai/zarf-config.yaml @@ -2,7 +2,7 @@ package: create: set: # TODO: renovate setup - version: "0.7.2" + version: "0.9.1" deploy: set: domain: "uds.dev" diff --git a/packages/leapfrogai/zarf.yaml b/packages/leapfrogai/zarf.yaml index 49e737eb8..4c43e5f43 100644 --- a/packages/leapfrogai/zarf.yaml +++ b/packages/leapfrogai/zarf.yaml @@ -14,6 +14,14 @@ variables: description: "The base domain used for all UDS core services and UDS deployed applications" default: "uds.dev" prompt: false + - name: CERTIFICATE_DIRECTORY + description: "The directory where the CA and TLS configurations and resulting certificates are stored" + default: "build/packages/local-path/certs" + prompt: true + - name: CA_TRUST_BUNDLE_NAME + description: "The CA trust bundle name (e.g., ca.pem)" + default: "ca.pem" + prompt: true components: - name: coredns-corefile-rewrites @@ -21,6 +29,12 @@ components: description: "Setup rewrites for Supabase's authentication callback to the KeycLoak SSO service" actions: onDeploy: + before: + - cmd: | + echo ${ZARF_VAR_DOMAIN} | sed 's/[.\/]/\\&/g' + description: "Add special escape characters to given domain" + setVariables: + - name: MODIFIED_DOMAIN after: - cmd: | sudo uds zarf tools kubectl patch configmap rke2-coredns-rke2-coredns -n kube-system --patch " @@ -44,24 +58,99 @@ components: reload loadbalance - rewrite name supabase-kong.${ZARF_VAR_DOMAIN} tenant-ingressgateway.istio-tenant-gateway.svc.cluster.local - rewrite name keycloak.admin.${ZARF_VAR_DOMAIN} admin-ingressgateway.istio-admin-gateway.svc.cluster.local + rewrite { + name regex (.*${ZARF_VAR_MODIFIED_DOMAIN}) tenant-ingressgateway.istio-tenant-gateway.svc.cluster.local answer auto + } + rewrite { + name regex (.*admin\.${ZARF_VAR_MODIFIED_DOMAIN}) admin-ingressgateway.istio-admin-gateway.svc.cluster.local answer auto + } } " description: "Patch the RKE2 CoreDNS Corefile with the tenant and admin gateway rewrites" + - cmd: | + sudo uds zarf tools kubectl rollout restart -n kube-system deployment/rke2-coredns-rke2-coredns + description: "Restart CoreDNS deployment to pick-up new Corefile" + - description: "Validate CoreDNS has restarted" + wait: + cluster: + kind: Pod + name: app.kubernetes.io/name=rke2-coredns + namespace: kube-system + condition: "'{.status.conditions[2].status}'=True" + maxTotalSeconds: 300 - - name: supabase-studio-config + - name: inject-ca-trust-bundles required: true - description: "Patch the missing Supabase domain" + description: "Create and mount the CA trust bundle into the Supabase and LeapfrogAI UI pods" actions: onDeploy: - after: + before: - cmd: | - sudo uds zarf tools kubectl patch configmap supabase-studio-default -n leapfrogai --patch " - data: - PORT: \"3000\" - STUDIO_PG_META_URL: http://supabase-kong.leapfrogai.svc.cluster.local:80/pg - SUPABASE_PUBLIC_URL: https://supabase-kong.${ZARF_VAR_DOMAIN} - SUPABASE_URL: http://supabase-kong.leapfrogai.svc.cluster.local:80 - " - description: "Patch the missing Supabase domain" + rm -f ${ZARF_VAR_CERTIFICATE_DIRECTORY}/uds-rke2-custom-ca-cert.ca-bundle + cp ${ZARF_VAR_CERTIFICATE_DIRECTORY}/${ZARF_VAR_CA_TRUST_BUNDLE_NAME} \ + ${ZARF_VAR_CERTIFICATE_DIRECTORY}/uds-rke2-custom-ca-cert.ca-bundle + + uds zarf tools kubectl create configmap uds-rke2-custom-ca-cert \ + --from-file=cacert=${ZARF_VAR_CERTIFICATE_DIRECTORY}/uds-rke2-custom-ca-cert.ca-bundle \ + -n leapfrogai + description: "Create the CA trust bundle ConfigMap" + - cmd: | + uds zarf tools kubectl patch deployment supabase-auth -n leapfrogai --type=json -p='[ + { + "op": "add", + "path": "/spec/template/spec/volumes/-", + "value": { + "name": "uds-rke2-custom-ca-cert", + "configMap": { + "name": "uds-rke2-custom-ca-cert", + "defaultMode": 511 + } + } + }, + { + "op": "add", + "path": "/spec/template/spec/containers/0/volumeMounts/-", + "value": { + "name": "uds-rke2-custom-ca-cert", + "subPath": "cacert", + "mountPath": "/etc/ssl/certs/uds-rke2-custom-ca-cert.ca-bundle" + } + } + ]' + description: "Mount the CA trust bundle ConfigMap to the Supabase Auth container" + - cmd: | + uds zarf tools kubectl patch deployment leapfrogai-ui -n leapfrogai --type=json -p='[ + { + "op": "add", + "path": "/spec/template/spec/volumes", + "value": [ + { + "name": "uds-rke2-custom-ca-cert", + "configMap": { + "name": "uds-rke2-custom-ca-cert", + "defaultMode": 511 + } + } + ] + }, + { + "op": "add", + "path": "/spec/template/spec/containers/0/volumeMounts", + "value": [ + { + "name": "uds-rke2-custom-ca-cert", + "subPath": "cacert", + "mountPath": "/etc/ssl/certs/uds-rke2-custom-ca-cert.ca-bundle" + } + ] + }, + { + "op": "add", + "path": "/spec/template/spec/containers/0/env/-", + "value": { + "name": "NODE_EXTRA_CA_CERTS", + "value": "/etc/ssl/certs/uds-rke2-custom-ca-cert.ca-bundle" + } + } + ]' + description: "Mount the CA trust bundle ConfigMap to the LeapfrogAI UI container" diff --git a/packages/local-path/charts/templates/deployment.yaml b/packages/local-path/charts/templates/deployment.yaml index 9278cb53b..3cdaeb10e 100644 --- a/packages/local-path/charts/templates/deployment.yaml +++ b/packages/local-path/charts/templates/deployment.yaml @@ -34,7 +34,6 @@ spec: imagePullPolicy: {{ .Values.imagePullPolicy }} command: - local-path-provisioner - - --debug - start - --config - /etc/config/config.json diff --git a/packages/local-path/values/local-path-provisioner-values.yaml b/packages/local-path/values/local-path-provisioner-values.yaml index 5c93e60e4..a1fbac4fc 100644 --- a/packages/local-path/values/local-path-provisioner-values.yaml +++ b/packages/local-path/values/local-path-provisioner-values.yaml @@ -1,3 +1,6 @@ +# See the UDS RKE2 repository's documentation, `docs/LOCAL-PATH.md` for more details on values overrides + +# Default for single-node, non-HA deployments replicaCount: 1 commonLabels: {} @@ -14,25 +17,25 @@ helperPod: imagePullSecrets: - name: private-registry -## For creating the StorageClass automatically: +# For creating the StorageClass automatically: storageClass: create: true - ## Set StorageClass as the default StorageClass - ## Ignored if storageClass.create is false + # Set StorageClass as the default StorageClass + # Ignored if storageClass.create is false defaultClass: ###ZARF_VAR_IS_DEFAULT_STORAGECLASS### - ## The default volume type this storage class creates, can be "local" or "hostPath" + # The default volume type this storage class creates, can be "local" or "hostPath" defaultVolumeType: ###ZARF_VAR_VOLUME_TYPE### - ## Set a StorageClass name - ## Ignored if storageClass.create is false + # Set a StorageClass name + # Ignored if storageClass.create is false name: local-path - ## ReclaimPolicy field of the class, which can be either Delete or Retain + # ReclaimPolicy field of the class, which can be either Delete or Retain reclaimPolicy: Retain - ## volumeBindingMode field controls when volume binding and dynamic provisioning should occur, can be "Immediate" or "WaitForFirstConsumer" + # volumeBindingMode field controls when volume binding and dynamic provisioning should occur, can be "Immediate" or "WaitForFirstConsumer" volumeBindingMode: WaitForFirstConsumer podAnnotations: {} @@ -97,11 +100,11 @@ configmap: sudo rm -rf "$VOL_DIR" -# Number of provisioner worker threads to call provision/delete simultaneously. +# # Number of provisioner worker threads to call provision/delete simultaneously. # workerThreads: 4 -# Number of retries of failed volume provisioning. 0 means retry indefinitely. +# # Number of retries of failed volume provisioning. 0 means retry indefinitely. # provisioningRetryCount: 15 -# Number of retries of failed volume deletion. 0 means retry indefinitely. +# # Number of retries of failed volume deletion. 0 means retry indefinitely. # deletionRetryCount: 15 diff --git a/packages/local-path/values/node-path-map-values.yaml b/packages/local-path/values/node-path-map-values.yaml deleted file mode 100644 index ee831613c..000000000 --- a/packages/local-path/values/node-path-map-values.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# nodePathMap is the place user can customize where to store the data on each node. -# 1. If one node is not listed on the nodePathMap, and Kubernetes wants to create volume on it, the paths specified in -# DEFAULT_PATH_FOR_NON_LISTED_NODES will be used for provisioning. -# 2. If one node is listed on the nodePathMap, the specified paths will be used for provisioning. -# 1. If one node is listed but with paths set to [], the provisioner will refuse to provision on this node. -# 2. If more than one path was specified, the path would be chosen randomly when provisioning. -# 3. The configuration must obey following rules: -# 1. A path must start with /, a.k.a an absolute path. -# 2. Root directory (/) is prohibited. -# 3. No duplicate paths allowed for one node. -# 4. No duplicate node allowed. -# 5. The path must not already be owned by a different system user -nodePathMapJson: - nodePathMap: - - node: DEFAULT_PATH_FOR_NON_LISTED_NODES - paths: - - "/opt/uds" \ No newline at end of file diff --git a/packages/local-path/values/storage-configuration-values.yaml b/packages/local-path/values/storage-configuration-values.yaml new file mode 100644 index 000000000..1e8508741 --- /dev/null +++ b/packages/local-path/values/storage-configuration-values.yaml @@ -0,0 +1,13 @@ +# See the UDS RKE2 repository's documentation, `docs/LOCAL-PATH.md` for more details on values overrides + +nodePathMapJson: + + # # Multi-node configuration, for multi-node filesystem definition + # # Adheres to nodeAffinity and accessMode provided by a deployment's PVC + # sharedFileSystemPath: "/opt/uds" + +# Node-level configuration, for per-node filesystem control + nodePathMap: + - node: DEFAULT_PATH_FOR_NON_LISTED_NODES + paths: + - "/opt/uds" diff --git a/packages/local-path/zarf-config.yaml b/packages/local-path/zarf-config.yaml index d04ae0b25..ae0568b03 100644 --- a/packages/local-path/zarf-config.yaml +++ b/packages/local-path/zarf-config.yaml @@ -5,4 +5,4 @@ package: version: "0.4.2" # x-release-please-end - node_path_map_values_file: values/node-path-map-values.yaml + storage_configuration_values_file: values/storage-configuration-values.yaml diff --git a/packages/local-path/zarf.yaml b/packages/local-path/zarf.yaml index 788f2cb80..51b9f6586 100644 --- a/packages/local-path/zarf.yaml +++ b/packages/local-path/zarf.yaml @@ -9,9 +9,9 @@ metadata: architecture: amd64 constants: - - name: NODE_PATH_MAP_VALUES_FILE + - name: STORAGE_CONFIGURATION_VALUES_FILE description: "The Node Path Map values to be used for defining volume locations per node" - value: "###ZARF_PKG_TMPL_NODE_PATH_MAP_VALUES_FILE###" + value: "###ZARF_PKG_TMPL_STORAGE_CONFIGURATION_VALUES_FILE###" variables: - name: IS_DEFAULT_STORAGECLASS @@ -48,7 +48,7 @@ components: localPath: charts valuesFiles: - values/local-path-provisioner-values.yaml - - "###ZARF_PKG_TMPL_NODE_PATH_MAP_VALUES_FILE###" + - "###ZARF_PKG_TMPL_STORAGE_CONFIGURATION_VALUES_FILE###" actions: onDeploy: after: diff --git a/packages/longhorn/values/default-values.yaml b/packages/longhorn/values/longhorn-values.yaml similarity index 100% rename from packages/longhorn/values/default-values.yaml rename to packages/longhorn/values/longhorn-values.yaml diff --git a/packages/minio/values/minio-values.yaml b/packages/minio/values/minio-values.yaml index dcd6ae75b..654b13ffa 100644 --- a/packages/minio/values/minio-values.yaml +++ b/packages/minio/values/minio-values.yaml @@ -1,6 +1,6 @@ # Mirrors MinIO helm release v5.0.14 image: - repository: registry1.dso.mil/ironbank/opensource/minio/minio + repository: ###ZARF_REGISTRY###/ironbank/opensource/minio/minio # TODO: renovate setup tag: RELEASE.2023-09-30T07-02-29Z pullPolicy: IfNotPresent @@ -10,7 +10,7 @@ imagePullSecrets: # Mirrors MinIO helm release v5.0.14 mcImage: - repository: registry1.dso.mil/ironbank/opensource/minio/mc + repository: ###ZARF_REGISTRY###/ironbank/opensource/minio/mc # TODO: renovate setup tag: RELEASE.2023-09-29T16-41-22Z pullPolicy: IfNotPresent @@ -26,10 +26,18 @@ resources: persistence: size: ###ZARF_VAR_BUCKET_PERSISTENCE_SIZE### +## Node labels for pod assignment +## Ref: https://kubernetes.io/docs/user-guide/node-selection/ +nodeSelector: {} +tolerations: [] +affinity: {} + +topologySpreadConstraints: [] + buckets: - name: uds -# uds test user +# UDS default test user users: - accessKey: uds secretKey: uds-secret diff --git a/packages/nvidia-gpu-operator/values/node-feature-discovery-values.yaml b/packages/nvidia-gpu-operator/values/node-feature-discovery-values.yaml new file mode 100644 index 000000000..c7c70f7d9 --- /dev/null +++ b/packages/nvidia-gpu-operator/values/node-feature-discovery-values.yaml @@ -0,0 +1,14 @@ +image: + repository: registry1.dso.mil/ironbank/opensource/nfd/node-feature-discovery + # This should be set to 'IfNotPresent' for released version + pullPolicy: IfNotPresent + +# RKE2-specific configurations +worker: + securityContext: + privileged: true + allowPrivilegeEscalation: true +master: + securityContext: + privileged: true + allowPrivilegeEscalation: true diff --git a/packages/nvidia-gpu-operator/values/nvidia-gpu-operator-values.yaml b/packages/nvidia-gpu-operator/values/nvidia-gpu-operator-values.yaml new file mode 100644 index 000000000..a4ab291d8 --- /dev/null +++ b/packages/nvidia-gpu-operator/values/nvidia-gpu-operator-values.yaml @@ -0,0 +1,582 @@ +# Default values for gpu-operator. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +platform: + openshift: false + +nfd: + # usually enabled by default, but choose to use external NFD from IronBank + enabled: false + nodefeaturerules: false + +psa: + enabled: false + +cdi: + enabled: false + default: false + +sandboxWorkloads: + enabled: false + defaultWorkload: "container" + +hostPaths: + # rootFS represents the path to the root filesystem of the host. + # This is used by components that need to interact with the host filesystem + # and as such this must be a chroot-able filesystem. + # Examples include the MIG Manager and Toolkit Container which may need to + # stop, start, or restart systemd services + rootFS: "/" + + # driverInstallDir represents the root at which driver files including libraries, + # config files, and executables can be found. + driverInstallDir: "/run/nvidia/driver" + +daemonsets: + labels: {} + annotations: {} + priorityClassName: system-node-critical + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands + # note that driver Daemonset is always set with OnDelete to avoid unintended disruptions + updateStrategy: "RollingUpdate" + # configuration for controlling rolling update of GPU Operands + rollingUpdate: + # maximum number of nodes to simultaneously apply pod updates on. + # can be specified either as number or percentage of nodes. Default 1. + maxUnavailable: "1" + +validator: + repository: registry1.dso.mil/ironbank/opensource/nvidia + image: gpu-operator-validator + version: v24.3.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + resources: {} + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + driver: + # RKE2-specific configurations + env: + - name: DISABLE_DEV_CHAR_SYMLINK_CREATION + value: "true" + - name: NVIDIA_VISIBLE_DEVICES + value: all + # Default value of "all" causes the "display" capability to also be considered; + # however, not all hosts have or allow that capability, causing the daemonset to fail + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + +operator: + repository: registry1.dso.mil/ironbank/opensource/nvidia + image: gpu-operator + version: v24.3.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + priorityClassName: system-node-critical + # Explicitly set runtime to containerd, not the default of `docker` + defaultRuntime: containerd + runtimeClass: nvidia + use_ocp_driver_toolkit: false + # cleanup CRD on chart un-install + cleanupCRD: false + # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag + # to be passed during helm upgrade. + upgradeCRD: false + initContainer: + image: cuda + repository: registry1.dso.mil/ironbank/opensource/nvidia + version: 12.4 + imagePullPolicy: IfNotPresent + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + annotations: + openshift.io/scc: restricted-readonly + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/master" + operator: In + values: [""] + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: In + values: [""] + logging: + # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano') + timeEncoding: epoch + # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity + level: info + # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn) + # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error) + develMode: true + resources: + limits: + cpu: 500m + memory: 350Mi + requests: + cpu: 200m + memory: 100Mi + +mig: + strategy: single + +driver: + # usually enabled by default, depends on deployment environment + enabled: false + nvidiaDriverCRD: + enabled: false + deployDefaultCR: true + driverType: gpu + nodeSelector: {} + useOpenKernelModules: false + # use pre-compiled packages for NVIDIA driver installation. + # only supported for as a tech-preview feature on ubuntu22.04 kernels. + # there is no IronBank flavor for these containers + usePrecompiled: false + repository: nvcr.io/nvidia + image: driver + version: "550.90.07" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 10 + # nvidia-smi can take longer than 30s in some cases + # ensure enough timeout is set + timeoutSeconds: 60 + failureThreshold: 120 + rdma: + enabled: false + useHostMofed: false + upgradePolicy: + # global switch for automatic upgrade feature + # if set to false all other options are ignored + autoUpgrade: true + # how many nodes can be upgraded in parallel + # 0 means no limit, all nodes will be upgraded in parallel + maxParallelUpgrades: 1 + # maximum number of nodes with the driver installed, that can be unavailable during + # the upgrade. Value can be an absolute number (ex: 5) or + # a percentage of total nodes at the start of upgrade (ex: + # 10%). Absolute number is calculated from percentage by rounding + # up. By default, a fixed value of 25% is used.' + maxUnavailable: 25% + # options for waiting on pod(job) completions + waitForCompletion: + timeoutSeconds: 0 + podSelector: "" + # options for gpu pod deletion + gpuPodDeletion: + force: false + timeoutSeconds: 300 + deleteEmptyDir: false + # options for node drain (`kubectl drain`) before the driver reload + # this is required only if default GPU pod deletions done by the operator + # are not sufficient to re-install the driver + drain: + enable: false + force: false + podSelector: "" + # It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries + timeoutSeconds: 300 + deleteEmptyDir: false + manager: + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4 + # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0 + version: v0.6.9 + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "true" + - name: ENABLE_AUTO_DRAIN + value: "false" + - name: DRAIN_USE_FORCE + value: "false" + - name: DRAIN_POD_SELECTOR_LABEL + value: "" + - name: DRAIN_TIMEOUT_SECONDS + value: "0s" + - name: DRAIN_DELETE_EMPTYDIR_DATA + value: "false" + env: [] + resources: {} + # Private mirror repository configuration + repoConfig: + configMapName: "" + # custom ssl key/certificate configuration + certConfig: + name: "" + # vGPU licensing configuration + licensingConfig: + configMapName: "" + nlsEnabled: true + # vGPU topology daemon configuration + virtualTopology: + config: "" + # kernel module configuration for NVIDIA driver + kernelModuleConfig: + name: "" + +toolkit: + # usually enabled by default, depends on deployment environment + enabled: false + # there is no IronBank flavor for these containers + repository: nvcr.io/nvidia/k8s + image: container-toolkit + version: v1.16.0-rc.1-ubuntu20.04 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + # RKE2-specific configurations + - name: CONTAINERD_CONFIG + value: /var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl + - name: CONTAINERD_SOCKET + value: /run/k3s/containerd/containerd.sock + - name: CONTAINERD_RUNTIME_CLASS + value: nvidia + - name: CONTAINERD_SET_AS_DEFAULT + value: "true" + resources: {} + installDir: "/usr/local/nvidia" + +devicePlugin: + enabled: true + repository: registry1.dso.mil/ironbank/opensource/nvidia + image: k8s-device-plugin + version: v0.15.1-ubi8 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + args: [] + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + # Default value of "all" causes the "display" capability to also be considered; + # however, not all hosts have or allow that capability, causing the daemonset to fail + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + resources: {} + # Plugin configuration + # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true). + # Use "data" to build an integrated ConfigMap from a set of configurations as + # part of this helm chart. An example of setting "data" might be: + # config: + # name: device-plugin-config + # create: true + # data: + # default: |- + # version: v1 + # flags: + # migStrategy: none + # mig-single: |- + # version: v1 + # flags: + # migStrategy: single + # mig-mixed: |- + # version: v1 + # flags: + # migStrategy: mixed + config: + # Create a ConfigMap (default: false) + create: false + # ConfigMap name (either exiting or to create a new one with create=true above) + name: "" + # Default config name within the ConfigMap + default: "" + # Data section for the ConfigMap to create (i.e only applies when create=true) + data: {} + # MPS related configuration for the plugin + mps: + # MPS root path on the host + root: "/run/nvidia/mps" + +# standalone dcgm host engine +dcgm: + # disabled by default to use embedded nv-host engine by exporter + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: dcgm + version: 3.3.6-1-ubuntu22.04 + imagePullPolicy: IfNotPresent + args: [] + env: [] + resources: {} + +dcgmExporter: + # TODO: re-enable and integrate with Prometheus + # disabled due to Registry1 image issues + enabled: false + repository: registry1.dso.mil/ironbank/opensource/nvidia + image: dcgm-exporter + version: 3.3.6-3.4.2 + imagePullPolicy: IfNotPresent + env: + - name: DCGM_EXPORTER_LISTEN + value: ":9400" + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + - name: DCGM_EXPORTER_COLLECTORS + value: "/etc/dcgm-exporter/dcp-metrics-included.csv" + resources: {} + serviceMonitor: + enabled: false + interval: 15s + honorLabels: false + additionalLabels: {} + relabelings: [] + # - source_labels: + # - __meta_kubernetes_pod_node_name + # regex: (.*) + # target_label: instance + # replacement: $1 + # action: replace + +gfd: + enabled: true + repository: registry1.dso.mil/ironbank/opensource/nvidia + image: k8s-device-plugin + version: v0.15.1-ubi8 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: GFD_SLEEP_INTERVAL + value: 60s + - name: GFD_FAIL_ON_INIT_ERROR + value: "true" + resources: {} + +migManager: + # usually enabled by default, depends on deployment environment + enabled: false + # there is no IronBank flavor for these containers + repository: nvcr.io/nvidia/cloud-native + image: k8s-mig-manager + version: v0.8.0-rc.1-ubuntu20.04 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: WITH_REBOOT + value: "false" + resources: {} + config: + name: "default-mig-parted-config" + default: "all-disabled" + gpuClientsConfig: + name: "" + +nodeStatusExporter: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: gpu-operator-validator + # If version is not specified, then default is to use chart.AppVersion + #version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + resources: {} + +gds: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: nvidia-fs + version: "2.17.5" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + +gdrcopy: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: gdrdrv + version: "v2.4.1" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + +vgpuManager: + enabled: false + repository: "" + image: vgpu-manager + version: "" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + driverManager: + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4 + # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0 + version: v0.6.9 + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + +vgpuDeviceManager: + # usually enabled by default, depends on deployment environment + enabled: false + # there is no IronBank flavor for these containers + repository: nvcr.io/nvidia/cloud-native + image: vgpu-device-manager + version: "v0.2.6" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + config: + name: "" + default: "default" + +vfioManager: + # usually enabled by default, depends on deployment environment + enabled: false + repository: nvcr.io/nvidia + image: cuda + version: 12.5.0-base-ubi8 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + driverManager: + # there is no IronBank flavor for these containers + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + # When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4 + # to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0 + version: v0.6.9 + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + +kataManager: + enabled: false + config: + artifactsDir: "/opt/nvidia-gpu-operator/artifacts/runtimeclasses" + runtimeClasses: + - name: kata-nvidia-gpu + nodeSelector: {} + artifacts: + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03 + pullSecret: "" + - name: kata-nvidia-gpu-snp + nodeSelector: + "nvidia.com/cc.capable": "true" + artifacts: + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp + pullSecret: "" + repository: nvcr.io/nvidia/cloud-native + image: k8s-kata-manager + version: v0.2.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + resources: {} + +sandboxDevicePlugin: + # usually enabled by default, depends on deployment environment + enabled: false + # there is no IronBank flavor for these containers + repository: nvcr.io/nvidia + image: kubevirt-gpu-device-plugin + version: v1.2.8 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + args: [] + env: [] + resources: {} + +ccManager: + enabled: false + defaultMode: "off" + repository: nvcr.io/nvidia/cloud-native + image: k8s-cc-manager + version: v0.1.1 + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: CC_CAPABLE_DEVICE_IDS + value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d" + resources: {} + +node-feature-discovery: + enableNodeFeatureApi: true + gc: + enable: true + replicaCount: 1 + serviceAccount: + name: node-feature-discovery + create: false + worker: + serviceAccount: + name: node-feature-discovery + # disable creation to avoid duplicate serviceaccount creation by master spec below + create: false + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + config: + sources: + pci: + deviceClassWhitelist: + - "02" + - "0200" + - "0207" + - "0300" + - "0302" + deviceLabelFields: + - vendor + master: + serviceAccount: + name: node-feature-discovery + create: true + config: + extraLabelNs: ["nvidia.com"] + # noPublish: false + # resourceLabels: ["nvidia.com/feature-1","nvidia.com/feature-2"] + # enableTaints: false + # labelWhiteList: "nvidia.com/gpu" diff --git a/packages/nvidia-gpu-operator/zarf-config.yaml b/packages/nvidia-gpu-operator/zarf-config.yaml new file mode 100644 index 000000000..aa29ab5f4 --- /dev/null +++ b/packages/nvidia-gpu-operator/zarf-config.yaml @@ -0,0 +1,6 @@ +package: + create: + set: + # x-release-please-start-version + version: "0.4.2" + # x-release-please-end diff --git a/packages/nvidia-gpu-operator/zarf.yaml b/packages/nvidia-gpu-operator/zarf.yaml new file mode 100644 index 000000000..dc29b4c78 --- /dev/null +++ b/packages/nvidia-gpu-operator/zarf.yaml @@ -0,0 +1,108 @@ +# TODO: renovate setup +# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.10.4/zarf.schema.json + +kind: ZarfPackageConfig +metadata: + name: nvidia-gpu-operator + description: "Zarf package of NVIDIA's GPU Operator" + version: "###ZARF_PKG_TMPL_VERSION###" + architecture: amd64 + +components: + - name: node-feature-discovery + required: true + images: + # gpu-operator pre-requisite via https://github.com/NVIDIA/gpu-operator/blob/main/deployments/gpu-operator/Chart.yaml + - registry1.dso.mil/ironbank/opensource/nfd/node-feature-discovery:v0.15.4 + charts: + - name: node-feature-discovery + namespace: nvidia-gpu-operator + url: https://kubernetes-sigs.github.io/node-feature-discovery/charts + version: v0.15.4 + valuesFiles: + - values/node-feature-discovery-values.yaml + + - name: nvidia-gpu-operator + required: true + # TODO: renovate setup + images: + - registry1.dso.mil/ironbank/opensource/nvidia/gpu-operator:v24.3.0 + - registry1.dso.mil/ironbank/opensource/nvidia/gpu-operator-validator:v24.3.0 + - registry1.dso.mil/ironbank/opensource/nvidia/k8s-device-plugin:v0.15.1-ubi8 + - registry1.dso.mil/ironbank/opensource/nvidia/cuda:12.4 + charts: + - name: gpu-operator + url: https://helm.ngc.nvidia.com/nvidia + # TODO: renovate setup + version: v24.3.0 + namespace: nvidia-gpu-operator + valuesFiles: + - "values/nvidia-gpu-operator-values.yaml" + actions: + onDeploy: + after: + # The following onDeploy actions are due to an upstream Registry1 issue: https://repo1.dso.mil/dsop/opensource/nvidia/gpu-operator/-/issues/11 + - cmd: | + kubectl patch daemonset nvidia-operator-validator -n nvidia-gpu-operator --type='json' -p='[ + {"op": "add", "path": "/spec/template/spec/containers/0/securityContext/runAsUser", "value": 0} + ]' + kubectl patch daemonset nvidia-operator-validator -n nvidia-gpu-operator --type='json' -p='[ + {"op": "add", "path": "/spec/template/spec/initContainers/0/securityContext/runAsUser", "value": 0} + ]' + kubectl patch daemonset nvidia-operator-validator -n nvidia-gpu-operator --type='json' -p='[ + {"op": "add", "path": "/spec/template/spec/initContainers/1/securityContext/runAsUser", "value": 0} + ]' + kubectl patch daemonset nvidia-operator-validator -n nvidia-gpu-operator --type='json' -p='[ + {"op": "add", "path": "/spec/template/spec/initContainers/2/securityContext/runAsUser", "value": 0} + ]' + kubectl patch daemonset nvidia-operator-validator -n nvidia-gpu-operator --type='json' -p='[ + {"op": "add", "path": "/spec/template/spec/initContainers/3/securityContext/runAsUser", "value": 0} + ]' + description: "Patch securityContext in the nvidia-operator-validator" + maxTotalSeconds: 60 + - cmd: | + kubectl patch daemonset gpu-feature-discovery -n nvidia-gpu-operator --type='json' -p='[ + {"op": "add", "path": "/spec/template/spec/containers/0/securityContext/runAsUser", "value": 0} + ]' + kubectl patch daemonset gpu-feature-discovery -n nvidia-gpu-operator --type='json' -p='[ + {"op": "add", "path": "/spec/template/spec/initContainers/0/securityContext/runAsUser", "value": 0} + ]' + description: "Patch securityContext in the gpu-feature-discovery" + maxTotalSeconds: 60 + - cmd: | + kubectl patch daemonset nvidia-device-plugin-daemonset -n nvidia-gpu-operator --type='json' -p='[ + {"op": "add", "path": "/spec/template/spec/containers/0/securityContext/runAsUser", "value": 0} + ]' + kubectl patch daemonset nvidia-device-plugin-daemonset -n nvidia-gpu-operator --type='json' -p='[ + {"op": "add", "path": "/spec/template/spec/initContainers/0/securityContext/runAsUser", "value": 0} + ]' + description: "Patch securityContext in the nvidia-device-plugin-daemonset" + maxTotalSeconds: 60 + # Validate that all components are back up and running after the patches + - description: "Validate nvidia-operator-validator is up" + wait: + cluster: + kind: Pod + name: app=nvidia-operator-validator + namespace: nvidia-gpu-operator + condition: "'{.status.conditions[2].status}'=True" + maxTotalSeconds: 300 + - description: "Validate gpu-feature-discovery is up" + wait: + cluster: + kind: Pod + name: app=gpu-feature-discovery + namespace: nvidia-gpu-operator + condition: "'{.status.conditions[2].status}'=True" + maxTotalSeconds: 300 + - description: "Validate nvidia-device-plugin-daemonset is up" + wait: + cluster: + kind: Pod + name: app=nvidia-device-plugin-daemonset + namespace: nvidia-gpu-operator + condition: "'{.status.conditions[2].status}'=True" + maxTotalSeconds: 300 + onFailure: + - cmd: uds zarf tools kubectl describe nodes + description: "Attempt to provide node data for debugging after a failed deployment" diff --git a/packages/rook-ceph/values/multi-node-cluster-values.yaml b/packages/rook-ceph/values/multi-node-cluster-values.yaml index 2e7a4c35f..2dd8639d3 100644 --- a/packages/rook-ceph/values/multi-node-cluster-values.yaml +++ b/packages/rook-ceph/values/multi-node-cluster-values.yaml @@ -1,7 +1,5 @@ # See the UDS RKE2 repository's documentation, `docs/ROOK-CEPH.md` for more details on values overrides -# Please use UDS overrides to accomplish the definition of environment or node-specific configurations, like deviceFilters per node - imagePullSecrets: - name: private-registry diff --git a/packages/uds-rke2/exemptions/charts/nvidia-gpu-operator-exemptions.yaml b/packages/uds-rke2/exemptions/charts/nvidia-gpu-operator-exemptions.yaml new file mode 100644 index 000000000..23b0b42ff --- /dev/null +++ b/packages/uds-rke2/exemptions/charts/nvidia-gpu-operator-exemptions.yaml @@ -0,0 +1,21 @@ +apiVersion: uds.dev/v1alpha1 +kind: Exemption +metadata: + name: nvidia-gpu-operator + namespace: uds-policy-exemptions +spec: + exemptions: + - policies: + - DisallowHostNamespaces + - DisallowPrivileged + - RequireNonRootUser + - DropAllCapabilities + - RestrictHostPathWrite + - RestrictVolumeTypes + - RestrictCapabilities + - RestrictHostPorts + matcher: + namespace: "nvidia-gpu-operator" + name: ".*" + title: "all pods" + description: "Exempt NVIDIA GPU Operator from policy enforcement" diff --git a/packages/uds-rke2/exemptions/zarf.yaml b/packages/uds-rke2/exemptions/zarf.yaml index e0053c9a0..4db2fbdd6 100644 --- a/packages/uds-rke2/exemptions/zarf.yaml +++ b/packages/uds-rke2/exemptions/zarf.yaml @@ -17,12 +17,15 @@ components: namespace: uds-policy-exemptions files: - charts/uds-rke2-infrastructure-exemptions.yaml - actions: - onDeploy: - after: - - cmd: sleep 10 - # See https://github.com/defenseunicorns/uds-core/issues/409 - description: Workaround for upstream UDS Core Pepr exemption race condition + + - name: nvidia-gpu-operator-exemptions + description: "NVIDIA UDS Core Pepr policy exemptions" + required: false + manifests: + - name: nvidia-gpu-operator-exemptions + namespace: uds-policy-exemptions + files: + - charts/nvidia-gpu-operator-exemptions.yaml - name: local-path-exemptions description: "MetalLB UDS Core Pepr policy exemptions" @@ -34,12 +37,6 @@ components: namespace: uds-policy-exemptions files: - charts/local-path-exemptions.yaml - actions: - onDeploy: - after: - - cmd: sleep 10 - # See https://github.com/defenseunicorns/uds-core/issues/409 - description: Workaround for upstream UDS Core Pepr exemption race condition - name: longhorn-exemptions description: "MetalLB UDS Core Pepr policy exemptions" @@ -51,12 +48,6 @@ components: namespace: uds-policy-exemptions files: - charts/longhorn-exemptions.yaml - actions: - onDeploy: - after: - - cmd: sleep 10 - # See https://github.com/defenseunicorns/uds-core/issues/409 - description: Workaround for upstream UDS Core Pepr exemption race condition - name: rook-ceph-exemptions description: "MetalLB UDS Core Pepr policy exemptions" @@ -68,9 +59,3 @@ components: namespace: uds-policy-exemptions files: - charts/rook-ceph-exemptions.yaml - actions: - onDeploy: - after: - - cmd: sleep 10 - # See https://github.com/defenseunicorns/uds-core/issues/409 - description: Workaround for upstream UDS Core Pepr exemption race condition diff --git a/packages/uds-rke2/infrastructure/values/metallb-l2-values.yaml b/packages/uds-rke2/infrastructure/values/metallb-l2-values.yaml index 8d8f8926b..1525e7cd7 100644 --- a/packages/uds-rke2/infrastructure/values/metallb-l2-values.yaml +++ b/packages/uds-rke2/infrastructure/values/metallb-l2-values.yaml @@ -1,5 +1,5 @@ -ipAddressPool: +ipAddressPool: - ###ZARF_VAR_BASE_IP###.###ZARF_VAR_ADDRESS_POOL_LOWER_BOUND###-###ZARF_VAR_BASE_IP###.###ZARF_VAR_ADDRESS_POOL_UPPER_BOUND### -interface: - - ###ZARF_VAR_INTERFACE### +interface: + - ###ZARF_VAR_NETWORK_INTERFACE### diff --git a/packages/uds-rke2/infrastructure/zarf.yaml b/packages/uds-rke2/infrastructure/zarf.yaml index 1fa068a65..550b5bdb1 100644 --- a/packages/uds-rke2/infrastructure/zarf.yaml +++ b/packages/uds-rke2/infrastructure/zarf.yaml @@ -9,9 +9,9 @@ metadata: version: "###ZARF_PKG_TMPL_VERSION###" variables: - - name: INTERFACE + - name: NETWORK_INTERFACE description: "The network interface name on which to perform MetalLB L2 advertisement" - default: "wlp0s20f3" + default: "" prompt: true - name: ADDRESS_POOL_LOWER_BOUND description: "Lower bound of the IP Address Pool range for L2 advertisement" @@ -19,7 +19,7 @@ variables: prompt: false - name: ADDRESS_POOL_UPPER_BOUND description: "Upper bound of the IP Address Pool range for L2 advertisement" - default: "215" + default: "209" prompt: false - name: BASE_IP description: "The host node's base IP" diff --git a/packages/uds-rke2/scripts/rke2-install.sh b/packages/uds-rke2/scripts/rke2-install.sh index 06b9847e7..e9302d99a 100644 --- a/packages/uds-rke2/scripts/rke2-install.sh +++ b/packages/uds-rke2/scripts/rke2-install.sh @@ -4,4 +4,4 @@ set -e # Run RKE2 install script - https://docs.rke2.io/install/airgap#rke2-installsh-script-install cd /root/uds-rke2-artifacts/install/ && chmod +x install.sh -INSTALL_RKE2_ARTIFACT_PATH=/root/uds-rke2-artifacts/install ./install.sh \ No newline at end of file +INSTALL_RKE2_ARTIFACT_PATH=/root/uds-rke2-artifacts/install INSTALL_RKE2_METHOD="tar" sh ./install.sh diff --git a/packages/uds-rke2/scripts/rke2-startup.sh b/packages/uds-rke2/scripts/rke2-startup.sh index 4e5b1aa70..8077c6403 100755 --- a/packages/uds-rke2/scripts/rke2-startup.sh +++ b/packages/uds-rke2/scripts/rke2-startup.sh @@ -97,6 +97,16 @@ version = 2 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] SystemdCgroup = true + +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" + Runtime = "/usr/bin/nvidia-container-runtime" EOF # Start RKE2 @@ -139,7 +149,7 @@ chmod 0700 $DIR/pod-manifests chmod 0700 $DIR/etc find /var/lib/rancher/rke2 -maxdepth 1 -type f -name "*.kubeconfig" -exec chmod 0640 {} \; -find /var/lib/rancher/rke2 -maxdepth 1 -type f -name "*.crt" -exec chmod 0600 {} \; +find /var/lib/rancher/rke2 -maxdepth 1 -type f -name "*.cert" -exec chmod 0600 {} \; find /var/lib/rancher/rke2 -maxdepth 1 -type f -name "*.key" -exec chmod 0600 {} \; DIR=/var/lib/rancher/rke2/bin diff --git a/packages/uds-rke2/zarf.yaml b/packages/uds-rke2/zarf.yaml index 4b35aa6a7..39a6f7ba2 100644 --- a/packages/uds-rke2/zarf.yaml +++ b/packages/uds-rke2/zarf.yaml @@ -5,7 +5,7 @@ kind: ZarfPackageConfig metadata: name: uds-rke2 description: "UDS RKE2 cluster setup for servers and agents. WARNING: This capability requires root access for deployment." - url: https://github.com/justinthelaw/uds-rke2-sandbox + url: https://github.com/justinthelaw/uds-rke2 architecture: amd64 version: "###ZARF_PKG_TMPL_VERSION###" @@ -15,10 +15,6 @@ constants: value: "###ZARF_PKG_TMPL_RKE2_VERSION###" variables: - - name: CLUSTER_NAME - description: "Name of the cluster" - default: "uds-prod" - prompt: true - name: JOIN_TOKEN description: "Cluster joining token" prompt: true diff --git a/tasks.yaml b/tasks.yaml index 6656fbf68..b762615bf 100644 --- a/tasks.yaml +++ b/tasks.yaml @@ -15,45 +15,70 @@ variables: default: "0.4.2" # x-release-please-end + # Explicitly set these environment variables to change the defaults + - name: JOIN_TOKEN + description: "Cluster join token for nodes, set by the master" + default: "my-test-token" + - name: NETWORK_INTERFACE + description: "Interface on which to perform L2 advertisement" + default: "wlp0s20f3" + - name: DOMAIN + description: "The default domain to use for gateway and CoreDNS" + default: "uds.local" + - name: BUCKET_PERSISTENCE_SIZE + description: "The default MinIO bucket size" + default: "50Gi" + tasks: - ################ - # E2E CI TESTING - ################ + ######### + # UTILITY + ######### - - name: uds-rke2-local-path-test - description: "Test the RKE2 cluster, with the Local Path Provisioner + MinIO Zarf Init" + - name: create-tls-local-path-dev + description: "Utility to re-create TLS cert and key, and inject them into the uds-config.yaml" actions: - - task: test:uds-rke2 - - task: test:local-path-minio-init + - task: create:tls + with: + flavor: local-path + bundleVersion: dev + certConfDir: bundles/dev - #################### - # BUNDLE DEPLOYMENTS - #################### + ########################### + # CREATE AND DEPLOY BUNDLES + ########################### - name: uds-rke2-local-path-core description: "Bootstrap a new RKE2 cluster, with the Local Path Provisioner + MinIO Zarf Init and UDS Core packages (LATEST)" actions: + - task: create:logical-volume + - task: create:local-path-core-bundle + + # TLS certificates are pre-packaged with UDS Core for *.uds.dev, which is the default domain for LATEST + # Please see `docs/DNS-TLS.md` if the deployment environment requires different CA, TLS, etc. + - cmd: | + cp bundles/latest/local-path-core/uds-config.yaml build/packages/local-path/uds-config-${VERSION}.yaml + - task: deploy:local-path-core-bundle - name: uds-rke2-local-path-core-dev description: "Bootstrap a new RKE2 cluster, with the Local Path Provisioner + MinIO Zarf Init and UDS Core packages (DEV)" actions: - - task: release-packages-dev - - task: create:local-path-core-bundle-dev + - description: "Use the `dev` version to create all bundle packages" + cmd: sudo uds run create:local-path-core-bundle-packages --set VERSION=dev --log-level warn --no-progress -a amd64 - - description: "Use the `dev` version to run a deployment" - cmd: sudo uds run deploy:local-path-core-bundle --set VERSION=dev --no-progress --no-log-file --log-level debug + - task: create:logical-volume - ########################## - # ALL PACKAGES AND BUNDLES - ########################## + # Automatically generated and signed CA and TLS certificates for *.uds.local + # Please see `docs/DNS-TLS.md` if the deployment environment requires different CA, TLS, etc. + - task: create-tls-local-path-dev - - name: release - description: "Builds and publishes all Zarf packages and Bundles in the repository (LATEST)" - actions: - - task: release-packages - - task: release-bundles + - task: create:local-path-core-bundle-dev + - task: deploy:local-path-core-bundle-dev + + ############################## + # RELEASE PACKAGES AND BUNDLES + ############################## - name: release-dev description: "Builds and publishes all Zarf packages and Bundles in the repository (DEV)" @@ -61,56 +86,85 @@ tasks: - task: release-packages-dev - task: release-bundles-dev - ############## - # ALL PACKAGES - ############## + - name: release-packages-dev + description: "Builds and publishes all Zarf packages in the repository (DEV)" + actions: + - description: "Use the `dev` version to run a package release" + cmd: sudo uds run release-packages --set VERSION=dev --log-level warn --no-progress -a amd64 + + - name: release-bundles-dev + description: "Builds and publishes all UDS bundles in the repository (DEV)" + actions: + - task: create:local-path-core-bundle-dev + - description: "Use the `dev` version to run a bundle release" + cmd: sudo uds run publish:local-path-core-bundle --set VERSION=dev --log-level warn --no-progress -a amd64 + + ###################################### + # NOT FOR LOCAL USE, FOR PIPELINE ONLY + ###################################### + + - name: release + description: "Builds and publishes all Zarf packages and Bundles in the repository (LATEST)" + actions: + - task: release-packages + - task: release-bundles - # TODO: re-enable rook-ceph and longhorn when built - name: release-packages - description: "Builds and publishes all Zarf packages in the repository (LATEST)" + description: "Builds and publishes all Zarf packages in the repository WITHOUT cleaning artifacts (LATEST)" actions: + # Standard uds-rke2-local-path-core Zarf packages - task: create:uds-rke2 - task: publish:uds-rke2 - - task: setup:clean - task: create:uds-rke2-infrastructure - task: publish:uds-rke2-infrastructure - - task: setup:clean - task: create:uds-rke2-exemptions-local-path - task: publish:uds-rke2-exemptions-local-path - - task: setup:clean - task: create:local-path - task: publish:local-path - - task: setup:clean - task: create:local-path-init - task: publish:local-path-init - - task: setup:clean - task: create:minio - task: publish:minio - - task: setup:clean + + # Extra, optional Zarf packages - task: create:leapfrogai-workarounds - task: publish:leapfrogai-workarounds - - task: setup:clean + - task: create:nvidia-gpu-operator + - task: publish:nvidia-gpu-operator - - name: release-packages-dev - description: "Builds and publishes all Zarf packages in the repository (DEV)" + - name: release-packages-with-clean + description: "Builds and publishes all Zarf packages in the repository (LATEST)" actions: - - description: "Use the `dev` version to run a package release" - cmd: sudo uds run release-packages --set VERSION=dev --no-progress --no-log-file --log-level debug + # Standard uds-rke2-local-path-core Zarf packages + - task: create:uds-rke2 + - task: publish:uds-rke2 + - task: setup:clean-build-artifacts + - task: create:uds-rke2-infrastructure + - task: publish:uds-rke2-infrastructure + - task: setup:clean-build-artifacts + - task: create:uds-rke2-exemptions-local-path + - task: publish:uds-rke2-exemptions-local-path + - task: setup:clean-build-artifacts + - task: create:local-path + - task: publish:local-path + - task: setup:clean-build-artifacts + - task: create:local-path-init + - task: publish:local-path-init + - task: setup:clean-build-artifacts + - task: create:minio + - task: publish:minio + - task: setup:clean-build-artifacts - ############# - # ALL BUNDLES - ############# + # Extra, optional Zarf packages + - task: create:leapfrogai-workarounds + - task: publish:leapfrogai-workarounds + - task: setup:clean-build-artifacts + - task: create:nvidia-gpu-operator + - task: publish:nvidia-gpu-operator + - task: setup:clean-build-artifacts - name: release-bundles description: "Builds and publishes all UDS bundles in the repository (LATEST)" actions: - task: create:local-path-core-bundle - task: publish:local-path-core-bundle - - - name: release-bundles-dev - description: "Builds and publishes all UDS bundles in the repository (DEV)" - actions: - - task: create:local-path-core-bundle-dev - - - description: "Use the `dev` version to run a bundle release" - cmd: sudo uds run publish:local-path-core-bundle --set VERSION=dev --no-progress --no-log-file --log-level debug diff --git a/tasks/create.yaml b/tasks/create.yaml index a72442afa..e32bddbf0 100644 --- a/tasks/create.yaml +++ b/tasks/create.yaml @@ -10,43 +10,23 @@ variables: - name: CREATE_OPTIONS description: "Extra Zarf package creation options" - default: "--no-progress --no-log-file --log-level debug -a amd64 --confirm" + default: "--log-level warn --no-progress -a amd64 --confirm" - name: PULL_OPTIONS description: "Extra Zarf package pull options" - default: "--no-progress --no-log-file --log-level debug -a amd64" + default: "--log-level warn --no-progress -a amd64" - name: CREATE_BUNDLE_OPTIONS description: "Extra UDS bundle creation options" - default: "--no-progress --no-tea --no-log-file --log-level debug -a amd64 --confirm" + default: "--log-level warn --no-tea --no-progress --oci-concurrency 8 -a amd64 --confirm" - name: CORE_PKG_VERSION description: "The version of UDS Core package to deploy" # TODO: renovate setup - default: "0.22.0" + default: "0.23.0" - name: CORE_PKG_FLAVOR description: "Flavor of the package to use (`registry1` or `upstream`)." default: "registry1" tasks: - ########################## - # ALL PACKAGES AND BUNDLES - ########################## - - # TODO: re-enable rook-ceph and longhorn when built - - name: all - description: "Builds all Zarf packages in the repository" - actions: - # Zarf packages - - task: uds-rke2 - - task: uds-rke2-infrastructure - - task: uds-rke2-exemptions-local-path - - task: local-path - - task: local-path-init - - task: minio - - task: leapfrogai-workarounds - - # UDS bundles - - task: local-path-core-bundle - ########### # REUSEABLE ########### @@ -103,10 +83,13 @@ tasks: outputPath: description: "Output path to the UDS bundle being created" required: true + udsConfig: + description: "UDS configuration manifest for deployment" + required: true actions: - description: "Create the UDS bundle for the amd64 architectures" cmd: | - sudo uds create "${{ .inputs.path }}" \ + sudo UDS_CONFIG=${{ .inputs.udsConfig }} uds create "${{ .inputs.path }}" \ -o "${{ .inputs.outputPath }}" \ ${CREATE_BUNDLE_OPTIONS} @@ -114,10 +97,66 @@ tasks: # UTILITY ######### - - name: tls-cert - description: "Generate self-signed TLS certs for testing" + - name: logical-volume + description: "Creates the default logical volume location on a node for Local Path Provisioner PVs" actions: - - cmd: openssl req -x509 -newkey rsa:4096 -keyout tls.key -out tls.cert -days 365 -nodes -config tls-cert.conf + - description: "Create directory with parents, if it does not already exist" + cmd: | + sudo mkdir -p /opt/uds + sudo chown -Rv 65534:65534 /opt/uds + + - name: tls + description: "Generate and inject CA and self-sign TLS certs for build and testing purposes" + inputs: + flavor: + description: "Flavor of the custom Zarf Init package (local-path, rook-ceph, or longhorn)" + required: true + bundleVersion: + description: "Version of the bundle to create (dev or latest)" + required: true + certConfDir: + description: "TLS and CA configuration file directory to be used for cert and key creation" + required: true + actions: + - cmd: | + mkdir -p build/packages/${{ .inputs.flavor }}/certs/ + touch build/packages/${{ .inputs.flavor }}/uds-config-${{ .inputs.bundleVersion }}.yaml + + # Generate CA key and certificate + openssl genrsa -out build/packages/${{ .inputs.flavor }}/certs/ca.key 4096 + openssl req -x509 -new -nodes -key build/packages/${{ .inputs.flavor }}/certs/ca.key \ + -sha256 -days 1825 -out build/packages/${{ .inputs.flavor }}/certs/ca.pem \ + -config ${{ .inputs.certConfDir }}/ca.conf -extensions v3_ca + + # Create CA trust bundle for consumption in leapfrogai-workaround + cp build/packages/${{ .inputs.flavor }}/certs/ca.pem build/packages/${{ .inputs.flavor }}/certs/uds-rke2-custom-ca-cert.ca-bundle + + # Generate TLS key and CSR + openssl genrsa -out build/packages/${{ .inputs.flavor }}/certs/tls.key 4096 + openssl req -new -key build/packages/${{ .inputs.flavor }}/certs/tls.key \ + -out build/packages/${{ .inputs.flavor }}/certs/tls.csr \ + -config ${{ .inputs.certConfDir }}/tls.conf -extensions v3_req + + # Sign the CSR with the CA + openssl x509 -req -in build/packages/${{ .inputs.flavor }}/certs/tls.csr \ + -CA build/packages/${{ .inputs.flavor }}/certs/ca.pem \ + -CAkey build/packages/${{ .inputs.flavor }}/certs/ca.key \ + -CAcreateserial -out build/packages/${{ .inputs.flavor }}/certs/tls.cert \ + -days 365 -sha256 -extfile ${{ .inputs.certConfDir }}/tls.conf -extensions v3_req \ + -copy_extensions copy + + # Base64 encode the TLS certificate and key + base64 -w 0 build/packages/${{ .inputs.flavor }}/certs/tls.cert > build/packages/${{ .inputs.flavor }}/certs/tls.cert.base64 + base64 -w 0 build/packages/${{ .inputs.flavor }}/certs/tls.key > build/packages/${{ .inputs.flavor }}/certs/tls.key.base64 + + # Update the uds-config file + uds zarf tools yq eval-all ' + .variables.core.admin_tls_cert = "'"$(cat build/packages/${{ .inputs.flavor }}/certs/tls.cert.base64)"'" | + .variables.core.admin_tls_key = "'"$(cat build/packages/${{ .inputs.flavor }}/certs/tls.key.base64)"'" | + .variables.core.tenant_tls_cert = "'"$(cat build/packages/${{ .inputs.flavor }}/certs/tls.cert.base64)"'" | + .variables.core.tenant_tls_key = "'"$(cat build/packages/${{ .inputs.flavor }}/certs/tls.key.base64)"'" + ' bundles/${{ .inputs.bundleVersion }}/${{ .inputs.flavor }}-core/uds-config.yaml \ + > build/packages/${{ .inputs.flavor }}/uds-config-${{ .inputs.bundleVersion }}.yaml ######### # BUNDLES @@ -130,6 +169,7 @@ tasks: with: path: "bundles/latest/local-path-core" outputPath: "build/bundles" + udsConfig: "build/packages/local-path/uds-config-${VERSION}.yaml" - name: local-path-core-bundle-dev description: "Build the Local Path Provisioner UDS RKE2 bootstrapping bundle (DEV)" @@ -138,6 +178,7 @@ tasks: with: path: "bundles/dev/local-path-core" outputPath: "build/bundles" + udsConfig: "build/packages/local-path/uds-config-dev.yaml" ################### # STANDARD PACKAGES @@ -222,3 +263,31 @@ tasks: path: packages/leapfrogai outputPath: build/packages zarfConfig: packages/leapfrogai/zarf-config.yaml + + - name: nvidia-gpu-operator + description: "Build the NVIDIA GPU Operator package" + actions: + - description: "Create the Zarf package for the amd64 architectures" + cmd: | + sudo ZARF_CONFIG="packages/nvidia-gpu-operator/zarf-config.yaml" uds zarf package create "packages/nvidia-gpu-operator" \ + -o "build/packages" \ + --set VERSION=${VERSION} \ + ${CREATE_OPTIONS} + + ############## + # ALL PACKAGES + ############## + + - name: local-path-core-bundle-packages + description: "Builds all Zarf packages in the repository for local bundle development (DEV)" + actions: + # Standard uds-rke2-local-path-core Zarf packages + - task: uds-rke2 + - task: uds-rke2-infrastructure + - task: uds-rke2-exemptions-local-path + - task: local-path-init + - task: uds-core + + # Extra, optional Zarf packages + - task: leapfrogai-workarounds + - task: nvidia-gpu-operator diff --git a/tasks/deploy.yaml b/tasks/deploy.yaml index e09c232a0..e347e512b 100644 --- a/tasks/deploy.yaml +++ b/tasks/deploy.yaml @@ -8,12 +8,25 @@ variables: default: "0.4.2" # x-release-please-end + - name: JOIN_TOKEN + description: "Cluster join token for nodes, set by the master" + default: "my-test-token" + - name: NETWORK_INTERFACE + description: "Interface on which to perform L2 advertisement" + default: "wlp0s20f3" + - name: DOMAIN + description: "The default domain to use for gateway and CoreDNS" + default: "uds.local" + - name: BUCKET_PERSISTENCE_SIZE + description: "The default MinIO bucket size" + default: "50Gi" + - name: DEPLOY_OPTIONS description: "Extra Zarf package deployment options" - default: "--no-progress --no-log-file --log-level debug --confirm" + default: "--log-level warn --no-progress -a amd64 --confirm" - name: DEPLOY_BUNDLE_OPTIONS description: "Extra UDS bundle deployment options" - default: "--no-progress --no-tea --no-log-file --log-level debug -a amd64 --confirm" + default: "--log-level warn --no-tea --no-progress --oci-concurrency 8 -a amd64 --confirm" tasks: ########### @@ -29,10 +42,19 @@ tasks: name: description: "Name of the Zarf package being deployed" required: true + optionalComponents: + description: "Optional components to deploy" + required: false + default: "" + extraOptions: + description: "Extra deployment options specific to the Zarf Init package's configuration" + required: false actions: - description: "Deploy the Zarf package for amd64 architectures" cmd: | sudo uds zarf package deploy "${{ .inputs.path }}/zarf-package-${{ .inputs.name }}-amd64-${VERSION}.tar.zst" \ + ${{ .inputs.optionalComponents }} \ + ${{ .inputs.extraOptions }} \ ${DEPLOY_OPTIONS} - name: bundle @@ -44,14 +66,28 @@ tasks: name: description: "Name of the UDS bundle being deployed" required: true - extraOptions: - description: "Extra deployment options specific to the UDS bundle's configuration" - required: false + udsConfig: + description: "UDS configuration manifest for deployment" + required: true + version: + description: "UDS bundle version to be deployed" + required: true + joinToken: + description: "Cluster join token for nodes, set by the master" + required: true + networkInterface: + description: "Interface on which to perform L2 advertisement" + required: true + bucketPersistenceSize: + description: "The default MinIO bucket size" + required: true actions: - description: "Deploy the UDS bundle for the amd64 architectures" cmd: | - sudo uds deploy "${{ .inputs.path }}/uds-bundle-${{ .inputs.name }}-amd64-${VERSION}.tar.zst" \ - ${{ .inputs.extraOptions }} \ + sudo UDS_CONFIG=${{ .inputs.udsConfig }} uds deploy "${{ .inputs.path }}/uds-bundle-${{ .inputs.name }}-amd64-${{ .inputs.version }}.tar.zst" \ + --set JOIN_TOKEN=${{ .inputs.joinToken }} \ + --set NETWORK_INTERFACE=${{ .inputs.networkInterface }} \ + --set BUCKET_PERSISTENCE_SIZE=${{ .inputs.bucketPersistenceSize }} \ ${DEPLOY_BUNDLE_OPTIONS} ######### @@ -65,7 +101,24 @@ tasks: with: name: uds-rke2-local-path-core path: build/bundles - extraOptions: "--set JOIN_TOKEN=my-test-token" + udsConfig: build/packages/local-path/uds-config-${VERSION}.yaml + version: ${VERSION} + joinToken: ${JOIN_TOKEN} + networkInterface: ${NETWORK_INTERFACE} + bucketPersistenceSize: ${BUCKET_PERSISTENCE_SIZE} + + - name: local-path-core-bundle-dev + description: "Deploy the Local Path Provisioner UDS RKE2 bootstrapping bundle" + actions: + - task: bundle + with: + name: uds-rke2-local-path-core + path: build/bundles + udsConfig: build/packages/local-path/uds-config-dev.yaml + version: dev + joinToken: ${JOIN_TOKEN} + networkInterface: ${NETWORK_INTERFACE} + bucketPersistenceSize: ${BUCKET_PERSISTENCE_SIZE} ################### # STANDARD PACKAGES @@ -75,7 +128,7 @@ tasks: description: "Deploy the UDS RKE2 Zarf package, as a new cluster bootstrapping node" inputs: joinToken: - description: "Cluster joining token" + description: "Cluster join token for nodes, set by the master" required: true optionalComponents: description: "Optional components to deploy" @@ -93,7 +146,7 @@ tasks: description: "Deploy the UDS RKE2 Zarf package, as a cluster joining node" inputs: joinToken: - description: "Cluster joining token" + description: "Cluster join token for nodes, set by the master" required: true joinAddress: description: "IP address at which to join the node when an existing cluster exists at a different IP address" @@ -124,6 +177,8 @@ tasks: with: path: build/packages name: infrastructure + # Interface on which to perform L2 advertisement + extraOptions: "--set NETWORK_INTERFACE=${NETWORK_INTERFACE}" - name: uds-rke2-exemptions-local-path description: "Deploy the UDS RKE2 service Pepr policy exemptions package" @@ -133,6 +188,15 @@ tasks: path: build/packages/local-path name: exemptions + - name: uds-rke2-exemptions-local-path-nvidia + description: "Deploy the UDS RKE2 service Pepr policy exemptions package" + actions: + - task: deploy + with: + path: build/packages/local-path + name: exemptions + optionalComponents: "--components nvidia-gpu-operator-exemptions" + - name: minio description: "Deploy the MinIO package" actions: @@ -178,3 +242,12 @@ tasks: with: path: build/packages name: leapfrogai-workarounds + extraOptions: "--set DOMAIN=${DOMAIN}" + + - name: nvidia-gpu-operator + description: "Deploy the NVIDIA GPU Operator package" + actions: + - task: deploy + with: + path: build/packages + name: nvidia-gpu-operator diff --git a/tasks/publish.yaml b/tasks/publish.yaml index 49db33f7b..fbdc5865f 100644 --- a/tasks/publish.yaml +++ b/tasks/publish.yaml @@ -10,35 +10,15 @@ variables: - name: PUBLISH_OPTIONS description: "Extra Zarf package publishing options" - default: "--no-progress --no-log-file --log-level debug -a amd64" + default: "--no-progress --log-level warn -a amd64" - name: PUBLISH_BUNDLE_OPTIONS description: "Extra UDS bundle publishing options" - default: "--no-progress --no-tea --no-log-file --log-level debug -a amd64 --confirm" + default: "--no-progress --log-level warn --no-tea --oci-concurrency 8 -a amd64" - name: TARGET_REGISTRY description: "Target registry to publish to" default: "ghcr.io" tasks: - ########################## - # ALL PACKAGES AND BUNDLES - ########################## - - # TODO: re-enable rook-ceph and longhorn when built - - name: all - description: "Publishes all Zarf packages in the repository" - actions: - # Zarf packages - - task: uds-rke2 - - task: uds-rke2-infrastructure - - task: uds-rke2-exemptions-local-path - - task: local-path - - task: local-path-init - - task: minio - - task: leapfrogai-workarounds - - # UDS bundles - - task: local-path-core-bundle - ########### # REUSEABLE ########### @@ -98,7 +78,7 @@ tasks: sudo uds publish \ "${{ .inputs.path }}/uds-bundle-${{ .inputs.name }}-amd64-${VERSION}.tar.zst" \ "oci://${TARGET_REGISTRY}/${{ .inputs.targetRepo }}" \ - ${PUBLISH_OPTIONS} + ${PUBLISH_BUNDLE_OPTIONS} ######### # UTILITY @@ -200,3 +180,12 @@ tasks: path: build/packages name: leapfrogai-workarounds targetRepo: justinthelaw/packages/uds/uds-rke2/ + + - name: nvidia-gpu-operator + description: "Publish the NVIDIA GPU Operator package" + actions: + - task: publish + with: + path: build/packages + name: nvidia-gpu-operator + targetRepo: justinthelaw/packages/uds/uds-rke2/ diff --git a/tasks/setup.yaml b/tasks/setup.yaml index bb9e65ee0..d600fb751 100644 --- a/tasks/setup.yaml +++ b/tasks/setup.yaml @@ -18,24 +18,22 @@ tasks: cmd: | sudo uds zarf tools clear-cache && sudo rm -rf ~/.uds-cache && sudo rm -rf ~/.zarf-cache sudo rm -rf build/ zarf-sbom/ /tmp/zarf-* + sudo rm -rf bundles/dev/tls/ + sudo rm -rf bundles/latest/tls/ + + - name: clean-docker + description: "Cleans hanging Docker artifacts" + actions: + - description: "Remove the all Docker artifacts" + cmd: | + sudo docker system prune -a -f + sudo docker volume prune -f - name: clean description: "Cleans the host system of extraneous build artifacts, as well as hanging Docker artifacts" actions: - task: clean-build-artifacts - - description: "Remove the build folder, Docker artifacts, and clear UDS and Zarf caches" - cmd: sudo docker system prune -a -f && sudo docker volume prune -f - - - name: local-registry - description: "Create a local image registry for development" - actions: - - description: "Create a local Docker registry" - cmd: | - if [ -z "$(docker ps --format '{{.Names}}:{{.Ports}}' | grep '^registry:.*:5001->')" ]; then - docker run -d -p 5001:5000 --restart=always --name registry registry:2 - else - echo "Local registry already exists at port 5001, using the existing registry." - fi + - task: clean-docker - name: rook-ceph-destroy description: "Destroys an existing Rook-Ceph cluster installation and all associated data on the host node" diff --git a/tasks/test.yaml b/tasks/test.yaml index 2da163ca8..badc86c27 100644 --- a/tasks/test.yaml +++ b/tasks/test.yaml @@ -9,25 +9,24 @@ includes: variables: - name: VERSION description: "Explicitly set the version, overriding the official release tag" - # x-release-please-start-version - default: "0.4.2" - # x-release-please-end + default: "dev" tasks: - name: uds-rke2 - description: "Complete a stand-up and stand-down of ONLY the RKE2 cluster" + description: "Deploy a new RKE2 cluster" actions: + - task: setup:clean + - task: create:uds-rke2 - task: deploy:uds-rke2-bootstrap with: joinToken: "my-test-token" - - task: setup:clean - - name: local-path-minio-init - description: "Bootstrap a new RKE2 cluster, with the Local Path Provisioner + MinIO Zarf Init" + description: "Deploy the Local Path Provisioner + MinIO Zarf Init" actions: - # Zarf packages + - task: setup:clean + - task: create:local-path-init - task: deploy:local-path-init with: diff --git a/tests/cuda-vector-add.yaml b/tests/cuda-vector-add.yaml new file mode 100644 index 000000000..053e3dde0 --- /dev/null +++ b/tests/cuda-vector-add.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + runtimeClassName: nvidia + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1 + resources: + limits: + nvidia.com/gpu: "1" + cpu: "1" + memory: 0.5Gi + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/tests/device-query.yaml b/tests/device-query.yaml new file mode 100644 index 000000000..421ca95ad --- /dev/null +++ b/tests/device-query.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + runtimeClassName: nvidia + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda11.7.1-ubuntu20.04 + resources: + limits: + nvidia.com/gpu: "1" + cpu: "1" + memory: 0.5Gi + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/tls-cert.conf b/tls-cert.conf deleted file mode 100644 index 14b7ba7f6..000000000 --- a/tls-cert.conf +++ /dev/null @@ -1,16 +0,0 @@ -[req] -distinguished_name = req_distinguished_name -req_extensions = v3_req -prompt = no - -[req_distinguished_name] -CN = uds.dev - -[v3_req] -keyUsage = digitalSignature -extendedKeyUsage = serverAuth -subjectAltName = @alt_names - -[alt_names] -DNS = uds.dev -# Add more DNS entries or IP addresses as needed