diff --git a/examples/ml-gke-ray.yaml b/examples/ml-gke-ray.yaml new file mode 100644 index 0000000000..c040ec8192 --- /dev/null +++ b/examples/ml-gke-ray.yaml @@ -0,0 +1,74 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +blueprint_name: ml-gke-ray + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: ml-gke-ray-01 + region: asia-southeast1 + zones: + - asia-southeast1-b # g2 machine has better availability in this zone + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + gcp_public_cidrs_access_enabled: false + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: $(vars.deployment_name)-subnet + secondary_ranges_list: + - subnetwork_name: $(vars.deployment_name)-subnet + ranges: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gke_service_account] + settings: + enable_ray_operator: true + enable_private_endpoint: false # Allows for access from authorized public IPs + gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled) + master_authorized_networks: + - display_name: deployment-machine + cidr_block: $(vars.authorized_cidr) + configure_workload_identity_sa: true + outputs: [instructions] + + - id: g2_pool + source: modules/compute/gke-node-pool + use: [gke_cluster, gke_service_account] + settings: + disk_type: pd-balanced + machine_type: g2-standard-4 diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index fe668c3df7..8a2b7ec85e 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -151,6 +151,7 @@ limitations under the License. | [enable\_dataplane\_v2](#input\_enable\_dataplane\_v2) | Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. If null, will default to false unless using multi-networking, in which case it will default to true | `bool` | `null` | no | | [enable\_dcgm\_monitoring](#input\_enable\_dcgm\_monitoring) | Enable GKE to collect DCGM metrics | `bool` | `false` | no | | [enable\_filestore\_csi](#input\_enable\_filestore\_csi) | The status of the Filestore Container Storage Interface (CSI) driver addon, which allows the usage of filestore instance as volumes. | `bool` | `false` | no | +| [enable\_ray\_operator](#input\_enable\_ray\_operator) | The status of the Ray operator addon, This feature enables Kubernetes APIs for managing and scaling Ray clusters and jobs. You control and are responsible for managing ray.io custom resources in your cluster. This feature is not compatible with GKE clusters that already have another Ray operator installed. Supports clusters on Kubernetes version 1.29.8-gke.1054000 or later. | `bool` | `false` | no | | [enable\_gcsfuse\_csi](#input\_enable\_gcsfuse\_csi) | The status of the GCSFuse Filestore Container Storage Interface (CSI) driver addon, which allows the usage of a gcs bucket as volumes. | `bool` | `false` | no | | [enable\_master\_global\_access](#input\_enable\_master\_global\_access) | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `false` | no | | [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). If null, will determine state based on if additional\_networks are passed in. | `bool` | `null` | no | diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index bdc16b1cbd..6954330e40 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -205,6 +205,9 @@ resource "google_container_cluster" "gke_cluster" { parallelstore_csi_driver_config { enabled = var.enable_parallelstore_csi } + ray_operator_config { + enabled = var.enable_ray_operator + } } timeouts { diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index a181c58239..f5741a3cc6 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -145,6 +145,12 @@ variable "enable_parallelstore_csi" { default = false } +variable "enable_ray_operator" { + description = "The status of the Ray operator addon, This feature enables Kubernetes APIs for managing and scaling Ray clusters and jobs. You control and are responsible for managing ray.io custom resources in your cluster. This feature is not compatible with GKE clusters that already have another Ray operator installed. Supports clusters on Kubernetes version 1.29.8-gke.1054000 or later." + type = bool + default = false +} + variable "enable_dcgm_monitoring" { description = "Enable GKE to collect DCGM metrics" type = bool