Skip to content

Commit

Permalink
Merge pull request #3573 from ankitkinra/hotfix-fix-container-gcluste…
Browse files Browse the repository at this point in the history
…r-kubectl

Revert PR 3406
  • Loading branch information
annuay-google authored Jan 22, 2025
2 parents 79299a1 + fcc2d8d commit a9dd634
Show file tree
Hide file tree
Showing 26 changed files with 120 additions and 170 deletions.
3 changes: 2 additions & 1 deletion community/modules/compute/gke-topology-scheduler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ No resources.
| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |

## Outputs

Expand Down
3 changes: 2 additions & 1 deletion community/modules/compute/gke-topology-scheduler/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
module "kubectl_apply" {
source = "../../../../modules/management/kubectl-apply"

gke_cluster_exists = var.gke_cluster_exists
cluster_id = var.cluster_id
project_id = var.project_id

apply_manifests = [
{ source = "${path.module}/manifests/topology-scheduler-scripts.yaml" },
Expand Down
12 changes: 8 additions & 4 deletions community/modules/compute/gke-topology-scheduler/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

variable "gke_cluster_exists" {
description = "A static flag that signals to modules that a cluster has been created."
type = bool
default = false
variable "project_id" {
description = "The project ID to host the cluster in."
type = string
}

variable "cluster_id" {
description = "projects/{{project}}/locations/{{location}}/clusters/{{cluster}}"
type = string
}
1 change: 0 additions & 1 deletion modules/compute/gke-node-pool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,6 @@ limitations under the License.
| <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
| <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
| <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
| <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes |
| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/> type = optional(string)<br/> count = optional(number, 0)<br/> gpu_driver_installation_config = optional(object({<br/> gpu_driver_version = string<br/> }), { gpu_driver_version = "DEFAULT" })<br/> gpu_partition_size = optional(string)<br/> gpu_sharing_config = optional(object({<br/> gpu_sharing_strategy = string<br/> max_shared_clients_per_gpu = number<br/> }))<br/> }))</pre> | `[]` | no |
| <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
Expand Down
3 changes: 2 additions & 1 deletion modules/compute/gke-node-pool/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,8 @@ resource "null_resource" "enable_tcpxo_in_workload" {
module "kubectl_apply" {
source = "../../management/kubectl-apply"

gke_cluster_exists = var.gke_cluster_exists
cluster_id = var.cluster_id
project_id = var.project_id

apply_manifests = flatten([
for manifest in local.gpu_direct_setting.gpu_direct_manifests : [
Expand Down
6 changes: 0 additions & 6 deletions modules/compute/gke-node-pool/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@ variable "cluster_id" {
type = string
}

variable "gke_cluster_exists" {
description = "A static flag that signals to modules that a cluster has been created."
type = bool
default = false
}

variable "zones" {
description = "A list of zones to be used. Zones must be in region of cluster. If null, cluster zones will be inherited. Note `zones` not `zone`; does not work with `zone` deployment variable."
type = list(string)
Expand Down
6 changes: 5 additions & 1 deletion modules/file-system/gke-persistent-volume/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,15 @@ limitations under the License.
| Name | Version |
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.0 |
| <a name="requirement_google"></a> [google](#requirement\_google) | >= 4.42 |
| <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.7.0 |
| <a name="requirement_local"></a> [local](#requirement\_local) | >= 2.0.0 |

## Providers

| Name | Version |
|------|---------|
| <a name="provider_google"></a> [google](#provider\_google) | >= 4.42 |
| <a name="provider_kubectl"></a> [kubectl](#provider\_kubectl) | >= 1.7.0 |
| <a name="provider_local"></a> [local](#provider\_local) | >= 2.0.0 |

Expand All @@ -142,15 +144,17 @@ No modules.
| [kubectl_manifest.pv](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
| [kubectl_manifest.pvc](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
| [local_file.debug_file](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource |
| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |

## Inputs

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_capacity_gb"></a> [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes |
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes |
| <a name="input_filestore_id"></a> [filestore\_id](#input\_filestore\_id) | An identifier for a filestore with the format `projects/{{project}}/locations/{{location}}/instances/{{name}}`. | `string` | `null` | no |
| <a name="input_gcs_bucket_name"></a> [gcs\_bucket\_name](#input\_gcs\_bucket\_name) | The gcs bucket to be used with the persistent volume. | `string` | `null` | no |
| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
| <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
| <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | Network attached storage mount to be configured. | <pre>object({<br/> server_ip = string,<br/> remote_mount = string,<br/> local_mount = string,<br/> fs_type = string,<br/> mount_options = string,<br/> client_install_runner = map(string)<br/> mount_runner = map(string)<br/> })</pre> | n/a | yes |

Expand Down
18 changes: 17 additions & 1 deletion modules/file-system/gke-persistent-volume/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ locals {
capacity = "${var.capacity_gb}Gi"
}
)

cluster_name = split("/", var.cluster_id)[5]
cluster_location = split("/", var.cluster_id)[3]
}

resource "local_file" "debug_file" {
Expand All @@ -87,8 +90,21 @@ resource "local_file" "debug_file" {
filename = "${path.root}/pv-pvc-debug-file-${local.filestore_name}.yaml"
}

data "google_container_cluster" "gke_cluster" {
name = local.cluster_name
location = local.cluster_location
}

data "google_client_config" "default" {}

provider "kubectl" {
host = "https://${data.google_container_cluster.gke_cluster.endpoint}"
cluster_ca_certificate = base64decode(data.google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate)
token = data.google_client_config.default.access_token
load_config_file = false
}

resource "kubectl_manifest" "pv" {
count = var.gke_cluster_exists ? 1 : 0
yaml_body = local.is_gcs ? local.gcs_pv_contents : local.filestore_pv_contents

lifecycle {
Expand Down
7 changes: 3 additions & 4 deletions modules/file-system/gke-persistent-volume/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@
* limitations under the License.
*/

variable "gke_cluster_exists" {
description = "A static flag that signals to modules that a cluster has been created."
type = bool
default = false
variable "cluster_id" {
description = "An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}`"
type = string
}

variable "network_storage" {
Expand Down
4 changes: 4 additions & 0 deletions modules/file-system/gke-persistent-volume/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
terraform {
required_version = ">= 1.0"
required_providers {
google = {
source = "hashicorp/google"
version = ">= 4.42"
}
kubectl = {
source = "gavinbunney/kubectl"
version = ">= 1.7.0"
Expand Down
3 changes: 2 additions & 1 deletion modules/file-system/gke-storage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,11 @@ No resources.
|------|-------------|------|---------|:--------:|
| <a name="input_access_mode"></a> [access\_mode](#input\_access\_mode) | The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)<br/>Valid access modes:<br/>- ReadWriteOnce<br/>- ReadOnlyMany<br/>- ReadWriteMany<br/>- ReadWriteOncePod | `string` | n/a | yes |
| <a name="input_capacity_gb"></a> [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes |
| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes |
| <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
| <a name="input_mount_options"></a> [mount\_options](#input\_mount\_options) | Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class. | `string` | `null` | no |
| <a name="input_private_vpc_connection_peering"></a> [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection.<br/>If using new VPC, please use community/modules/network/private-service-access to create private-service-access and<br/>If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
| <a name="input_pv_mount_path"></a> [pv\_mount\_path](#input\_pv\_mount\_path) | Path within the container at which the volume should be mounted. Must not contain ':'. | `string` | `"/data"` | no |
| <a name="input_pvc_count"></a> [pvc\_count](#input\_pvc\_count) | How many PersistentVolumeClaims that will be created | `number` | `1` | no |
| <a name="input_sc_reclaim_policy"></a> [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.<br/>[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)<br/>Supported value:<br/>- Retain<br/>- Delete | `string` | n/a | yes |
Expand Down
3 changes: 2 additions & 1 deletion modules/file-system/gke-storage/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ check "private_vpc_connection_peering" {
module "kubectl_apply" {
source = "../../management/kubectl-apply"

gke_cluster_exists = var.gke_cluster_exists
cluster_id = var.cluster_id
project_id = var.project_id

# count = var.pvc_count
apply_manifests = flatten(
Expand Down
12 changes: 8 additions & 4 deletions modules/file-system/gke-storage/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@
* limitations under the License.
*/

variable "gke_cluster_exists" {
description = "A static flag that signals to modules that a cluster has been created."
type = bool
default = false
variable "project_id" {
description = "The project ID to host the cluster in."
type = string
}

variable "cluster_id" {
description = "An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}`"
type = string
}

variable "labels" {
Expand Down
8 changes: 7 additions & 1 deletion modules/management/kubectl-apply/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,15 @@ limitations under the License.
| Name | Version |
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
| <a name="requirement_google"></a> [google](#requirement\_google) | > 5.0 |
| <a name="requirement_http"></a> [http](#requirement\_http) | ~> 3.0 |
| <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.7.0 |

## Providers

| Name | Version |
|------|---------|
| <a name="provider_google"></a> [google](#provider\_google) | > 5.0 |
| <a name="provider_terraform"></a> [terraform](#provider\_terraform) | n/a |

## Modules
Expand All @@ -124,15 +127,18 @@ limitations under the License.
|------|------|
| [terraform_data.jobset_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
| [terraform_data.kueue_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |

## Inputs

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md). | <pre>list(object({<br/> content = optional(string, null)<br/> source = optional(string, null)<br/> template_vars = optional(map(any), null)<br/> server_side_apply = optional(bool, false)<br/> wait_for_rollout = optional(bool, true)<br/> }))</pre> | `[]` | no |
| <a name="input_gke_cluster_exists"></a> [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no |
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects/<project\_id>/locations/<region>/clusters/<name>. | `string` | n/a | yes |
| <a name="input_jobset"></a> [jobset](#input\_jobset) | Install [Jobset](https://github.com/kubernetes-sigs/jobset) which manages a group of K8s [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) as a unit. | <pre>object({<br/> install = optional(bool, false)<br/> version = optional(string, "v0.5.2")<br/> })</pre> | `{}` | no |
| <a name="input_kueue"></a> [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. A configuration yaml/template file can be provided with config\_path to be applied right after kueue installation. If a template file provided, its variables can be set to config\_template\_vars. | <pre>object({<br/> install = optional(bool, false)<br/> version = optional(string, "v0.8.1")<br/> config_path = optional(string, null)<br/> config_template_vars = optional(map(any), null)<br/> })</pre> | `{}` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID that hosts the gke cluster. | `string` | n/a | yes |

## Outputs

Expand Down
30 changes: 22 additions & 8 deletions modules/management/kubectl-apply/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
*/

locals {
cluster_id_parts = split("/", var.cluster_id)
cluster_name = local.cluster_id_parts[5]
cluster_location = local.cluster_id_parts[3]
project_id = var.project_id != null ? var.project_id : local.cluster_id_parts[1]

apply_manifests_map = tomap({
for index, manifest in var.apply_manifests : index => manifest
})
Expand All @@ -25,8 +30,16 @@ locals {
jobset_install_source = format("${path.module}/manifests/jobset-%s.yaml", try(var.jobset.version, ""))
}

data "google_container_cluster" "gke_cluster" {
project = local.project_id
name = local.cluster_name
location = local.cluster_location
}

data "google_client_config" "default" {}

module "kubectl_apply_manifests" {
for_each = var.gke_cluster_exists ? local.apply_manifests_map : {}
for_each = local.apply_manifests_map
source = "./kubectl"

content = each.value.content
Expand All @@ -36,34 +49,34 @@ module "kubectl_apply_manifests" {
wait_for_rollout = each.value.wait_for_rollout

providers = {
http = http.h
kubectl = kubectl
http = http.h
}
}

module "install_kueue" {
count = var.gke_cluster_exists ? 1 : 0
source = "./kubectl"
source_path = local.install_kueue ? local.kueue_install_source : null
server_side_apply = true

providers = {
http = http.h
kubectl = kubectl
http = http.h
}
}

module "install_jobset" {
count = var.gke_cluster_exists ? 1 : 0
source = "./kubectl"
source_path = local.install_jobset ? local.jobset_install_source : null
server_side_apply = true

providers = {
http = http.h
kubectl = kubectl
http = http.h
}
}

module "configure_kueue" {
count = var.gke_cluster_exists ? 1 : 0
source = "./kubectl"
source_path = local.install_kueue ? try(var.kueue.config_path, "") : null
template_vars = local.install_kueue ? try(var.kueue.config_template_vars, null) : null
Expand All @@ -73,6 +86,7 @@ module "configure_kueue" {
wait_for_rollout = true

providers = {
http = http.h
kubectl = kubectl
http = http.h
}
}
8 changes: 8 additions & 0 deletions modules/management/kubectl-apply/providers.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@
* limitations under the License.
*/

provider "kubectl" {
host = "https://${data.google_container_cluster.gke_cluster.endpoint}"
token = data.google_client_config.default.access_token
cluster_ca_certificate = base64decode(data.google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate)
load_config_file = false
apply_retry_count = 15 # Terraform may apply resources in parallel, leading to potential dependency issues. This retry mechanism ensures that if a resource's dependencies aren't ready, Terraform will attempt to apply it again.
}

provider "http" {
alias = "h"
}
13 changes: 9 additions & 4 deletions modules/management/kubectl-apply/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,15 @@ resource "terraform_data" "jobset_validations" {
}
}

variable "gke_cluster_exists" {
description = "A static flag that signals to modules that a cluster has been created."
type = bool
default = false
variable "project_id" {
description = "The project ID that hosts the gke cluster."
type = string
}

variable "cluster_id" {
description = "An identifier for the gke cluster resource with format projects/<project_id>/locations/<region>/clusters/<name>."
type = string
nullable = false
}

variable "apply_manifests" {
Expand Down
8 changes: 8 additions & 0 deletions modules/management/kubectl-apply/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@

terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "> 5.0"
}
kubectl = {
source = "gavinbunney/kubectl"
version = ">= 1.7.0"
}
http = {
source = "hashicorp/http"
version = "~> 3.0"
Expand Down
Loading

0 comments on commit a9dd634

Please sign in to comment.