Skip to content

Commit

Permalink
Merge pull request #3362 from mr0re1/res_pp
Browse files Browse the repository at this point in the history
Untangle `exclusive` and `enable_placement`, update examples.
  • Loading branch information
mr0re1 authored Jan 19, 2025
2 parents 81a8dc6 + ed17cc1 commit 8b711f6
Show file tree
Hide file tree
Showing 29 changed files with 15 additions and 57 deletions.
2 changes: 0 additions & 2 deletions community/examples/AMD/hpc-amd-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ deployment_groups:
machine_type: c2d-standard-4
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false

- id: low_cost_partition
Expand All @@ -194,7 +193,6 @@ deployment_groups:
machine_type: c2d-standard-112
node_count_dynamic_max: 50
bandwidth_tier: gvnic_enabled
enable_placement: true
allow_automatic_updates: false

# Because is_default is set to true, jobs will run on this partition unless an
Expand Down
1 change: 0 additions & 1 deletion community/examples/hpc-slurm-sharedvpc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ deployment_groups:
settings:
node_count_dynamic_max: 4
machine_type: n2-standard-2
enable_placement: false # the default is: true
allow_automatic_updates: false

- id: debug_partition
Expand Down
1 change: 0 additions & 1 deletion community/examples/hpc-slurm-ubuntu2004.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ deployment_groups:
use: [network]
settings:
instance_image: $(vars.slurm_image)
enable_placement: false # the default is: true
node_count_dynamic_max: 4
machine_type: n2-standard-2

Expand Down
4 changes: 0 additions & 4 deletions community/examples/htc-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ deployment_groups:
name: c2s60
node_count_dynamic_max: 200
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false

- id: compute_nodeset_c2s30
Expand All @@ -98,7 +97,6 @@ deployment_groups:
node_count_dynamic_max: 200
machine_type: c2-standard-30
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false

- id: compute_partition
Expand All @@ -122,7 +120,6 @@ deployment_groups:
machine_type: n2-standard-2
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false

- id: low_cost_nodeset_n2s4
Expand All @@ -133,7 +130,6 @@ deployment_groups:
machine_type: n2-standard-4
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false

- id: low_cost_partition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ modules. For support with the underlying modules, see the instructions in the
| <a name="input_enable_maintenance_reservation"></a> [enable\_maintenance\_reservation](#input\_enable\_maintenance\_reservation) | Enables slurm reservation for scheduled maintenance. | `bool` | `false` | no |
| <a name="input_enable_opportunistic_maintenance"></a> [enable\_opportunistic\_maintenance](#input\_enable\_opportunistic\_maintenance) | On receiving maintenance notification, maintenance will be performed as soon as nodes becomes idle. | `bool` | `false` | no |
| <a name="input_enable_oslogin"></a> [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.<br/>See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no |
| <a name="input_enable_placement"></a> [enable\_placement](#input\_enable\_placement) | Enable placement groups. | `bool` | `true` | no |
| <a name="input_enable_placement"></a> [enable\_placement](#input\_enable\_placement) | Use placement policy for VMs in this nodeset.<br/>See: https://cloud.google.com/compute/docs/instances/placement-policies-overview<br/>To set max\_distance of used policy, use `placement_max_distance` variable.<br/><br/>Enabled by default, reasons for users to disable it:<br/>- If non-dense reservation is used, user can avoid extra-cost of creating placement policies;<br/>- If user wants to avoid "all or nothing" VM provisioning behaviour;<br/>- If user wants to intentionally have "spread" VMs (e.g. for reliability reasons) | `bool` | `true` | no |
| <a name="input_enable_public_ips"></a> [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no |
| <a name="input_enable_shielded_vm"></a> [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no |
| <a name="input_enable_smt"></a> [enable\_smt](#input\_enable\_smt) | DEPRECATED: Use `advanced_machine_features.threads_per_core` instead. | `bool` | `null` | no |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,6 @@ output "nodeset" {
error_message = "A disk_type=${var.disk_type} cannot be used with machine_type=${var.machine_type}."
}

precondition {
condition = var.reservation_name == "" || !var.enable_placement
error_message = <<-EOD
If a reservation is specified, `var.enable_placement` must be `false`.
If the specified reservation has a placement policy then it will be used automatically.
EOD
}

precondition {
condition = var.reservation_name == "" || length(var.zones) == 0
error_message = <<-EOD
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,16 @@ variable "disable_public_ips" { # tflint-ignore: terraform_unused_declarations


variable "enable_placement" {
description = "Enable placement groups."
description = <<-EOD
Use placement policy for VMs in this nodeset.
See: https://cloud.google.com/compute/docs/instances/placement-policies-overview
To set max_distance of used policy, use `placement_max_distance` variable.
Enabled by default, reasons for users to disable it:
- If non-dense reservation is used, user can avoid extra-cost of creating placement policies;
- If user wants to avoid "all or nothing" VM provisioning behaviour;
- If user wants to intentionally have "spread" VMs (e.g. for reliability reasons)
EOD
type = bool
default = true
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
# limitations under the License.

locals {
non_static_ns_with_placement = [for ns in var.nodeset : ns.nodeset_name if ns.enable_placement && ns.node_count_static == 0]
use_static = [for ns in concat(var.nodeset, var.nodeset_tpu) : ns.nodeset_name if ns.node_count_static > 0]
uses_job_duration = length([for ns in var.nodeset : ns.dws_flex.use_job_duration if ns.dws_flex.use_job_duration]) > 0 ? true : false
use_static = [for ns in concat(var.nodeset, var.nodeset_tpu) : ns.nodeset_name if ns.node_count_static > 0]
uses_job_duration = length([for ns in var.nodeset : ns.dws_flex.use_job_duration if ns.dws_flex.use_job_duration]) > 0 ? true : false

has_node = length(var.nodeset) > 0
has_dyn = length(var.nodeset_dyn) > 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@ output "partitions" {

value = [local.partition]

precondition {
condition = (length(local.non_static_ns_with_placement) == 0) || var.exclusive
error_message = "If any non-static nodesets has `enable_placement`, `var.exclusive` must be set true"
}

precondition {
condition = (length(local.use_static) == 0) || !var.exclusive
error_message = <<-EOD
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ def create_instances_request(nodes: List[str], placement_group: Optional[str], e
)

if placement_group:
assert len(nodes) <= PLACEMENT_MAX_CNT
pass # do not set minCount to force "all or nothing" behavior
else:
body["minCount"] = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ deployment_groups:
settings:
partition_name: debug
exclusive: false
enable_placement: false
is_default: false

- id: compute_node_group
Expand Down
1 change: 0 additions & 1 deletion docs/tutorials/hpc-slurm-qwiklabs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ deployment_groups:
settings:
node_count_dynamic_max: 4
machine_type: n2d-standard-2
enable_placement: false # the default is: true

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
1 change: 0 additions & 1 deletion examples/hcls-blueprint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,6 @@ deployment_groups:
threads_per_core: null # Use platform default value
node_count_dynamic_max: 20
machine_type: g2-standard-4
enable_placement: False
allow_automatic_updates: false

- id: gpu_partition
Expand Down
4 changes: 1 addition & 3 deletions examples/hpc-enterprise-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ deployment_groups:
node_count_dynamic_max: 4
machine_type: n2-standard-2
instance_image: $(vars.slurm_image)
enable_placement: false # the default is: true
allow_automatic_updates: false

- id: n2_partition
Expand All @@ -138,7 +137,6 @@ deployment_groups:
node_count_dynamic_max: 20
machine_type: c2-standard-60 # this is the default
instance_image: $(vars.slurm_image)
enable_placement: true
bandwidth_tier: tier_1_enabled
disk_type: pd-ssd
disk_size_gb: 100
Expand All @@ -152,7 +150,7 @@ deployment_groups:
settings:
partition_name: c2
# the following two are true by default
exclusive: true # this must be true if nodeset.enable_placement is true
exclusive: true

- id: c2d_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
Expand Down
1 change: 0 additions & 1 deletion examples/hpc-slurm-static.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ deployment_groups:
settings:
node_count_static: $(vars.static_node_count)
node_count_dynamic_max: 0
enable_placement: false # placement is done on reservation
reservation_name: $(vars.static_reservation_name)
machine_type: $(vars.static_reservation_machine_type)
instance_image: $(vars.slurm_instance_image)
Expand Down
1 change: 0 additions & 1 deletion examples/hpc-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ deployment_groups:
settings:
node_count_dynamic_max: 4
machine_type: n2-standard-2
enable_placement: false # the default is: true
allow_automatic_updates: false

- id: debug_partition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ deployment_groups:
node_count_static: 0
node_count_dynamic_max: 4
machine_type: n2-standard-2
enable_placement: false

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down Expand Up @@ -182,7 +181,6 @@ deployment_groups:
settings:
reservation_name: $(vars.a3_reservation_name)
maintenance_interval: $(vars.a3_maintenance_interval)
enable_placement: false
node_count_static: $(vars.a3_static_cluster_size)
node_count_dynamic_max: 0
disk_type: pd-ssd
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ vars:
deployment_name: a3mega-cluster
a3mega_partition_name: a3mega
a3mega_maintenance_interval: ""
enable_placement: false
remote_mount_homefs: /nfsshare
local_mount_homefs: /home
instance_image_custom: true
Expand Down
1 change: 0 additions & 1 deletion examples/ml-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,6 @@ deployment_groups:
use: [network]
settings:
node_count_dynamic_max: 20
enable_placement: false
bandwidth_tier: gvnic_enabled
machine_type: g2-standard-4
instance_image: $(vars.new_image)
Expand Down
1 change: 0 additions & 1 deletion examples/ps-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ deployment_groups:
settings:
node_count_dynamic_max: 4
machine_type: $(vars.compute_node_machine_type)
enable_placement: false # the default is: true
allow_automatic_updates: false

- id: debug_partition
Expand Down
2 changes: 0 additions & 2 deletions tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ vars:
zone: us-central1-a
machine_type: n2-standard-2
disk_type: pd-ssd
# enable_placement: false
# on_host_maintenance: MIGRATE
num_nodes: 1
rocky_image:
family: slurm-gcp-6-8-hpc-rocky-linux-8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ deployment_groups:
node_count_dynamic_max: 0
node_count_static: 5
allow_automatic_updates: false
enable_placement: true

- id: partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ deployment_groups:
node_count_dynamic_max: 4
machine_type: n2-standard-2
instance_image: $(vars.slurm_image)
enable_placement: false # the default is: true
allow_automatic_updates: false

- id: n2_partition
Expand All @@ -128,7 +127,6 @@ deployment_groups:
node_count_dynamic_max: 20
machine_type: c2-standard-60 # this is the default
instance_image: $(vars.slurm_image)
enable_placement: true
bandwidth_tier: tier_1_enabled
disk_type: pd-ssd
disk_size_gb: 100
Expand All @@ -142,7 +140,7 @@ deployment_groups:
settings:
partition_name: c2
# the following two are true by default
exclusive: true # this must be true if nodeset.enable_placement is true
exclusive: true

- id: c2d_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ deployment_groups:
- compute_sa
settings:
allow_automatic_updates: false
enable_placement: false
instance_image: ((var.slurm_image))
instance_image_custom: ((var.instance_image_custom))
labels: ((var.labels))
Expand Down Expand Up @@ -185,7 +184,6 @@ deployment_groups:
bandwidth_tier: tier_1_enabled
disk_size_gb: 100
disk_type: pd-ssd
enable_placement: true
instance_image: ((var.slurm_image))
instance_image_custom: ((var.instance_image_custom))
labels: ((var.labels))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ module "scratchfs" {
module "n2_nodeset" {
source = "github.com/GoogleCloudPlatform/cluster-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-nodeset?ref=v1.38.0&depth=1"
allow_automatic_updates = false
enable_placement = false
instance_image = var.slurm_image
instance_image_custom = var.instance_image_custom
labels = var.labels
Expand Down Expand Up @@ -112,7 +111,6 @@ module "c2_nodeset" {
bandwidth_tier = "tier_1_enabled"
disk_size_gb = 100
disk_type = "pd-ssd"
enable_placement = true
instance_image = var.slurm_image
instance_image_custom = var.instance_image_custom
labels = var.labels
Expand Down
1 change: 0 additions & 1 deletion tools/validate_configs/test_configs/config-ssh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ deployment_groups:
settings:
node_count_dynamic_max: 4
machine_type: n2-standard-2
enable_placement: false # the default is: true

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
3 changes: 0 additions & 3 deletions tools/validate_configs/test_configs/gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ deployment_groups:
use: [network_slurm]
settings:
name: nogpu
enable_placement: false
node_count_dynamic_max: 4
machine_type: n2-standard-2

Expand All @@ -148,7 +147,6 @@ deployment_groups:
use: [network_slurm]
settings:
name: man
enable_placement: false
node_count_dynamic_max: 4
machine_type: a2-ultragpu-2g
guest_accelerator:
Expand All @@ -160,7 +158,6 @@ deployment_groups:
use: [network_slurm]
settings:
name: auto
enable_placement: false
node_count_dynamic_max: 4
machine_type: a2-ultragpu-2g

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ deployment_groups:
settings:
node_count_dynamic_max: 4
machine_type: n2-standard-2
enable_placement: false

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand Down
2 changes: 0 additions & 2 deletions tools/validate_configs/test_configs/zone-policies-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ deployment_groups:
node_count_dynamic_max: 4
machine_type: n2-standard-2
enable_public_ips: true
enable_placement: false

- id: zonal_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
Expand All @@ -63,7 +62,6 @@ deployment_groups:
settings:
node_count_dynamic_max: 4
machine_type: n2-standard-2
enable_placement: false
zones: $(vars.additional_zones)

- id: multizonal_partition
Expand Down

0 comments on commit 8b711f6

Please sign in to comment.