From b18d453c50be4ef5b7a163e9c3d0ba3241813bf7 Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Thu, 31 Oct 2024 01:05:42 +0000 Subject: [PATCH] Add future reservation support --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 1 + .../schedmd-slurm-gcp-v6-nodeset/main.tf | 12 +++ .../schedmd-slurm-gcp-v6-nodeset/outputs.tf | 24 ++++++ .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 15 ++++ .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/scripts/resume.py | 49 +++++++----- .../modules/slurm_files/scripts/slurmsync.py | 23 +++++- .../modules/slurm_files/scripts/util.py | 76 ++++++++++++++++--- .../partition.tf | 1 + .../variables.tf | 1 + 10 files changed, 171 insertions(+), 33 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 98cb44be72..297c40bb7a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -179,6 +179,7 @@ No modules. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | +| [future\_reservation](#input\_future\_reservation) | If set, will make use of the future reservation for the nodeset. Input can be either the future reservation name or its selfLink in the format 'projects/PROJECT\_ID/zones/ZONE/futureReservations/FUTURE\_RESERVATION\_NAME'.
See https://cloud.google.com/compute/docs/instances/future-reservations-overview | `string` | `""` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 3f283ffade..84cb60457a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -95,6 +95,7 @@ locals { spot = var.enable_spot_vm termination_action = try(var.spot_instance_config.termination_action, null) reservation_name = local.reservation_name + future_reservation = local.future_reservation maintenance_interval = var.maintenance_interval instance_properties_json = jsonencode(var.instance_properties) @@ -141,6 +142,17 @@ locals { reservation_name = local.res_match.whole == null ? "" : "${local.res_prefix}${local.res_short_name}${local.res_suffix}" } +locals { + fr_match = regex("^(?Pprojects/(?P[a-z0-9-]+)/zones/(?P[a-z0-9-]+)/futureReservations/)?(?P[a-z0-9-]+)?$", var.future_reservation) + + fr_name = local.fr_match.name + fr_project = coalesce(local.fr_match.project, var.project_id) + fr_zone = coalesce(local.fr_match.zone, var.zone) + + future_reservation = var.future_reservation == "" ? "" : "projects/${local.fr_project}/zones/${local.fr_zone}/futureReservations/${local.fr_name}" +} + + # tflint-ignore: terraform_unused_declarations data "google_compute_reservation" "reservation" { count = length(local.reservation_name) > 0 ? 1 : 0 diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf index b957db13c1..ad78840a38 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf @@ -54,4 +54,28 @@ output "nodeset" { condition = !var.enable_placement || !var.dws_flex.enabled error_message = "Cannot use DWS Flex with `enable_placement`." } + + precondition { + condition = var.reservation_name == "" || var.future_reservation == "" + error_message = "Cannot use reservations and future reservations in the same nodeset" + } + + precondition { + condition = !var.enable_placement || var.future_reservation == "" + error_message = "Cannot use `enable_placement` with future reservations." + } + + precondition { + condition = var.future_reservation == "" || length(var.zones) == 0 + error_message = <<-EOD + If a future reservation is specified, `var.zones` should be empty. + EOD + } + + precondition { + condition = var.future_reservation == "" || local.fr_zone == var.zone + error_message = <<-EOD + The zone of the deployment must match that of the future reservation" + EOD + } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index a2f8140d2b..3b7e342c32 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -463,6 +463,21 @@ variable "reservation_name" { } } +variable "future_reservation" { + description = <<-EOD + If set, will make use of the future reservation for the nodeset. Input can be either the future reservation name or its selfLink in the format 'projects/PROJECT_ID/zones/ZONE/futureReservations/FUTURE_RESERVATION_NAME'. + See https://cloud.google.com/compute/docs/instances/future-reservations-overview + EOD + type = string + default = "" + nullable = false + + validation { + condition = length(regexall("^(projects/([a-z0-9-]+)/zones/([a-z0-9-]+)/futureReservations/([a-z0-9-]+))?$", var.future_reservation)) > 0 || length(regexall("^([a-z0-9-]+)$", var.future_reservation)) > 0 + error_message = "Future reservation must be either the future reservation name or its selfLink in the format 'projects/PROJECT_ID/zone/ZONE/futureReservations/FUTURE_RESERVATION_NAME'." + } +} + variable "maintenance_interval" { description = <<-EOD Sets the maintenance interval for instances in this nodeset. diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 9b024e2117..f2dd5607f1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -336,7 +336,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, false)
enable_opportunistic_maintenance = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
use_job_duration = bool
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, false)
enable_opportunistic_maintenance = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
use_job_duration = bool
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
future_reservation = string
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 9f84a6d186..9d607fb6af 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -102,24 +102,10 @@ def instance_properties(nodeset:object, model:str, placement_group:Optional[str] props.resourcePolicies = [placement_group] if reservation := lookup().nodeset_reservation(nodeset): - props.reservationAffinity = { - "consumeReservationType": "SPECIFIC_RESERVATION", - "key": f"compute.{util.universe_domain()}/reservation-name", - "values": [reservation.bulk_insert_name], - } - - if reservation.dense: - props.scheduling.provisioning_model = "RESERVATION_BOUND" + update_reservation_props(reservation, props, placement_group, False) - # Figure out `resourcePolicies` - if reservation.policies: # use ones already attached to reservations - props.resourcePolicies = reservation.policies - elif reservation.dense: # use once created by Slurm - props.resourcePolicies = [placement_group] - else: # vanilla reservations don't support external policies - props.resourcePolicies = [] - log.info( - f"reservation {reservation.bulk_insert_name} is being used with resourcePolicies: {props.resourcePolicies}") + if (fr := lookup().future_reservation(nodeset)) and fr.specific: + update_reservation_props(fr.active_reservation, props, placement_group, True) if props.resourcePolicies: props.scheduling.onHostMaintenance = "TERMINATE" @@ -132,9 +118,28 @@ def instance_properties(nodeset:object, model:str, placement_group:Optional[str] # Override with properties explicit specified in the nodeset props.update(nodeset.get("instance_properties") or {}) - return props +def update_reservation_props(reservation:object, props:object, placement_group:Optional[str], reservation_from_fr:bool) -> None: + props.reservationAffinity = { + "consumeReservationType": "SPECIFIC_RESERVATION", + "key": f"compute.{util.universe_domain()}/reservation-name", + "values": [reservation.bulk_insert_name], + } + + if reservation.dense or reservation_from_fr: + props.scheduling.provisioning_model = "RESERVATION_BOUND" + + # Figure out `resourcePolicies` + if reservation.policies: # use ones already attached to reservations + props.resourcePolicies = reservation.policies + elif reservation.dense and placement_group: # use once created by Slurm + props.resourcePolicies = [placement_group] + else: # vanilla reservations don't support external policies + props.resourcePolicies = [] + log.info( + f"reservation {reservation.bulk_insert_name} is being used with resourcePolicies: {props.resourcePolicies}") + def update_props_dws(props:object, dws_flex:object, job_id: Optional[int]) -> None: props.scheduling.onHostMaintenance = "TERMINATE" props.scheduling.instanceTerminationAction = "DELETE" @@ -150,7 +155,6 @@ def dws_flex_duration(dws_flex:object, job_id: Optional[int]) -> int: log.info("Job TimeLimit cannot be less than 30 seconds or exceed 2 weeks") return max_duration - def per_instance_properties(node): props = NSDict() # No properties beyond name are supported yet. @@ -320,6 +324,13 @@ def start_tpu(data): def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]): """resume nodes in nodelist""" + # Prevent dormant nodes associated with a future reservation from being resumed + nodes, dormant_fr_nodes = util.separate(lookup().is_dormant_fr_node, nodes) + + if dormant_fr_nodes: + log.warning(f"Resume was unable to resume future reservation nodes={dormant_fr_nodes}") + down_nodes_notify_jobs(dormant_fr_nodes, "Reservation is not active, nodes cannot be resumed", resume_data) + if not nodes: log.info("No nodes to resume") return diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index e04f5bd1a2..d21211e8e7 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -15,18 +15,16 @@ # limitations under the License. import argparse -import datetime import fcntl import json import logging import re import sys import shlex +from datetime import datetime, timedelta from enum import Enum from itertools import chain from pathlib import Path -import datetime as dt -from datetime import datetime from dataclasses import dataclass from typing import Dict, Tuple, List, Optional, Protocol from functools import lru_cache @@ -36,6 +34,7 @@ batch_execute, ensure_execute, execute_with_futures, + FutureReservation, install_custom_scripts, run, separate, @@ -152,6 +151,17 @@ def _find_dynamic_node_status() -> NodeAction: # * delete orhpaned instances return NodeActionUnchanged() # don't touch dynamic nodes +def get_fr_action(fr: FutureReservation, nodename:str, state:NodeState) -> Optional[NodeAction]: + now = datetime.utcnow() + if fr.start_time < now < fr.end_time: + return None # handle like any other node + if state.base == "DOWN": + return NodeActionUnchanged() + if fr.start_time >= now: + msg = f"Waiting for reservation:{fr.name} to start at {fr.start_time}" + else: + msg = f"Reservation:{fr.name} is after its end-time" + return NodeActionDown(reason=msg) def _find_tpu_node_action(nodename, state) -> NodeAction: ns = lookup().node_nodeset(nodename) @@ -217,6 +227,11 @@ def get_node_action(nodename: str) -> NodeAction: """Determine node/instance status that requires action""" state = lookup().slurm_node(nodename) + if lookup().node_is_fr(nodename): + fr = lookup().future_reservation(lookup().node_nodeset(nodename)) + if action := get_fr_action(fr, nodename, state): + return action + if lookup().node_is_dyn(nodename): return _find_dynamic_node_status() @@ -502,7 +517,7 @@ def sync_maintenance_reservation(lkp: util.Lookup) -> None: if res_name in curr_reservation_map: diff = curr_reservation_map[res_name] - start_time - if abs(diff) <= dt.timedelta(seconds=1): + if abs(diff) <= timedelta(seconds=1): continue else: del_reservation.add(res_name) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 2b12defa9b..39d35482d5 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -19,7 +19,7 @@ import base64 import collections from dataclasses import dataclass -from datetime import timedelta +from datetime import timedelta, datetime import hashlib import inspect import json @@ -1462,6 +1462,17 @@ class ReservationDetails: def dense(self) -> bool: return self.deployment_type == "DENSE" +@dataclass(frozen=True) +class FutureReservation: + project: str + zone: str + name: str + specific: bool + start_time: datetime + end_time: datetime + active_reservation: Optional[ReservationDetails] + + @dataclass class Job: id: int @@ -1586,6 +1597,15 @@ def partition_is_tpu(self, part: str) -> bool: def node_is_tpu(self, node_name=None): nodeset_name = self.node_nodeset_name(node_name) return self.cfg.nodeset_tpu.get(nodeset_name) is not None + + def node_is_fr(self, node_name:str) -> bool: + return self.node_nodeset(node_name).future_reservation is not None + + def is_dormant_fr_node(self, node_name:str) -> bool: + fr = self.future_reservation(self.node_nodeset(node_name)) + if not fr: + return False + return fr.active_reservation is None def node_is_dyn(self, node_name=None) -> bool: nodeset = self.node_nodeset_name(node_name) @@ -1768,7 +1788,27 @@ def _get_reservation(self, project: str, zone: str, name: str) -> object: """See https://cloud.google.com/compute/docs/reference/rest/v1/reservations""" return self.compute.reservations().get( project=project, zone=zone, reservation=name).execute() + + @lru_cache() + def _get_future_reservation(self, project:str, zone:str, name: str) -> object: + """See https://cloud.google.com/compute/docs/reference/rest/v1/futureReservations""" + return self.compute.futureReservations().get(project=project, zone=zone, futureReservation=name).execute() + + def get_reservation_details(self, project:str, zone:str, name:str, bulk_insert_name:str) -> ReservationDetails: + reservation = self._get_reservation(project, zone, name) + + # Converts policy URLs to names, e.g.: + # projects/111111/regions/us-central1/resourcePolicies/zebra -> zebra + policies = [u.split("/")[-1] for u in reservation.get("resourcePolicies", {}).values()] + return ReservationDetails( + project=project, + zone=zone, + name=name, + policies=policies, + deployment_type=reservation.get("deploymentType"), + bulk_insert_name=bulk_insert_name) + def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: if not nodeset.reservation_name: return None @@ -1784,19 +1824,37 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: ) project, name = match.group("project", "reservation") - reservation = self._get_reservation(project, zone, name) + return self.get_reservation_details(project, zone, name, nodeset.reservation_name) + + def future_reservation(self, nodeset:object) -> Optional[FutureReservation]: + if not nodeset.future_reservation: + return None - # Converts policy URLs to names, e.g.: - # projects/111111/regions/us-central1/resourcePolicies/zebra -> zebra - policies = [u.split("/")[-1] for u in reservation.get("resourcePolicies", {}).values()] + active_reservation = None + match = re.search(r'^projects/(?P[^/]+)/zones/(?P[^/]+)/futureReservations/(?P[^/]+)(/.*)?$', nodeset.future_reservation) + project, zone, name = match.group("project","zone","name") + fr = self._get_future_reservation(project,zone,name) - return ReservationDetails( + # TODO: Remove this "hack" of trimming the Z from timestamps once we move to Python 3.11 (context: https://discuss.python.org/t/parse-z-timezone-suffix-in-datetime/2220/30) + start_time = datetime.fromisoformat(fr["timeWindow"]["startTime"][:-1]) + end_time = datetime.fromisoformat(fr["timeWindow"]["endTime"][:-1]) + + if "autoCreatedReservations" in fr["status"] and (fr_res:=fr["status"]["autoCreatedReservations"][0]): + if (start_time<=datetime.utcnow()<=end_time): + match = re.search(r'projects/(?P[^/]+)/zones/(?P[^/]+)/reservations/(?P[^/]+)(/.*)?$',fr_res) + res_name = match.group("name") + bulk_insert_name = f"projects/{project}/reservations/{res_name}" + active_reservation = self.get_reservation_details(project, zone, res_name, bulk_insert_name) + + return FutureReservation( project=project, zone=zone, name=name, - policies=policies, - deployment_type=reservation.get("deploymentType"), - bulk_insert_name=nodeset.reservation_name) + specific=fr["specificReservationRequired"], + start_time=start_time, + end_time=end_time, + active_reservation=active_reservation + ) @lru_cache(maxsize=1) def machine_types(self): diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 2a76ed7dca..3862e46e43 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -89,6 +89,7 @@ locals { node_count_static = ns.node_count_static subnetwork = ns.subnetwork_self_link reservation_name = ns.reservation_name + future_reservation = ns.future_reservation maintenance_interval = ns.maintenance_interval instance_properties_json = ns.instance_properties_json enable_placement = ns.enable_placement diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index b06d62b39f..6264576b2c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -278,6 +278,7 @@ variable "nodeset" { tags = optional(list(string), []) termination_action = optional(string) reservation_name = optional(string) + future_reservation = string startup_script = optional(list(object({ filename = string content = string })), [])