From c88c6862c4ddd5d446e998141b2e42d263310905 Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Thu, 31 Oct 2024 01:05:42 +0000 Subject: [PATCH] add future reservation support --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 1 + .../schedmd-slurm-gcp-v6-nodeset/main.tf | 1 + .../schedmd-slurm-gcp-v6-nodeset/outputs.tf | 15 ++++ .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 9 +++ .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/scripts/resume.py | 50 +++++++----- .../modules/slurm_files/scripts/slurmsync.py | 31 ++++--- .../slurm_files/scripts/tests/test_util.py | 6 +- .../modules/slurm_files/scripts/util.py | 80 ++++++++++++++++--- .../partition.tf | 1 + .../variables.tf | 1 + 11 files changed, 155 insertions(+), 42 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index bd339c262e..f727dc8fa2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -178,6 +178,7 @@ No modules. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | +| [future\_reservation](#input\_future\_reservation) | Allows for the use of future reservations. Input can either be the future reservation name or full selfLink.
See https://cloud.google.com/compute/docs/instances/future-reservations-overview | `string` | `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index ffaa9d4302..72b88b4c26 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -95,6 +95,7 @@ locals { spot = var.enable_spot_vm termination_action = try(var.spot_instance_config.termination_action, null) reservation_name = local.reservation_name + future_reservation = var.future_reservation maintenance_interval = var.maintenance_interval instance_properties_json = jsonencode(var.instance_properties) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf index b957db13c1..b512b7aba2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf @@ -54,4 +54,19 @@ output "nodeset" { condition = !var.enable_placement || !var.dws_flex.enabled error_message = "Cannot use DWS Flex with `enable_placement`." } + + precondition { + condition = var.reservation_name == "" || var.future_reservation == null + error_message = "Cannot use reservations and future reservations in the same nodeset" + } + + precondition { + condition = var.node_count_dynamic_max == 0 || var.future_reservation == null + error_message = "Only static nodes can be used with future reservations" + } + + precondition { + condition = !var.enable_placement || var.future_reservation == null + error_message = "Cannot use `enable_placement` with future reservations." + } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 3bd8fc74fb..14198ef3ae 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -463,6 +463,15 @@ variable "reservation_name" { } } +variable "future_reservation" { + description = <<-EOD + Allows for the use of future reservations. Input can either be the future reservation name or full selfLink. + See https://cloud.google.com/compute/docs/instances/future-reservations-overview + EOD + type = string + default = null +} + variable "maintenance_interval" { description = <<-EOD Sets the maintenance interval for instances in this nodeset. diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index eb2e5b2f86..b31094e39c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -313,7 +313,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
use_job_duration = bool
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
use_job_duration = bool
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
future_reservation = string
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 31658cd96a..a3d9f8b628 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -77,24 +77,11 @@ def instance_properties(nodeset:object, model:str, placement_group:Optional[str] props.scheduling.onHostMaintenance = "TERMINATE" props.resourcePolicies = [placement_group] - if reservation := lookup().nodeset_reservation(nodeset): - props.reservationAffinity = { - "consumeReservationType": "SPECIFIC_RESERVATION", - "key": f"compute.{util.universe_domain()}/reservation-name", - "values": [reservation.bulk_insert_name], - } + if reservation := lookup().nodeset_reservation(nodeset.reservation_name,nodeset.zone_policy_allow): + update_reservation_props(reservation, props) - if reservation.policies: - props.scheduling.onHostMaintenance = "TERMINATE" - props.resourcePolicies = reservation.policies - log.info( - f"reservation {reservation.bulk_insert_name} is being used with policies {props.resourcePolicies}" - ) - else: - props.resourcePolicies = [] - log.info( - f"reservation {reservation.bulk_insert_name} is being used without any policies" - ) + if nodeset.future_reservation: + use_future_reservation(props, nodeset) if nodeset.maintenance_interval: props.scheduling.maintenanceInterval = nodeset.maintenance_interval @@ -104,9 +91,35 @@ def instance_properties(nodeset:object, model:str, placement_group:Optional[str] # Override with properties explicit specified in the nodeset props.update(nodeset.get("instance_properties") or {}) - + return props +def update_reservation_props(reservation:Optional[object], props:object) -> None: + if not reservation: + return + + props.reservationAffinity = { + "consumeReservationType": "SPECIFIC_RESERVATION", + "key": f"compute.{util.universe_domain()}/reservation-name", + "values": [reservation.bulk_insert_name], + } + + if reservation.policies: + props.scheduling.onHostMaintenance = "TERMINATE" + props.resourcePolicies = reservation.policies + log.info( + f"reservation {reservation.bulk_insert_name} is being used with policies {props.resourcePolicies}" + ) + else: + props.resourcePolicies = [] + log.info( + f"reservation {reservation.bulk_insert_name} is being used without any policies" + ) + +def use_future_reservation(props:object, nodeset:object) -> None: + if (future_reservation := lookup().future_reservation(nodeset.future_reservation, nodeset.zone_policy_allow)) and future_reservation.nodesAreActive and future_reservation.specificReservationRequired: + update_reservation_props(future_reservation.activeReservation, props) + def update_props_dws(props:object, dws_flex:object, job_id: Optional[int]) -> None: props.scheduling.onHostMaintenance = "TERMINATE" props.scheduling.instanceTerminationAction = "DELETE" @@ -122,7 +135,6 @@ def dws_flex_duration(dws_flex:object, job_id: Optional[int]) -> int: log.info("Job TimeLimit cannot be less than 30 seconds or exceed 2 weeks") return max_duration - def per_instance_properties(node): props = NSDict() # No properties beyond name are supported yet. diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 112e2d5748..c6e66688ae 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -15,18 +15,16 @@ # limitations under the License. import argparse -import datetime import fcntl import json import logging import re import sys +from datetime import datetime, timedelta from enum import Enum from itertools import chain from pathlib import Path import yaml -import datetime as dt -from datetime import datetime from typing import Dict, Tuple import util @@ -99,9 +97,8 @@ def _find_dynamic_node_status() -> NodeStatus: return NodeStatus.unchanged # don't touch dynamic nodes -def _find_tpu_node_status(nodename, state): - ns = lookup().node_nodeset(nodename) - tpuobj = TPU(ns) +def _find_tpu_node_status(nodeset:object, nodename:str, state:object) -> NodeStatus: + tpuobj = TPU(nodeset) inst = tpuobj.get_node(nodename) # If we do not find the node but it is from a Tpu that has multiple vms look for the master node if inst is None and tpuobj.vmcount > 1: @@ -159,21 +156,26 @@ def _find_tpu_node_status(nodename, state): return NodeStatus.unchanged -def find_node_status(nodename): +def find_node_status(nodename) -> NodeStatus: """Determine node/instance status that requires action""" state = lookup().slurm_node(nodename) + ns = lookup().node_nodeset(nodename) if lookup().node_is_dyn(nodename): return _find_dynamic_node_status() if lookup().node_is_tpu(nodename): - return _find_tpu_node_status(nodename, state) + return _find_tpu_node_status(ns, nodename, state) # split below is workaround for VMs whose hostname is FQDN inst = lookup().instance(nodename.split(".")[0]) power_flags = frozenset( ("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN") ) & (state.flags if state is not None else set()) + + if ns.future_reservation: + future_reservation_sync(ns, nodename, state) + return NodeStatus.unchanged if inst is None: if "POWERING_UP" in state.flags: @@ -403,6 +405,17 @@ def sync_slurm(): for status, nodes in node_statuses.items(): do_node_update(status, nodes) +def future_reservation_sync(nodeset:object, nodename:str, state:object)-> None: + if future_reservation := lookup().future_reservation(nodeset.future_reservation, nodeset.zone_policy_allow): + currentTime = datetime.utcnow() + if future_reservation.nodesAreActive is False and future_reservation.startTime>currentTime and state.base != "DOWN": + run(f"{lookup().scontrol} update nodename={nodename} state=down reason='Waiting for reservation to start at {future_reservation.startTime}'") + + if future_reservation.nodesAreActive is False and future_reservation.endTime None: if res_name in curr_reservation_map: diff = curr_reservation_map[res_name] - start_time - if abs(diff) <= dt.timedelta(seconds=1): + if abs(diff) <= timedelta(seconds=1): continue else: del_reservation.add(res_name) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index 14b7a7bf62..ba58c1fde3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -156,7 +156,7 @@ def test_nodeset_reservation_err(nodeset, err): lkp = util.Lookup(TstCfg()) lkp._get_reservation = Mock() with pytest.raises(err): - lkp.nodeset_reservation(nodeset) + lkp.nodeset_reservation(nodeset.reservation_name,nodeset.zone_policy_allow) lkp._get_reservation.assert_not_called() @pytest.mark.parametrize( @@ -201,14 +201,14 @@ def test_nodeset_reservation_ok(nodeset, policies, expected): lkp._get_reservation = Mock() if not expected: - assert lkp.nodeset_reservation(nodeset) is None + assert lkp.nodeset_reservation(nodeset.reservation_name,nodeset.zone_policy_allow) is None lkp._get_reservation.assert_not_called() return lkp._get_reservation.return_value = { "resourcePolicies": {i: p for i, p in enumerate(policies)}, } - assert lkp.nodeset_reservation(nodeset) == expected + assert lkp.nodeset_reservation(nodeset.reservation_name,nodeset.zone_policy_allow) == expected lkp._get_reservation.assert_called_once_with(expected.project, expected.zone, expected.name) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index d046c27357..44c3e4f86c 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -19,7 +19,7 @@ import base64 import collections from dataclasses import dataclass -from datetime import timedelta +from datetime import timedelta, datetime import hashlib import inspect import json @@ -1457,6 +1457,18 @@ class ReservationDetails: policies: List[str] # names (not URLs) of resource policies bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name in long format) +@dataclass +class FutureReservation: + project: str + zone: str + name: str + nodesAreActive: bool + specificReservationRequired: Optional[bool] + startTime: Optional[datetime] + endTime: Optional[datetime] + activeReservation: Optional[ReservationDetails] + + @dataclass class Job: id: int @@ -1758,20 +1770,26 @@ def _get_reservation(self, project: str, zone: str, name: str) -> object: return self.compute.reservations().get( project=project, zone=zone, reservation=name).execute() - def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: - if not nodeset.reservation_name: - return None - - zones = list(nodeset.zone_policy_allow or []) + @lru_cache() + def _get_future_reservation(self, project:str, zone:str, name: str) -> object: + """See https://cloud.google.com/compute/docs/reference/rest/v1/futureReservations""" + return self.compute.futureReservations().get(project=project, zone=zone, futureReservation=name).execute() + + + def nodeset_reservation(self, reservation_name:str, zone_policy_allow: Optional[list]) -> Optional[ReservationDetails]: + if not reservation_name: + return + + zones = list(zone_policy_allow or []) assert len(zones) == 1, "Only single zone is supported if using a reservation" zone = zones[0] regex = re.compile(r'^projects/(?P[^/]+)/reservations/(?P[^/]+)(/.*)?$') - if not (match := regex.match(nodeset.reservation_name)): + if not (match := regex.match(reservation_name)): raise ValueError( - f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'" + f"Invalid reservation name: '{reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'" ) - + project, name = match.group("project", "reservation") reservation = self._get_reservation(project, zone, name) @@ -1784,7 +1802,49 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: zone=zone, name=name, policies=policies, - bulk_insert_name=nodeset.reservation_name) + bulk_insert_name=reservation_name) + + def future_reservation(self, futureReservation:str, zone_policy_allow: Optional[list]) -> Optional[FutureReservation]: + zones = list(zone_policy_allow or []) + assert len(zones) == 1, "Only single zone is supported if using a reservation" + zone = zones[0] + + project = self.project + name = futureReservation + nodesAreActive = False + specificReservationRequired = None + activeReservation = None + endTime = None + startTime = None + + regex = re.compile(r'/futureReservations/(?P[^/]+)(/.*)?$') + + if match := regex.search(futureReservation): + name = match.group("name") + + if reservation := self._get_future_reservation(project, zone, name): + startString = reservation.get("timeWindow").get("startTime") + endString = reservation.get("timeWindow").get("endTime") + startTime = datetime.fromisoformat(startString[:len(startString)-1]) + endTime = datetime.fromisoformat(endString[:len(endString)-1]) + + specificReservationRequired = reservation.get("specificReservationRequired") + + if reservation.get("status").get("autoCreatedReservations") and (autoCreatedReservation:=reservation.get("status").get("autoCreatedReservations")[0]): + reservationRegex = re.compile(r'/reservations/(?P[^/]+)(/.*)?$') + if (match := reservationRegex.search(autoCreatedReservation)) and (activeReservation:=self.nodeset_reservation(f"projects/{project}/reservations/{match.group('name')}", [zone])) and (startTime <= datetime.utcnow() <= endTime): + nodesAreActive=True + + return FutureReservation( + project=project, + zone=zone, + name=name, + nodesAreActive=nodesAreActive, + specificReservationRequired=specificReservationRequired, + startTime=startTime, + endTime=endTime, + activeReservation=activeReservation + ) @lru_cache(maxsize=1) def machine_types(self): diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 6287399993..3240f94db6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -89,6 +89,7 @@ locals { node_count_static = ns.node_count_static subnetwork = ns.subnetwork_self_link reservation_name = ns.reservation_name + future_reservation = ns.future_reservation maintenance_interval = ns.maintenance_interval instance_properties_json = ns.instance_properties_json enable_placement = ns.enable_placement diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 1fc4fb1e0f..e9d77962a4 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -277,6 +277,7 @@ variable "nodeset" { tags = optional(list(string), []) termination_action = optional(string) reservation_name = optional(string) + future_reservation = string startup_script = optional(list(object({ filename = string content = string })), [])