From 834d101215cf4a2a839a4e08d9a358bbd8797019 Mon Sep 17 00:00:00 2001 From: Jvst Me Date: Thu, 24 Oct 2024 12:41:35 +0200 Subject: [PATCH] Fix instance price discrepancies in RunPod - Deploy to Secure Cloud only, otherwise RunPod may choose any cloud type and prices won't match. This is a temporary measure until `dstack` can't differentiate Secure Cloud and Community Cloud offers. - Set `bid_per_gpu` as the price of a single GPU, not the entire instance. --- .../_internal/core/backends/runpod/api_client.py | 4 ++-- src/dstack/_internal/core/backends/runpod/compute.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/dstack/_internal/core/backends/runpod/api_client.py b/src/dstack/_internal/core/backends/runpod/api_client.py index f538713c0..59d8e2004 100644 --- a/src/dstack/_internal/core/backends/runpod/api_client.py +++ b/src/dstack/_internal/core/backends/runpod/api_client.py @@ -31,7 +31,7 @@ def create_pod( name: str, image_name: str, gpu_type_id: str, - cloud_type: str = "ALL", + cloud_type: str, support_public_ip: bool = True, start_ssh: bool = True, data_center_id: Optional[str] = None, @@ -323,7 +323,7 @@ def generate_pod_deployment_mutation( name: str, image_name: str, gpu_type_id: str, - cloud_type: str = "ALL", + cloud_type: str, support_public_ip: bool = True, start_ssh: bool = True, data_center_id=None, diff --git a/src/dstack/_internal/core/backends/runpod/compute.py b/src/dstack/_internal/core/backends/runpod/compute.py index 7835fd946..9ab3df0ca 100644 --- a/src/dstack/_internal/core/backends/runpod/compute.py +++ b/src/dstack/_internal/core/backends/runpod/compute.py @@ -91,20 +91,25 @@ def run_job( container_registry_auth_id = self._generate_container_registry_auth_id( job.job_spec.registry_auth ) + gpu_count = len(instance_offer.instance.resources.gpus) + bid_per_gpu = None + if instance_offer.instance.resources.spot and gpu_count: + bid_per_gpu = instance_offer.price / gpu_count + resp = self.api_client.create_pod( name=instance_config.instance_name, image_name=job.job_spec.image_name, gpu_type_id=instance_offer.instance.name, - cloud_type="ALL", # ["ALL", "COMMUNITY", "SECURE"]: + cloud_type="SECURE", # ["ALL", "COMMUNITY", "SECURE"]: data_center_id=instance_offer.region, - gpu_count=len(instance_offer.instance.resources.gpus), + gpu_count=gpu_count, container_disk_in_gb=disk_size, min_vcpu_count=instance_offer.instance.resources.cpus, min_memory_in_gb=memory_size, support_public_ip=True, docker_args=_get_docker_args(authorized_keys), ports="10022/tcp", - bid_per_gpu=instance_offer.price if instance_offer.instance.resources.spot else None, + bid_per_gpu=bid_per_gpu, network_volume_id=network_volume_id, volume_mount_path=volume_mount_path, )