From 596925a90a57aaa10e5e959c46a5ed07d5d758cd Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 16 Jan 2025 19:29:42 -0800 Subject: [PATCH] test: Update L0_io to work with only 2 GPUs instead of requiring 4 GPUs (#7947) --- qa/L0_io/gen_libtorch_model.py | 90 ++++++++++++++++++++++++++++++++++ qa/L0_io/test.sh | 6 +-- 2 files changed, 93 insertions(+), 3 deletions(-) create mode 100755 qa/L0_io/gen_libtorch_model.py diff --git a/qa/L0_io/gen_libtorch_model.py b/qa/L0_io/gen_libtorch_model.py new file mode 100755 index 0000000000..2e9b5abe87 --- /dev/null +++ b/qa/L0_io/gen_libtorch_model.py @@ -0,0 +1,90 @@ +#!/usr/bin/python +# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch +import torch.nn as nn + + +class SumModule(nn.Module): + def __init__(self, device): + super(SumModule, self).__init__() + self.device = device + + def forward(self, INPUT0, INPUT1): + INPUT0 = INPUT0.to(self.device) + INPUT1 = INPUT1.to(self.device) + print( + "SumModule - INPUT0 device: {}, INPUT1 device: {}\n".format( + INPUT0.device, INPUT1.device + ) + ) + return INPUT0 + INPUT1 + + +class DiffModule(nn.Module): + def __init__(self, device): + super(DiffModule, self).__init__() + self.device = device + + def forward(self, INPUT0, INPUT1): + INPUT0 = INPUT0.to(self.device) + INPUT1 = INPUT1.to(self.device) + print( + "DiffModule - INPUT0 device: {}, INPUT1 device: {}\n".format( + INPUT0.device, INPUT1.device + ) + ) + return INPUT0 - INPUT1 + + +class TestModel(nn.Module): + def __init__(self, device0, device1): + super(TestModel, self).__init__() + self.device0 = device0 + self.device1 = device1 + + self.layer1 = SumModule(self.device0) + self.layer2 = DiffModule(self.device1) + + def forward(self, INPUT0, INPUT1): + op0 = self.layer1(INPUT0, INPUT1) + op1 = self.layer2(INPUT0, INPUT1) + return op0, op1 + + +if torch.cuda.device_count() < 2: + print("Need at least 2 GPUs to run this test") + exit(1) + +devices = [("cuda:1", "cuda:0"), ("cpu", "cuda:1")] +model_names = ["libtorch_multi_gpu", "libtorch_multi_device"] + +for device_pair, model_name in zip(devices, model_names): + model = TestModel(device_pair[0], device_pair[1]) + model_path = "models/" + model_name + "/1/model.pt" + scripted_model = torch.jit.script(model) + scripted_model.save(model_path) diff --git a/qa/L0_io/test.sh b/qa/L0_io/test.sh index 85d45ad7d7..0207ce1696 100755 --- a/qa/L0_io/test.sh +++ b/qa/L0_io/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -38,7 +38,8 @@ if [ ! -z "$TEST_REPO_ARCH" ]; then REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} fi -export CUDA_VISIBLE_DEVICES=0,1,2,3 +# This test requires at least 2 GPUs to test h2d and d2d transfer combinations +export CUDA_VISIBLE_DEVICES=0,1 IO_TEST_UTIL=./memory_alloc CLIENT_LOG="./client.log" @@ -147,7 +148,6 @@ cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 $MODELSDIR/. && \ # prepare libtorch multi-device and multi-gpu models cp -r ../L0_libtorch_instance_group_kind_model/models/libtorch_multi_device $MODELSDIR/. -cp ../L0_libtorch_instance_group_kind_model/gen_models.py ./gen_libtorch_model.py mkdir -p $MODELSDIR/libtorch_multi_device/1 mkdir -p $MODELSDIR/libtorch_multi_gpu/1 cp $MODELSDIR/libtorch_multi_device/config.pbtxt $MODELSDIR/libtorch_multi_gpu/.