From 55663e60174dff70ef59f066bcd3cd5b4da79700 Mon Sep 17 00:00:00 2001 From: marcojob <44396071+marcojob@users.noreply.github.com> Date: Wed, 13 Nov 2024 16:14:51 +0100 Subject: [PATCH] ci: Fix GPU usage --- .github/workflows/ci.yml | 4 ++ src/interface.cpp | 110 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 107 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e33f4433..f98d5604 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,8 @@ jobs: runs-on: self-hosted container: image: omavteam/v4l2_camera:latest + options: | + --gpus all strategy: matrix: @@ -31,4 +33,6 @@ jobs: - name: Run ${{ matrix.ci_script }} run: | + export ONNX_VERBOSE=1 + export TRT_LOGGER=VERBOSE bash -x ./ci/${{ matrix.ci_script }}.sh diff --git a/src/interface.cpp b/src/interface.cpp index cc58681d..81305387 100644 --- a/src/interface.cpp +++ b/src/interface.cpp @@ -35,8 +35,10 @@ void LearningInterface::_load_model() { if (_model_path.find(".onnx") != std::string::npos) { // Check if the engine file already exists std::ifstream engine_check(engine_path, std::ios::binary); - + + std::cout << "FOUND ONNX" << std::endl; if (engine_check.good()) { + std::cout << "GOT ENGINE" << std::endl; engine_check.seekg(0, std::ios::end); const size_t model_size = engine_check.tellg(); engine_check.seekg(0, std::ios::beg); @@ -51,6 +53,7 @@ void LearningInterface::_load_model() { _context = _engine->createExecutionContext(); } else { + std::cout << "NO ENGINE" << std::endl; // Build an engine from the .onnx model and save it as .engine _build(_model_path); _save_engine(engine_path); @@ -90,25 +93,118 @@ void LearningInterface::_load_model() { } void LearningInterface::_build(std::string onnx_path) { + std::cout << "BUILDING ENGINE" << std::endl; + + // Create the builder auto builder = createInferBuilder(_logger); + if (!builder) { + throw std::runtime_error("Failed to create TensorRT builder."); + } + + // Set up network with explicit batch flag const auto explicit_batch = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); INetworkDefinition* network = builder->createNetworkV2(explicit_batch); + if (!network) { + builder->destroy(); + throw std::runtime_error("Failed to create TensorRT network definition."); + } + + // Create builder configuration IBuilderConfig* config = builder->createBuilderConfig(); + if (!config) { + network->destroy(); + builder->destroy(); + throw std::runtime_error("Failed to create TensorRT builder configuration."); + } - // TODO: What about different hardware? + // Set configuration memory pool limit + std::cout << "SETTING CONFIG MEMORY LIMIT" << std::endl; config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, JETSON_MEM_LIMIT_B); + + // Create parser nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, _logger); + if (!parser) { + config->destroy(); + network->destroy(); + builder->destroy(); + throw std::runtime_error("Failed to create TensorRT ONNX parser."); + } + + // Parse the ONNX model + std::cout << "PARSING ONNX MODEL" << std::endl; bool parsed = parser->parseFromFile(onnx_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kINFO)); - IHostMemory* plan{ builder->buildSerializedNetwork(*network, *config) }; + if (!parsed) { + std::cerr << "Failed to parse ONNX model from file: " << onnx_path << std::endl; + parser->destroy(); + config->destroy(); + network->destroy(); + builder->destroy(); + throw std::runtime_error("ONNX model parsing failed."); + } + + // Build the serialized network (engine plan) + std::cout << "BUILDING SERIALIZED NETWORK" << std::endl; + IHostMemory* plan = builder->buildSerializedNetwork(*network, *config); + if (!plan) { + std::cerr << "Failed to build serialized TensorRT engine plan." << std::endl; + parser->destroy(); + config->destroy(); + network->destroy(); + builder->destroy(); + throw std::runtime_error("Serialized network creation failed."); + } + // Create runtime + std::cout << "CREATING RUNTIME" << std::endl; _runtime = createInferRuntime(_logger); + if (!_runtime) { + std::cerr << "Failed to create TensorRT runtime." << std::endl; + plan->destroy(); + parser->destroy(); + config->destroy(); + network->destroy(); + builder->destroy(); + throw std::runtime_error("Runtime creation failed."); + } + + // Deserialize the engine from the plan + std::cout << "DESERIALIZING ENGINE" << std::endl; _engine = _runtime->deserializeCudaEngine(plan->data(), plan->size()); + if (!_engine) { + std::cerr << "Failed to deserialize CUDA engine from serialized plan." << std::endl; + _runtime->destroy(); + plan->destroy(); + parser->destroy(); + config->destroy(); + network->destroy(); + builder->destroy(); + throw std::runtime_error("CUDA engine deserialization failed."); + } + + // Create execution context + std::cout << "CREATING EXECUTION CONTEXT" << std::endl; _context = _engine->createExecutionContext(); + if (!_context) { + std::cerr << "Failed to create execution context from CUDA engine." << std::endl; + _engine->destroy(); + _runtime->destroy(); + plan->destroy(); + parser->destroy(); + config->destroy(); + network->destroy(); + builder->destroy(); + throw std::runtime_error("Execution context creation failed."); + } + + // Clean up resources + std::cout << "CLEANING UP RESOURCES" << std::endl; + plan->destroy(); + parser->destroy(); + config->destroy(); + network->destroy(); + builder->destroy(); - delete network; - delete config; - delete parser; - delete plan; + std::cout << "ENGINE BUILD SUCCESSFUL" << std::endl; } bool LearningInterface::_save_engine(const std::string& engine_path) {