diff --git a/build_depends.repos b/build_depends.repos index 4db947b7c26a8..2c19de267f95e 100644 --- a/build_depends.repos +++ b/build_depends.repos @@ -58,3 +58,7 @@ repositories: type: git url: https://github.com/autowarefoundation/ament_cmake.git version: feat/faster_ament_libraries_deduplicate + universe/external/trt_batched_nms: + type: git + url: https://github.com/autowarefoundation/trt_batched_nms.git + version: main diff --git a/perception/autoware_tensorrt_rtmdet/CMakeLists.txt b/perception/autoware_tensorrt_rtmdet/CMakeLists.txt new file mode 100644 index 0000000000000..300b620158d74 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/CMakeLists.txt @@ -0,0 +1,102 @@ +cmake_minimum_required(VERSION 3.8) +project(autoware_tensorrt_rtmdet) + +find_package(autoware_tensorrt_common) +if(NOT ${autoware_tensorrt_common_FOUND}) + message(WARNING "The autoware_tensorrt_common package is not found. Please check its dependencies.") + return() +endif() + +find_package(autoware_cmake REQUIRED) +autoware_package() + +find_package(OpenCV REQUIRED) + +include(CheckLanguage) +check_language(CUDA) +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) +else() + message(WARNING "CUDA is not found. preprocess acceleration using CUDA will not be available.") +endif() + +find_package(OpenMP) +if(OpenMP_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +# tensorrt_rtmdet +ament_auto_add_library(${PROJECT_NAME} SHARED + src/tensorrt_rtmdet.cpp +) + +ament_target_dependencies(${PROJECT_NAME} + OpenCV +) + +if(CMAKE_CUDA_COMPILER) + # preprocess + cuda_add_library(${PROJECT_NAME}_gpu_preprocess + SHARED + src/preprocess.cu + ) + + target_include_directories(${PROJECT_NAME}_gpu_preprocess PUBLIC + $ + $ + ) + + target_link_libraries(${PROJECT_NAME} + ${autoware_tensorrt_common_LIBRARIES} + ${PROJECT_NAME}_gpu_preprocess + ) + +else() + target_link_libraries(${PROJECT_NAME} + ${autoware_tensorrt_common_LIBRARIES} + ) +endif() + +target_compile_definitions(${PROJECT_NAME} PRIVATE + TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR} +) + +# tensorrt_rtmdet_node +ament_auto_add_library(${PROJECT_NAME}_node SHARED + src/tensorrt_rtmdet_node.cpp +) + +ament_target_dependencies(${PROJECT_NAME}_node + OpenCV +) + +target_link_libraries(${PROJECT_NAME}_node + ${PROJECT_NAME} +) + +target_compile_definitions(${PROJECT_NAME}_node PRIVATE + TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR} +) + +rclcpp_components_register_node(${PROJECT_NAME}_node + PLUGIN "autoware::tensorrt_rtmdet::TrtRTMDetNode" + EXECUTABLE ${PROJECT_NAME}_node_exe +) + +if(BUILD_TESTING) + find_package(ament_cmake_ros REQUIRED) + find_package(ament_lint_auto REQUIRED) + ament_lint_auto_find_test_dependencies() + + ament_add_ros_isolated_gtest(test_rtmdet test/test_rtmdet.cpp) + set_tests_properties(test_rtmdet PROPERTIES TIMEOUT 60) # It could take a long time on the first run to create the engine + target_link_libraries(test_rtmdet ${PROJECT_NAME} ${PROJECT_NAME}_node) + target_include_directories(test_rtmdet PRIVATE include) +endif() + +ament_auto_package(INSTALL_TO_SHARE + launch + config + test/data +) diff --git a/perception/autoware_tensorrt_rtmdet/README.md b/perception/autoware_tensorrt_rtmdet/README.md new file mode 100644 index 0000000000000..0912fd72e3466 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/README.md @@ -0,0 +1,70 @@ +# autoware_tensorrt_rtmdet + +## Purpose + +RTMDet is a real-time instance segmentation model which can be used for detecting objects like cars, pedestrians, +bicycles, etc. in a scene. This package provides a ROS 2 interface for RTMDet using TensorRT. + +## Inner-workings / Algorithms + +## Inputs / Outputs + +### Input + +| Name | Type | Description | +| ---------- | ------------------- | --------------- | +| `in/image` | `sensor_msgs/Image` | The input image | + +### Output + +| Name | Type | Description | +| ----------------- | ---------------------------------------------------- | ------------------------------------------------------------------- | +| `out/objects` | `tier4_perception_msgs/DetectedObjectsWithFeature` | The detected objects with 2D bounding boxes and scores | +| `out/mask` | `autoware_internal_perception_msgs/SegmentationMask` | The instance segmentation mask | +| `out/color_mask` | `sensor_msgs/Image` | The colorized image of instance segmentation mask for visualization | +| `out/debug_image` | `sensor_msgs/Image` | The image with 2D bounding boxes for visualization | + +## Parameters + +{{ json_to_markdown("perception/autoware_tensorrt_rtmdet/schema/rtmdet.schema.json") }} + +## Assumptions / Known limits + +## Onnx model + +A sample model is provided in `autoware_data` folder by ansible script on env preparation stage. If you cannot find the +model, you follow instructions from the +[link](https://github.com/autowarefoundation/autoware/tree/main/ansible/roles/artifacts) to download the model. + +The shared model was trained by open-mmlab using the COCO dataset. For more details, +see [link](https://github.com/open-mmlab/mmdetection/tree/3.x/configs/rtmdet#instance-segmentation). + +### Package acceptable model generation + +Users can generate their own model using official RTMDet repository. Please refer to the +[official repository](https://github.com/open-mmlab/mmdetection/tree/3.x/configs/rtmdet#rtmdet-an-empirical-study-of-designing-real-time-object-detectors) + +## Label file + +A sample label file (named label.txt) and instance segmentation color map file (name color_map.csv) are also +downloaded automatically during env preparation process. + +These files are used to map the class index to class name and color respectively. + +## Reference repositories + +- + +## Citation + +```bibtex +@misc{lyu2022rtmdetempiricalstudydesigning, +title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, +author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, +year={2022}, +eprint={2212.07784}, +archivePrefix={arXiv}, +primaryClass={cs.CV}, +url={https://arxiv.org/abs/2212.07784}, +} +``` diff --git a/perception/autoware_tensorrt_rtmdet/config/rtmdet.param.yaml b/perception/autoware_tensorrt_rtmdet/config/rtmdet.param.yaml new file mode 100644 index 0000000000000..d766e53dab427 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/config/rtmdet.param.yaml @@ -0,0 +1,23 @@ +/**: + ros__parameters: + is_publish_debug_image: false + + is_publish_color_mask: false + + # cspell: ignore libtrt + model_path: "$(var data_path)/tensorrt_rtmdet/$(var model_name).onnx" + color_map_path: "$(var data_path)/tensorrt_rtmdet/color_map.csv" + plugin_paths: [ "$(find-pkg-prefix trt_batched_nms)/lib/libtrt_batched_nms_plugin.so" ] + input_image_mean: [ 103.53, 116.28, 123.675 ] + input_image_std_dev: [ 57.375, 57.12, 58.395 ] + score_threshold: 0.3 + nms_threshold: 0.3 + mask_threshold: 0.7 + precision: "fp16" + dla_core_id: -1 + quantize_first_layer: false + quantize_last_layer: false + profile_per_layer: false + clip_value: 6.0 + apply_erosion: false + calibration_image_list_path: "" diff --git a/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/calibrator.hpp b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/calibrator.hpp new file mode 100644 index 0000000000000..1481b8099ccd4 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/calibrator.hpp @@ -0,0 +1,257 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// cspell: ignore RTMDET +#ifndef AUTOWARE__TENSORRT_RTMDET__CALIBRATOR_HPP_ +#define AUTOWARE__TENSORRT_RTMDET__CALIBRATOR_HPP_ +#include "autoware/cuda_utils/cuda_check_error.hpp" + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace autoware::tensorrt_rtmdet +{ +class ImageStream +{ +public: + ImageStream( + uint32_t batch_size, nvinfer1::Dims input_dims, + const std::vector & calibration_images) + : batch_size_(batch_size), + calibration_images_(calibration_images), + max_batches_(calibration_images.size() / batch_size_), + input_dims_(input_dims) + { + batch_.resize(batch_size_ * input_dims_.d[1] * input_dims_.d[2] * input_dims_.d[3]); + } + + [[nodiscard]] uint32_t get_batch_size() const { return batch_size_; } + + [[nodiscard]] uint32_t get_max_batches() const { return max_batches_; } + + float * get_batch() { return &batch_.at(0); } + + nvinfer1::Dims get_input_dims() { return input_dims_; } + + /** + * @brief Preprocess in calibration + * @param[in] images input images + * @param[in] input_dims input dimensions + * @param[in] mean mean value for each channel + * @param[in] std std value for each channel + * @return vector preprocessed data + */ + static std::vector preprocess( + const std::vector & images, nvinfer1::Dims input_dims, const std::vector & mean, + const std::vector & std) + { + std::vector input_h; + auto batch_size = static_cast(images.size()); + input_dims.d[0] = static_cast(batch_size); + const auto input_chan = static_cast(input_dims.d[1]); + const auto input_height = static_cast(input_dims.d[2]); + const auto input_width = static_cast(input_dims.d[3]); + std::vector dst_images; + uint32_t volume = batch_size * input_chan * input_height * input_width; + input_h.resize(volume); + for (const auto & image : images) { + cv::Mat dst_image; + cv::resize(image, dst_image, cv::Size(640, 640)); + dst_image.convertTo(dst_image, CV_32F); + dst_image -= cv::Scalar(mean.at(0), mean.at(1), mean.at(2)); + dst_image /= cv::Scalar(std.at(0), std.at(1), std.at(2)); + dst_images.emplace_back(dst_image); + } + + const auto chw_images = + cv::dnn::blobFromImages(dst_images, 1.0, cv::Size(), cv::Scalar(), false, false, CV_32F); + const auto data_length = chw_images.total(); + input_h.reserve(data_length); + const auto flat = chw_images.reshape(1, static_cast(data_length)); + input_h = chw_images.isContinuous() ? flat : flat.clone(); + return input_h; + } + + /** + * @brief Decode data in calibration + * @param[in] scale normalization (0.0-1.0) + * @return bool succ or fail + */ + bool next(const std::vector & mean, const std::vector & std) + { + if (current_batch_ == max_batches_) { + return false; + } + for (uint32_t i = 0; i < batch_size_; ++i) { + auto image = + cv::imread(calibration_images_.at(batch_size_ * current_batch_ + i), cv::IMREAD_COLOR); + RCLCPP_INFO( + rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), "Preprocess %s", + calibration_images_.at(batch_size_ * current_batch_ + i).c_str()); + auto input = preprocess({image}, input_dims_, mean, std); + batch_.insert( + batch_.begin() + i * input_dims_.d[1] * input_dims_.d[2] * input_dims_.d[3], input.begin(), + input.end()); + } + + ++current_batch_; + return true; + } + /** + * @brief Reset calibration + */ + void reset() { current_batch_ = 0; } + +private: + uint32_t batch_size_; + std::vector calibration_images_; + uint32_t current_batch_{0}; + uint32_t max_batches_; + nvinfer1::Dims input_dims_; + std::vector batch_; +}; + +/** + * @class Int8LegacyCalibrator + * @brief Calibrator for Percentile + * + */ +class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator2 +{ +public: + Int8EntropyCalibrator( + ImageStream & stream, std::string calibration_cache_file, + const std::vector & mean = {103.53, 116.28, 123.675}, + const std::vector & std = {57.375, 57.12, 58.395}, bool read_cache = true) + : stream_(stream), + calibration_cache_file_(std::move(calibration_cache_file)), + read_cache_(read_cache) + { + auto d = stream_.get_input_dims(); + input_count_ = stream_.get_batch_size() * d.d[1] * d.d[2] * d.d[3]; + m_std_ = std; + m_mean_ = mean; + + CHECK_CUDA_ERROR(cudaMalloc(&device_input_, input_count_ * sizeof(float))); + auto alg_type = getAlgorithm(); + switch (alg_type) { + case (nvinfer1::CalibrationAlgoType::kLEGACY_CALIBRATION): + RCLCPP_INFO( + rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), + "CalibrationAlgoType : kLEGACY_CALIBRATION"); + break; + case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION): + RCLCPP_INFO( + rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), + "CalibrationAlgoType : kENTROPY_CALIBRATION"); + break; + case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION_2): + RCLCPP_INFO( + rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), + "CalibrationAlgoType : kENTROPY_CALIBRATION_2"); + break; + case (nvinfer1::CalibrationAlgoType::kMINMAX_CALIBRATION): + RCLCPP_INFO( + rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), + "CalibrationAlgoType : kMINMAX_CALIBRATION"); + break; + default: + RCLCPP_INFO( + rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), "No CalibrationAlgType"); + break; + } + } + [[nodiscard]] int getBatchSize() const noexcept override + { + return static_cast(stream_.get_batch_size()); + } + + ~Int8EntropyCalibrator() noexcept override { CHECK_CUDA_ERROR(cudaFree(device_input_)); } + + bool getBatch(void * bindings[], const char * names[], int nb_bindings) noexcept override + { + (void)names; + (void)nb_bindings; + + if (!stream_.next(m_mean_, m_std_)) { + return false; + } + try { + CHECK_CUDA_ERROR(cudaMemcpy( + device_input_, stream_.get_batch(), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); + } catch (const std::exception & e) { + // Do nothing + } + bindings[0] = device_input_; + return true; + } + + const void * readCalibrationCache(size_t & length) noexcept override + { + calib_cache_.clear(); + std::ifstream input(calibration_cache_file_, std::ios::binary); + input >> std::noskipws; + if (read_cache_ && input.good()) { + std::copy( + std::istream_iterator(input), std::istream_iterator(), + std::back_inserter(calib_cache_)); + } + + length = calib_cache_.size(); + if (length) { + RCLCPP_INFO( + rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), + "Using cached calibration table to build the engine"); + } else { + RCLCPP_INFO( + rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), + "New calibration table will be created to build the engine"); + } + return length ? &calib_cache_.at(0) : nullptr; + } + + void writeCalibrationCache(const void * cache, size_t length) noexcept override + { + std::ofstream output(calibration_cache_file_, std::ios::binary); + output.write(static_cast(cache), length); + } + +private: + ImageStream stream_; + const std::string calibration_cache_file_; + bool read_cache_{true}; + size_t input_count_; + void * device_input_{nullptr}; + std::vector calib_cache_; + std::vector hist_cache_; + // mean for preprocessing + std::vector m_mean_; + // std for preprocessing + std::vector m_std_; +}; +} // namespace autoware::tensorrt_rtmdet + +#endif // AUTOWARE__TENSORRT_RTMDET__CALIBRATOR_HPP_ diff --git a/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/preprocess.hpp b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/preprocess.hpp new file mode 100644 index 0000000000000..bf633e07c5833 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/preprocess.hpp @@ -0,0 +1,152 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// cspell: ignore RTMDET +#ifndef AUTOWARE__TENSORRT_RTMDET__PREPROCESS_HPP_ +#define AUTOWARE__TENSORRT_RTMDET__PREPROCESS_HPP_ + +#include +#include +#include +#include +#include + +namespace autoware::tensorrt_rtmdet +{ +struct Roi +{ + int x; + int y; + int w; + int h; +}; + +/** + * @brief Resize a image using bilinear interpolation on gpus + * @param[out] dst Resized image + * @param[in] src image + * @param[in] d_w width for resized image + * @param[in] d_h height for resized image + * @param[in] d_c channel for resized image + * @param[in] s_w width for input image + * @param[in] s_h height for input image + * @param[in] stream cuda stream + */ +extern void resize_bilinear_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, + cudaStream_t stream); + +/** + * @brief Letterbox a image on gpus + * @param[out] dst letterbox-ed image + * @param[in] src image + * @param[in] d_w width for letterbox-ing + * @param[in] d_h height for letterbox-ing + * @param[in] s_w width for input image + * @param[in] s_h height for input image + * @param[in] stream cuda stream + */ +extern void letterbox_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, + cudaStream_t stream); + +/** + * @brief NHWC to NHWC conversion + * @param[out] dst converted image + * @param[in] src image + * @param[in] d_w width for a image + * @param[in] d_h height for a image + * @param[in] stream cuda stream + */ +extern void nchw_to_nhwc_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, cudaStream_t stream); + +/** + * @brief Unsigned char to float32 for inference + * @param[out] dst32 converted image + * @param[in] src image + * @param[in] d_w width for a image + * @param[in] d_h height for a image + * @param[in] d_c channel for a image + * @param[in] stream cuda stream + */ +extern void to_float_gpu( + float * dst32, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream); + +/** + * @brief Resize and letterbox a image using bilinear interpolation on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] stream cuda stream + */ +extern void resize_bilinear_letterbox_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, + cudaStream_t stream); + +/** + * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization + * for RTMDet on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] norm normalization + * @param[in] stream cuda stream + */ +extern void resize_bilinear_letterbox_nhwc_to_nchw32_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, float norm, + cudaStream_t stream); + +/** + * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization + * with batching for RTMDet on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] batch batch size + * @param[in] mean mean value for normalization + * @param[in] std standard deviation for normalization + * @param[in] stream cuda stream + */ +extern void resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, int batch, + const float * mean, const float * std, cudaStream_t stream); + +/** + * @brief Argmax on GPU + * @param[out] dst processed image + * @param[in] src probability map + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] batch batch size + * @param[in] stream cuda stream + */ +extern void argmax_gpu( + unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch, + cudaStream_t stream); +} // namespace autoware::tensorrt_rtmdet + +#endif // AUTOWARE__TENSORRT_RTMDET__PREPROCESS_HPP_ diff --git a/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp new file mode 100644 index 0000000000000..9bfb7ca77510b --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp @@ -0,0 +1,223 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// cspell: ignore RTMDET +#ifndef AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_HPP_ +#define AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_HPP_ + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +class TrtRTMDetTest; + +namespace autoware::tensorrt_rtmdet +{ +using autoware::cuda_utils::CudaUniquePtr; +using autoware::cuda_utils::CudaUniquePtrHost; +using autoware::cuda_utils::makeCudaStream; +using autoware::cuda_utils::StreamUniquePtr; + +/** + * @brief Represents a detected object. + * + * This struct stores bounding box coordinates, class id, mask index, score of detected object. + */ +struct Object +{ + uint32_t x1; + uint32_t y1; + uint32_t x2; + uint32_t y2; + uint32_t class_id; + uint32_t mask_index; + float score; +}; + +using ObjectArray = std::vector; +using ObjectArrays = std::vector; + +/** + * @brief Represents a color for a class. + * + * This struct stores class name, color and label id for a label. + * Read from a color map csv file. + */ +using LabelColor = struct LabelColor +{ + std::string class_name; + cv::Vec3b color; + uint8_t label_id; +}; + +using ColorMap = std::map; + +class TrtRTMDet +{ +public: + TrtRTMDet( + const std::string & model_path, const std::string & precision, const ColorMap & color_map, + const float score_threshold = 0.3, const float nms_threshold = 0.7, + const float mask_threshold = 200.0, + const autoware::tensorrt_common::BuildConfig & build_config = + autoware::tensorrt_common::BuildConfig(), + const std::string & calibration_image_list_file_path = std::string(), + const double norm_factor = 1.0, const std::vector & mean = {103.53, 116.28, 123.675}, + const std::vector & std = {57.375, 57.12, 58.395}, + [[maybe_unused]] const std::string & cache_dir = "", + const autoware::tensorrt_common::BatchConfig & batch_config = {1, 1, 1}, + const size_t max_workspace_size = (1u << 30u), + const std::vector & plugin_paths = {}); + + ~TrtRTMDet() noexcept; + + /** + * @brief Perform inference. + * + * This function calls preprocess and feedforward functions to perform inference. + * + * @param[in] images a vector of input images. + * @param[out] objects a vector of detected objects. + * @param[out] mask a segmentation mask. + * @param[out] class_ids a vector of class ids. + * @return true if inference is successful. + */ + bool do_inference( + const std::vector & images, ObjectArrays & objects, cv::Mat & mask, + std::vector & class_ids); + + /** + * @brief Calculate intersection over union. + * + * This function calculates the intersection over union (IoU) between two bounding boxes. + * Input bounding boxes contains the bounding box and class id. + * + * @param[in] a First object. + * @param[in] b Second object. + * @return IoU value. + */ + [[nodiscard]] static float intersection_over_union(const Object & a, const Object & b); + + /** + * @brief Perform non-maximum suppression. + * + * This function performs non-maximum suppression (NMS) on the detected objects. + * The detected objects are sorted by score and the NMS is applied to remove overlapping boxes. + * + * Since the model architecture has own NMS layer, this function added for overlapped boxes which + * have different class. + * + * @param[in] input_objects a vector of detected objects. + * @param[out] output_objects a vector of detected objects after NMS. + */ + static void nms_sorted_bboxes( + const ObjectArray & input_objects, ObjectArray & output_objects, const float & nms_threshold); + + void print_profiling(); + +private: + /** + * @brief Preprocess input images on GPU. + * + * This function takes a vector of input images and preprocesses them on GPU. + * The images are resized to the input size of the model, normalized and stored in a buffer. + * + * @param[in] images a vector of input images. + */ + void preprocess_gpu(const std::vector & images); + + /** + * @brief Inference with TensorRT. + * + * This function performs inference with TensorRT. + * The input images are fed to the model and the output is stored in the output buffers. + * The output buffers are then post-processed to get the detected objects and segmentation mask. + * + * @param[in] images a vector of input images. + * @param[out] objects a vector of detected objects. + * @param[out] mask a segmentation mask. + * @param[out] class_ids a vector of class ids. + * @return true if inference is successful. + */ + bool feedforward( + const std::vector & images, ObjectArrays & objects, cv::Mat & mask, + std::vector & class_ids); + + std::unique_ptr trt_common_; + + // Host and device pointers for inputs + std::vector input_h_; + CudaUniquePtr input_d_; + + // Device pointer for outputs + CudaUniquePtr out_detections_d_; + CudaUniquePtr out_labels_d_; + CudaUniquePtr out_masks_d_; + + // Host pointer for outputs + std::unique_ptr out_detections_h_; + std::unique_ptr out_labels_h_; + std::unique_ptr out_masks_h_; + + StreamUniquePtr stream_{makeCudaStream()}; + + // scale factor for input image + float scale_width_; + float scale_height_; + + // size of input image for model + uint32_t model_input_width_; + uint32_t model_input_height_; + + const float score_threshold_; + const float nms_threshold_; + const float mask_threshold_; + const int batch_size_; + + // maximum number of detections, read from model + uint32_t max_detections_; + + // host buffer for preprocessing on GPU + CudaUniquePtrHost image_buf_h_; + // device buffer for preprocessing on GPU + CudaUniquePtr image_buf_d_; + // normalization factor used for preprocessing + const double norm_factor_; + + // size of input image for preprocessing, check if input image size is changed + int32_t src_width_; + int32_t src_height_; + + // mean and std for preprocessing + const std::vector mean_; + const std::vector std_; + + // Segmentation map, stores information for each class + const ColorMap color_map_; + +public: + friend TrtRTMDetTest; +}; +} // namespace autoware::tensorrt_rtmdet + +#endif // AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_HPP_ diff --git a/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp new file mode 100644 index 0000000000000..01ed65091ddd7 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp @@ -0,0 +1,134 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// cspell: ignore RTMDET +#ifndef AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_NODE_HPP_ +#define AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_NODE_HPP_ + +#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp" + +#include +#include +#include +#include + +#include "autoware_internal_perception_msgs/msg/segmentation_mask.hpp" +#include "tier4_perception_msgs/msg/detected_objects_with_feature.hpp" + +#include + +#include +#include +#include + +namespace autoware::tensorrt_rtmdet +{ +class TrtRTMDetNode : public rclcpp::Node +{ +public: + explicit TrtRTMDetNode(const rclcpp::NodeOptions & node_options); + + /** + * @brief Read a color map file and return the color map. + * + * The color map file should be a csv file with the following format: + * ``` + * unique_id, class_name, r, g, b, label_id + * ``` + * where the label_id represents the class ID of the object, used to process the outputs in + * Autoware. + * + * @param[in] color_map_path The path to the color map file. + * @return A color map structure with the color information. + */ + static ColorMap read_color_map_file(const std::string & color_map_path); + + /** + * @brief Colorize the output mask. + * + * Take the output mask which includes the class ID of the objects and convert it to a colorized + * mask. + * + * @param[in] color_map The color map structure. + * @param[in] mask The output mask. + * @param[out] color_mask The colorized mask. + */ + static void get_colorized_mask( + const ColorMap & color_map, const cv::Mat & mask, cv::Mat & color_mask); + + /** + * @brief Draw the detected objects on the input image. + * + * @param[out] image + * @param[in] mask + * @param[in] objects + * @param[in] color_map + */ + static void draw_debug_image( + cv::Mat & image, const cv::Mat & mask, const ObjectArrays & objects, + const ColorMap & color_map); + + /** + * @brief Apply erosion to the mask. + * + * With using this function, you can apply erosion to the mask and It overflowing the border. + * + * @param[in,out] mask + * @param[in] erosion_size + */ + static void apply_erosion(cv::Mat & mask, const int erosion_size); + +private: + /** + * @brief Callback function to check are there any subscribers. + */ + void on_connect(); + + /** + * @brief Callback function to run RTMDet model. + * + * @param[in] msg The input image message. + */ + void on_image(const sensor_msgs::msg::Image::ConstSharedPtr msg); + + std::unique_ptr trt_rtmdet_; + + image_transport::Subscriber image_sub_; + rclcpp::TimerBase::SharedPtr timer_; + + rclcpp::Publisher::SharedPtr objects_pub_; + rclcpp::Publisher::SharedPtr mask_pub_; + + image_transport::Publisher color_mask_pub_; + image_transport::Publisher debug_image_pub_; + + std::unique_ptr> stop_watch_ptr_; + std::unique_ptr debug_publisher_; + + const bool is_publish_color_mask_; + const bool is_publish_debug_image_; + + // Color map to store label and color information. + ColorMap color_map_; + + // Mean and std for normalization of input image. + std::vector mean_; + std::vector std_; + + // If true, apply erosion to the output mask. + bool is_apply_erosion_; +}; +} // namespace autoware::tensorrt_rtmdet + +#endif // AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_NODE_HPP_ diff --git a/perception/autoware_tensorrt_rtmdet/launch/multiple_rtmdet.launch.xml b/perception/autoware_tensorrt_rtmdet/launch/multiple_rtmdet.launch.xml new file mode 100644 index 0000000000000..9f4e1390ce302 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/launch/multiple_rtmdet.launch.xml @@ -0,0 +1,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/perception/autoware_tensorrt_rtmdet/launch/rtmdet.launch.xml b/perception/autoware_tensorrt_rtmdet/launch/rtmdet.launch.xml new file mode 100644 index 0000000000000..c1510cd325963 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/launch/rtmdet.launch.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/perception/autoware_tensorrt_rtmdet/package.xml b/perception/autoware_tensorrt_rtmdet/package.xml new file mode 100644 index 0000000000000..885e561d47506 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/package.xml @@ -0,0 +1,41 @@ + + + + autoware_tensorrt_rtmdet + 0.0.1 + TensorRT library implementation for RTMDet + Barış Zeren + + Apache License 2.0 + + ament_cmake_auto + autoware_cmake + cudnn_cmake_module + tensorrt_cmake_module + + cudnn_cmake_module + tensorrt_cmake_module + + autoware_cuda_utils + autoware_internal_perception_msgs + autoware_object_recognition_utils + autoware_perception_msgs + autoware_tensorrt_common + cv_bridge + image_transport + libopencv-dev + rclcpp + rclcpp_components + sensor_msgs + tier4_perception_msgs + trt_batched_nms + + image_transport_decompressor + + ament_lint_auto + autoware_lint_common + + + ament_cmake + + diff --git a/perception/autoware_tensorrt_rtmdet/schema/rtmdet.schema.json b/perception/autoware_tensorrt_rtmdet/schema/rtmdet.schema.json new file mode 100644 index 0000000000000..132563791a590 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/schema/rtmdet.schema.json @@ -0,0 +1,139 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Parameters for tensorrt_rtmdet Nodes", + "type": "object", + "definitions": { + "rtmdet": { + "type": "object", + "properties": { + "is_publish_debug_image": { + "type": "boolean", + "default": false, + "description": "If true, debug image which contain bounding boxes is published at runtime." + }, + "is_publish_color_mask": { + "type": "boolean", + "default": false, + "description": "If true, debug image which contain colorful masks is published at runtime." + }, + "model_path": { + "type": "string", + "default": "$(var data_path)/tensorrt_rtmdet/$(var model_name).onnx", + "description": "Path to ONNX file." + }, + "color_map_path": { + "type": "string", + "default": "$(var data_path)/tensorrt_rtmdet/color_map.csv", + "description": "Path to color map file." + }, + "plugin_paths": { + "type": "array", + "default": ["./build/tensorrt_rtmdet/libtensorrt_rtmdet_plugin.so"], + "description": "Path to plugin files." + }, + "input_image_mean": { + "type": "array", + "default": [103.53, 116.28, 123.675], + "description": "Mean values for the model." + }, + "input_image_std_dev": { + "type": "array", + "default": [57.375, 57.12, 58.395], + "description": "Standard deviation values for model." + }, + "score_threshold": { + "type": "number", + "default": 0.3, + "minimum": 0.0, + "maximum": 1.0, + "description": "A threshold value of existence probability score, all of objects with score less than this threshold are ignored." + }, + "nms_threshold": { + "type": "number", + "default": 0.0, + "minimum": 0.0, + "maximum": 1.0, + "description": "A threshold value of NMS." + }, + "mask_threshold": { + "type": "number", + "default": 0.7, + "minimum": 0.0, + "maximum": 1.0, + "description": "A threshold value of pixel values in the output mask." + }, + "precision": { + "type": "string", + "default": "fp16", + "description": "Operation precision to be used on inference. Valid value is one of: [fp32, fp16, int8]." + }, + "dla_core_id": { + "type": "number", + "default": -1, + "description": "If positive ID value is specified, the node assign inference task to the DLA core." + }, + "quantize_first_layer": { + "type": "boolean", + "default": false, + "description": "If true, set the operating precision for the first (input) layer to be fp16. This option is valid only when precision==int8." + }, + "quantize_last_layer": { + "type": "boolean", + "default": false, + "description": "If true, set the operating precision for the last (output) layer to be fp16. This option is valid only when precision==int8." + }, + "profile_per_layer": { + "type": "boolean", + "default": false, + "description": "If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose." + }, + "clip_value": { + "type": "number", + "default": 0.0, + "description": "If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration." + }, + "apply_erosion": { + "type": "boolean", + "default": false, + "description": "If true, apply erosion to the output mask." + }, + "calibration_image_list_path": { + "type": "string", + "default": "", + "description": "Path to a file which contains path to images. Those images will be used for int8 quantization." + } + }, + "required": [ + "is_publish_debug_image", + "is_publish_color_mask", + "model_path", + "color_map_path", + "plugin_paths", + "input_image_mean", + "input_image_std_dev", + "score_threshold", + "nms_threshold", + "mask_threshold", + "precision", + "dla_core_id", + "quantize_first_layer", + "quantize_last_layer", + "profile_per_layer", + "clip_value", + "apply_erosion" + ] + } + }, + "properties": { + "/**": { + "type": "object", + "properties": { + "ros__parameters": { + "$ref": "#/definitions/rtmdet" + } + }, + "required": ["ros__parameters"] + } + }, + "required": ["/**"] +} diff --git a/perception/autoware_tensorrt_rtmdet/src/preprocess.cu b/perception/autoware_tensorrt_rtmdet/src/preprocess.cu new file mode 100644 index 0000000000000..f5139805c9983 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/src/preprocess.cu @@ -0,0 +1,461 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/tensorrt_rtmdet/preprocess.hpp" + +#include +#include +#include + +namespace autoware::tensorrt_rtmdet +{ +constexpr size_t block = 512; +constexpr size_t max_blocks_per_dim = 65535; +constexpr int C = 3; + +dim3 cuda_gridsize(size_t n) +{ + size_t k = (n - 1) / block + 1; + size_t x = k; + size_t y = 1; + if (x > max_blocks_per_dim) { + x = ceil(sqrt(k)); + y = (n - 1) / (x * block) + 1; + } + dim3 d; + d.x = x; + d.y = y; + d.z = 1; + return d; +} + +__device__ double lerp1d(int a, int b, float w) +{ + return fma(w, (float)b, fma(-w, (float)a, (float)a)); +} + +__device__ float lerp2d(int f00, int f01, int f10, int f11, float centroid_h, float centroid_w) +{ + centroid_w = (1 + lroundf(centroid_w) - centroid_w) / 2; + centroid_h = (1 + lroundf(centroid_h) - centroid_h) / 2; + + float r0, r1, r; + r0 = (float)lround(lerp1d(f00, f01, centroid_w)); + r1 = (float)lround(lerp1d(f10, f11, centroid_w)); + + r = lerp1d((int)r0, (int)r1, centroid_h); + return r; +} + +__global__ void resize_bilinear_kernel( + int N, unsigned char * dst_img, unsigned char * src_img, int dst_w, int src_h, int src_w, + float stride_h, float stride_w) +{ + // NHWC + int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; + + if (index >= N) return; + int W = dst_w; + + int n = 0; + + int w = index % W; + int h = index / W; + + float centroid_h, centroid_w; + centroid_h = stride_h * (float)(h + 0.5); + centroid_w = stride_w * (float)(w + 0.5); + + int f00, f01, f10, f11; + + int src_h_idx = lroundf(centroid_h) - 1; + int src_w_idx = lroundf(centroid_w) - 1; + if (src_h_idx < 0) { + src_h_idx = 0; + } + if (src_w_idx < 0) { + src_w_idx = 0; + } + + index = C * w + C * W * h; +#pragma unroll + for (int c = 0; c < C; c++) { + f00 = n * src_h * src_w * C + src_h_idx * src_w * C + src_w_idx * C + c; + f01 = n * src_h * src_w * C + src_h_idx * src_w * C + (src_w_idx + 1) * C + c; + f10 = n * src_h * src_w * C + (src_h_idx + 1) * src_w * C + src_w_idx * C + c; + f11 = n * src_h * src_w * C + (src_h_idx + 1) * src_w * C + (src_w_idx + 1) * C + c; + + float rs = lroundf(lerp2d( + (int)src_img[f00], (int)src_img[f01], (int)src_img[f10], (int)src_img[f11], centroid_h, + centroid_w)); + dst_img[index + c] = (unsigned char)rs; + } +} + +void resize_bilinear_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, cudaStream_t stream) +{ + int N = d_w * d_h; + float stride_h = (float)s_h / (float)d_h; + float stride_w = (float)s_w / (float)d_w; + + resize_bilinear_kernel<<>>( + N, dst, src, d_w, s_h, s_w, stride_h, stride_w); +} + +__global__ void letterbox_kernel( + int N, unsigned char * dst_img, unsigned char * src_img, int dst_w, int src_w, int letter_bot, + int letter_right) +{ + // NHWC + int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; + + if (index >= N) return; + int W = dst_w; + + int w = index % W; + int h = index / W; + + index = (C * w) + (C * W * h); + int index2 = (C * w) + (C * src_w * h); +#pragma unroll + for (int c = 0; c < C; c++) { + dst_img[index + c] = + (w >= letter_right || h >= letter_bot) ? (unsigned int)114 : src_img[index2 + c]; + } +} + +void letterbox_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, cudaStream_t stream) +{ + int N = d_w * d_h; + const float scale = std::min(d_w / (float)s_w, d_h / (float)s_h); + int r_h = (int)(scale * s_h); + int r_w = (int)(scale * s_w); + + letterbox_kernel<<>>(N, dst, src, d_w, r_w, r_h, r_w); +} + +__global__ void nhwc_to_nchw_kernel( + int N, unsigned char * dst_img, unsigned char * src_img, int height, int width) +{ + // NHWC + int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; + + if (index >= N) return; + int x = index % width; + int y = index / width; + int src_index; + int dst_index; + for (int c = 0; c < C; c++) { + src_index = c + (C * x) + (C * width * y); + dst_index = x + (width * y) + (width * height * c); + dst_img[dst_index] = src_img[src_index]; + } +} + +void nhwc_to_nchw_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, cudaStream_t stream) +{ + int N = d_w * d_h; + nhwc_to_nchw_kernel<<>>(N, dst, src, d_h, d_w); +} + +__global__ void nchw_to_nhwc_kernel( + int N, unsigned char * dst, unsigned char * src, int height, int width) +{ + // NHWC + int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; + + if (index >= N) return; + int x = index % width; + int y = index / width; + int src_index; + int dst_index; + for (int c = 0; c < C; c++) { + // NHWC + dst_index = c + (C * x) + (C * width * y); + // NCHW + src_index = x + (width * y) + (width * height * c); + dst[dst_index] = src[src_index]; + } +} + +void nchw_to_nhwc_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, cudaStream_t stream) +{ + int N = d_w * d_h; + nchw_to_nhwc_kernel<<>>(N, dst, src, d_h, d_w); +} + +__global__ void to_float_kernel(int N, float * dst32, unsigned char * src8, int height, int width) +{ + // NHWC + int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; + + if (index >= N) return; + int x = index % width; + int y = index / width; + int dst_index; + for (int c = 0; c < C; c++) { + // NCHW + dst_index = x + (width * y) + (width * height * c); + dst32[dst_index] = (float)(src8[dst_index]); + } +} + +void to_float_gpu(float * dst32, unsigned char * src, int d_w, int d_h, cudaStream_t stream) +{ + int N = d_w * d_h; + to_float_kernel<<>>(N, dst32, src, d_h, d_w); +} + +__global__ void resize_bilinear_letterbox_kernel( + int N, unsigned char * dst_img, unsigned char * src_img, int dst_w, int src_h, int src_w, + float scale, int letter_bot, int letter_right) +{ + // NHWC + int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; + + if (index >= N) return; + int W = dst_w; + int n = 0; // index / (C*W*H); + + int w = index % W; + int h = index / W; + + float centroid_h, centroid_w; + centroid_h = scale * (float)(h + 0.5); + centroid_w = scale * (float)(w + 0.5); + + int f00, f01, f10, f11; + + int src_h_idx = (int)lroundf(centroid_h) - 1; + int src_w_idx = (int)lroundf(centroid_w) - 1; + if (src_h_idx < 0) { + src_h_idx = 0; + } + if (src_w_idx < 0) { + src_w_idx = 0; + } + if (src_h_idx >= src_h) { + src_h_idx = src_h - 1; + } + if (src_w_idx >= src_w) { + src_w_idx = src_w - 1; + } + + index = (C * w) + (C * W * h); +#pragma unroll + for (int c = 0; c < C; c++) { + f00 = n * src_h * src_w * C + src_h_idx * src_w * C + src_w_idx * C + c; + f01 = n * src_h * src_w * C + src_h_idx * src_w * C + (src_w_idx + 1) * C + c; + f10 = n * src_h * src_w * C + (src_h_idx + 1) * src_w * C + src_w_idx * C + c; + f11 = n * src_h * src_w * C + (src_h_idx + 1) * src_w * C + (src_w_idx + 1) * C + c; + + float rs = lroundf(lerp2d( + (int)src_img[f00], (int)src_img[f01], (int)src_img[f10], (int)src_img[f11], centroid_h, + centroid_w)); + dst_img[index + c] = (unsigned char)rs; + dst_img[index + c] = (h >= letter_bot) ? (unsigned int)114 : dst_img[index + c]; + dst_img[index + c] = (w >= letter_right) ? (unsigned int)114 : dst_img[index + c]; + } +} + +void resize_bilinear_letterbox_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, cudaStream_t stream) +{ + int N = d_w * d_h; + const float scale = std::min(d_w / (float)s_w, d_h / (float)s_h); + int r_h = (int)(scale * s_h); + int r_w = (int)(scale * s_w); + resize_bilinear_letterbox_kernel<<>>( + N, dst, src, d_w, s_h, s_w, 1.0 / scale, r_h, r_w); +} + +__global__ void resize_bilinear_letterbox_nhwc_to_nchw32_kernel( + int N, float * dst_img, unsigned char * src_img, int dst_h, int dst_w, int src_h, int src_w, + float scale, int letter_bot, int letter_right, float norm) +{ + // NHWC + int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; + + if (index >= N) return; + int H = dst_h; + int W = dst_w; + + int w = index % W; + int h = index / W; + + float centroid_h, centroid_w; + centroid_h = scale * (float)(h + 0.5); + centroid_w = scale * (float)(w + 0.5); + + int f00, f01, f10, f11; + + int src_h_idx = lroundf(centroid_h) - 1; + int src_w_idx = lroundf(centroid_w) - 1; + src_h_idx = (src_h_idx < 0) ? 0 : src_h_idx; + src_h_idx = (src_h_idx >= (src_h - 1)) ? src_h - 2 : src_h_idx; + src_w_idx = (src_w_idx < 0) ? 0 : src_w_idx; + src_w_idx = (src_w_idx >= (src_w - 1)) ? src_w - 2 : src_w_idx; + + int stride = src_w * C; + +#pragma unroll + for (int c = 0; c < C; c++) { + f00 = src_h_idx * stride + src_w_idx * C + c; + f01 = src_h_idx * stride + (src_w_idx + 1) * C + c; + f10 = (src_h_idx + 1) * stride + src_w_idx * C + c; + f11 = (src_h_idx + 1) * stride + (src_w_idx + 1) * C + c; + + float rs = lroundf(lerp2d( + (int)src_img[f00], (int)src_img[f01], (int)src_img[f10], (int)src_img[f11], centroid_h, + centroid_w)); + + // NHCW + int dst_index = w + (W * h) + (W * H * c); + + dst_img[dst_index] = (float)rs; + dst_img[dst_index] = (h >= letter_bot) ? 114.0 : dst_img[dst_index]; + dst_img[dst_index] = (w >= letter_right) ? 114.0 : dst_img[dst_index]; + dst_img[dst_index] *= norm; + } +} + +void resize_bilinear_letterbox_nhwc_to_nchw32_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, float norm, + cudaStream_t stream) +{ + int N = d_w * d_h; + const float scale = std::min(d_w / (float)s_w, d_h / (float)s_h); + int r_h = scale * s_h; + int r_w = scale * s_w; + + resize_bilinear_letterbox_nhwc_to_nchw32_kernel<<>>( + N, dst, src, d_h, d_w, s_h, s_w, 1.0 / scale, r_h, r_w, norm); +} + +__global__ void resize_bilinear_letterbox_nhwc_to_nchw32_batch_kernel( + int N, float * dst_img, unsigned char * src_img, int dst_h, int dst_w, int src_h, int src_w, + float scale_h, float scale_w, int letter_bot, int letter_right, int batch, const float * mean, + const float * std) +{ + // NHWC + int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; + + if (index >= N) return; + int H = dst_h; + int W = dst_w; + + int w = index % W; + int h = index / (W); + float centroid_h, centroid_w; + centroid_h = scale_h * (float)(h + 0.5); + centroid_w = scale_w * (float)(w + 0.5); + + int f00, f01, f10, f11; + + int src_h_idx = lroundf(centroid_h) - 1; + int src_w_idx = lroundf(centroid_w) - 1; + src_h_idx = (src_h_idx < 0) ? 0 : src_h_idx; + src_h_idx = (src_h_idx >= (src_h - 1)) ? src_h - 2 : src_h_idx; + src_w_idx = (src_w_idx < 0) ? 0 : src_w_idx; + src_w_idx = (src_w_idx >= (src_w - 1)) ? src_w - 2 : src_w_idx; + + int stride = src_w * C; + int b_stride = src_h * src_w * C; + +#pragma unroll + for (int b = 0; b < batch; b++) { + for (int c = 0; c < C; c++) { + // NHWC + f00 = src_h_idx * stride + src_w_idx * C + c + b * b_stride; + f01 = src_h_idx * stride + (src_w_idx + 1) * C + c + b * b_stride; + f10 = (src_h_idx + 1) * stride + src_w_idx * C + c + b * b_stride; + f11 = (src_h_idx + 1) * stride + (src_w_idx + 1) * C + c + b * b_stride; + + float rs = lroundf(lerp2d( + (int)src_img[f00], (int)src_img[f01], (int)src_img[f10], (int)src_img[f11], centroid_h, + centroid_w)); + + // NCHW + int dst_index = w + (W * h) + (W * H * c) + b * (W * H * C); + + dst_img[dst_index] = (float)rs; + dst_img[dst_index] = (h >= letter_bot) ? 114.0 : dst_img[dst_index]; + dst_img[dst_index] = (w >= letter_right) ? 114.0 : dst_img[dst_index]; + // dst_img[dst_index] *= norm; + dst_img[dst_index] -= mean[c]; + dst_img[dst_index] /= std[c]; + } + } +} + +void resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, int batch, + const float * mean, const float * std, cudaStream_t stream) +{ + int N = d_w * d_h; + + const float scale_h = d_h / (float)s_h; + const float scale_w = d_w / (float)s_w; + + int r_h = scale_h * s_h; + int r_w = scale_w * s_w; + + float *d_mean, *d_std; + cudaMalloc(&d_mean, 3 * sizeof(float)); + cudaMalloc(&d_std, 3 * sizeof(float)); + cudaMemcpyAsync(d_mean, mean, 3 * sizeof(float), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_std, std, 3 * sizeof(float), cudaMemcpyHostToDevice, stream); + + resize_bilinear_letterbox_nhwc_to_nchw32_batch_kernel<<>>( + N, dst, src, d_h, d_w, s_h, s_w, 1.0 / scale_h, 1.0 / scale_w, r_h, r_w, batch, d_mean, d_std); +} + +__global__ void argmax_gpu_kernel( + int N, unsigned char * dst, float * src, int dst_h, int dst_w, int src_c, int src_h, int src_w, + int batch) +{ + // NHWC + int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; + + if (index >= N) return; + int w = index % dst_w; + int h = index / (dst_w); + + for (int b = 0; b < batch; b++) { + float max_prob = 0.0; + int max_index = 0; + int dst_index = w + dst_w * h + b * dst_h * dst_w; + for (int c = 0; c < src_c; c++) { + int src_index = w + src_w * h + c * src_h * src_w + b * src_c * src_h * src_w; + max_index = max_prob < src[src_index] ? c : max_index; + max_prob = max_prob < src[src_index] ? src[src_index] : max_prob; + } + dst[dst_index] = max_index; + } +} + +void argmax_gpu( + unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch, + cudaStream_t stream) +{ + int N = d_w * d_h; + argmax_gpu_kernel<<>>( + N, dst, src, d_h, d_w, s_c, s_h, s_w, batch); +} + +} // namespace autoware::tensorrt_rtmdet diff --git a/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet.cpp b/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet.cpp new file mode 100644 index 0000000000000..54f5aca0d9c1c --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet.cpp @@ -0,0 +1,368 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp" + +#include "autoware/tensorrt_rtmdet/calibrator.hpp" +#include "autoware/tensorrt_rtmdet/preprocess.hpp" + +#include +#include +#include +#include +#include +#include +#include + +/** + * @brief Load calibration image list from a text file. + * + * @param filename Path to the text file. + * @return List of image paths. + */ +std::vector load_calibration_image_list(const std::string & filename) +{ + if (filename.empty()) { + RCLCPP_ERROR( + rclcpp::get_logger("autoware_tensorrt_rtmdet"), + "Calibration image list is empty. Please set calibration_image_list_path."); + return {}; + } + if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(filename))) { + RCLCPP_ERROR( + rclcpp::get_logger("autoware_tensorrt_rtmdet"), "failed to open %s", filename.c_str()); + return {}; + } + + std::ifstream txt_file(filename); + + std::string line; + std::vector image_list; + while (std::getline(txt_file, line)) { + image_list.push_back(line); + } + + txt_file.close(); + return image_list; +} + +namespace autoware::tensorrt_rtmdet +{ +TrtRTMDet::TrtRTMDet( + const std::string & model_path, const std::string & precision, const ColorMap & color_map, + const float score_threshold, const float nms_threshold, const float mask_threshold, + const autoware::tensorrt_common::BuildConfig & build_config, + const std::string & calibration_image_list_file_path, const double norm_factor, + const std::vector & mean, const std::vector & std, + [[maybe_unused]] const std::string & cache_dir, + const autoware::tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size, + const std::vector & plugin_paths) +: score_threshold_{score_threshold}, + nms_threshold_{nms_threshold}, + mask_threshold_{mask_threshold}, + batch_size_{batch_config.at(1)}, + norm_factor_{norm_factor}, + mean_{mean}, + std_{std}, + color_map_{color_map} +{ + src_width_ = -1; + src_height_ = -1; + scale_width_ = -1; + scale_height_ = -1; + + if (precision == "int8") { + std::vector calibration_images = + load_calibration_image_list(calibration_image_list_file_path); + + int max_batch_size = batch_config.at(2); + nvinfer1::Dims input_dims = autoware::tensorrt_common::get_input_dims(model_path); + tensorrt_rtmdet::ImageStream stream(max_batch_size, input_dims, calibration_images); + + fs::path calibration_table{model_path}; + std::string ext = "EntropyV2-calibration.table"; + calibration_table.replace_extension(ext); + + fs::path histogram_table{model_path}; + ext = "histogram.table"; + histogram_table.replace_extension(ext); + + std::unique_ptr calibrator; + calibrator = std::make_unique( + stream, calibration_table, mean_, std_); + + trt_common_ = std::make_unique( + model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config, + plugin_paths); + } else { + trt_common_ = std::make_unique( + model_path, precision, nullptr, batch_config, max_workspace_size, build_config, plugin_paths); + } + trt_common_->setup(); + + if (!trt_common_->isInitialized()) { + return; + } + + const auto input_dims = trt_common_->getBindingDimensions(0); + const auto out_scores_dims = trt_common_->getBindingDimensions(3); + + max_detections_ = out_scores_dims.d[1]; + model_input_height_ = input_dims.d[2]; + model_input_width_ = input_dims.d[3]; + + input_d_ = autoware::cuda_utils::make_unique( + batch_size_ * input_dims.d[1] * input_dims.d[2] * input_dims.d[3]); + out_detections_d_ = autoware::cuda_utils::make_unique(batch_size_ * max_detections_ * 5); + out_labels_d_ = autoware::cuda_utils::make_unique(batch_size_ * max_detections_); + out_masks_d_ = autoware::cuda_utils::make_unique( + batch_size_ * max_detections_ * model_input_width_ * model_input_height_); + + out_detections_h_ = std::make_unique(batch_size_ * max_detections_ * 5); + out_labels_h_ = std::make_unique(batch_size_ * max_detections_); + out_masks_h_ = std::make_unique( + batch_size_ * max_detections_ * model_input_width_ * model_input_height_); + +#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8050 + std::vector buffers = { + input_d_.get(), out_detections_d_.get(), out_labels_d_.get(), out_masks_d_.get()}; + trt_common_->setupBindings(buffers); +#endif +} + +TrtRTMDet::~TrtRTMDet() noexcept +{ + if (image_buf_h_) { + image_buf_h_.reset(); + } + if (image_buf_d_) { + image_buf_d_.reset(); + } +} + +void TrtRTMDet::print_profiling() +{ + trt_common_->printProfiling(); +} + +void TrtRTMDet::preprocess_gpu(const std::vector & images) +{ + const auto batch_size = images.size(); + auto input_dims = trt_common_->getBindingDimensions(0); + + input_dims.d[0] = static_cast(batch_size); + for (const auto & image : images) { + // if size of source input has been changed... + int width = image.cols; + int height = image.rows; + if (src_width_ != -1 || src_height_ != -1) { + if (width != src_width_ || height != src_height_) { + // Free cuda memory to reallocate + if (image_buf_h_) { + image_buf_h_.reset(); + } + if (image_buf_d_) { + image_buf_d_.reset(); + } + } + } + src_width_ = width; + src_height_ = height; + } + if (!image_buf_h_) { + trt_common_->setBindingDimensions(0, input_dims); + scale_width_ = 0; + scale_height_ = 0; + } + const auto input_height = static_cast(input_dims.d[2]); + const auto input_width = static_cast(input_dims.d[3]); + int b = 0; + for (const auto & image : images) { + if (!image_buf_h_) { + scale_width_ = input_width / static_cast(image.cols); + scale_height_ = input_height / static_cast(image.rows); + image_buf_h_ = autoware::cuda_utils::make_unique_host( + image.cols * image.rows * 3 * batch_size, cudaHostAllocWriteCombined); + image_buf_d_ = autoware::cuda_utils::make_unique( + image.cols * image.rows * 3 * batch_size); + } + int index = b * image.cols * image.rows * 3; + // Copy into pinned memory + memcpy( + image_buf_h_.get() + index, &image.data[0], + image.cols * image.rows * 3 * sizeof(unsigned char)); + b++; + } + // Copy into device memory + CHECK_CUDA_ERROR(cudaMemcpyAsync( + image_buf_d_.get(), image_buf_h_.get(), + images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char), + cudaMemcpyHostToDevice, *stream_)); + // Preprocess on GPU + resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + input_d_.get(), image_buf_d_.get(), static_cast(input_width), + static_cast(input_height), images[0].cols, images[0].rows, + static_cast(batch_size), mean_.data(), std_.data(), *stream_); +} + +bool TrtRTMDet::do_inference( + const std::vector & images, ObjectArrays & objects, cv::Mat & mask, + std::vector & class_ids) +{ + if (!trt_common_->isInitialized()) { + return false; + } + preprocess_gpu(images); + + return feedforward(images, objects, mask, class_ids); +} + +bool TrtRTMDet::feedforward( + const std::vector & images, ObjectArrays & objects, cv::Mat & mask, + std::vector & class_ids) +{ +#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8050 + trt_common_->enqueueV3(*stream_); +#else + std::vector buffers = { + input_d_.get(), out_detections_d_.get(), out_labels_d_.get(), out_masks_d_.get()}; + + trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); +#endif + + const auto batch_size = images.size(); + out_detections_h_.reset(new float[batch_size_ * max_detections_ * 5]); + out_labels_h_.reset(new int32_t[batch_size_ * max_detections_]); + out_masks_h_.reset(new float[batch_size_ * 20 * model_input_width_ * model_input_height_]); + + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_detections_h_.get(), out_detections_d_.get(), + sizeof(float) * batch_size_ * max_detections_ * 5, cudaMemcpyDeviceToHost, *stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_labels_h_.get(), out_labels_d_.get(), sizeof(int32_t) * batch_size_ * max_detections_, + cudaMemcpyDeviceToHost, *stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_masks_h_.get(), out_masks_d_.get(), + sizeof(float) * batch_size_ * 20 * model_input_width_ * model_input_height_, + cudaMemcpyDeviceToHost, *stream_)); + + cudaStreamSynchronize(*stream_); + + // POST PROCESSING + objects.clear(); + for (size_t batch = 0; batch < batch_size; ++batch) { + ObjectArray object_array; + for (uint32_t index = 0; index < max_detections_; ++index) { + if (out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 4)] < score_threshold_) { + break; + } + + Object object{}; + object.mask_index = index; + object.class_id = out_labels_h_[(batch * max_detections_) + index]; + object.x1 = static_cast( + out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 0)] / scale_width_); + object.y1 = static_cast( + out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 1)] / scale_height_); + object.x2 = static_cast( + out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 2)] / scale_width_); + object.y2 = static_cast( + out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 3)] / scale_height_); + object.score = out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 4)]; + object_array.push_back(object); + } + ObjectArray nms_objects; + nms_sorted_bboxes(object_array, nms_objects, nms_threshold_); + + objects.push_back(nms_objects); + } + + // Create an instance segmentation mask. + // The mask is an image with the same dimensions as the model input image, + // where each pixel represents the class intensity. + // The intensity of each pixel corresponds to the index of the class_array, + // which stores the class IDs. + mask = cv::Mat( + static_cast(model_input_height_), static_cast(model_input_width_), CV_8UC1, + cv::Scalar(0, 0, 0)); + uint8_t pixel_intensity = 1; // 0 is reserved for background + for (size_t batch = 0; batch < batch_size; ++batch) { + for (const auto & object : objects.at(batch)) { + cv::Mat object_mask( + static_cast(model_input_height_), static_cast(model_input_width_), CV_32F, + &out_masks_h_ + [(batch * 100 * model_input_width_ * model_input_height_) + + (object.mask_index * model_input_width_ * model_input_height_)]); + double min_val = 0; + double max_val = 0; + cv::minMaxLoc(object_mask, &min_val, &max_val); + object_mask.convertTo( + object_mask, CV_8U, 255.0 / (max_val - min_val), -min_val * 255.0 / (max_val - min_val)); + + auto process_pixel = [&]([[maybe_unused]] cv::Vec3b & pixel, const int * position) -> void { + int i = position[0]; + int j = position[1]; + + if (object_mask.at(i, j) > static_cast(255 * mask_threshold_)) { + mask.at(i, j) = pixel_intensity; + } + }; + mask.forEach(process_pixel); + class_ids.push_back(color_map_.at(object.class_id).label_id); + pixel_intensity++; + } + } + return true; +} + +float TrtRTMDet::intersection_over_union(const Object & a, const Object & b) +{ + uint32_t x_left = std::max(a.x1, b.x1); + uint32_t y_top = std::max(a.y1, b.y1); + uint32_t x_right = std::min(a.x2, b.x2); + uint32_t y_bottom = std::min(a.y2, b.y2); + + if (x_right < x_left || y_bottom < y_top) { + return 0.0; + } + + float intersection_area = ((x_right - x_left + 1) * (y_bottom - y_top + 1)); + float a_area = (a.x2 - a.x1 + 1) * (a.y2 - a.y1 + 1); + float b_area = (b.x2 - b.x1 + 1) * (b.y2 - b.y1 + 1); + + return intersection_area / (a_area + b_area - intersection_area); +} + +void TrtRTMDet::nms_sorted_bboxes( + const ObjectArray & input_objects, ObjectArray & output_objects, const float & nms_threshold) +{ + std::vector suppressed(input_objects.size(), false); + + for (size_t i = 0; i < input_objects.size(); ++i) { + if (suppressed.at(i)) continue; + + const Object & a = input_objects.at(i); + output_objects.push_back(a); + + for (size_t j = i + 1; j < input_objects.size(); ++j) { + if (suppressed.at(j)) continue; + + const Object & b = input_objects.at(j); + if (intersection_over_union(a, b) > nms_threshold) { + suppressed.at(j) = true; + } + } + } +} +} // namespace autoware::tensorrt_rtmdet diff --git a/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet_node.cpp b/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet_node.cpp new file mode 100644 index 0000000000000..621c39d4fd190 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet_node.cpp @@ -0,0 +1,301 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp" + +#include "autoware_perception_msgs/msg/object_classification.hpp" +#include "tier4_perception_msgs/msg/detected_object_with_feature.hpp" + +#include + +#include +#include +#include +#include + +namespace autoware::tensorrt_rtmdet +{ +TrtRTMDetNode::TrtRTMDetNode(const rclcpp::NodeOptions & node_options) +: Node("tensorrt_rtmdet", node_options), + is_publish_color_mask_(declare_parameter("is_publish_color_mask")), + is_publish_debug_image_(declare_parameter("is_publish_debug_image")), + is_apply_erosion_(declare_parameter("apply_erosion")) +{ + { + stop_watch_ptr_ = + std::make_unique>(); + debug_publisher_ = + std::make_unique(this, this->get_name()); + stop_watch_ptr_->tic("cyclic_time"); + stop_watch_ptr_->tic("processing_time"); + } + using std::placeholders::_1; + using std::chrono_literals::operator""ms; + + std::string model_path = declare_parameter("model_path"); + std::string color_map_path = declare_parameter("color_map_path"); + std::string precision = declare_parameter("precision"); + std::vector mean = declare_parameter>("input_image_mean"); + std::vector std = declare_parameter>("input_image_std_dev"); + double score_threshold = declare_parameter("score_threshold"); + double nms_threshold = declare_parameter("nms_threshold"); + double mask_threshold = declare_parameter("mask_threshold"); + int64_t dla_core_id = declare_parameter("dla_core_id"); + bool quantize_first_layer = declare_parameter("quantize_first_layer"); + bool quantize_last_layer = declare_parameter("quantize_last_layer"); + bool profile_per_layer = declare_parameter("profile_per_layer"); + double clip_value = declare_parameter("clip_value"); + std::string calibration_image_list_path = + declare_parameter("calibration_image_list_path"); + std::vector plugin_paths = + declare_parameter>("plugin_paths"); + + color_map_ = read_color_map_file(color_map_path); + + autoware::tensorrt_common::BuildConfig build_config( + "Entropy", dla_core_id, quantize_first_layer, quantize_last_layer, profile_per_layer, + clip_value); + + const double norm_factor = 1.0; + const std::string cache_dir; + const autoware::tensorrt_common::BatchConfig batch_config{1, 1, 1}; + const size_t max_workspace_size = (1u << 30u); + + mean_ = std::vector(mean.begin(), mean.end()); + std_ = std::vector(std.begin(), std.end()); + + trt_rtmdet_ = std::make_unique( + model_path, precision, color_map_, score_threshold, nms_threshold, mask_threshold, build_config, + calibration_image_list_path, norm_factor, mean_, std_, cache_dir, batch_config, + max_workspace_size, plugin_paths); + + timer_ = + rclcpp::create_timer(this, get_clock(), 100ms, std::bind(&TrtRTMDetNode::on_connect, this)); + + objects_pub_ = this->create_publisher( + "~/out/objects", 1); + mask_pub_ = this->create_publisher( + "~/out/mask", 1); + + color_mask_pub_ = image_transport::create_publisher(this, "~/out/color_mask"); + debug_image_pub_ = image_transport::create_publisher(this, "~/out/debug_image"); + + if (declare_parameter("build_only", false)) { + RCLCPP_INFO(this->get_logger(), "TensorRT engine file is built and exit."); + rclcpp::shutdown(); + } +} + +void TrtRTMDetNode::on_connect() +{ + using std::placeholders::_1; + if ( + debug_image_pub_.getNumSubscribers() == 0 && mask_pub_->get_subscription_count() == 0 && + mask_pub_->get_intra_process_subscription_count() == 0 && + color_mask_pub_.getNumSubscribers() == 0 && objects_pub_->get_subscription_count() == 0 && + objects_pub_->get_intra_process_subscription_count() == 0) { + image_sub_.shutdown(); + } else if (!image_sub_) { + image_sub_ = image_transport::create_subscription( + this, "~/in/image", std::bind(&TrtRTMDetNode::on_image, this, _1), "raw", + rmw_qos_profile_sensor_data); + } +} + +void TrtRTMDetNode::on_image(const sensor_msgs::msg::Image::ConstSharedPtr msg) +{ + stop_watch_ptr_->toc("processing_time", true); + + cv_bridge::CvImagePtr in_image_ptr; + try { + in_image_ptr = cv_bridge::toCvCopy(msg, msg->encoding); + } catch (cv_bridge::Exception & e) { + RCLCPP_ERROR(this->get_logger(), "cv_bridge exception: %s", e.what()); + return; + } + + tensorrt_rtmdet::ObjectArrays objects; + cv::Mat mask; + std::vector class_ids; + if (!trt_rtmdet_->do_inference({in_image_ptr->image}, objects, mask, class_ids)) { + RCLCPP_WARN(this->get_logger(), "Fail to inference"); + return; + } + + tier4_perception_msgs::msg::DetectedObjectsWithFeature detected_objects_with_feature; + for (const auto & object : objects.at(0)) { + tier4_perception_msgs::msg::DetectedObjectWithFeature detected_object_with_feature; + detected_object_with_feature.feature.roi.width = object.x2 - object.x1; + detected_object_with_feature.feature.roi.height = object.y2 - object.y1; + detected_object_with_feature.feature.roi.x_offset = object.x1; + detected_object_with_feature.feature.roi.y_offset = object.y1; + autoware_perception_msgs::msg::ObjectClassification classification; + classification.label = color_map_[object.class_id].label_id; + classification.probability = object.score; + detected_object_with_feature.object.classification = + std::vector{classification}; + + detected_objects_with_feature.feature_objects.push_back(detected_object_with_feature); + } + detected_objects_with_feature.header = msg->header; + objects_pub_->publish(detected_objects_with_feature); + + if (!mask.empty() && !class_ids.empty()) { + if (is_apply_erosion_) apply_erosion(mask, 3); + sensor_msgs::msg::Image::SharedPtr mask_image = + cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::MONO8, mask) + .toImageMsg(); + std::vector classification; + for (uint8_t class_id : class_ids) { + autoware_perception_msgs::msg::ObjectClassification object_classification; + object_classification.label = class_id; + object_classification.probability = 1.0; + classification.push_back(object_classification); + } + autoware_internal_perception_msgs::msg::SegmentationMask mask_msg; + mask_msg.classification = classification; + mask_msg.image = *mask_image; + mask_msg.header = msg->header; + mask_pub_->publish(mask_msg); + } + + if (is_publish_color_mask_) { + cv::Mat color_mask = cv::Mat::zeros(mask.rows, mask.cols, CV_8UC3); + get_colorized_mask(color_map_, mask, color_mask); + sensor_msgs::msg::Image::SharedPtr color_mask_msg = + cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::BGR8, color_mask) + .toImageMsg(); + color_mask_msg->header = msg->header; + color_mask_pub_.publish(color_mask_msg); + } + + if (is_publish_debug_image_) { + cv::Mat debug_image = in_image_ptr->image.clone(); + draw_debug_image(debug_image, mask, objects, color_map_); + sensor_msgs::msg::Image::SharedPtr debug_image_msg = + cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::BGR8, debug_image) + .toImageMsg(); + debug_image_msg->header = msg->header; + debug_image_pub_.publish(debug_image_msg); + } + + if (debug_publisher_) { + const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true); + const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic_time", true); + const double pipeline_latency_ms = + std::chrono::duration( + std::chrono::nanoseconds((this->get_clock()->now() - msg->header.stamp).nanoseconds())) + .count(); + debug_publisher_->publish( + "debug/cyclic_time_ms", cyclic_time_ms); + debug_publisher_->publish( + "debug/processing_time_ms", processing_time_ms); + debug_publisher_->publish( + "debug/pipeline_latency_ms", pipeline_latency_ms); + } +} + +ColorMap TrtRTMDetNode::read_color_map_file(const std::string & color_map_path) +{ + ColorMap color_map; + + if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(color_map_path))) { + RCLCPP_ERROR( + rclcpp::get_logger("tensorrt_rtmdet_node"), "failed to open %s", color_map_path.c_str()); + assert(0); + } + + std::ifstream file(color_map_path); + std::string line; + + // Skip the first line since it is the header + std::getline(file, line); + + while (std::getline(file, line)) { + if (line.empty()) { + continue; + } + + auto split_string = [](std::string & str, char delimiter) -> std::vector { + std::vector result; + std::stringstream ss(str); + std::string item; + + while (std::getline(ss, item, delimiter)) { + result.push_back(item); + } + + return result; + }; + std::vector tokens = split_string(line, ','); + + LabelColor label_color; + label_color.class_name = tokens.at(1); + label_color.color[0] = std::stoi(tokens.at(2)); + label_color.color[1] = std::stoi(tokens.at(3)); + label_color.color[2] = std::stoi(tokens.at(4)); + label_color.label_id = std::stoi(tokens.at(5)); + color_map[std::stoi(tokens.at(0))] = label_color; + } + + return color_map; +} + +void TrtRTMDetNode::draw_debug_image( + cv::Mat & image, [[maybe_unused]] const cv::Mat & mask, + const tensorrt_rtmdet::ObjectArrays & objects, const tensorrt_rtmdet::ColorMap & color_map) +{ + // TODO(StepTurtle): add mask to debug image + + for (const auto & object : objects.at(0)) { + // Draw the bounding box + cv::rectangle( + image, cv::Point(static_cast(object.x1), static_cast(object.y1)), + cv::Point(static_cast(object.x2), static_cast(object.y2)), + color_map.at(static_cast(object.class_id)).color, 2); + // Write the class name + cv::putText( + image, color_map.at(static_cast(object.class_id)).class_name, + cv::Point(static_cast(object.x1), static_cast(object.y1)), cv::FONT_HERSHEY_SIMPLEX, + 1, color_map.at(static_cast(object.class_id)).color, 2); + } +} + +void TrtRTMDetNode::get_colorized_mask( + const ColorMap & color_map, const cv::Mat & mask, cv::Mat & color_mask) +{ + int width = mask.cols; + int height = mask.rows; + if ((color_mask.cols != mask.cols) || (color_mask.rows != mask.rows)) { + throw std::runtime_error("input and output image have difference size."); + } + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + unsigned char id = mask.at(y, x); + color_mask.at(y, x) = color_map.at(id).color; + } + } +} + +void TrtRTMDetNode::apply_erosion(cv::Mat & mask, const int erosion_size) +{ + cv::Mat element = cv::getStructuringElement( + cv::MORPH_RECT, cv::Size(2 * erosion_size + 1, 2 * erosion_size + 1), + cv::Point(erosion_size, erosion_size)); + cv::erode(mask, mask, element); +} +} // namespace autoware::tensorrt_rtmdet + +#include "rclcpp_components/register_node_macro.hpp" +RCLCPP_COMPONENTS_REGISTER_NODE(autoware::tensorrt_rtmdet::TrtRTMDetNode) diff --git a/perception/autoware_tensorrt_rtmdet/test/data/test_color_map.csv b/perception/autoware_tensorrt_rtmdet/test/data/test_color_map.csv new file mode 100644 index 0000000000000..0fb9eaa709f1a --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/test/data/test_color_map.csv @@ -0,0 +1,4 @@ +id,name,r,g,b,label +0,person,220,20,60,7 +1,bicycle,0,0,142,6 +2,car,0,0,255,1 diff --git a/perception/autoware_tensorrt_rtmdet/test/data/test_image.jpg b/perception/autoware_tensorrt_rtmdet/test/data/test_image.jpg new file mode 100644 index 0000000000000..c2f3cba777d43 Binary files /dev/null and b/perception/autoware_tensorrt_rtmdet/test/data/test_image.jpg differ diff --git a/perception/autoware_tensorrt_rtmdet/test/test_rtmdet.cpp b/perception/autoware_tensorrt_rtmdet/test/test_rtmdet.cpp new file mode 100644 index 0000000000000..aaa45ca7fb7e0 --- /dev/null +++ b/perception/autoware_tensorrt_rtmdet/test/test_rtmdet.cpp @@ -0,0 +1,237 @@ +// Copyright 2024 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// cspell: ignore libtrt +#include "ament_index_cpp/get_package_prefix.hpp" +#include "ament_index_cpp/get_package_share_directory.hpp" +#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp" +#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp" + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +template +T get_pixel_value(const cv::Mat & image, int x, int y) +{ + if (x < 0 || x >= image.cols || y < 0 || y >= image.rows) { + throw std::out_of_range("Coordinates are out of image bounds"); + } + + if (image.channels() == 1) { + return image.at(y, x); + } + return image.at(y, x); +} + +class TrtRTMDetTest : public ::testing::Test +{ +protected: + void SetUp() override + { + const auto home_path = std::string(getenv("HOME")); + const auto autoware_data_path = home_path + "/autoware_data"; + + const auto rtmdet_share_dir = + ament_index_cpp::get_package_share_directory("autoware_tensorrt_rtmdet"); + const auto plugin_dir = ament_index_cpp::get_package_prefix("trt_batched_nms"); + + const auto test_image_path = rtmdet_share_dir + "/data/test_image.jpg"; + const auto test_color_map_path = rtmdet_share_dir + "/data/test_color_map.csv"; + const auto plugin_path = plugin_dir + "/lib/libtrt_batched_nms_plugin.so"; + const auto model_path = autoware_data_path + "/tensorrt_rtmdet/rtmdet_x.onnx"; + + const double norm_factor = 1.0; + const std::string cache_dir; + const autoware::tensorrt_common::BatchConfig batch_config{1, 1, 1}; + const size_t max_workspace_size = (1u << 30u); + const std::vector plugin_paths{plugin_path}; + + test_image_ = cv::imread(test_image_path, cv::IMREAD_COLOR); + + autoware::tensorrt_common::BuildConfig build_config("Entropy", -1, false, false, false, 6.0); + + // Test color map only includes 3 classes. It is enough for the test. + color_map_ = autoware::tensorrt_rtmdet::TrtRTMDetNode::read_color_map_file(test_color_map_path); + + trt_rtmdet_ = std::make_unique( + model_path, precision_, color_map_, score_threshold_, nms_threshold_, mask_threshold_, + build_config, "", norm_factor, mean_, std_, cache_dir, batch_config, max_workspace_size, + plugin_paths); + } + +public: + /** + * @brief Wrapper function to test the autoware::tensorrt_rtmdet::TrtRTMDet::preprocess_gpu + * function. + * + * @param input_image The input images. + */ + void preprocess_gpu(const std::vector & input_images) const + { + trt_rtmdet_->preprocess_gpu(input_images); + } + + /** + * @brief Wrapper function to get the image buffer on the host. + * + * @return The image buffer on the host. + */ + [[nodiscard]] const autoware::cuda_utils::CudaUniquePtrHost & + get_image_buffer_h() const + { + return trt_rtmdet_->image_buf_h_; + } + + /** + * @brief Wrapper function to get the image buffer on the device. + * + * @return The image buffer on the device. + */ + [[nodiscard]] const autoware::cuda_utils::CudaUniquePtr & get_image_buffer_d() + const + { + return trt_rtmdet_->image_buf_d_; + } + + /** + * @brief Wrapper function to get the input dimensions. + * + * @return The input dimensions. + */ + [[nodiscard]] nvinfer1::Dims get_input_dimensions() const + { + return trt_rtmdet_->trt_common_->getBindingDimensions(0); + } + + std::unique_ptr trt_rtmdet_; + + // Common parameters to initialize the RTMDet model + const std::vector mean_{103.53, 116.28, 123.675}; + const std::vector std_{57.375, 57.12, 58.395}; + const float score_threshold_{0.3}; + const float nms_threshold_{0.3}; + const float mask_threshold_{0.7}; + const std::string precision_{"fp16"}; + + cv::Mat test_image_; + + autoware::tensorrt_rtmdet::ColorMap color_map_; +}; + +TEST_F(TrtRTMDetTest, TestPreprocessGPU) +{ + const auto input_image = test_image_.clone(); + const auto input_images = std::vector{input_image}; + + ASSERT_NO_THROW(preprocess_gpu(input_images)); + + // Validate CUDA memory setup by checking pointers. + EXPECT_NE(get_image_buffer_h(), nullptr); + EXPECT_NE(get_image_buffer_d(), nullptr); +} + +TEST_F(TrtRTMDetTest, TestFeedForward) +{ + const auto input_image = test_image_.clone(); + const auto input_images = std::vector{input_image}; + preprocess_gpu(input_images); + + autoware::tensorrt_rtmdet::ObjectArrays object_arrays; + cv::Mat mask; + std::vector class_ids; + + const auto success = trt_rtmdet_->do_inference(input_images, object_arrays, mask, class_ids); + const auto object_size = object_arrays[0].size(); + + // Check if the inference was successful + EXPECT_TRUE(success); + + // Check if the object size is correct + EXPECT_EQ(object_size, 1); + + // Check if the output objects class ids is correct (the label_id is derived from the COCO + // dataset.) + EXPECT_EQ(object_arrays.at(0).at(0).class_id, 2); + + // Check if the output objects positions are correct + EXPECT_EQ(object_arrays.at(0).at(0).x1, 138); + EXPECT_EQ(object_arrays.at(0).at(0).x2, 1026); + EXPECT_EQ(object_arrays.at(0).at(0).y1, 201); + EXPECT_EQ(object_arrays.at(0).at(0).y2, 509); + + // Check if the output objects class ids is correct (label_id, which is used to process the + // outputs in Autoware.) + EXPECT_EQ(class_ids.at(0), 1); + + // Check if the one of the mask pixel which is in the object is not zero (zero represents the + // background.) There is a car in the test image, so the mask pixel value should not be zero. + const auto mask_pixel_value = get_pixel_value(mask, 320, 413); + EXPECT_NE(mask_pixel_value, 0); +} + +TEST_F(TrtRTMDetTest, TestIntersectionOverUnion) +{ + const autoware::tensorrt_rtmdet::Object obj1 = {10, 10, 50, 50, 1, 0, 0.8}; + const autoware::tensorrt_rtmdet::Object obj2 = {20, 20, 60, 60, 1, 0, 0.7}; + + float iou = autoware::tensorrt_rtmdet::TrtRTMDet::intersection_over_union(obj1, obj2); + EXPECT_NEAR(iou, 0.4, 0.01); +} + +TEST_F(TrtRTMDetTest, TestNmsSortedBboxes) +{ + const autoware::tensorrt_rtmdet::Object obj1 = {10, 10, 50, 50, 0, 0, 0.8}; + const autoware::tensorrt_rtmdet::Object obj2 = {10, 10, 50, 50, 1, 0, 0.3}; + const autoware::tensorrt_rtmdet::Object obj3 = {12, 12, 52, 52, 1, 0, 0.6}; + const autoware::tensorrt_rtmdet::Object obj4 = {8, 8, 48, 48, 1, 0, 0.5}; + + std::vector input_objects = {obj1, obj2, obj3, obj4}; + std::vector output_objects; + const float nms_threshold = 0.5; + + autoware::tensorrt_rtmdet::TrtRTMDet::nms_sorted_bboxes( + input_objects, output_objects, nms_threshold); + + // Check if the number of objects is correct after NMS + EXPECT_EQ(output_objects.size(), 1); + + // After NMS, only the `obj1` should be kept. + EXPECT_EQ(output_objects.at(0).x1, 10); + EXPECT_EQ(output_objects.at(0).x2, 50); + EXPECT_EQ(output_objects.at(0).y1, 10); + EXPECT_EQ(output_objects.at(0).y2, 50); + EXPECT_EQ(output_objects.at(0).class_id, 0); + EXPECT_EQ(output_objects.at(0).score, static_cast(0.8)); +} + +TEST_F(TrtRTMDetTest, TestPluginLoad) +{ + const auto plugin_dir = ament_index_cpp::get_package_prefix("trt_batched_nms"); + const auto plugin_path = plugin_dir + "/lib/libtrt_batched_nms_plugin.so"; + + int32_t flags{RTLD_LAZY}; + void * handle = dlopen(plugin_path.c_str(), flags); + + EXPECT_NE(handle, nullptr); +}