diff --git a/build_depends.repos b/build_depends.repos
index 4db947b7c26a8..2c19de267f95e 100644
--- a/build_depends.repos
+++ b/build_depends.repos
@@ -58,3 +58,7 @@ repositories:
     type: git
     url: https://github.com/autowarefoundation/ament_cmake.git
     version: feat/faster_ament_libraries_deduplicate
+  universe/external/trt_batched_nms:
+    type: git
+    url: https://github.com/autowarefoundation/trt_batched_nms.git
+    version: main
diff --git a/perception/autoware_tensorrt_rtmdet/CMakeLists.txt b/perception/autoware_tensorrt_rtmdet/CMakeLists.txt
new file mode 100644
index 0000000000000..300b620158d74
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/CMakeLists.txt
@@ -0,0 +1,102 @@
+cmake_minimum_required(VERSION 3.8)
+project(autoware_tensorrt_rtmdet)
+
+find_package(autoware_tensorrt_common)
+if(NOT ${autoware_tensorrt_common_FOUND})
+  message(WARNING "The autoware_tensorrt_common package is not found. Please check its dependencies.")
+  return()
+endif()
+
+find_package(autoware_cmake REQUIRED)
+autoware_package()
+
+find_package(OpenCV REQUIRED)
+
+include(CheckLanguage)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+else()
+  message(WARNING "CUDA is not found. preprocess acceleration using CUDA will not be available.")
+endif()
+
+find_package(OpenMP)
+if(OpenMP_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+# tensorrt_rtmdet
+ament_auto_add_library(${PROJECT_NAME} SHARED
+        src/tensorrt_rtmdet.cpp
+)
+
+ament_target_dependencies(${PROJECT_NAME}
+        OpenCV
+)
+
+if(CMAKE_CUDA_COMPILER)
+  # preprocess
+  cuda_add_library(${PROJECT_NAME}_gpu_preprocess
+          SHARED
+          src/preprocess.cu
+  )
+
+  target_include_directories(${PROJECT_NAME}_gpu_preprocess PUBLIC
+          $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+          $<INSTALL_INTERFACE:include/${PROJECT_NAME}>
+  )
+
+  target_link_libraries(${PROJECT_NAME}
+          ${autoware_tensorrt_common_LIBRARIES}
+          ${PROJECT_NAME}_gpu_preprocess
+  )
+
+else()
+  target_link_libraries(${PROJECT_NAME}
+          ${autoware_tensorrt_common_LIBRARIES}
+  )
+endif()
+
+target_compile_definitions(${PROJECT_NAME} PRIVATE
+        TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR}
+)
+
+# tensorrt_rtmdet_node
+ament_auto_add_library(${PROJECT_NAME}_node SHARED
+        src/tensorrt_rtmdet_node.cpp
+)
+
+ament_target_dependencies(${PROJECT_NAME}_node
+        OpenCV
+)
+
+target_link_libraries(${PROJECT_NAME}_node
+        ${PROJECT_NAME}
+)
+
+target_compile_definitions(${PROJECT_NAME}_node PRIVATE
+        TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR}
+)
+
+rclcpp_components_register_node(${PROJECT_NAME}_node
+        PLUGIN "autoware::tensorrt_rtmdet::TrtRTMDetNode"
+        EXECUTABLE ${PROJECT_NAME}_node_exe
+)
+
+if(BUILD_TESTING)
+  find_package(ament_cmake_ros REQUIRED)
+  find_package(ament_lint_auto REQUIRED)
+  ament_lint_auto_find_test_dependencies()
+
+  ament_add_ros_isolated_gtest(test_rtmdet test/test_rtmdet.cpp)
+  set_tests_properties(test_rtmdet PROPERTIES TIMEOUT 60) # It could take a long time on the first run to create the engine
+  target_link_libraries(test_rtmdet ${PROJECT_NAME} ${PROJECT_NAME}_node)
+  target_include_directories(test_rtmdet PRIVATE include)
+endif()
+
+ament_auto_package(INSTALL_TO_SHARE
+        launch
+        config
+        test/data
+)
diff --git a/perception/autoware_tensorrt_rtmdet/README.md b/perception/autoware_tensorrt_rtmdet/README.md
new file mode 100644
index 0000000000000..0912fd72e3466
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/README.md
@@ -0,0 +1,70 @@
+# autoware_tensorrt_rtmdet
+
+## Purpose
+
+RTMDet is a real-time instance segmentation model which can be used for detecting objects like cars, pedestrians,
+bicycles, etc. in a scene. This package provides a ROS 2 interface for RTMDet using TensorRT.
+
+## Inner-workings / Algorithms
+
+## Inputs / Outputs
+
+### Input
+
+| Name       | Type                | Description     |
+| ---------- | ------------------- | --------------- |
+| `in/image` | `sensor_msgs/Image` | The input image |
+
+### Output
+
+| Name              | Type                                                 | Description                                                         |
+| ----------------- | ---------------------------------------------------- | ------------------------------------------------------------------- |
+| `out/objects`     | `tier4_perception_msgs/DetectedObjectsWithFeature`   | The detected objects with 2D bounding boxes and scores              |
+| `out/mask`        | `autoware_internal_perception_msgs/SegmentationMask` | The instance segmentation mask                                      |
+| `out/color_mask`  | `sensor_msgs/Image`                                  | The colorized image of instance segmentation mask for visualization |
+| `out/debug_image` | `sensor_msgs/Image`                                  | The image with 2D bounding boxes for visualization                  |
+
+## Parameters
+
+{{ json_to_markdown("perception/autoware_tensorrt_rtmdet/schema/rtmdet.schema.json") }}
+
+## Assumptions / Known limits
+
+## Onnx model
+
+A sample model is provided in `autoware_data` folder by ansible script on env preparation stage. If you cannot find the
+model, you follow instructions from the
+[link](https://github.com/autowarefoundation/autoware/tree/main/ansible/roles/artifacts) to download the model.
+
+The shared model was trained by open-mmlab using the COCO dataset. For more details,
+see [link](https://github.com/open-mmlab/mmdetection/tree/3.x/configs/rtmdet#instance-segmentation).
+
+### Package acceptable model generation
+
+Users can generate their own model using official RTMDet repository. Please refer to the
+[official repository](https://github.com/open-mmlab/mmdetection/tree/3.x/configs/rtmdet#rtmdet-an-empirical-study-of-designing-real-time-object-detectors)
+
+## Label file
+
+A sample label file (named label.txt) and instance segmentation color map file (name color_map.csv) are also
+downloaded automatically during env preparation process.
+
+These files are used to map the class index to class name and color respectively.
+
+## Reference repositories
+
+- <https://github.com/open-mmlab/mmdetection/tree/3.x/configs/rtmdet#rtmdet-an-empirical-study-of-designing-real-time-object-detectors>
+
+## Citation
+
+```bibtex
+@misc{lyu2022rtmdetempiricalstudydesigning,
+title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+year={2022},
+eprint={2212.07784},
+archivePrefix={arXiv},
+primaryClass={cs.CV},
+url={https://arxiv.org/abs/2212.07784},
+}
+```
diff --git a/perception/autoware_tensorrt_rtmdet/config/rtmdet.param.yaml b/perception/autoware_tensorrt_rtmdet/config/rtmdet.param.yaml
new file mode 100644
index 0000000000000..d766e53dab427
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/config/rtmdet.param.yaml
@@ -0,0 +1,23 @@
+/**:
+  ros__parameters:
+    is_publish_debug_image: false
+
+    is_publish_color_mask: false
+
+    # cspell: ignore libtrt
+    model_path: "$(var data_path)/tensorrt_rtmdet/$(var model_name).onnx"
+    color_map_path: "$(var data_path)/tensorrt_rtmdet/color_map.csv"
+    plugin_paths: [ "$(find-pkg-prefix trt_batched_nms)/lib/libtrt_batched_nms_plugin.so" ]
+    input_image_mean: [ 103.53, 116.28, 123.675 ]
+    input_image_std_dev: [ 57.375, 57.12, 58.395 ]
+    score_threshold: 0.3
+    nms_threshold: 0.3
+    mask_threshold: 0.7
+    precision: "fp16"
+    dla_core_id: -1
+    quantize_first_layer: false
+    quantize_last_layer: false
+    profile_per_layer: false
+    clip_value: 6.0
+    apply_erosion: false
+    calibration_image_list_path: ""
diff --git a/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/calibrator.hpp b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/calibrator.hpp
new file mode 100644
index 0000000000000..1481b8099ccd4
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/calibrator.hpp
@@ -0,0 +1,257 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// cspell: ignore RTMDET
+#ifndef AUTOWARE__TENSORRT_RTMDET__CALIBRATOR_HPP_
+#define AUTOWARE__TENSORRT_RTMDET__CALIBRATOR_HPP_
+#include "autoware/cuda_utils/cuda_check_error.hpp"
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <rclcpp/rclcpp.hpp>
+
+#include <NvInfer.h>
+
+#include <algorithm>
+#include <cassert>
+#include <fstream>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace autoware::tensorrt_rtmdet
+{
+class ImageStream
+{
+public:
+  ImageStream(
+    uint32_t batch_size, nvinfer1::Dims input_dims,
+    const std::vector<std::string> & calibration_images)
+  : batch_size_(batch_size),
+    calibration_images_(calibration_images),
+    max_batches_(calibration_images.size() / batch_size_),
+    input_dims_(input_dims)
+  {
+    batch_.resize(batch_size_ * input_dims_.d[1] * input_dims_.d[2] * input_dims_.d[3]);
+  }
+
+  [[nodiscard]] uint32_t get_batch_size() const { return batch_size_; }
+
+  [[nodiscard]] uint32_t get_max_batches() const { return max_batches_; }
+
+  float * get_batch() { return &batch_.at(0); }
+
+  nvinfer1::Dims get_input_dims() { return input_dims_; }
+
+  /**
+   * @brief Preprocess in calibration
+   * @param[in] images input images
+   * @param[in] input_dims input dimensions
+   * @param[in] mean mean value for each channel
+   * @param[in] std std value for each channel
+   * @return vector<float> preprocessed data
+   */
+  static std::vector<float> preprocess(
+    const std::vector<cv::Mat> & images, nvinfer1::Dims input_dims, const std::vector<float> & mean,
+    const std::vector<float> & std)
+  {
+    std::vector<float> input_h;
+    auto batch_size = static_cast<uint32_t>(images.size());
+    input_dims.d[0] = static_cast<int32_t>(batch_size);
+    const auto input_chan = static_cast<uint32_t>(input_dims.d[1]);
+    const auto input_height = static_cast<uint32_t>(input_dims.d[2]);
+    const auto input_width = static_cast<uint32_t>(input_dims.d[3]);
+    std::vector<cv::Mat> dst_images;
+    uint32_t volume = batch_size * input_chan * input_height * input_width;
+    input_h.resize(volume);
+    for (const auto & image : images) {
+      cv::Mat dst_image;
+      cv::resize(image, dst_image, cv::Size(640, 640));
+      dst_image.convertTo(dst_image, CV_32F);
+      dst_image -= cv::Scalar(mean.at(0), mean.at(1), mean.at(2));
+      dst_image /= cv::Scalar(std.at(0), std.at(1), std.at(2));
+      dst_images.emplace_back(dst_image);
+    }
+
+    const auto chw_images =
+      cv::dnn::blobFromImages(dst_images, 1.0, cv::Size(), cv::Scalar(), false, false, CV_32F);
+    const auto data_length = chw_images.total();
+    input_h.reserve(data_length);
+    const auto flat = chw_images.reshape(1, static_cast<int>(data_length));
+    input_h = chw_images.isContinuous() ? flat : flat.clone();
+    return input_h;
+  }
+
+  /**
+   * @brief Decode data in calibration
+   * @param[in] scale normalization (0.0-1.0)
+   * @return bool succ or fail
+   */
+  bool next(const std::vector<float> & mean, const std::vector<float> & std)
+  {
+    if (current_batch_ == max_batches_) {
+      return false;
+    }
+    for (uint32_t i = 0; i < batch_size_; ++i) {
+      auto image =
+        cv::imread(calibration_images_.at(batch_size_ * current_batch_ + i), cv::IMREAD_COLOR);
+      RCLCPP_INFO(
+        rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), "Preprocess %s",
+        calibration_images_.at(batch_size_ * current_batch_ + i).c_str());
+      auto input = preprocess({image}, input_dims_, mean, std);
+      batch_.insert(
+        batch_.begin() + i * input_dims_.d[1] * input_dims_.d[2] * input_dims_.d[3], input.begin(),
+        input.end());
+    }
+
+    ++current_batch_;
+    return true;
+  }
+  /**
+   * @brief Reset calibration
+   */
+  void reset() { current_batch_ = 0; }
+
+private:
+  uint32_t batch_size_;
+  std::vector<std::string> calibration_images_;
+  uint32_t current_batch_{0};
+  uint32_t max_batches_;
+  nvinfer1::Dims input_dims_;
+  std::vector<float> batch_;
+};
+
+/**
+ * @class Int8LegacyCalibrator
+ * @brief Calibrator for Percentile
+ *
+ */
+class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator2
+{
+public:
+  Int8EntropyCalibrator(
+    ImageStream & stream, std::string calibration_cache_file,
+    const std::vector<float> & mean = {103.53, 116.28, 123.675},
+    const std::vector<float> & std = {57.375, 57.12, 58.395}, bool read_cache = true)
+  : stream_(stream),
+    calibration_cache_file_(std::move(calibration_cache_file)),
+    read_cache_(read_cache)
+  {
+    auto d = stream_.get_input_dims();
+    input_count_ = stream_.get_batch_size() * d.d[1] * d.d[2] * d.d[3];
+    m_std_ = std;
+    m_mean_ = mean;
+
+    CHECK_CUDA_ERROR(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
+    auto alg_type = getAlgorithm();
+    switch (alg_type) {
+      case (nvinfer1::CalibrationAlgoType::kLEGACY_CALIBRATION):
+        RCLCPP_INFO(
+          rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"),
+          "CalibrationAlgoType : kLEGACY_CALIBRATION");
+        break;
+      case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION):
+        RCLCPP_INFO(
+          rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"),
+          "CalibrationAlgoType : kENTROPY_CALIBRATION");
+        break;
+      case (nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION_2):
+        RCLCPP_INFO(
+          rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"),
+          "CalibrationAlgoType : kENTROPY_CALIBRATION_2");
+        break;
+      case (nvinfer1::CalibrationAlgoType::kMINMAX_CALIBRATION):
+        RCLCPP_INFO(
+          rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"),
+          "CalibrationAlgoType : kMINMAX_CALIBRATION");
+        break;
+      default:
+        RCLCPP_INFO(
+          rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"), "No CalibrationAlgType");
+        break;
+    }
+  }
+  [[nodiscard]] int getBatchSize() const noexcept override
+  {
+    return static_cast<int32_t>(stream_.get_batch_size());
+  }
+
+  ~Int8EntropyCalibrator() noexcept override { CHECK_CUDA_ERROR(cudaFree(device_input_)); }
+
+  bool getBatch(void * bindings[], const char * names[], int nb_bindings) noexcept override
+  {
+    (void)names;
+    (void)nb_bindings;
+
+    if (!stream_.next(m_mean_, m_std_)) {
+      return false;
+    }
+    try {
+      CHECK_CUDA_ERROR(cudaMemcpy(
+        device_input_, stream_.get_batch(), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
+    } catch (const std::exception & e) {
+      // Do nothing
+    }
+    bindings[0] = device_input_;
+    return true;
+  }
+
+  const void * readCalibrationCache(size_t & length) noexcept override
+  {
+    calib_cache_.clear();
+    std::ifstream input(calibration_cache_file_, std::ios::binary);
+    input >> std::noskipws;
+    if (read_cache_ && input.good()) {
+      std::copy(
+        std::istream_iterator<char>(input), std::istream_iterator<char>(),
+        std::back_inserter(calib_cache_));
+    }
+
+    length = calib_cache_.size();
+    if (length) {
+      RCLCPP_INFO(
+        rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"),
+        "Using cached calibration table to build the engine");
+    } else {
+      RCLCPP_INFO(
+        rclcpp::get_logger("autoware_tensorrt_rtmdet_calibrator"),
+        "New calibration table will be created to build the engine");
+    }
+    return length ? &calib_cache_.at(0) : nullptr;
+  }
+
+  void writeCalibrationCache(const void * cache, size_t length) noexcept override
+  {
+    std::ofstream output(calibration_cache_file_, std::ios::binary);
+    output.write(static_cast<const char *>(cache), length);
+  }
+
+private:
+  ImageStream stream_;
+  const std::string calibration_cache_file_;
+  bool read_cache_{true};
+  size_t input_count_;
+  void * device_input_{nullptr};
+  std::vector<char> calib_cache_;
+  std::vector<char> hist_cache_;
+  // mean for preprocessing
+  std::vector<float> m_mean_;
+  // std for preprocessing
+  std::vector<float> m_std_;
+};
+}  // namespace autoware::tensorrt_rtmdet
+
+#endif  // AUTOWARE__TENSORRT_RTMDET__CALIBRATOR_HPP_
diff --git a/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/preprocess.hpp b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/preprocess.hpp
new file mode 100644
index 0000000000000..bf633e07c5833
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/preprocess.hpp
@@ -0,0 +1,152 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// cspell: ignore RTMDET
+#ifndef AUTOWARE__TENSORRT_RTMDET__PREPROCESS_HPP_
+#define AUTOWARE__TENSORRT_RTMDET__PREPROCESS_HPP_
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <curand.h>
+
+namespace autoware::tensorrt_rtmdet
+{
+struct Roi
+{
+  int x;
+  int y;
+  int w;
+  int h;
+};
+
+/**
+ * @brief Resize a image using bilinear interpolation on gpus
+ * @param[out] dst Resized image
+ * @param[in] src image
+ * @param[in] d_w width for resized image
+ * @param[in] d_h height for resized image
+ * @param[in] d_c channel for resized image
+ * @param[in] s_w width for input image
+ * @param[in] s_h height for input image
+ * @param[in] stream cuda stream
+ */
+extern void resize_bilinear_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h,
+  cudaStream_t stream);
+
+/**
+ * @brief Letterbox a image on gpus
+ * @param[out] dst letterbox-ed image
+ * @param[in] src image
+ * @param[in] d_w width for letterbox-ing
+ * @param[in] d_h height for letterbox-ing
+ * @param[in] s_w width for input image
+ * @param[in] s_h height for input image
+ * @param[in] stream cuda stream
+ */
+extern void letterbox_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h,
+  cudaStream_t stream);
+
+/**
+ * @brief NHWC to NHWC conversion
+ * @param[out] dst converted image
+ * @param[in] src image
+ * @param[in] d_w width for a image
+ * @param[in] d_h height for a image
+ * @param[in] stream cuda stream
+ */
+extern void nchw_to_nhwc_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, cudaStream_t stream);
+
+/**
+ * @brief Unsigned char to float32 for inference
+ * @param[out] dst32 converted image
+ * @param[in] src image
+ * @param[in] d_w width for a image
+ * @param[in] d_h height for a image
+ * @param[in] d_c channel for a image
+ * @param[in] stream cuda stream
+ */
+extern void to_float_gpu(
+  float * dst32, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream);
+
+/**
+ * @brief Resize and letterbox a image using bilinear interpolation on gpus
+ * @param[out] dst processed image
+ * @param[in] src image
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] stream cuda stream
+ */
+extern void resize_bilinear_letterbox_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h,
+  cudaStream_t stream);
+
+/**
+ * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization
+ * for RTMDet on gpus
+ * @param[out] dst processed image
+ * @param[in] src image
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] norm normalization
+ * @param[in] stream cuda stream
+ */
+extern void resize_bilinear_letterbox_nhwc_to_nchw32_gpu(
+  float * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, float norm,
+  cudaStream_t stream);
+
+/**
+ * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization
+ * with batching for RTMDet on gpus
+ * @param[out] dst processed image
+ * @param[in] src image
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] batch batch size
+ * @param[in] mean mean value for normalization
+ * @param[in] std standard deviation for normalization
+ * @param[in] stream cuda stream
+ */
+extern void resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
+  float * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, int batch,
+  const float * mean, const float * std, cudaStream_t stream);
+
+/**
+ * @brief Argmax on GPU
+ * @param[out] dst processed image
+ * @param[in] src probability map
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] s_c channel for input
+ * @param[in] batch batch size
+ * @param[in] stream cuda stream
+ */
+extern void argmax_gpu(
+  unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch,
+  cudaStream_t stream);
+}  // namespace autoware::tensorrt_rtmdet
+
+#endif  // AUTOWARE__TENSORRT_RTMDET__PREPROCESS_HPP_
diff --git a/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp
new file mode 100644
index 0000000000000..9bfb7ca77510b
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp
@@ -0,0 +1,223 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// cspell: ignore RTMDET
+#ifndef AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_HPP_
+#define AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_HPP_
+
+#include <autoware/cuda_utils/cuda_unique_ptr.hpp>
+#include <autoware/cuda_utils/stream_unique_ptr.hpp>
+#include <autoware/tensorrt_common/tensorrt_common.hpp>
+#include <opencv2/opencv.hpp>
+#include <rclcpp/rclcpp.hpp>
+
+#include <cv_bridge/cv_bridge.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+class TrtRTMDetTest;
+
+namespace autoware::tensorrt_rtmdet
+{
+using autoware::cuda_utils::CudaUniquePtr;
+using autoware::cuda_utils::CudaUniquePtrHost;
+using autoware::cuda_utils::makeCudaStream;
+using autoware::cuda_utils::StreamUniquePtr;
+
+/**
+ * @brief Represents a detected object.
+ *
+ * This struct stores bounding box coordinates, class id, mask index, score of detected object.
+ */
+struct Object
+{
+  uint32_t x1;
+  uint32_t y1;
+  uint32_t x2;
+  uint32_t y2;
+  uint32_t class_id;
+  uint32_t mask_index;
+  float score;
+};
+
+using ObjectArray = std::vector<Object>;
+using ObjectArrays = std::vector<ObjectArray>;
+
+/**
+ * @brief Represents a color for a class.
+ *
+ * This struct stores class name, color and label id for a label.
+ * Read from a color map csv file.
+ */
+using LabelColor = struct LabelColor
+{
+  std::string class_name;
+  cv::Vec3b color;
+  uint8_t label_id;
+};
+
+using ColorMap = std::map<uint32_t, LabelColor>;
+
+class TrtRTMDet
+{
+public:
+  TrtRTMDet(
+    const std::string & model_path, const std::string & precision, const ColorMap & color_map,
+    const float score_threshold = 0.3, const float nms_threshold = 0.7,
+    const float mask_threshold = 200.0,
+    const autoware::tensorrt_common::BuildConfig & build_config =
+      autoware::tensorrt_common::BuildConfig(),
+    const std::string & calibration_image_list_file_path = std::string(),
+    const double norm_factor = 1.0, const std::vector<float> & mean = {103.53, 116.28, 123.675},
+    const std::vector<float> & std = {57.375, 57.12, 58.395},
+    [[maybe_unused]] const std::string & cache_dir = "",
+    const autoware::tensorrt_common::BatchConfig & batch_config = {1, 1, 1},
+    const size_t max_workspace_size = (1u << 30u),
+    const std::vector<std::string> & plugin_paths = {});
+
+  ~TrtRTMDet() noexcept;
+
+  /**
+   * @brief Perform inference.
+   *
+   * This function calls preprocess and feedforward functions to perform inference.
+   *
+   * @param[in] images a vector of input images.
+   * @param[out] objects a vector of detected objects.
+   * @param[out] mask a segmentation mask.
+   * @param[out] class_ids a vector of class ids.
+   * @return true if inference is successful.
+   */
+  bool do_inference(
+    const std::vector<cv::Mat> & images, ObjectArrays & objects, cv::Mat & mask,
+    std::vector<uint8_t> & class_ids);
+
+  /**
+   * @brief Calculate intersection over union.
+   *
+   * This function calculates the intersection over union (IoU) between two bounding boxes.
+   * Input bounding boxes contains the bounding box and class id.
+   *
+   * @param[in] a First object.
+   * @param[in] b Second object.
+   * @return IoU value.
+   */
+  [[nodiscard]] static float intersection_over_union(const Object & a, const Object & b);
+
+  /**
+   * @brief Perform non-maximum suppression.
+   *
+   * This function performs non-maximum suppression (NMS) on the detected objects.
+   * The detected objects are sorted by score and the NMS is applied to remove overlapping boxes.
+   *
+   * Since the model architecture has own NMS layer, this function added for overlapped boxes which
+   * have different class.
+   *
+   * @param[in] input_objects a vector of detected objects.
+   * @param[out] output_objects a vector of detected objects after NMS.
+   */
+  static void nms_sorted_bboxes(
+    const ObjectArray & input_objects, ObjectArray & output_objects, const float & nms_threshold);
+
+  void print_profiling();
+
+private:
+  /**
+   * @brief Preprocess input images on GPU.
+   *
+   * This function takes a vector of input images and preprocesses them on GPU.
+   * The images are resized to the input size of the model, normalized and stored in a buffer.
+   *
+   * @param[in] images a vector of input images.
+   */
+  void preprocess_gpu(const std::vector<cv::Mat> & images);
+
+  /**
+   * @brief Inference with TensorRT.
+   *
+   * This function performs inference with TensorRT.
+   * The input images are fed to the model and the output is stored in the output buffers.
+   * The output buffers are then post-processed to get the detected objects and segmentation mask.
+   *
+   * @param[in] images a vector of input images.
+   * @param[out] objects a vector of detected objects.
+   * @param[out] mask a segmentation mask.
+   * @param[out] class_ids a vector of class ids.
+   * @return true if inference is successful.
+   */
+  bool feedforward(
+    const std::vector<cv::Mat> & images, ObjectArrays & objects, cv::Mat & mask,
+    std::vector<uint8_t> & class_ids);
+
+  std::unique_ptr<autoware::tensorrt_common::TrtCommon> trt_common_;
+
+  // Host and device pointers for inputs
+  std::vector<float> input_h_;
+  CudaUniquePtr<float[]> input_d_;
+
+  // Device pointer for outputs
+  CudaUniquePtr<float[]> out_detections_d_;
+  CudaUniquePtr<int32_t[]> out_labels_d_;
+  CudaUniquePtr<float[]> out_masks_d_;
+
+  // Host pointer for outputs
+  std::unique_ptr<float[]> out_detections_h_;
+  std::unique_ptr<int32_t[]> out_labels_h_;
+  std::unique_ptr<float[]> out_masks_h_;
+
+  StreamUniquePtr stream_{makeCudaStream()};
+
+  // scale factor for input image
+  float scale_width_;
+  float scale_height_;
+
+  // size of input image for model
+  uint32_t model_input_width_;
+  uint32_t model_input_height_;
+
+  const float score_threshold_;
+  const float nms_threshold_;
+  const float mask_threshold_;
+  const int batch_size_;
+
+  // maximum number of detections, read from model
+  uint32_t max_detections_;
+
+  // host buffer for preprocessing on GPU
+  CudaUniquePtrHost<unsigned char[]> image_buf_h_;
+  // device buffer for preprocessing on GPU
+  CudaUniquePtr<unsigned char[]> image_buf_d_;
+  // normalization factor used for preprocessing
+  const double norm_factor_;
+
+  // size of input image for preprocessing, check if input image size is changed
+  int32_t src_width_;
+  int32_t src_height_;
+
+  // mean and std for preprocessing
+  const std::vector<float> mean_;
+  const std::vector<float> std_;
+
+  // Segmentation map, stores information for each class
+  const ColorMap color_map_;
+
+public:
+  friend TrtRTMDetTest;
+};
+}  // namespace autoware::tensorrt_rtmdet
+
+#endif  // AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_HPP_
diff --git a/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp
new file mode 100644
index 0000000000000..01ed65091ddd7
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/include/autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp
@@ -0,0 +1,134 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// cspell: ignore RTMDET
+#ifndef AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_NODE_HPP_
+#define AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_NODE_HPP_
+
+#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp"
+
+#include <autoware/universe_utils/ros/debug_publisher.hpp>
+#include <autoware/universe_utils/system/stop_watch.hpp>
+#include <image_transport/image_transport.hpp>
+#include <rclcpp/rclcpp.hpp>
+
+#include "autoware_internal_perception_msgs/msg/segmentation_mask.hpp"
+#include "tier4_perception_msgs/msg/detected_objects_with_feature.hpp"
+
+#include <cv_bridge/cv_bridge.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace autoware::tensorrt_rtmdet
+{
+class TrtRTMDetNode : public rclcpp::Node
+{
+public:
+  explicit TrtRTMDetNode(const rclcpp::NodeOptions & node_options);
+
+  /**
+   * @brief Read a color map file and return the color map.
+   *
+   * The color map file should be a csv file with the following format:
+   * ```
+   * unique_id, class_name, r, g, b, label_id
+   * ```
+   * where the label_id represents the class ID of the object, used to process the outputs in
+   * Autoware.
+   *
+   * @param[in] color_map_path The path to the color map file.
+   * @return A color map structure with the color information.
+   */
+  static ColorMap read_color_map_file(const std::string & color_map_path);
+
+  /**
+   * @brief Colorize the output mask.
+   *
+   * Take the output mask which includes the class ID of the objects and convert it to a colorized
+   * mask.
+   *
+   * @param[in] color_map The color map structure.
+   * @param[in] mask The output mask.
+   * @param[out] color_mask The colorized mask.
+   */
+  static void get_colorized_mask(
+    const ColorMap & color_map, const cv::Mat & mask, cv::Mat & color_mask);
+
+  /**
+   * @brief Draw the detected objects on the input image.
+   *
+   * @param[out] image
+   * @param[in] mask
+   * @param[in] objects
+   * @param[in] color_map
+   */
+  static void draw_debug_image(
+    cv::Mat & image, const cv::Mat & mask, const ObjectArrays & objects,
+    const ColorMap & color_map);
+
+  /**
+   *  @brief Apply erosion to the mask.
+   *
+   *  With using this function, you can apply erosion to the mask and It overflowing the border.
+   *
+   * @param[in,out] mask
+   * @param[in] erosion_size
+   */
+  static void apply_erosion(cv::Mat & mask, const int erosion_size);
+
+private:
+  /**
+   * @brief Callback function to check are there any subscribers.
+   */
+  void on_connect();
+
+  /**
+   * @brief Callback function to run RTMDet model.
+   *
+   * @param[in] msg The input image message.
+   */
+  void on_image(const sensor_msgs::msg::Image::ConstSharedPtr msg);
+
+  std::unique_ptr<tensorrt_rtmdet::TrtRTMDet> trt_rtmdet_;
+
+  image_transport::Subscriber image_sub_;
+  rclcpp::TimerBase::SharedPtr timer_;
+
+  rclcpp::Publisher<tier4_perception_msgs::msg::DetectedObjectsWithFeature>::SharedPtr objects_pub_;
+  rclcpp::Publisher<autoware_internal_perception_msgs::msg::SegmentationMask>::SharedPtr mask_pub_;
+
+  image_transport::Publisher color_mask_pub_;
+  image_transport::Publisher debug_image_pub_;
+
+  std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
+  std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_;
+
+  const bool is_publish_color_mask_;
+  const bool is_publish_debug_image_;
+
+  // Color map to store label and color information.
+  ColorMap color_map_;
+
+  // Mean and std for normalization of input image.
+  std::vector<float> mean_;
+  std::vector<float> std_;
+
+  // If true, apply erosion to the output mask.
+  bool is_apply_erosion_;
+};
+}  // namespace autoware::tensorrt_rtmdet
+
+#endif  // AUTOWARE__TENSORRT_RTMDET__TENSORRT_RTMDET_NODE_HPP_
diff --git a/perception/autoware_tensorrt_rtmdet/launch/multiple_rtmdet.launch.xml b/perception/autoware_tensorrt_rtmdet/launch/multiple_rtmdet.launch.xml
new file mode 100644
index 0000000000000..9f4e1390ce302
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/launch/multiple_rtmdet.launch.xml
@@ -0,0 +1,78 @@
+<launch>
+  <arg name="image_raw0" default="/sensing/camera/camera0/image_raw"/>
+  <arg name="gpu_id_image_raw0" default="0"/>
+
+  <arg name="image_raw1" default="/sensing/camera/camera1/image_raw"/>
+  <arg name="gpu_id_image_raw1" default="0"/>
+
+  <arg name="image_raw2" default="/sensing/camera/camera2/image_raw"/>
+  <arg name="gpu_id_image_raw2" default="0"/>
+
+  <arg name="image_raw3" default="/sensing/camera/camera3/image_raw"/>
+  <arg name="gpu_id_image_raw3" default="0"/>
+
+  <arg name="image_raw4" default="/sensing/camera/camera4/image_raw"/>
+  <arg name="gpu_id_image_raw4" default="0"/>
+
+  <arg name="image_raw5" default="/sensing/camera/camera5/image_raw"/>
+  <arg name="gpu_id_image_raw5" default="0"/>
+
+  <arg name="image_raw6" default="/sensing/camera/camera6/image_raw"/>
+  <arg name="gpu_id_image_raw6" default="0"/>
+
+  <arg name="image_raw7" default="/sensing/camera/camera7/image_raw"/>
+  <arg name="gpu_id_image_raw7" default="0"/>
+
+  <arg name="image_number" default="6"/>
+  <arg name="output_roi_topic" default="/perception/object_recognition/detection/rois"/>
+  <arg name="output_mask_topic" default="/perception/object_recognition/detection/mask"/>
+
+  <include if="$(eval &quot;'$(var image_number)'>='1'&quot;)" file="$(find-pkg-share autoware_tensorrt_rtmdet)/launch/rtmdet.launch.xml">
+    <arg name="input/image" value="$(var image_raw0)"/>
+    <arg name="output/objects" value="$(var output_roi_topic)0"/>
+    <arg name="output/mask" value="$(var output_mask_topic)0"/>
+    <arg name="gpu_id" value="$(var gpu_id_image_raw0)"/>
+  </include>
+  <include if="$(eval &quot;'$(var image_number)'>='2'&quot;)" file="$(find-pkg-share autoware_tensorrt_rtmdet)/launch/rtmdet.launch.xml">
+    <arg name="input/image" value="$(var image_raw1)"/>
+    <arg name="output/objects" value="$(var output_roi_topic)1"/>
+    <arg name="output/mask" value="$(var output_mask_topic)1"/>
+    <arg name="gpu_id" value="$(var gpu_id_image_raw1)"/>
+  </include>
+  <include if="$(eval &quot;'$(var image_number)'>='3'&quot;)" file="$(find-pkg-share autoware_tensorrt_rtmdet)/launch/rtmdet.launch.xml">
+    <arg name="input/image" value="$(var image_raw2)"/>
+    <arg name="output/objects" value="$(var output_roi_topic)2"/>
+    <arg name="output/mask" value="$(var output_mask_topic)2"/>
+    <arg name="gpu_id" value="$(var gpu_id_image_raw2)"/>
+  </include>
+  <include if="$(eval &quot;'$(var image_number)'>='4'&quot;)" file="$(find-pkg-share autoware_tensorrt_rtmdet)/launch/rtmdet.launch.xml">
+    <arg name="input/image" value="$(var image_raw3)"/>
+    <arg name="output/objects" value="$(var output_roi_topic)3"/>
+    <arg name="output/mask" value="$(var output_mask_topic)3"/>
+    <arg name="gpu_id" value="$(var gpu_id_image_raw3)"/>
+  </include>
+  <include if="$(eval &quot;'$(var image_number)'>='5'&quot;)" file="$(find-pkg-share autoware_tensorrt_rtmdet)/launch/rtmdet.launch.xml">
+    <arg name="input/image" value="$(var image_raw4)"/>
+    <arg name="output/objects" value="$(var output_roi_topic)4"/>
+    <arg name="output/mask" value="$(var output_mask_topic)4"/>
+    <arg name="gpu_id" value="$(var gpu_id_image_raw4)"/>
+  </include>
+  <include if="$(eval &quot;'$(var image_number)'>='6'&quot;)" file="$(find-pkg-share autoware_tensorrt_rtmdet)/launch/rtmdet.launch.xml">
+    <arg name="input/image" value="$(var image_raw5)"/>
+    <arg name="output/objects" value="$(var output_roi_topic)5"/>
+    <arg name="output/mask" value="$(var output_mask_topic)5"/>
+    <arg name="gpu_id" value="$(var gpu_id_image_raw5)"/>
+  </include>
+  <include if="$(eval &quot;'$(var image_number)'>='7'&quot;)" file="$(find-pkg-share autoware_tensorrt_rtmdet)/launch/rtmdet.launch.xml">
+    <arg name="input/image" value="$(var image_raw6)"/>
+    <arg name="output/objects" value="$(var output_roi_topic)6"/>
+    <arg name="output/mask" value="$(var output_mask_topic)6"/>
+    <arg name="gpu_id" value="$(var gpu_id_image_raw6)"/>
+  </include>
+  <include if="$(eval &quot;'$(var image_number)'>='8'&quot;)" file="$(find-pkg-share autoware_tensorrt_rtmdet)/launch/rtmdet.launch.xml">
+    <arg name="input/image" value="$(var image_raw7)"/>
+    <arg name="output/objects" value="$(var output_roi_topic)7"/>
+    <arg name="output/mask" value="$(var output_mask_topic)7"/>
+    <arg name="gpu_id" value="$(var gpu_id_image_raw7)"/>
+  </include>
+</launch>
diff --git a/perception/autoware_tensorrt_rtmdet/launch/rtmdet.launch.xml b/perception/autoware_tensorrt_rtmdet/launch/rtmdet.launch.xml
new file mode 100644
index 0000000000000..c1510cd325963
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/launch/rtmdet.launch.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<launch>
+  <arg name="input/image" default="/camera0/image_raw"/>
+  <arg name="output/objects" default="/perception/object_recognition/detection/rois0"/>
+  <arg name="output/mask" default="/perception/object_recognition/detection/mask0"/>
+  <arg name="model_name" default="rtmdet_x" description="options `rtmdet_x` and `rtmdet_s`"/>
+  <arg name="data_path" default="$(env HOME)/autoware_data" description="packages data and artifacts directory path"/>
+  <arg name="rtmdet_param_path" default="$(find-pkg-share autoware_tensorrt_rtmdet)/config/rtmdet.param.yaml"/>
+  <arg name="use_decompress" default="true" description="use image decompress"/>
+  <arg name="build_only" default="false" description="exit after trt engine is built"/>
+
+  <arg name="param_file" default="$(find-pkg-share autoware_image_transport_decompressor)/config/image_transport_decompressor.param.yaml"/>
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var input/image)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var input/image)"/>
+    <param from="$(var param_file)"/>
+  </node>
+
+  <node pkg="autoware_tensorrt_rtmdet" exec="autoware_tensorrt_rtmdet_node_exe" name="tensorrt_rtmdet" output="screen">
+    <remap from="~/in/image" to="$(var input/image)"/>
+    <remap from="~/out/objects" to="$(var output/objects)"/>
+    <remap from="~/out/mask" to="$(var output/mask)"/>
+    <param from="$(var rtmdet_param_path)" allow_substs="true"/>
+    <param name="build_only" value="$(var build_only)"/>
+  </node>
+</launch>
diff --git a/perception/autoware_tensorrt_rtmdet/package.xml b/perception/autoware_tensorrt_rtmdet/package.xml
new file mode 100644
index 0000000000000..885e561d47506
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/package.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>autoware_tensorrt_rtmdet</name>
+  <version>0.0.1</version>
+  <description>TensorRT library implementation for RTMDet</description>
+  <maintainer email="baris@leodrive.ai">Barış Zeren</maintainer>
+
+  <license>Apache License 2.0</license>
+
+  <buildtool_depend>ament_cmake_auto</buildtool_depend>
+  <buildtool_depend>autoware_cmake</buildtool_depend>
+  <buildtool_depend>cudnn_cmake_module</buildtool_depend>
+  <buildtool_depend>tensorrt_cmake_module</buildtool_depend>
+
+  <buildtool_export_depend>cudnn_cmake_module</buildtool_export_depend>
+  <buildtool_export_depend>tensorrt_cmake_module</buildtool_export_depend>
+
+  <depend>autoware_cuda_utils</depend>
+  <depend>autoware_internal_perception_msgs</depend>
+  <depend>autoware_object_recognition_utils</depend>
+  <depend>autoware_perception_msgs</depend>
+  <depend>autoware_tensorrt_common</depend>
+  <depend>cv_bridge</depend>
+  <depend>image_transport</depend>
+  <depend>libopencv-dev</depend>
+  <depend>rclcpp</depend>
+  <depend>rclcpp_components</depend>
+  <depend>sensor_msgs</depend>
+  <depend>tier4_perception_msgs</depend>
+  <depend>trt_batched_nms</depend>
+
+  <exec_depend>image_transport_decompressor</exec_depend>
+
+  <test_depend>ament_lint_auto</test_depend>
+  <test_depend>autoware_lint_common</test_depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/perception/autoware_tensorrt_rtmdet/schema/rtmdet.schema.json b/perception/autoware_tensorrt_rtmdet/schema/rtmdet.schema.json
new file mode 100644
index 0000000000000..132563791a590
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/schema/rtmdet.schema.json
@@ -0,0 +1,139 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for tensorrt_rtmdet Nodes",
+  "type": "object",
+  "definitions": {
+    "rtmdet": {
+      "type": "object",
+      "properties": {
+        "is_publish_debug_image": {
+          "type": "boolean",
+          "default": false,
+          "description": "If true, debug image which contain bounding boxes is published at runtime."
+        },
+        "is_publish_color_mask": {
+          "type": "boolean",
+          "default": false,
+          "description": "If true, debug image which contain colorful masks is published at runtime."
+        },
+        "model_path": {
+          "type": "string",
+          "default": "$(var data_path)/tensorrt_rtmdet/$(var model_name).onnx",
+          "description": "Path to ONNX file."
+        },
+        "color_map_path": {
+          "type": "string",
+          "default": "$(var data_path)/tensorrt_rtmdet/color_map.csv",
+          "description": "Path to color map file."
+        },
+        "plugin_paths": {
+          "type": "array",
+          "default": ["./build/tensorrt_rtmdet/libtensorrt_rtmdet_plugin.so"],
+          "description": "Path to plugin files."
+        },
+        "input_image_mean": {
+          "type": "array",
+          "default": [103.53, 116.28, 123.675],
+          "description": "Mean values for the model."
+        },
+        "input_image_std_dev": {
+          "type": "array",
+          "default": [57.375, 57.12, 58.395],
+          "description": "Standard deviation values for model."
+        },
+        "score_threshold": {
+          "type": "number",
+          "default": 0.3,
+          "minimum": 0.0,
+          "maximum": 1.0,
+          "description": "A threshold value of existence probability score, all of objects with score less than this threshold are ignored."
+        },
+        "nms_threshold": {
+          "type": "number",
+          "default": 0.0,
+          "minimum": 0.0,
+          "maximum": 1.0,
+          "description": "A threshold value of NMS."
+        },
+        "mask_threshold": {
+          "type": "number",
+          "default": 0.7,
+          "minimum": 0.0,
+          "maximum": 1.0,
+          "description": "A threshold value of pixel values in the output mask."
+        },
+        "precision": {
+          "type": "string",
+          "default": "fp16",
+          "description": "Operation precision to be used on inference. Valid value is one of: [fp32, fp16, int8]."
+        },
+        "dla_core_id": {
+          "type": "number",
+          "default": -1,
+          "description": "If positive ID value is specified, the node assign inference task to the DLA core."
+        },
+        "quantize_first_layer": {
+          "type": "boolean",
+          "default": false,
+          "description": "If true, set the operating precision for the first (input) layer to be fp16. This option is valid only when precision==int8."
+        },
+        "quantize_last_layer": {
+          "type": "boolean",
+          "default": false,
+          "description": "If true, set the operating precision for the last (output) layer to be fp16. This option is valid only when precision==int8."
+        },
+        "profile_per_layer": {
+          "type": "boolean",
+          "default": false,
+          "description": "If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose."
+        },
+        "clip_value": {
+          "type": "number",
+          "default": 0.0,
+          "description": "If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration."
+        },
+        "apply_erosion": {
+          "type": "boolean",
+          "default": false,
+          "description": "If true, apply erosion to the output mask."
+        },
+        "calibration_image_list_path": {
+          "type": "string",
+          "default": "",
+          "description": "Path to a file which contains path to images. Those images will be used for int8 quantization."
+        }
+      },
+      "required": [
+        "is_publish_debug_image",
+        "is_publish_color_mask",
+        "model_path",
+        "color_map_path",
+        "plugin_paths",
+        "input_image_mean",
+        "input_image_std_dev",
+        "score_threshold",
+        "nms_threshold",
+        "mask_threshold",
+        "precision",
+        "dla_core_id",
+        "quantize_first_layer",
+        "quantize_last_layer",
+        "profile_per_layer",
+        "clip_value",
+        "apply_erosion"
+      ]
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/rtmdet"
+        }
+      },
+      "required": ["ros__parameters"]
+    }
+  },
+  "required": ["/**"]
+}
diff --git a/perception/autoware_tensorrt_rtmdet/src/preprocess.cu b/perception/autoware_tensorrt_rtmdet/src/preprocess.cu
new file mode 100644
index 0000000000000..f5139805c9983
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/src/preprocess.cu
@@ -0,0 +1,461 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_rtmdet/preprocess.hpp"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+
+namespace autoware::tensorrt_rtmdet
+{
+constexpr size_t block = 512;
+constexpr size_t max_blocks_per_dim = 65535;
+constexpr int C = 3;
+
+dim3 cuda_gridsize(size_t n)
+{
+  size_t k = (n - 1) / block + 1;
+  size_t x = k;
+  size_t y = 1;
+  if (x > max_blocks_per_dim) {
+    x = ceil(sqrt(k));
+    y = (n - 1) / (x * block) + 1;
+  }
+  dim3 d;
+  d.x = x;
+  d.y = y;
+  d.z = 1;
+  return d;
+}
+
+__device__ double lerp1d(int a, int b, float w)
+{
+  return fma(w, (float)b, fma(-w, (float)a, (float)a));
+}
+
+__device__ float lerp2d(int f00, int f01, int f10, int f11, float centroid_h, float centroid_w)
+{
+  centroid_w = (1 + lroundf(centroid_w) - centroid_w) / 2;
+  centroid_h = (1 + lroundf(centroid_h) - centroid_h) / 2;
+
+  float r0, r1, r;
+  r0 = (float)lround(lerp1d(f00, f01, centroid_w));
+  r1 = (float)lround(lerp1d(f10, f11, centroid_w));
+
+  r = lerp1d((int)r0, (int)r1, centroid_h);
+  return r;
+}
+
+__global__ void resize_bilinear_kernel(
+  int N, unsigned char * dst_img, unsigned char * src_img, int dst_w, int src_h, int src_w,
+  float stride_h, float stride_w)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int W = dst_w;
+
+  int n = 0;
+
+  int w = index % W;
+  int h = index / W;
+
+  float centroid_h, centroid_w;
+  centroid_h = stride_h * (float)(h + 0.5);
+  centroid_w = stride_w * (float)(w + 0.5);
+
+  int f00, f01, f10, f11;
+
+  int src_h_idx = lroundf(centroid_h) - 1;
+  int src_w_idx = lroundf(centroid_w) - 1;
+  if (src_h_idx < 0) {
+    src_h_idx = 0;
+  }
+  if (src_w_idx < 0) {
+    src_w_idx = 0;
+  }
+
+  index = C * w + C * W * h;
+#pragma unroll
+  for (int c = 0; c < C; c++) {
+    f00 = n * src_h * src_w * C + src_h_idx * src_w * C + src_w_idx * C + c;
+    f01 = n * src_h * src_w * C + src_h_idx * src_w * C + (src_w_idx + 1) * C + c;
+    f10 = n * src_h * src_w * C + (src_h_idx + 1) * src_w * C + src_w_idx * C + c;
+    f11 = n * src_h * src_w * C + (src_h_idx + 1) * src_w * C + (src_w_idx + 1) * C + c;
+
+    float rs = lroundf(lerp2d(
+      (int)src_img[f00], (int)src_img[f01], (int)src_img[f10], (int)src_img[f11], centroid_h,
+      centroid_w));
+    dst_img[index + c] = (unsigned char)rs;
+  }
+}
+
+void resize_bilinear_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, cudaStream_t stream)
+{
+  int N = d_w * d_h;
+  float stride_h = (float)s_h / (float)d_h;
+  float stride_w = (float)s_w / (float)d_w;
+
+  resize_bilinear_kernel<<<cuda_gridsize(N), block, 0, stream>>>(
+    N, dst, src, d_w, s_h, s_w, stride_h, stride_w);
+}
+
+__global__ void letterbox_kernel(
+  int N, unsigned char * dst_img, unsigned char * src_img, int dst_w, int src_w, int letter_bot,
+  int letter_right)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int W = dst_w;
+
+  int w = index % W;
+  int h = index / W;
+
+  index = (C * w) + (C * W * h);
+  int index2 = (C * w) + (C * src_w * h);
+#pragma unroll
+  for (int c = 0; c < C; c++) {
+    dst_img[index + c] =
+      (w >= letter_right || h >= letter_bot) ? (unsigned int)114 : src_img[index2 + c];
+  }
+}
+
+void letterbox_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, cudaStream_t stream)
+{
+  int N = d_w * d_h;
+  const float scale = std::min(d_w / (float)s_w, d_h / (float)s_h);
+  int r_h = (int)(scale * s_h);
+  int r_w = (int)(scale * s_w);
+
+  letterbox_kernel<<<cuda_gridsize(N), block, 0, stream>>>(N, dst, src, d_w, r_w, r_h, r_w);
+}
+
+__global__ void nhwc_to_nchw_kernel(
+  int N, unsigned char * dst_img, unsigned char * src_img, int height, int width)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int x = index % width;
+  int y = index / width;
+  int src_index;
+  int dst_index;
+  for (int c = 0; c < C; c++) {
+    src_index = c + (C * x) + (C * width * y);
+    dst_index = x + (width * y) + (width * height * c);
+    dst_img[dst_index] = src_img[src_index];
+  }
+}
+
+void nhwc_to_nchw_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, cudaStream_t stream)
+{
+  int N = d_w * d_h;
+  nhwc_to_nchw_kernel<<<cuda_gridsize(N), block, 0, stream>>>(N, dst, src, d_h, d_w);
+}
+
+__global__ void nchw_to_nhwc_kernel(
+  int N, unsigned char * dst, unsigned char * src, int height, int width)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int x = index % width;
+  int y = index / width;
+  int src_index;
+  int dst_index;
+  for (int c = 0; c < C; c++) {
+    // NHWC
+    dst_index = c + (C * x) + (C * width * y);
+    // NCHW
+    src_index = x + (width * y) + (width * height * c);
+    dst[dst_index] = src[src_index];
+  }
+}
+
+void nchw_to_nhwc_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, cudaStream_t stream)
+{
+  int N = d_w * d_h;
+  nchw_to_nhwc_kernel<<<cuda_gridsize(N), block, 0, stream>>>(N, dst, src, d_h, d_w);
+}
+
+__global__ void to_float_kernel(int N, float * dst32, unsigned char * src8, int height, int width)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int x = index % width;
+  int y = index / width;
+  int dst_index;
+  for (int c = 0; c < C; c++) {
+    // NCHW
+    dst_index = x + (width * y) + (width * height * c);
+    dst32[dst_index] = (float)(src8[dst_index]);
+  }
+}
+
+void to_float_gpu(float * dst32, unsigned char * src, int d_w, int d_h, cudaStream_t stream)
+{
+  int N = d_w * d_h;
+  to_float_kernel<<<cuda_gridsize(N), block, 0, stream>>>(N, dst32, src, d_h, d_w);
+}
+
+__global__ void resize_bilinear_letterbox_kernel(
+  int N, unsigned char * dst_img, unsigned char * src_img, int dst_w, int src_h, int src_w,
+  float scale, int letter_bot, int letter_right)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int W = dst_w;
+  int n = 0;  // index / (C*W*H);
+
+  int w = index % W;
+  int h = index / W;
+
+  float centroid_h, centroid_w;
+  centroid_h = scale * (float)(h + 0.5);
+  centroid_w = scale * (float)(w + 0.5);
+
+  int f00, f01, f10, f11;
+
+  int src_h_idx = (int)lroundf(centroid_h) - 1;
+  int src_w_idx = (int)lroundf(centroid_w) - 1;
+  if (src_h_idx < 0) {
+    src_h_idx = 0;
+  }
+  if (src_w_idx < 0) {
+    src_w_idx = 0;
+  }
+  if (src_h_idx >= src_h) {
+    src_h_idx = src_h - 1;
+  }
+  if (src_w_idx >= src_w) {
+    src_w_idx = src_w - 1;
+  }
+
+  index = (C * w) + (C * W * h);
+#pragma unroll
+  for (int c = 0; c < C; c++) {
+    f00 = n * src_h * src_w * C + src_h_idx * src_w * C + src_w_idx * C + c;
+    f01 = n * src_h * src_w * C + src_h_idx * src_w * C + (src_w_idx + 1) * C + c;
+    f10 = n * src_h * src_w * C + (src_h_idx + 1) * src_w * C + src_w_idx * C + c;
+    f11 = n * src_h * src_w * C + (src_h_idx + 1) * src_w * C + (src_w_idx + 1) * C + c;
+
+    float rs = lroundf(lerp2d(
+      (int)src_img[f00], (int)src_img[f01], (int)src_img[f10], (int)src_img[f11], centroid_h,
+      centroid_w));
+    dst_img[index + c] = (unsigned char)rs;
+    dst_img[index + c] = (h >= letter_bot) ? (unsigned int)114 : dst_img[index + c];
+    dst_img[index + c] = (w >= letter_right) ? (unsigned int)114 : dst_img[index + c];
+  }
+}
+
+void resize_bilinear_letterbox_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, cudaStream_t stream)
+{
+  int N = d_w * d_h;
+  const float scale = std::min(d_w / (float)s_w, d_h / (float)s_h);
+  int r_h = (int)(scale * s_h);
+  int r_w = (int)(scale * s_w);
+  resize_bilinear_letterbox_kernel<<<cuda_gridsize(N), block, 0, stream>>>(
+    N, dst, src, d_w, s_h, s_w, 1.0 / scale, r_h, r_w);
+}
+
+__global__ void resize_bilinear_letterbox_nhwc_to_nchw32_kernel(
+  int N, float * dst_img, unsigned char * src_img, int dst_h, int dst_w, int src_h, int src_w,
+  float scale, int letter_bot, int letter_right, float norm)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int H = dst_h;
+  int W = dst_w;
+
+  int w = index % W;
+  int h = index / W;
+
+  float centroid_h, centroid_w;
+  centroid_h = scale * (float)(h + 0.5);
+  centroid_w = scale * (float)(w + 0.5);
+
+  int f00, f01, f10, f11;
+
+  int src_h_idx = lroundf(centroid_h) - 1;
+  int src_w_idx = lroundf(centroid_w) - 1;
+  src_h_idx = (src_h_idx < 0) ? 0 : src_h_idx;
+  src_h_idx = (src_h_idx >= (src_h - 1)) ? src_h - 2 : src_h_idx;
+  src_w_idx = (src_w_idx < 0) ? 0 : src_w_idx;
+  src_w_idx = (src_w_idx >= (src_w - 1)) ? src_w - 2 : src_w_idx;
+
+  int stride = src_w * C;
+
+#pragma unroll
+  for (int c = 0; c < C; c++) {
+    f00 = src_h_idx * stride + src_w_idx * C + c;
+    f01 = src_h_idx * stride + (src_w_idx + 1) * C + c;
+    f10 = (src_h_idx + 1) * stride + src_w_idx * C + c;
+    f11 = (src_h_idx + 1) * stride + (src_w_idx + 1) * C + c;
+
+    float rs = lroundf(lerp2d(
+      (int)src_img[f00], (int)src_img[f01], (int)src_img[f10], (int)src_img[f11], centroid_h,
+      centroid_w));
+
+    // NHCW
+    int dst_index = w + (W * h) + (W * H * c);
+
+    dst_img[dst_index] = (float)rs;
+    dst_img[dst_index] = (h >= letter_bot) ? 114.0 : dst_img[dst_index];
+    dst_img[dst_index] = (w >= letter_right) ? 114.0 : dst_img[dst_index];
+    dst_img[dst_index] *= norm;
+  }
+}
+
+void resize_bilinear_letterbox_nhwc_to_nchw32_gpu(
+  float * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, float norm,
+  cudaStream_t stream)
+{
+  int N = d_w * d_h;
+  const float scale = std::min(d_w / (float)s_w, d_h / (float)s_h);
+  int r_h = scale * s_h;
+  int r_w = scale * s_w;
+
+  resize_bilinear_letterbox_nhwc_to_nchw32_kernel<<<cuda_gridsize(N), block, 0, stream>>>(
+    N, dst, src, d_h, d_w, s_h, s_w, 1.0 / scale, r_h, r_w, norm);
+}
+
+__global__ void resize_bilinear_letterbox_nhwc_to_nchw32_batch_kernel(
+  int N, float * dst_img, unsigned char * src_img, int dst_h, int dst_w, int src_h, int src_w,
+  float scale_h, float scale_w, int letter_bot, int letter_right, int batch, const float * mean,
+  const float * std)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int H = dst_h;
+  int W = dst_w;
+
+  int w = index % W;
+  int h = index / (W);
+  float centroid_h, centroid_w;
+  centroid_h = scale_h * (float)(h + 0.5);
+  centroid_w = scale_w * (float)(w + 0.5);
+
+  int f00, f01, f10, f11;
+
+  int src_h_idx = lroundf(centroid_h) - 1;
+  int src_w_idx = lroundf(centroid_w) - 1;
+  src_h_idx = (src_h_idx < 0) ? 0 : src_h_idx;
+  src_h_idx = (src_h_idx >= (src_h - 1)) ? src_h - 2 : src_h_idx;
+  src_w_idx = (src_w_idx < 0) ? 0 : src_w_idx;
+  src_w_idx = (src_w_idx >= (src_w - 1)) ? src_w - 2 : src_w_idx;
+
+  int stride = src_w * C;
+  int b_stride = src_h * src_w * C;
+
+#pragma unroll
+  for (int b = 0; b < batch; b++) {
+    for (int c = 0; c < C; c++) {
+      // NHWC
+      f00 = src_h_idx * stride + src_w_idx * C + c + b * b_stride;
+      f01 = src_h_idx * stride + (src_w_idx + 1) * C + c + b * b_stride;
+      f10 = (src_h_idx + 1) * stride + src_w_idx * C + c + b * b_stride;
+      f11 = (src_h_idx + 1) * stride + (src_w_idx + 1) * C + c + b * b_stride;
+
+      float rs = lroundf(lerp2d(
+        (int)src_img[f00], (int)src_img[f01], (int)src_img[f10], (int)src_img[f11], centroid_h,
+        centroid_w));
+
+      // NCHW
+      int dst_index = w + (W * h) + (W * H * c) + b * (W * H * C);
+
+      dst_img[dst_index] = (float)rs;
+      dst_img[dst_index] = (h >= letter_bot) ? 114.0 : dst_img[dst_index];
+      dst_img[dst_index] = (w >= letter_right) ? 114.0 : dst_img[dst_index];
+      //            dst_img[dst_index] *= norm;
+      dst_img[dst_index] -= mean[c];
+      dst_img[dst_index] /= std[c];
+    }
+  }
+}
+
+void resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
+  float * dst, unsigned char * src, int d_w, int d_h, int s_w, int s_h, int batch,
+  const float * mean, const float * std, cudaStream_t stream)
+{
+  int N = d_w * d_h;
+
+  const float scale_h = d_h / (float)s_h;
+  const float scale_w = d_w / (float)s_w;
+
+  int r_h = scale_h * s_h;
+  int r_w = scale_w * s_w;
+
+  float *d_mean, *d_std;
+  cudaMalloc(&d_mean, 3 * sizeof(float));
+  cudaMalloc(&d_std, 3 * sizeof(float));
+  cudaMemcpyAsync(d_mean, mean, 3 * sizeof(float), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(d_std, std, 3 * sizeof(float), cudaMemcpyHostToDevice, stream);
+
+  resize_bilinear_letterbox_nhwc_to_nchw32_batch_kernel<<<cuda_gridsize(N), block, 0, stream>>>(
+    N, dst, src, d_h, d_w, s_h, s_w, 1.0 / scale_h, 1.0 / scale_w, r_h, r_w, batch, d_mean, d_std);
+}
+
+__global__ void argmax_gpu_kernel(
+  int N, unsigned char * dst, float * src, int dst_h, int dst_w, int src_c, int src_h, int src_w,
+  int batch)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int w = index % dst_w;
+  int h = index / (dst_w);
+
+  for (int b = 0; b < batch; b++) {
+    float max_prob = 0.0;
+    int max_index = 0;
+    int dst_index = w + dst_w * h + b * dst_h * dst_w;
+    for (int c = 0; c < src_c; c++) {
+      int src_index = w + src_w * h + c * src_h * src_w + b * src_c * src_h * src_w;
+      max_index = max_prob < src[src_index] ? c : max_index;
+      max_prob = max_prob < src[src_index] ? src[src_index] : max_prob;
+    }
+    dst[dst_index] = max_index;
+  }
+}
+
+void argmax_gpu(
+  unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch,
+  cudaStream_t stream)
+{
+  int N = d_w * d_h;
+  argmax_gpu_kernel<<<cuda_gridsize(N), block, 0, stream>>>(
+    N, dst, src, d_h, d_w, s_c, s_h, s_w, batch);
+}
+
+}  // namespace autoware::tensorrt_rtmdet
diff --git a/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet.cpp b/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet.cpp
new file mode 100644
index 0000000000000..54f5aca0d9c1c
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet.cpp
@@ -0,0 +1,368 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp"
+
+#include "autoware/tensorrt_rtmdet/calibrator.hpp"
+#include "autoware/tensorrt_rtmdet/preprocess.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+/**
+ * @brief Load calibration image list from a text file.
+ *
+ * @param filename Path to the text file.
+ * @return List of image paths.
+ */
+std::vector<std::string> load_calibration_image_list(const std::string & filename)
+{
+  if (filename.empty()) {
+    RCLCPP_ERROR(
+      rclcpp::get_logger("autoware_tensorrt_rtmdet"),
+      "Calibration image list is empty. Please set calibration_image_list_path.");
+    return {};
+  }
+  if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(filename))) {
+    RCLCPP_ERROR(
+      rclcpp::get_logger("autoware_tensorrt_rtmdet"), "failed to open %s", filename.c_str());
+    return {};
+  }
+
+  std::ifstream txt_file(filename);
+
+  std::string line;
+  std::vector<std::string> image_list;
+  while (std::getline(txt_file, line)) {
+    image_list.push_back(line);
+  }
+
+  txt_file.close();
+  return image_list;
+}
+
+namespace autoware::tensorrt_rtmdet
+{
+TrtRTMDet::TrtRTMDet(
+  const std::string & model_path, const std::string & precision, const ColorMap & color_map,
+  const float score_threshold, const float nms_threshold, const float mask_threshold,
+  const autoware::tensorrt_common::BuildConfig & build_config,
+  const std::string & calibration_image_list_file_path, const double norm_factor,
+  const std::vector<float> & mean, const std::vector<float> & std,
+  [[maybe_unused]] const std::string & cache_dir,
+  const autoware::tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size,
+  const std::vector<std::string> & plugin_paths)
+: score_threshold_{score_threshold},
+  nms_threshold_{nms_threshold},
+  mask_threshold_{mask_threshold},
+  batch_size_{batch_config.at(1)},
+  norm_factor_{norm_factor},
+  mean_{mean},
+  std_{std},
+  color_map_{color_map}
+{
+  src_width_ = -1;
+  src_height_ = -1;
+  scale_width_ = -1;
+  scale_height_ = -1;
+
+  if (precision == "int8") {
+    std::vector<std::string> calibration_images =
+      load_calibration_image_list(calibration_image_list_file_path);
+
+    int max_batch_size = batch_config.at(2);
+    nvinfer1::Dims input_dims = autoware::tensorrt_common::get_input_dims(model_path);
+    tensorrt_rtmdet::ImageStream stream(max_batch_size, input_dims, calibration_images);
+
+    fs::path calibration_table{model_path};
+    std::string ext = "EntropyV2-calibration.table";
+    calibration_table.replace_extension(ext);
+
+    fs::path histogram_table{model_path};
+    ext = "histogram.table";
+    histogram_table.replace_extension(ext);
+
+    std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator;
+    calibrator = std::make_unique<tensorrt_rtmdet::Int8EntropyCalibrator>(
+      stream, calibration_table, mean_, std_);
+
+    trt_common_ = std::make_unique<autoware::tensorrt_common::TrtCommon>(
+      model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config,
+      plugin_paths);
+  } else {
+    trt_common_ = std::make_unique<autoware::tensorrt_common::TrtCommon>(
+      model_path, precision, nullptr, batch_config, max_workspace_size, build_config, plugin_paths);
+  }
+  trt_common_->setup();
+
+  if (!trt_common_->isInitialized()) {
+    return;
+  }
+
+  const auto input_dims = trt_common_->getBindingDimensions(0);
+  const auto out_scores_dims = trt_common_->getBindingDimensions(3);
+
+  max_detections_ = out_scores_dims.d[1];
+  model_input_height_ = input_dims.d[2];
+  model_input_width_ = input_dims.d[3];
+
+  input_d_ = autoware::cuda_utils::make_unique<float[]>(
+    batch_size_ * input_dims.d[1] * input_dims.d[2] * input_dims.d[3]);
+  out_detections_d_ = autoware::cuda_utils::make_unique<float[]>(batch_size_ * max_detections_ * 5);
+  out_labels_d_ = autoware::cuda_utils::make_unique<int32_t[]>(batch_size_ * max_detections_);
+  out_masks_d_ = autoware::cuda_utils::make_unique<float[]>(
+    batch_size_ * max_detections_ * model_input_width_ * model_input_height_);
+
+  out_detections_h_ = std::make_unique<float[]>(batch_size_ * max_detections_ * 5);
+  out_labels_h_ = std::make_unique<int32_t[]>(batch_size_ * max_detections_);
+  out_masks_h_ = std::make_unique<float[]>(
+    batch_size_ * max_detections_ * model_input_width_ * model_input_height_);
+
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8050
+  std::vector<void *> buffers = {
+    input_d_.get(), out_detections_d_.get(), out_labels_d_.get(), out_masks_d_.get()};
+  trt_common_->setupBindings(buffers);
+#endif
+}
+
+TrtRTMDet::~TrtRTMDet() noexcept
+{
+  if (image_buf_h_) {
+    image_buf_h_.reset();
+  }
+  if (image_buf_d_) {
+    image_buf_d_.reset();
+  }
+}
+
+void TrtRTMDet::print_profiling()
+{
+  trt_common_->printProfiling();
+}
+
+void TrtRTMDet::preprocess_gpu(const std::vector<cv::Mat> & images)
+{
+  const auto batch_size = images.size();
+  auto input_dims = trt_common_->getBindingDimensions(0);
+
+  input_dims.d[0] = static_cast<int32_t>(batch_size);
+  for (const auto & image : images) {
+    // if size of source input has been changed...
+    int width = image.cols;
+    int height = image.rows;
+    if (src_width_ != -1 || src_height_ != -1) {
+      if (width != src_width_ || height != src_height_) {
+        // Free cuda memory to reallocate
+        if (image_buf_h_) {
+          image_buf_h_.reset();
+        }
+        if (image_buf_d_) {
+          image_buf_d_.reset();
+        }
+      }
+    }
+    src_width_ = width;
+    src_height_ = height;
+  }
+  if (!image_buf_h_) {
+    trt_common_->setBindingDimensions(0, input_dims);
+    scale_width_ = 0;
+    scale_height_ = 0;
+  }
+  const auto input_height = static_cast<float>(input_dims.d[2]);
+  const auto input_width = static_cast<float>(input_dims.d[3]);
+  int b = 0;
+  for (const auto & image : images) {
+    if (!image_buf_h_) {
+      scale_width_ = input_width / static_cast<float>(image.cols);
+      scale_height_ = input_height / static_cast<float>(image.rows);
+      image_buf_h_ = autoware::cuda_utils::make_unique_host<unsigned char[]>(
+        image.cols * image.rows * 3 * batch_size, cudaHostAllocWriteCombined);
+      image_buf_d_ = autoware::cuda_utils::make_unique<unsigned char[]>(
+        image.cols * image.rows * 3 * batch_size);
+    }
+    int index = b * image.cols * image.rows * 3;
+    // Copy into pinned memory
+    memcpy(
+      image_buf_h_.get() + index, &image.data[0],
+      image.cols * image.rows * 3 * sizeof(unsigned char));
+    b++;
+  }
+  // Copy into device memory
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    image_buf_d_.get(), image_buf_h_.get(),
+    images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char),
+    cudaMemcpyHostToDevice, *stream_));
+  // Preprocess on GPU
+  resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
+    input_d_.get(), image_buf_d_.get(), static_cast<int32_t>(input_width),
+    static_cast<int32_t>(input_height), images[0].cols, images[0].rows,
+    static_cast<int32_t>(batch_size), mean_.data(), std_.data(), *stream_);
+}
+
+bool TrtRTMDet::do_inference(
+  const std::vector<cv::Mat> & images, ObjectArrays & objects, cv::Mat & mask,
+  std::vector<uint8_t> & class_ids)
+{
+  if (!trt_common_->isInitialized()) {
+    return false;
+  }
+  preprocess_gpu(images);
+
+  return feedforward(images, objects, mask, class_ids);
+}
+
+bool TrtRTMDet::feedforward(
+  const std::vector<cv::Mat> & images, ObjectArrays & objects, cv::Mat & mask,
+  std::vector<uint8_t> & class_ids)
+{
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8050
+  trt_common_->enqueueV3(*stream_);
+#else
+  std::vector<void *> buffers = {
+    input_d_.get(), out_detections_d_.get(), out_labels_d_.get(), out_masks_d_.get()};
+
+  trt_common_->enqueueV2(buffers.data(), *stream_, nullptr);
+#endif
+
+  const auto batch_size = images.size();
+  out_detections_h_.reset(new float[batch_size_ * max_detections_ * 5]);
+  out_labels_h_.reset(new int32_t[batch_size_ * max_detections_]);
+  out_masks_h_.reset(new float[batch_size_ * 20 * model_input_width_ * model_input_height_]);
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_detections_h_.get(), out_detections_d_.get(),
+    sizeof(float) * batch_size_ * max_detections_ * 5, cudaMemcpyDeviceToHost, *stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_labels_h_.get(), out_labels_d_.get(), sizeof(int32_t) * batch_size_ * max_detections_,
+    cudaMemcpyDeviceToHost, *stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_masks_h_.get(), out_masks_d_.get(),
+    sizeof(float) * batch_size_ * 20 * model_input_width_ * model_input_height_,
+    cudaMemcpyDeviceToHost, *stream_));
+
+  cudaStreamSynchronize(*stream_);
+
+  // POST PROCESSING
+  objects.clear();
+  for (size_t batch = 0; batch < batch_size; ++batch) {
+    ObjectArray object_array;
+    for (uint32_t index = 0; index < max_detections_; ++index) {
+      if (out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 4)] < score_threshold_) {
+        break;
+      }
+
+      Object object{};
+      object.mask_index = index;
+      object.class_id = out_labels_h_[(batch * max_detections_) + index];
+      object.x1 = static_cast<uint32_t>(
+        out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 0)] / scale_width_);
+      object.y1 = static_cast<uint32_t>(
+        out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 1)] / scale_height_);
+      object.x2 = static_cast<uint32_t>(
+        out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 2)] / scale_width_);
+      object.y2 = static_cast<uint32_t>(
+        out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 3)] / scale_height_);
+      object.score = out_detections_h_[(batch * max_detections_ * 5) + ((5 * index) + 4)];
+      object_array.push_back(object);
+    }
+    ObjectArray nms_objects;
+    nms_sorted_bboxes(object_array, nms_objects, nms_threshold_);
+
+    objects.push_back(nms_objects);
+  }
+
+  // Create an instance segmentation mask.
+  // The mask is an image with the same dimensions as the model input image,
+  // where each pixel represents the class intensity.
+  // The intensity of each pixel corresponds to the index of the class_array,
+  // which stores the class IDs.
+  mask = cv::Mat(
+    static_cast<int32_t>(model_input_height_), static_cast<int32_t>(model_input_width_), CV_8UC1,
+    cv::Scalar(0, 0, 0));
+  uint8_t pixel_intensity = 1;  // 0 is reserved for background
+  for (size_t batch = 0; batch < batch_size; ++batch) {
+    for (const auto & object : objects.at(batch)) {
+      cv::Mat object_mask(
+        static_cast<int32_t>(model_input_height_), static_cast<int32_t>(model_input_width_), CV_32F,
+        &out_masks_h_
+          [(batch * 100 * model_input_width_ * model_input_height_) +
+           (object.mask_index * model_input_width_ * model_input_height_)]);
+      double min_val = 0;
+      double max_val = 0;
+      cv::minMaxLoc(object_mask, &min_val, &max_val);
+      object_mask.convertTo(
+        object_mask, CV_8U, 255.0 / (max_val - min_val), -min_val * 255.0 / (max_val - min_val));
+
+      auto process_pixel = [&]([[maybe_unused]] cv::Vec3b & pixel, const int * position) -> void {
+        int i = position[0];
+        int j = position[1];
+
+        if (object_mask.at<uchar>(i, j) > static_cast<int>(255 * mask_threshold_)) {
+          mask.at<uint8_t>(i, j) = pixel_intensity;
+        }
+      };
+      mask.forEach<cv::Vec3b>(process_pixel);
+      class_ids.push_back(color_map_.at(object.class_id).label_id);
+      pixel_intensity++;
+    }
+  }
+  return true;
+}
+
+float TrtRTMDet::intersection_over_union(const Object & a, const Object & b)
+{
+  uint32_t x_left = std::max(a.x1, b.x1);
+  uint32_t y_top = std::max(a.y1, b.y1);
+  uint32_t x_right = std::min(a.x2, b.x2);
+  uint32_t y_bottom = std::min(a.y2, b.y2);
+
+  if (x_right < x_left || y_bottom < y_top) {
+    return 0.0;
+  }
+
+  float intersection_area = ((x_right - x_left + 1) * (y_bottom - y_top + 1));
+  float a_area = (a.x2 - a.x1 + 1) * (a.y2 - a.y1 + 1);
+  float b_area = (b.x2 - b.x1 + 1) * (b.y2 - b.y1 + 1);
+
+  return intersection_area / (a_area + b_area - intersection_area);
+}
+
+void TrtRTMDet::nms_sorted_bboxes(
+  const ObjectArray & input_objects, ObjectArray & output_objects, const float & nms_threshold)
+{
+  std::vector<bool> suppressed(input_objects.size(), false);
+
+  for (size_t i = 0; i < input_objects.size(); ++i) {
+    if (suppressed.at(i)) continue;
+
+    const Object & a = input_objects.at(i);
+    output_objects.push_back(a);
+
+    for (size_t j = i + 1; j < input_objects.size(); ++j) {
+      if (suppressed.at(j)) continue;
+
+      const Object & b = input_objects.at(j);
+      if (intersection_over_union(a, b) > nms_threshold) {
+        suppressed.at(j) = true;
+      }
+    }
+  }
+}
+}  // namespace autoware::tensorrt_rtmdet
diff --git a/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet_node.cpp b/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet_node.cpp
new file mode 100644
index 0000000000000..621c39d4fd190
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/src/tensorrt_rtmdet_node.cpp
@@ -0,0 +1,301 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp"
+
+#include "autoware_perception_msgs/msg/object_classification.hpp"
+#include "tier4_perception_msgs/msg/detected_object_with_feature.hpp"
+
+#include <dlfcn.h>
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace autoware::tensorrt_rtmdet
+{
+TrtRTMDetNode::TrtRTMDetNode(const rclcpp::NodeOptions & node_options)
+: Node("tensorrt_rtmdet", node_options),
+  is_publish_color_mask_(declare_parameter<bool>("is_publish_color_mask")),
+  is_publish_debug_image_(declare_parameter<bool>("is_publish_debug_image")),
+  is_apply_erosion_(declare_parameter<bool>("apply_erosion"))
+{
+  {
+    stop_watch_ptr_ =
+      std::make_unique<autoware::universe_utils::StopWatch<std::chrono::milliseconds>>();
+    debug_publisher_ =
+      std::make_unique<autoware::universe_utils::DebugPublisher>(this, this->get_name());
+    stop_watch_ptr_->tic("cyclic_time");
+    stop_watch_ptr_->tic("processing_time");
+  }
+  using std::placeholders::_1;
+  using std::chrono_literals::operator""ms;
+
+  std::string model_path = declare_parameter<std::string>("model_path");
+  std::string color_map_path = declare_parameter<std::string>("color_map_path");
+  std::string precision = declare_parameter<std::string>("precision");
+  std::vector<double> mean = declare_parameter<std::vector<double>>("input_image_mean");
+  std::vector<double> std = declare_parameter<std::vector<double>>("input_image_std_dev");
+  double score_threshold = declare_parameter<double>("score_threshold");
+  double nms_threshold = declare_parameter<double>("nms_threshold");
+  double mask_threshold = declare_parameter<double>("mask_threshold");
+  int64_t dla_core_id = declare_parameter<int64_t>("dla_core_id");
+  bool quantize_first_layer = declare_parameter<bool>("quantize_first_layer");
+  bool quantize_last_layer = declare_parameter<bool>("quantize_last_layer");
+  bool profile_per_layer = declare_parameter<bool>("profile_per_layer");
+  double clip_value = declare_parameter<double>("clip_value");
+  std::string calibration_image_list_path =
+    declare_parameter<std::string>("calibration_image_list_path");
+  std::vector<std::string> plugin_paths =
+    declare_parameter<std::vector<std::string>>("plugin_paths");
+
+  color_map_ = read_color_map_file(color_map_path);
+
+  autoware::tensorrt_common::BuildConfig build_config(
+    "Entropy", dla_core_id, quantize_first_layer, quantize_last_layer, profile_per_layer,
+    clip_value);
+
+  const double norm_factor = 1.0;
+  const std::string cache_dir;
+  const autoware::tensorrt_common::BatchConfig batch_config{1, 1, 1};
+  const size_t max_workspace_size = (1u << 30u);
+
+  mean_ = std::vector<float>(mean.begin(), mean.end());
+  std_ = std::vector<float>(std.begin(), std.end());
+
+  trt_rtmdet_ = std::make_unique<tensorrt_rtmdet::TrtRTMDet>(
+    model_path, precision, color_map_, score_threshold, nms_threshold, mask_threshold, build_config,
+    calibration_image_list_path, norm_factor, mean_, std_, cache_dir, batch_config,
+    max_workspace_size, plugin_paths);
+
+  timer_ =
+    rclcpp::create_timer(this, get_clock(), 100ms, std::bind(&TrtRTMDetNode::on_connect, this));
+
+  objects_pub_ = this->create_publisher<tier4_perception_msgs::msg::DetectedObjectsWithFeature>(
+    "~/out/objects", 1);
+  mask_pub_ = this->create_publisher<autoware_internal_perception_msgs::msg::SegmentationMask>(
+    "~/out/mask", 1);
+
+  color_mask_pub_ = image_transport::create_publisher(this, "~/out/color_mask");
+  debug_image_pub_ = image_transport::create_publisher(this, "~/out/debug_image");
+
+  if (declare_parameter("build_only", false)) {
+    RCLCPP_INFO(this->get_logger(), "TensorRT engine file is built and exit.");
+    rclcpp::shutdown();
+  }
+}
+
+void TrtRTMDetNode::on_connect()
+{
+  using std::placeholders::_1;
+  if (
+    debug_image_pub_.getNumSubscribers() == 0 && mask_pub_->get_subscription_count() == 0 &&
+    mask_pub_->get_intra_process_subscription_count() == 0 &&
+    color_mask_pub_.getNumSubscribers() == 0 && objects_pub_->get_subscription_count() == 0 &&
+    objects_pub_->get_intra_process_subscription_count() == 0) {
+    image_sub_.shutdown();
+  } else if (!image_sub_) {
+    image_sub_ = image_transport::create_subscription(
+      this, "~/in/image", std::bind(&TrtRTMDetNode::on_image, this, _1), "raw",
+      rmw_qos_profile_sensor_data);
+  }
+}
+
+void TrtRTMDetNode::on_image(const sensor_msgs::msg::Image::ConstSharedPtr msg)
+{
+  stop_watch_ptr_->toc("processing_time", true);
+
+  cv_bridge::CvImagePtr in_image_ptr;
+  try {
+    in_image_ptr = cv_bridge::toCvCopy(msg, msg->encoding);
+  } catch (cv_bridge::Exception & e) {
+    RCLCPP_ERROR(this->get_logger(), "cv_bridge exception: %s", e.what());
+    return;
+  }
+
+  tensorrt_rtmdet::ObjectArrays objects;
+  cv::Mat mask;
+  std::vector<uint8_t> class_ids;
+  if (!trt_rtmdet_->do_inference({in_image_ptr->image}, objects, mask, class_ids)) {
+    RCLCPP_WARN(this->get_logger(), "Fail to inference");
+    return;
+  }
+
+  tier4_perception_msgs::msg::DetectedObjectsWithFeature detected_objects_with_feature;
+  for (const auto & object : objects.at(0)) {
+    tier4_perception_msgs::msg::DetectedObjectWithFeature detected_object_with_feature;
+    detected_object_with_feature.feature.roi.width = object.x2 - object.x1;
+    detected_object_with_feature.feature.roi.height = object.y2 - object.y1;
+    detected_object_with_feature.feature.roi.x_offset = object.x1;
+    detected_object_with_feature.feature.roi.y_offset = object.y1;
+    autoware_perception_msgs::msg::ObjectClassification classification;
+    classification.label = color_map_[object.class_id].label_id;
+    classification.probability = object.score;
+    detected_object_with_feature.object.classification =
+      std::vector<autoware_perception_msgs::msg::ObjectClassification>{classification};
+
+    detected_objects_with_feature.feature_objects.push_back(detected_object_with_feature);
+  }
+  detected_objects_with_feature.header = msg->header;
+  objects_pub_->publish(detected_objects_with_feature);
+
+  if (!mask.empty() && !class_ids.empty()) {
+    if (is_apply_erosion_) apply_erosion(mask, 3);
+    sensor_msgs::msg::Image::SharedPtr mask_image =
+      cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::MONO8, mask)
+        .toImageMsg();
+    std::vector<autoware_perception_msgs::msg::ObjectClassification> classification;
+    for (uint8_t class_id : class_ids) {
+      autoware_perception_msgs::msg::ObjectClassification object_classification;
+      object_classification.label = class_id;
+      object_classification.probability = 1.0;
+      classification.push_back(object_classification);
+    }
+    autoware_internal_perception_msgs::msg::SegmentationMask mask_msg;
+    mask_msg.classification = classification;
+    mask_msg.image = *mask_image;
+    mask_msg.header = msg->header;
+    mask_pub_->publish(mask_msg);
+  }
+
+  if (is_publish_color_mask_) {
+    cv::Mat color_mask = cv::Mat::zeros(mask.rows, mask.cols, CV_8UC3);
+    get_colorized_mask(color_map_, mask, color_mask);
+    sensor_msgs::msg::Image::SharedPtr color_mask_msg =
+      cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::BGR8, color_mask)
+        .toImageMsg();
+    color_mask_msg->header = msg->header;
+    color_mask_pub_.publish(color_mask_msg);
+  }
+
+  if (is_publish_debug_image_) {
+    cv::Mat debug_image = in_image_ptr->image.clone();
+    draw_debug_image(debug_image, mask, objects, color_map_);
+    sensor_msgs::msg::Image::SharedPtr debug_image_msg =
+      cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::BGR8, debug_image)
+        .toImageMsg();
+    debug_image_msg->header = msg->header;
+    debug_image_pub_.publish(debug_image_msg);
+  }
+
+  if (debug_publisher_) {
+    const double processing_time_ms = stop_watch_ptr_->toc("processing_time", true);
+    const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic_time", true);
+    const double pipeline_latency_ms =
+      std::chrono::duration<double, std::milli>(
+        std::chrono::nanoseconds((this->get_clock()->now() - msg->header.stamp).nanoseconds()))
+        .count();
+    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/cyclic_time_ms", cyclic_time_ms);
+    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/processing_time_ms", processing_time_ms);
+    debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/pipeline_latency_ms", pipeline_latency_ms);
+  }
+}
+
+ColorMap TrtRTMDetNode::read_color_map_file(const std::string & color_map_path)
+{
+  ColorMap color_map;
+
+  if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(color_map_path))) {
+    RCLCPP_ERROR(
+      rclcpp::get_logger("tensorrt_rtmdet_node"), "failed to open %s", color_map_path.c_str());
+    assert(0);
+  }
+
+  std::ifstream file(color_map_path);
+  std::string line;
+
+  // Skip the first line since it is the header
+  std::getline(file, line);
+
+  while (std::getline(file, line)) {
+    if (line.empty()) {
+      continue;
+    }
+
+    auto split_string = [](std::string & str, char delimiter) -> std::vector<std::string> {
+      std::vector<std::string> result;
+      std::stringstream ss(str);
+      std::string item;
+
+      while (std::getline(ss, item, delimiter)) {
+        result.push_back(item);
+      }
+
+      return result;
+    };
+    std::vector<std::string> tokens = split_string(line, ',');
+
+    LabelColor label_color;
+    label_color.class_name = tokens.at(1);
+    label_color.color[0] = std::stoi(tokens.at(2));
+    label_color.color[1] = std::stoi(tokens.at(3));
+    label_color.color[2] = std::stoi(tokens.at(4));
+    label_color.label_id = std::stoi(tokens.at(5));
+    color_map[std::stoi(tokens.at(0))] = label_color;
+  }
+
+  return color_map;
+}
+
+void TrtRTMDetNode::draw_debug_image(
+  cv::Mat & image, [[maybe_unused]] const cv::Mat & mask,
+  const tensorrt_rtmdet::ObjectArrays & objects, const tensorrt_rtmdet::ColorMap & color_map)
+{
+  // TODO(StepTurtle): add mask to debug image
+
+  for (const auto & object : objects.at(0)) {
+    // Draw the bounding box
+    cv::rectangle(
+      image, cv::Point(static_cast<int>(object.x1), static_cast<int>(object.y1)),
+      cv::Point(static_cast<int>(object.x2), static_cast<int>(object.y2)),
+      color_map.at(static_cast<int32_t>(object.class_id)).color, 2);
+    // Write the class name
+    cv::putText(
+      image, color_map.at(static_cast<int32_t>(object.class_id)).class_name,
+      cv::Point(static_cast<int>(object.x1), static_cast<int>(object.y1)), cv::FONT_HERSHEY_SIMPLEX,
+      1, color_map.at(static_cast<int32_t>(object.class_id)).color, 2);
+  }
+}
+
+void TrtRTMDetNode::get_colorized_mask(
+  const ColorMap & color_map, const cv::Mat & mask, cv::Mat & color_mask)
+{
+  int width = mask.cols;
+  int height = mask.rows;
+  if ((color_mask.cols != mask.cols) || (color_mask.rows != mask.rows)) {
+    throw std::runtime_error("input and output image have difference size.");
+  }
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      unsigned char id = mask.at<unsigned char>(y, x);
+      color_mask.at<cv::Vec3b>(y, x) = color_map.at(id).color;
+    }
+  }
+}
+
+void TrtRTMDetNode::apply_erosion(cv::Mat & mask, const int erosion_size)
+{
+  cv::Mat element = cv::getStructuringElement(
+    cv::MORPH_RECT, cv::Size(2 * erosion_size + 1, 2 * erosion_size + 1),
+    cv::Point(erosion_size, erosion_size));
+  cv::erode(mask, mask, element);
+}
+}  // namespace autoware::tensorrt_rtmdet
+
+#include "rclcpp_components/register_node_macro.hpp"
+RCLCPP_COMPONENTS_REGISTER_NODE(autoware::tensorrt_rtmdet::TrtRTMDetNode)
diff --git a/perception/autoware_tensorrt_rtmdet/test/data/test_color_map.csv b/perception/autoware_tensorrt_rtmdet/test/data/test_color_map.csv
new file mode 100644
index 0000000000000..0fb9eaa709f1a
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/test/data/test_color_map.csv
@@ -0,0 +1,4 @@
+id,name,r,g,b,label
+0,person,220,20,60,7
+1,bicycle,0,0,142,6
+2,car,0,0,255,1
diff --git a/perception/autoware_tensorrt_rtmdet/test/data/test_image.jpg b/perception/autoware_tensorrt_rtmdet/test/data/test_image.jpg
new file mode 100644
index 0000000000000..c2f3cba777d43
Binary files /dev/null and b/perception/autoware_tensorrt_rtmdet/test/data/test_image.jpg differ
diff --git a/perception/autoware_tensorrt_rtmdet/test/test_rtmdet.cpp b/perception/autoware_tensorrt_rtmdet/test/test_rtmdet.cpp
new file mode 100644
index 0000000000000..aaa45ca7fb7e0
--- /dev/null
+++ b/perception/autoware_tensorrt_rtmdet/test/test_rtmdet.cpp
@@ -0,0 +1,237 @@
+// Copyright 2024 Autoware Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// cspell: ignore libtrt
+#include "ament_index_cpp/get_package_prefix.hpp"
+#include "ament_index_cpp/get_package_share_directory.hpp"
+#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet.hpp"
+#include "autoware/tensorrt_rtmdet/tensorrt_rtmdet_node.hpp"
+
+#include <autoware/cuda_utils/cuda_unique_ptr.hpp>
+#include <autoware/cuda_utils/stream_unique_ptr.hpp>
+#include <autoware/tensorrt_common/tensorrt_common.hpp>
+#include <opencv2/opencv.hpp>
+
+#include <dlfcn.h>
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+template <typename T>
+T get_pixel_value(const cv::Mat & image, int x, int y)
+{
+  if (x < 0 || x >= image.cols || y < 0 || y >= image.rows) {
+    throw std::out_of_range("Coordinates are out of image bounds");
+  }
+
+  if (image.channels() == 1) {
+    return image.at<T>(y, x);
+  }
+  return image.at<T>(y, x);
+}
+
+class TrtRTMDetTest : public ::testing::Test
+{
+protected:
+  void SetUp() override
+  {
+    const auto home_path = std::string(getenv("HOME"));
+    const auto autoware_data_path = home_path + "/autoware_data";
+
+    const auto rtmdet_share_dir =
+      ament_index_cpp::get_package_share_directory("autoware_tensorrt_rtmdet");
+    const auto plugin_dir = ament_index_cpp::get_package_prefix("trt_batched_nms");
+
+    const auto test_image_path = rtmdet_share_dir + "/data/test_image.jpg";
+    const auto test_color_map_path = rtmdet_share_dir + "/data/test_color_map.csv";
+    const auto plugin_path = plugin_dir + "/lib/libtrt_batched_nms_plugin.so";
+    const auto model_path = autoware_data_path + "/tensorrt_rtmdet/rtmdet_x.onnx";
+
+    const double norm_factor = 1.0;
+    const std::string cache_dir;
+    const autoware::tensorrt_common::BatchConfig batch_config{1, 1, 1};
+    const size_t max_workspace_size = (1u << 30u);
+    const std::vector<std::string> plugin_paths{plugin_path};
+
+    test_image_ = cv::imread(test_image_path, cv::IMREAD_COLOR);
+
+    autoware::tensorrt_common::BuildConfig build_config("Entropy", -1, false, false, false, 6.0);
+
+    // Test color map only includes 3 classes. It is enough for the test.
+    color_map_ = autoware::tensorrt_rtmdet::TrtRTMDetNode::read_color_map_file(test_color_map_path);
+
+    trt_rtmdet_ = std::make_unique<autoware::tensorrt_rtmdet::TrtRTMDet>(
+      model_path, precision_, color_map_, score_threshold_, nms_threshold_, mask_threshold_,
+      build_config, "", norm_factor, mean_, std_, cache_dir, batch_config, max_workspace_size,
+      plugin_paths);
+  }
+
+public:
+  /**
+   * @brief Wrapper function to test the autoware::tensorrt_rtmdet::TrtRTMDet::preprocess_gpu
+   * function.
+   *
+   * @param input_image The input images.
+   */
+  void preprocess_gpu(const std::vector<cv::Mat> & input_images) const
+  {
+    trt_rtmdet_->preprocess_gpu(input_images);
+  }
+
+  /**
+   * @brief Wrapper function to get the image buffer on the host.
+   *
+   * @return The image buffer on the host.
+   */
+  [[nodiscard]] const autoware::cuda_utils::CudaUniquePtrHost<unsigned char[]> &
+  get_image_buffer_h() const
+  {
+    return trt_rtmdet_->image_buf_h_;
+  }
+
+  /**
+   * @brief Wrapper function to get the image buffer on the device.
+   *
+   * @return The image buffer on the device.
+   */
+  [[nodiscard]] const autoware::cuda_utils::CudaUniquePtr<unsigned char[]> & get_image_buffer_d()
+    const
+  {
+    return trt_rtmdet_->image_buf_d_;
+  }
+
+  /**
+   * @brief Wrapper function to get the input dimensions.
+   *
+   * @return The input dimensions.
+   */
+  [[nodiscard]] nvinfer1::Dims get_input_dimensions() const
+  {
+    return trt_rtmdet_->trt_common_->getBindingDimensions(0);
+  }
+
+  std::unique_ptr<autoware::tensorrt_rtmdet::TrtRTMDet> trt_rtmdet_;
+
+  // Common parameters to initialize the RTMDet model
+  const std::vector<float> mean_{103.53, 116.28, 123.675};
+  const std::vector<float> std_{57.375, 57.12, 58.395};
+  const float score_threshold_{0.3};
+  const float nms_threshold_{0.3};
+  const float mask_threshold_{0.7};
+  const std::string precision_{"fp16"};
+
+  cv::Mat test_image_;
+
+  autoware::tensorrt_rtmdet::ColorMap color_map_;
+};
+
+TEST_F(TrtRTMDetTest, TestPreprocessGPU)
+{
+  const auto input_image = test_image_.clone();
+  const auto input_images = std::vector<cv::Mat>{input_image};
+
+  ASSERT_NO_THROW(preprocess_gpu(input_images));
+
+  // Validate CUDA memory setup by checking pointers.
+  EXPECT_NE(get_image_buffer_h(), nullptr);
+  EXPECT_NE(get_image_buffer_d(), nullptr);
+}
+
+TEST_F(TrtRTMDetTest, TestFeedForward)
+{
+  const auto input_image = test_image_.clone();
+  const auto input_images = std::vector<cv::Mat>{input_image};
+  preprocess_gpu(input_images);
+
+  autoware::tensorrt_rtmdet::ObjectArrays object_arrays;
+  cv::Mat mask;
+  std::vector<uint8_t> class_ids;
+
+  const auto success = trt_rtmdet_->do_inference(input_images, object_arrays, mask, class_ids);
+  const auto object_size = object_arrays[0].size();
+
+  // Check if the inference was successful
+  EXPECT_TRUE(success);
+
+  // Check if the object size is correct
+  EXPECT_EQ(object_size, 1);
+
+  // Check if the output objects class ids is correct (the label_id is derived from the COCO
+  // dataset.)
+  EXPECT_EQ(object_arrays.at(0).at(0).class_id, 2);
+
+  // Check if the output objects positions are correct
+  EXPECT_EQ(object_arrays.at(0).at(0).x1, 138);
+  EXPECT_EQ(object_arrays.at(0).at(0).x2, 1026);
+  EXPECT_EQ(object_arrays.at(0).at(0).y1, 201);
+  EXPECT_EQ(object_arrays.at(0).at(0).y2, 509);
+
+  // Check if the output objects class ids is correct (label_id, which is used to process the
+  // outputs in Autoware.)
+  EXPECT_EQ(class_ids.at(0), 1);
+
+  // Check if the one of the mask pixel which is in the object is not zero (zero represents the
+  // background.) There is a car in the test image, so the mask pixel value should not be zero.
+  const auto mask_pixel_value = get_pixel_value<uint8_t>(mask, 320, 413);
+  EXPECT_NE(mask_pixel_value, 0);
+}
+
+TEST_F(TrtRTMDetTest, TestIntersectionOverUnion)
+{
+  const autoware::tensorrt_rtmdet::Object obj1 = {10, 10, 50, 50, 1, 0, 0.8};
+  const autoware::tensorrt_rtmdet::Object obj2 = {20, 20, 60, 60, 1, 0, 0.7};
+
+  float iou = autoware::tensorrt_rtmdet::TrtRTMDet::intersection_over_union(obj1, obj2);
+  EXPECT_NEAR(iou, 0.4, 0.01);
+}
+
+TEST_F(TrtRTMDetTest, TestNmsSortedBboxes)
+{
+  const autoware::tensorrt_rtmdet::Object obj1 = {10, 10, 50, 50, 0, 0, 0.8};
+  const autoware::tensorrt_rtmdet::Object obj2 = {10, 10, 50, 50, 1, 0, 0.3};
+  const autoware::tensorrt_rtmdet::Object obj3 = {12, 12, 52, 52, 1, 0, 0.6};
+  const autoware::tensorrt_rtmdet::Object obj4 = {8, 8, 48, 48, 1, 0, 0.5};
+
+  std::vector<autoware::tensorrt_rtmdet::Object> input_objects = {obj1, obj2, obj3, obj4};
+  std::vector<autoware::tensorrt_rtmdet::Object> output_objects;
+  const float nms_threshold = 0.5;
+
+  autoware::tensorrt_rtmdet::TrtRTMDet::nms_sorted_bboxes(
+    input_objects, output_objects, nms_threshold);
+
+  // Check if the number of objects is correct after NMS
+  EXPECT_EQ(output_objects.size(), 1);
+
+  // After NMS, only the `obj1` should be kept.
+  EXPECT_EQ(output_objects.at(0).x1, 10);
+  EXPECT_EQ(output_objects.at(0).x2, 50);
+  EXPECT_EQ(output_objects.at(0).y1, 10);
+  EXPECT_EQ(output_objects.at(0).y2, 50);
+  EXPECT_EQ(output_objects.at(0).class_id, 0);
+  EXPECT_EQ(output_objects.at(0).score, static_cast<float>(0.8));
+}
+
+TEST_F(TrtRTMDetTest, TestPluginLoad)
+{
+  const auto plugin_dir = ament_index_cpp::get_package_prefix("trt_batched_nms");
+  const auto plugin_path = plugin_dir + "/lib/libtrt_batched_nms_plugin.so";
+
+  int32_t flags{RTLD_LAZY};
+  void * handle = dlopen(plugin_path.c_str(), flags);
+
+  EXPECT_NE(handle, nullptr);
+}