dav2: Get rid of vis topic

ethz-asl · Nov 12, 2024 · 063990b · 063990b
1 parent 2d5a149
commit 063990b
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 79 deletions.
diff --git a/include/usb_cam/learning/depth_anything_v2.hpp b/include/usb_cam/learning/depth_anything_v2.hpp
@@ -9,110 +9,77 @@
 
 class DepthAnythingV2 : public LearningInterface {
 public:
-    DepthAnythingV2(ros::NodeHandle* nh, std::string& model_path, std::string& metric_topic, std::string& vis_topic) {
-        _INPUT_SIZE = cv::Size(_HEIGHT, _WIDTH);
+    DepthAnythingV2(ros::NodeHandle* nh, std::string& model_path, std::string& metric_topic) {
         _model_path = model_path;
         _load_model();
 
-        if (nh != nullptr) {
-            if (!metric_topic.empty()) {
-                _depth_metric_pub = nh->advertise<sensor_msgs::Image>(metric_topic, 1);
-            }
-            if (!vis_topic.empty()) {
-                _depth_vis_pub = nh->advertise<sensor_msgs::Image>(vis_topic, 1);
-            }
-
+        if (nh != nullptr && !metric_topic.empty()) {
+            _depth_pub = nh->advertise<sensor_msgs::Image>(metric_topic, 1);
         }
     }
 
     void set_input(sensor_msgs::Image& msg) override {
+        // Keep track of input image size, we want to get the same output image dimensions
+        _output_image_w = msg.width;
+        _output_image_h = msg.height;
+
         cv_bridge::CvImagePtr cv_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::RGB8);
         cv::Mat image = cv_ptr->image;
 
         // Change size to 518x518 (still uint8)
         cv::Mat resized_image;
-        cv::resize(image, resized_image, _INPUT_SIZE);
+        cv::resize(image, resized_image, cv::Size(_input_w, _input_h));
 
         std::vector<float> input = _preprocess(resized_image);
         memcpy(_input_data, input.data(), _input_size_float);
     }
 
 void publish() override {
-    cv::Mat depth_prediction(_HEIGHT, _WIDTH, CV_32FC1, _output_data);
-    if (_depth_metric_pub.getTopic() != "") {
-        // Raw depth prediction (in meters, CV_32FC1)
+    if (_depth_pub.getTopic() != "") {
+        cv::Mat depth_prediction(_input_w, _input_h, CV_32FC1, _output_data);
+        cv::Mat depth_resized;
+        cv::resize(depth_prediction, depth_resized, cv::Size(_output_image_w, _output_image_h));
         cv_bridge::CvImage depth_image;
         depth_image.header.stamp = ros::Time::now();
         depth_image.header.frame_id = "depth_frame";
         depth_image.encoding = sensor_msgs::image_encodings::TYPE_32FC1;
-        depth_image.image = depth_prediction;
-        _depth_metric_pub.publish(depth_image.toImageMsg());
-    }
-
-    if (_depth_vis_pub.getTopic() != "") {
-        cv::normalize(depth_prediction, depth_prediction, 0, 255, cv::NORM_MINMAX, CV_8UC1);
-        cv::Mat colormap;
-        cv::applyColorMap(depth_prediction, colormap, cv::COLORMAP_INFERNO);
-        cv::resize(colormap, colormap, cv::Size(_WIDTH, _HEIGHT));
-
-        cv_bridge::CvImage visualized_image;
-        visualized_image.header.stamp = ros::Time::now();
-        visualized_image.header.frame_id = "depth_frame";
-        visualized_image.encoding = sensor_msgs::image_encodings::MONO8;
-        visualized_image.image = colormap;
-        _depth_vis_pub.publish(visualized_image.toImageMsg());
+        depth_image.image = depth_resized;
+        _depth_pub.publish(depth_image.toImageMsg());
     }
 }
 
 private:
-    const size_t _HEIGHT = 518;
-    const size_t _WIDTH = 518;
+    // TODO: Not so nice magic numbers from the paper implementation
     const float mean[3] = { 123.675f, 116.28f, 103.53f };
     const float std[3] = { 58.395f, 57.12f, 57.375f };
-    cv::Size _INPUT_SIZE;
-    ros::Publisher _depth_metric_pub;
-    ros::Publisher _depth_vis_pub;
+    ros::Publisher _depth_pub;
+    uint32_t _output_image_w, _output_image_h;
 
     std::vector<float> _preprocess(cv::Mat& image) {
-        std::tuple<cv::Mat, int, int> resized = _resize_depth(image, _input_w, _input_h);
-        cv::Mat resized_image = std::get<0>(resized);
+        // Determine the resized dimensions
+        const int iw = image.cols;
+        const int ih = image.rows;
+        const float aspect_ratio = static_cast<float>(iw) / ih;
+        const int nw = (aspect_ratio >= 1) ? _input_w : static_cast<int>(_input_w * aspect_ratio);
+        const int nh = (aspect_ratio >= 1) ? static_cast<int>(_input_h / aspect_ratio) : _input_h;
+
+        // Resize image and pad if necessary
+        cv::Mat resized_image;
+        cv::resize(image, resized_image, cv::Size(nw, nh), 0, 0, cv::INTER_LINEAR);
+        cv::Mat padded_image = cv::Mat::ones(cv::Size(_input_w, _input_h), CV_8UC3) * 128;
+        resized_image.copyTo(padded_image(cv::Rect((_input_w - nw) / 2, (_input_h - nh) / 2, nw, nh)));
+
+        // Normalize and flatten the image to a 1D tensor
         std::vector<float> input_tensor;
         for (int k = 0; k < 3; k++) {
-            for (int i = 0; i < resized_image.rows; i++) {
-                for (int j = 0; j < resized_image.cols; j++) {
-                    input_tensor.emplace_back(((float)resized_image.at<cv::Vec3b>(i, j)[k] - mean[k]) / std[k]);
+            for (int i = 0; i < padded_image.rows; i++) {
+                for (int j = 0; j < padded_image.cols; j++) {
+                    input_tensor.emplace_back((padded_image.at<cv::Vec3b>(i, j)[k] - mean[k]) / std[k]);
                 }
             }
         }
-        return input_tensor;
-    }
 
-    std::tuple<cv::Mat, int, int> _resize_depth(cv::Mat& img, int w, int h) {
-        cv::Mat result;
-        int nw, nh;
-        int ih = img.rows;
-        int iw = img.cols;
-        float aspectRatio = (float)img.cols / (float)img.rows;
-
-        if (aspectRatio >= 1) {
-            nw = w;
-            nh = int(h / aspectRatio);
-        } else {
-            nw = int(w * aspectRatio);
-            nh = h;
-        }
-        cv::resize(img, img, cv::Size(nw, nh));
-        result = cv::Mat::ones(cv::Size(w, h), CV_8UC1) * 128;
-        cv::cvtColor(result, result, cv::COLOR_GRAY2RGB);
-        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
-
-        cv::Mat re(h, w, CV_8UC3);
-        cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
-        cv::Mat out(h, w, CV_8UC3, 0.0);
-        re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
-
-        std::tuple<cv::Mat, int, int> res_tuple = std::make_tuple(out, (w - nw) / 2, (h - nh) / 2);
-        return res_tuple;
+        return input_tensor;
     }
 };
 

diff --git a/include/usb_cam/usb_cam.hpp b/include/usb_cam/usb_cam.hpp
@@ -120,8 +120,7 @@ typedef struct
   std::string pixel_format_name;
   std::string av_device_format;
   std::string dav2_file;
-  std::string dav2_metric_topic;
-  std::string dav2_vis_topic;
+  std::string dav2_topic;
   int image_width;
   int image_height;
   int framerate;

diff --git a/launch/two_mipi_rgb.launch b/launch/two_mipi_rgb.launch
@@ -19,8 +19,7 @@
     <param name="wb_red_gain" value="1.18"/>
 
     <param name="dav2_file" value="/home/asl/Workspace/src/v4l2_camera/test/resources/depth_anything_v2_vits.onnx" />
-    <param name="dav2_metric_topic" value="/depth0/relative" />
-    <param name="dav2_vis_topic" value="/depth0/vis" />
+    <param name="dav2_topic" value="/cam0/depth/relative" />
   </node>
 
   <node name="cam1" pkg="usb_cam" type="usb_cam_node" output="screen" >
@@ -40,8 +39,8 @@
     <param name="wb_green_gain" value="0.88"/>
     <param name="wb_red_gain" value="1.18"/>
 
-    <param name="dav2_file" value="" />
-    <param name="dav2_metric_topic" value="/depth1/relative" />
-    <param name="dav2_vis_topic" value="/depth1/vis" />
+    <param name="dav2_file" value="/home/asl/Workspace/src/v4l2_camera/test/resources/depth_anything_v2_vits.onnx" />
+    <!-- Downfacing cam needs no depth prediction (yet) -->
+    <param name="dav2_topic" value="" /> 
   </node>
 </launch>
diff --git a/src/usb_cam_node.cpp b/src/usb_cam_node.cpp
@@ -125,11 +125,10 @@ class UsbCamNode {
 
         // Setup the network that outputs derivates of the image captured
         m_node.param("dav2_file", m_parameters.dav2_file, std::string(""));
-        m_node.param("dav2_metric_topic", m_parameters.dav2_metric_topic, std::string(""));
-        m_node.param("dav2_vis_topic", m_parameters.dav2_vis_topic, std::string(""));
+        m_node.param("dav2_topic", m_parameters.dav2_topic, std::string(""));
 
-        if (!m_parameters.dav2_file.empty() && !m_parameters.dav2_metric_topic.empty()) {
-            networks.push_back(std::make_unique<DepthAnythingV2>(&m_node, m_parameters.dav2_file, m_parameters.dav2_metric_topic, m_parameters.dav2_vis_topic));
+        if (!m_parameters.dav2_file.empty() && !m_parameters.dav2_topic.empty()) {
+            networks.push_back(std::make_unique<DepthAnythingV2>(&m_node, m_parameters.dav2_file, m_parameters.dav2_topic));
         }
 
         ROS_INFO("Starting '%s' (%s) at %dx%d via %s (%s) at %i FPS",