NVIDIA-AI-IOT · xuanlinli17 · Mar 1, 2024 · Mar 6, 2024 · Mar 7, 2024 · Mar 7, 2024
diff --git a/examples/owl_predict.py b/examples/owl_predict.py
@@ -30,10 +30,12 @@
 
     parser = argparse.ArgumentParser()
     parser.add_argument("--image", type=str, default="../assets/owl_glove_small.jpg")
-    parser.add_argument("--prompt", type=str, default="an owl, a glove")
+    parser.add_argument("--prompt", type=str, default="[an owl, a glove]")
     parser.add_argument("--threshold", type=str, default="0.1,0.1")
+    parser.add_argument("--nms_threshold", type=float, default=0.3)
     parser.add_argument("--output", type=str, default="../data/owl_predict_out.jpg")
     parser.add_argument("--model", type=str, default="google/owlvit-base-patch32")
+    parser.add_argument('--no_roi_align', action='store_true')
     parser.add_argument("--image_encoder_engine", type=str, default="../data/owl_image_encoder_patch32.engine")
     parser.add_argument("--profile", action="store_true")
     parser.add_argument("--num_profiling_runs", type=int, default=30)
@@ -45,13 +47,17 @@
 
     thresholds = args.threshold.strip("][()")
     thresholds = thresholds.split(',')
-    thresholds = [float(x) for x in thresholds]
+    if len(thresholds) == 1:
+        thresholds = float(thresholds[0])
+    else:
+        thresholds = [float(x) for x in thresholds]
     print(thresholds)
 
 
     predictor = OwlPredictor(
         args.model,
-        image_encoder_engine=args.image_encoder_engine
+        image_encoder_engine=args.image_encoder_engine,
+        no_roi_align=args.no_roi_align
     )
 
     image = PIL.Image.open(args.image)
@@ -63,6 +69,7 @@
         text=text, 
         text_encodings=text_encodings,
         threshold=thresholds,
+        nms_threshold=args.nms_threshold,
         pad_square=False
     )
 
@@ -75,6 +82,7 @@
                 text=text, 
                 text_encodings=text_encodings,
                 threshold=thresholds,
+                nms_threshold=args.nms_threshold,
                 pad_square=False
             )
         torch.cuda.current_stream().synchronize()

diff --git a/examples/tree_predict.py b/examples/tree_predict.py
@@ -33,13 +33,15 @@
     parser.add_argument("--threshold", type=float, default=0.1)
     parser.add_argument("--output", type=str, default="../data/tree_predict_out.jpg")
     parser.add_argument("--model", type=str, default="google/owlvit-base-patch32")
+    parser.add_argument('--no_roi_align', action='store_true')
     parser.add_argument("--image_encoder_engine", type=str, default="../data/owl_image_encoder_patch32.engine")
     args = parser.parse_args()
 
     predictor = TreePredictor(
         owl_predictor=OwlPredictor(
             args.model,
-            image_encoder_engine=args.image_encoder_engine
+            image_encoder_engine=args.image_encoder_engine,
+            no_roi_align=args.no_roi_align
         )
     )
 

diff --git a/nanoowl/image_preprocessor.py b/nanoowl/image_preprocessor.py
@@ -15,9 +15,10 @@
 
 
 import torch
+import torchvision
 import PIL.Image
 import numpy as np
-from typing import Tuple
+from typing import Tuple, Optional, Union
 
 
 __all__ = [
@@ -44,7 +45,10 @@
 class ImagePreprocessor(torch.nn.Module):
     def __init__(self,
             mean: Tuple[float, float, float] = DEFAULT_IMAGE_PREPROCESSOR_MEAN,
-            std: Tuple[float, float, float] = DEFAULT_IMAGE_PREPROCESSOR_STD
+            std: Tuple[float, float, float] = DEFAULT_IMAGE_PREPROCESSOR_STD,
+            resize: Optional[Union[int, Tuple[int, int]]] = None,
+            resize_by_pad: bool = False,
+            padding_value: Optional[float] = 127.5,
         ):
         super().__init__()
 
@@ -57,8 +61,47 @@ def __init__(self,
             torch.tensor(std)[None, :, None, None]
         )
 
-    def forward(self, image: torch.Tensor, inplace: bool = False):
+        if resize is not None and isinstance(resize, int):
+            resize = (resize, resize)
+        self.resize = resize
+        self.resize_by_pad = resize_by_pad
+        self.padding_value = padding_value
+        if (resize is not None) and (not resize_by_pad):
+            self.resizer = torchvision.transforms.Resize(
+                resize, 
+                interpolation=torchvision.transforms.InterpolationMode.BICUBIC
+            )
+        else:
+            self.resizer = None
 
+    def forward(self, image: torch.Tensor, inplace: bool = False):
+
+        if self.resize:
+            if self.resizer is not None:
+                image = self.resizer(image)
+            if self.resize_by_pad:
+                if image.size(-1) <= self.resize[-1] and image.size(-2) <= self.resize[-2]:
+                    image = torch.nn.functional.pad(
+                        image, 
+                        [0, self.resize[-1] - image.size(-1), 0, self.resize[-2] - image.size(-2)],
+                        "constant",
+                        self.padding_value
+                    )
+                else:
+                    downsample_factor = max(image.size(-2) / self.resize[-2], image.size(-1) / self.resize[-1])
+                    target_size = (round(image.size(-2) / downsample_factor), round(image.size(-1) / downsample_factor))
+                    image = torchvision.transforms.functional.resize(
+                        image,
+                        target_size,
+                        interpolation=torchvision.transforms.InterpolationMode.BILINEAR
+                    )
+                    image = torch.nn.functional.pad(
+                        image, 
+                        [0, self.resize[-1] - image.size(-1), 0, self.resize[-2] - image.size(-2)],
+                        "constant",
+                        self.padding_value
+                    )
+
         if inplace:
             image = image.sub_(self.mean).div_(self.std)
         else:
@@ -67,9 +110,13 @@ def forward(self, image: torch.Tensor, inplace: bool = False):
         return image
 
     @torch.no_grad()
-    def preprocess_pil_image(self, image: PIL.Image.Image):
-        image = torch.from_numpy(np.asarray(image))
+    def preprocess_numpy_array(self, image: np.ndarray):
+        image = torch.from_numpy(image)
         image = image.permute(2, 0, 1)[None, ...]
         image = image.to(self.mean.device)
         image = image.type(self.mean.dtype)
-        return self.forward(image, inplace=True)
+        return self.forward(image, inplace=True)
+
+    @torch.no_grad()
+    def preprocess_pil_image(self, image: PIL.Image.Image):
+        return self.preprocess_numpy_array(np.asarray(image))
diff --git a/nanoowl/owl_drawing.py b/nanoowl/owl_drawing.py
@@ -36,7 +36,7 @@ def get_colors(count: int):
 def draw_owl_output(image, output: OwlDecodeOutput, text: List[str], draw_text=True):
     is_pil = not isinstance(image, np.ndarray)
     if is_pil:
-        image = np.asarray(image)
+        image = np.array(image)
     font = cv2.FONT_HERSHEY_SIMPLEX
     font_scale = 0.75
     colors = get_colors(len(text))
@@ -58,7 +58,7 @@ def draw_owl_output(image, output: OwlDecodeOutput, text: List[str], draw_text=T
         if draw_text:
             offset_y = 12
             offset_x = 0
-            label_text = text[label_index]
+            label_text = text[label_index] + ' ' + f'{output.scores[i]:.2f}'
             cv2.putText(
                 image,
                 label_text,
@@ -71,4 +71,4 @@ def draw_owl_output(image, output: OwlDecodeOutput, text: List[str], draw_text=T
             )
     if is_pil:
         image = PIL.Image.fromarray(image)
-    return image
+    return image