Compatible with DS

nv-vankit · nv-vankit · commit ead1d3c8e553 · 2024-03-21T11:03:51.000+05:30
diff --git a/nanoowl/image_preprocessor.py b/nanoowl/image_preprocessor.py
@@ -72,4 +72,20 @@ def preprocess_pil_image(self, image: PIL.Image.Image):
         image = image.permute(2, 0, 1)[None, ...]
         image = image.to(self.mean.device)
         image = image.type(self.mean.dtype)
+        return self.forward(image, inplace=True)
+    
+    @torch.no_grad()
+    def preprocess_tensor_image(self, image: torch.Tensor) -> torch.Tensor:
+        # Assuming the input image tensor is in the shape (H, W, C)
+        assert image.dim() == 3, "Input image tensor must have 3 dimensions (H, W, C)"
+        assert image.size(2) == 3, "Input image tensor must have 3 channels (RGB)"
+        # Permute the tensor to match the expected shape (N, C, H, W)
+        image = image.permute(2, 0, 1)[None, ...]
+        # Convert the image tensor to the same device as self.mean
+        image = image.to(self.mean.device)
+
+        # Convert the data type of the image tensor to match self.mean
+        image = image.type(self.mean.dtype)
+
+        # Assuming self.forward is a method in your class
         return self.forward(image, inplace=True)
diff --git a/nanoowl/owl_predictor.py b/nanoowl/owl_predictor.py
@@ -454,23 +454,73 @@ def build_image_encoder_engine(self,
 
         return self.load_image_encoder_engine(engine_path, max_batch_size)
 
+
     def predict(self, 
-            image: PIL.Image, 
-            text: List[str], 
-            text_encodings: Optional[OwlEncodeTextOutput],
-            threshold: Union[int, float, List[Union[int, float]]] = 0.1,
-            pad_square: bool = True,
-            
-        ) -> OwlDecodeOutput:
+                image: Union[PIL.Image.Image, torch.Tensor], 
+                text: List[str], 
+                text_encodings: Optional[OwlEncodeTextOutput],
+                threshold: Union[int, float, List[Union[int, float]]] = 0.1,
+                pad_square: bool = True,
+            ) -> OwlDecodeOutput:
+        
+        if isinstance(image, PIL.Image.Image):
+            image_tensor = self.image_preprocessor.preprocess_pil_image(image)
 
-        image_tensor = self.image_preprocessor.preprocess_pil_image(image)
+            rois = torch.tensor([[0, 0, image.width, image.height]], dtype=image_tensor.dtype, device=image_tensor.device)
+           
+        elif isinstance(image, torch.Tensor):
+            image_tensor = self.image_preprocessor.preprocess_tensor_image(image)
 
+            rois = torch.tensor([[0, 0, image.shape[1], image.shape[0]]], dtype=image_tensor.dtype, device=image_tensor.device)
+           
+        else:
+            raise ValueError("Input image must be either a PIL Image or a torch.Tensor")
+        
         if text_encodings is None:
             text_encodings = self.encode_text(text)
+        
+        image_encodings = self.encode_rois(image_tensor, rois, pad_square=pad_square)
+        
+        return self.decode(image_encodings, text_encodings, threshold)
+    
+    # def predict(self, 
+    #         image: PIL.Image, 
+    #         text: List[str], 
+    #         text_encodings: Optional[OwlEncodeTextOutput],
+    #         threshold: Union[int, float, List[Union[int, float]]] = 0.1,
+    #         pad_square: bool = True,
+            
+    #     ) -> OwlDecodeOutput:
 
-        rois = torch.tensor([[0, 0, image.width, image.height]], dtype=image_tensor.dtype, device=image_tensor.device)
+    #     image_tensor = self.image_preprocessor.preprocess_pil_image(image)
+    #     print(image_tensor)
+    #     if text_encodings is None:
+    #         text_encodings = self.encode_text(text)
 
-        image_encodings = self.encode_rois(image_tensor, rois, pad_square=pad_square)
+    #     rois = torch.tensor([[0, 0, image.width, image.height]], dtype=image_tensor.dtype, device=image_tensor.device)
 
-        return self.decode(image_encodings, text_encodings, threshold)
+    #     image_encodings = self.encode_rois(image_tensor, rois, pad_square=pad_square)
 
+    #     return self.decode(image_encodings, text_encodings, threshold)
+    
+    # def predictTensor(self, 
+    #                   image: torch.Tensor,
+    #                   text: List[str],
+    #                   text_encodings: Optional[OwlEncodeTextOutput],
+    #                   threshold: Union[int, float, List[Union[int, float]]] = 0.1,
+    #                   pad_square: bool = True,
+                      
+    #                 ) -> OwlDecodeOutput:
+        
+    #     image_tensor = self.image_preprocessor.preprocess_tensor_image(image)
+
+    #     if text_encodings is None:
+    #         text_encodings = self.encode_text(text)
+    #     print(image_tensor)
+    #     #print(image.shape[1])
+    #     rois = torch.tensor([[0, 0, image.shape[1], image.shape[0]]], dtype=image_tensor.dtype, device=image_tensor.device)
+
+    #     image_encodings = self.encode_rois(image_tensor, rois, pad_square=pad_square)
+        
+    #     return self.decode(image_encodings, text_encodings, threshold)
+