diff --git a/test/common_utils.py b/test/common_utils.py index 8c3c9dd58a8..e3fa464b5ea 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -20,7 +20,8 @@ from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair from torchvision import io, tv_tensors from torchvision.transforms._functional_tensor import _max_value as get_max_value -from torchvision.transforms.v2.functional import to_cvcuda_tensor, to_image, to_pil_image +from torchvision.transforms.v2.functional import cvcuda_to_tensor, to_cvcuda_tensor, to_image, to_pil_image +from torchvision.transforms.v2.functional._utils import _is_cvcuda_available, _is_cvcuda_tensor from torchvision.utils import _Image_fromarray @@ -284,8 +285,24 @@ def __init__( mae=False, **other_parameters, ): - if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]): - actual, expected = (to_image(input) for input in [actual, expected]) + # Convert PIL images to tv_tensors.Image (regardless of what the other is) + if isinstance(actual, PIL.Image.Image): + actual = to_image(actual) + if isinstance(expected, PIL.Image.Image): + expected = to_image(expected) + + if _is_cvcuda_available(): + if _is_cvcuda_tensor(actual): + actual = cvcuda_to_tensor(actual) + # Remove batch dimension if it's 1 for easier comparison against 3D PIL images + if actual.shape[0] == 1: + actual = actual[0] + actual = actual.cpu() + if _is_cvcuda_tensor(expected): + expected = cvcuda_to_tensor(expected) + if expected.shape[0] == 1: + expected = expected[0] + expected = expected.cpu() super().__init__(actual, expected, **other_parameters) self.mae = mae @@ -400,8 +417,8 @@ def make_image_pil(*args, **kwargs): return to_pil_image(make_image(*args, **kwargs)) -def make_image_cvcuda(*args, **kwargs): - return to_cvcuda_tensor(make_image(*args, **kwargs)) +def make_image_cvcuda(*args, batch_dims=(1,), **kwargs): + return to_cvcuda_tensor(make_image(*args, batch_dims=batch_dims, **kwargs)) def make_keypoints(canvas_size=DEFAULT_SIZE, *, num_points=4, dtype=None, device="cpu"): @@ -541,5 +558,9 @@ def ignore_jit_no_profile_information_warning(): # with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore # them. with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message=re.escape("operator() profile_node %"), category=UserWarning) + warnings.filterwarnings( + "ignore", + message=re.escape("operator() profile_node %"), + category=UserWarning, + ) yield diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 978d6e04756..3ce603c3ed2 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -1240,6 +1240,10 @@ def test_kernel_video(self): make_image_tensor, make_image_pil, make_image, + pytest.param( + make_image_cvcuda, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), make_bounding_boxes, make_segmentation_mask, make_video, @@ -1255,6 +1259,11 @@ def test_functional(self, make_input): (F.horizontal_flip_image, torch.Tensor), (F._geometry._horizontal_flip_image_pil, PIL.Image.Image), (F.horizontal_flip_image, tv_tensors.Image), + pytest.param( + F._geometry._horizontal_flip_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), (F.horizontal_flip_bounding_boxes, tv_tensors.BoundingBoxes), (F.horizontal_flip_mask, tv_tensors.Mask), (F.horizontal_flip_video, tv_tensors.Video), @@ -1262,6 +1271,8 @@ def test_functional(self, make_input): ], ) def test_functional_signature(self, kernel, input_type): + if kernel is F._geometry._horizontal_flip_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type) @pytest.mark.parametrize( @@ -1270,6 +1281,10 @@ def test_functional_signature(self, kernel, input_type): make_image_tensor, make_image_pil, make_image, + pytest.param( + make_image_cvcuda, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), make_bounding_boxes, make_segmentation_mask, make_video, @@ -1283,13 +1298,23 @@ def test_transform(self, make_input, device): @pytest.mark.parametrize( "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] ) - def test_image_correctness(self, fn): - image = make_image(dtype=torch.uint8, device="cpu") - + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), + ], + ) + def test_image_correctness(self, fn, make_input): + image = make_input() actual = fn(image) - expected = F.to_image(F.horizontal_flip(F.to_pil_image(image))) - - torch.testing.assert_close(actual, expected) + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.horizontal_flip(F.to_pil_image(image)) + assert_equal(actual, expected) def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes): affine_matrix = np.array( @@ -1345,6 +1370,10 @@ def test_keypoints_correctness(self, fn): make_image_tensor, make_image_pil, make_image, + pytest.param( + make_image_cvcuda, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), make_bounding_boxes, make_segmentation_mask, make_video, @@ -1354,11 +1383,8 @@ def test_keypoints_correctness(self, fn): @pytest.mark.parametrize("device", cpu_and_cuda()) def test_transform_noop(self, make_input, device): input = make_input(device=device) - transform = transforms.RandomHorizontalFlip(p=0) - output = transform(input) - assert_equal(output, input) @@ -1856,6 +1882,10 @@ def test_kernel_video(self): make_image_tensor, make_image_pil, make_image, + pytest.param( + make_image_cvcuda, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), make_bounding_boxes, make_segmentation_mask, make_video, @@ -1871,6 +1901,11 @@ def test_functional(self, make_input): (F.vertical_flip_image, torch.Tensor), (F._geometry._vertical_flip_image_pil, PIL.Image.Image), (F.vertical_flip_image, tv_tensors.Image), + pytest.param( + F._geometry._vertical_flip_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), (F.vertical_flip_bounding_boxes, tv_tensors.BoundingBoxes), (F.vertical_flip_mask, tv_tensors.Mask), (F.vertical_flip_video, tv_tensors.Video), @@ -1878,6 +1913,8 @@ def test_functional(self, make_input): ], ) def test_functional_signature(self, kernel, input_type): + if kernel is F._geometry._vertical_flip_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type) @pytest.mark.parametrize( @@ -1886,6 +1923,10 @@ def test_functional_signature(self, kernel, input_type): make_image_tensor, make_image_pil, make_image, + pytest.param( + make_image_cvcuda, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), make_bounding_boxes, make_segmentation_mask, make_video, @@ -1897,13 +1938,23 @@ def test_transform(self, make_input, device): check_transform(transforms.RandomVerticalFlip(p=1), make_input(device=device)) @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) - def test_image_correctness(self, fn): - image = make_image(dtype=torch.uint8, device="cpu") - + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), + ], + ) + def test_image_correctness(self, fn, make_input): + image = make_input() actual = fn(image) - expected = F.to_image(F.vertical_flip(F.to_pil_image(image))) - - torch.testing.assert_close(actual, expected) + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.vertical_flip(F.to_pil_image(image)) + assert_equal(actual, expected) def _reference_vertical_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes): affine_matrix = np.array( @@ -1955,6 +2006,10 @@ def test_keypoints_correctness(self, fn): make_image_tensor, make_image_pil, make_image, + pytest.param( + make_image_cvcuda, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"), + ), make_bounding_boxes, make_segmentation_mask, make_video, @@ -1964,11 +2019,8 @@ def test_keypoints_correctness(self, fn): @pytest.mark.parametrize("device", cpu_and_cuda()) def test_transform_noop(self, make_input, device): input = make_input(device=device) - transform = transforms.RandomVerticalFlip(p=0) - output = transform(input) - assert_equal(output, input) diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index 1418a6b4953..96166e05e9a 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -11,7 +11,7 @@ from torchvision.ops.boxes import box_iou from torchvision.transforms.functional import _get_perspective_coeffs from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform -from torchvision.transforms.v2.functional._utils import _FillType +from torchvision.transforms.v2.functional._utils import _FillType, _is_cvcuda_available, _is_cvcuda_tensor from ._transform import _RandomApplyTransform from ._utils import ( @@ -30,6 +30,8 @@ query_size, ) +CVCUDA_AVAILABLE = _is_cvcuda_available() + class RandomHorizontalFlip(_RandomApplyTransform): """Horizontally flip the input with a given probability. @@ -45,6 +47,9 @@ class RandomHorizontalFlip(_RandomApplyTransform): _v1_transform_cls = _transforms.RandomHorizontalFlip + if CVCUDA_AVAILABLE: + _transformed_types = _RandomApplyTransform._transformed_types + (_is_cvcuda_tensor,) + def transform(self, inpt: Any, params: dict[str, Any]) -> Any: return self._call_kernel(F.horizontal_flip, inpt) @@ -63,6 +68,9 @@ class RandomVerticalFlip(_RandomApplyTransform): _v1_transform_cls = _transforms.RandomVerticalFlip + if CVCUDA_AVAILABLE: + _transformed_types = _RandomApplyTransform._transformed_types + (_is_cvcuda_tensor,) + def transform(self, inpt: Any, params: dict[str, Any]) -> Any: return self._call_kernel(F.vertical_flip, inpt) diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 4fcb7fabe0d..0e27218bc89 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -2,7 +2,7 @@ import numbers import warnings from collections.abc import Sequence -from typing import Any, Optional, Union +from typing import Any, Optional, TYPE_CHECKING, Union import PIL.Image import torch @@ -26,7 +26,18 @@ from ._meta import _get_size_image_pil, clamp_bounding_boxes, convert_bounding_box_format -from ._utils import _FillTypeJIT, _get_kernel, _register_five_ten_crop_kernel_internal, _register_kernel_internal +from ._utils import ( + _FillTypeJIT, + _get_kernel, + _import_cvcuda, + _is_cvcuda_available, + _register_five_ten_crop_kernel_internal, + _register_kernel_internal, +) + +CVCUDA_AVAILABLE = _is_cvcuda_available() +if TYPE_CHECKING: + import cvcuda # type: ignore[import-not-found] def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> InterpolationMode: @@ -62,6 +73,14 @@ def _horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image: return _FP.hflip(image) +def _horizontal_flip_image_cvcuda(image: "cvcuda.Tensor") -> "cvcuda.Tensor": + return _import_cvcuda().flip(image, flipCode=1) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(horizontal_flip, _import_cvcuda().Tensor)(_horizontal_flip_image_cvcuda) + + @_register_kernel_internal(horizontal_flip, tv_tensors.Mask) def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor: return horizontal_flip_image(mask) @@ -150,6 +169,14 @@ def _vertical_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image: return _FP.vflip(image) +def _vertical_flip_image_cvcuda(image: "cvcuda.Tensor") -> "cvcuda.Tensor": + return _import_cvcuda().flip(image, flipCode=0) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(vertical_flip, _import_cvcuda().Tensor)(_vertical_flip_image_cvcuda) + + @_register_kernel_internal(vertical_flip, tv_tensors.Mask) def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor: return vertical_flip_image(mask) diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py index ad1eddd258b..11480b30ef9 100644 --- a/torchvision/transforms/v2/functional/_utils.py +++ b/torchvision/transforms/v2/functional/_utils.py @@ -169,3 +169,11 @@ def _is_cvcuda_available(): return True except ImportError: return False + + +def _is_cvcuda_tensor(inpt: Any) -> bool: + try: + cvcuda = _import_cvcuda() + return isinstance(inpt, cvcuda.Tensor) + except ImportError: + return False