huggingface
diff --git a/‎tests/test_models.py
Lines changed: 71 additions & 0 deletions b/‎tests/test_models.py
Lines changed: 71 additions & 0 deletions
diff --git a/‎timm/layers/patch_embed.py
Lines changed: 18 additions & 2 deletions b/‎timm/layers/patch_embed.py
Lines changed: 18 additions & 2 deletions
diff --git a/‎timm/models/_features.py
Lines changed: 21 additions & 8 deletions b/‎timm/models/_features.py
Lines changed: 21 additions & 8 deletions
diff --git a/‎timm/models/beit.py
Lines changed: 12 additions & 15 deletions b/‎timm/models/beit.py
Lines changed: 12 additions & 15 deletions
diff --git a/‎timm/models/eva.py
Lines changed: 12 additions & 15 deletions b/‎timm/models/eva.py
Lines changed: 12 additions & 15 deletions
@@ -47,6 +47,11 @@
     torch._C._jit_set_profiling_executor(True)
     torch._C._jit_set_profiling_mode(False)
 
+# models with forward_intermediates() and support for FeatureGetterNet features_only wrapper
+FEAT_INTER_FILTERS = [
+    'vit_*', 'twins_*', 'deit*', 'beit*', 'mvitv2*', 'eva*', 'samvit_*', 'flexivit*'
+]
+
 # transformer models don't support many of the spatial / feature based model functionalities
 NON_STD_FILTERS = [
     'vit_*', 'tnt_*', 'pit_*', 'coat_*', 'cait_*', '*mixer_*', 'gmlp_*', 'resmlp_*', 'twins_*',
@@ -380,6 +385,72 @@ def test_model_forward_features(model_name, batch_size):
         assert not torch.isnan(o).any()
 
 
+@pytest.mark.features
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(FEAT_INTER_FILTERS, include_tags=True))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_forward_intermediates_features(model_name, batch_size):
+    """Run a single forward pass with each model in feature extraction mode"""
+    model = create_model(model_name, pretrained=False, features_only=True)
+    model.eval()
+    print(model.feature_info.out_indices)
+    expected_channels = model.feature_info.channels()
+    expected_reduction = model.feature_info.reduction()
+
+    input_size = _get_input_size(model=model, target=TARGET_FFEAT_SIZE)
+    if max(input_size) > MAX_FFEAT_SIZE:
+        pytest.skip("Fixed input size model > limit.")
+    output_fmt = getattr(model, 'output_fmt', 'NCHW')
+    feat_axis = get_channel_dim(output_fmt)
+    spatial_axis = get_spatial_dim(output_fmt)
+    import math
+
+    outputs = model(torch.randn((batch_size, *input_size)))
+    assert len(expected_channels) == len(outputs)
+    spatial_size = input_size[-2:]
+    for e, r, o in zip(expected_channels, expected_reduction, outputs):
+        print(o.shape)
+        assert e == o.shape[feat_axis]
+        assert o.shape[spatial_axis[0]] <= math.ceil(spatial_size[0] / r) + 1
+        assert o.shape[spatial_axis[1]] <= math.ceil(spatial_size[1] / r) + 1
+        assert o.shape[0] == batch_size
+        assert not torch.isnan(o).any()
+
+
+@pytest.mark.features
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(FEAT_INTER_FILTERS, include_tags=True))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_forward_intermediates(model_name, batch_size):
+    """Run a single forward pass with each model in feature extraction mode"""
+    model = create_model(model_name, pretrained=False)
+    model.eval()
+    feature_info = timm.models.FeatureInfo(model.feature_info, len(model.feature_info))
+    expected_channels = feature_info.channels()
+    expected_reduction = feature_info.reduction()
+    assert len(expected_channels) >= 4  # all models here should have at least 4 feature levels by default, some 5 or 6
+
+    input_size = _get_input_size(model=model, target=TARGET_FFEAT_SIZE)
+    if max(input_size) > MAX_FFEAT_SIZE:
+        pytest.skip("Fixed input size model > limit.")
+    output_fmt = getattr(model, 'output_fmt', 'NCHW')
+    feat_axis = get_channel_dim(output_fmt)
+    spatial_axis = get_spatial_dim(output_fmt)
+    import math
+
+    output, intermediates = model.forward_intermediates(
+        torch.randn((batch_size, *input_size)),
+    )
+    assert len(expected_channels) == len(intermediates)
+    spatial_size = input_size[-2:]
+    for e, r, o in zip(expected_channels, expected_reduction, intermediates):
+        assert e == o.shape[feat_axis]
+        assert o.shape[spatial_axis[0]] <= math.ceil(spatial_size[0] / r) + 1
+        assert o.shape[spatial_axis[1]] <= math.ceil(spatial_size[1] / r) + 1
+        assert o.shape[0] == batch_size
+        assert not torch.isnan(o).any()
+
+
 def _create_fx_model(model, train=False):
     # This block of code does a bit of juggling to handle any case where there are multiple outputs in train mode
     # So we trace once and look at the graph, and get the indices of the nodes that lead into the original fx output
 
@@ -9,6 +9,7 @@
 Hacked together by / Copyright 2020 Ross Wightman
 """
 import logging
+import math
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
@@ -65,6 +66,21 @@ def __init__(
         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
 
+    def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]:
+        if as_scalar:
+            return max(self.patch_size)
+        else:
+            return self.patch_size
+
+    def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]:
+        """ Get grid (feature) size for given image size taking account of dynamic padding.
+        NOTE: must be torchscript compatible so using fixed tuple indexing
+        """
+        if self.dynamic_img_pad:
+            return math.ceil(img_size[0] / self.patch_size[0]), math.ceil(img_size[1] / self.patch_size[1])
+        else:
+            return img_size[0] // self.patch_size[0], img_size[1] // self.patch_size[1]
+
     def forward(self, x):
         B, C, H, W = x.shape
         if self.img_size is not None:
@@ -127,13 +143,13 @@ def forward(self, x) -> Tuple[torch.Tensor, List[int]]:
             _assert(W % self.patch_size[1] == 0, f"Input image width ({W}) must be divisible by patch size ({self.patch_size[1]}).")
 
         x = self.proj(x)
-        grid_size = x.shape[-2:]
+        feat_size = x.shape[-2:]
         if self.flatten:
             x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
         elif self.output_fmt != Format.NCHW:
             x = nchw_to(x, self.output_fmt)
         x = self.norm(x)
-        return x, grid_size
+        return x, feat_size
 
 
 def resample_patch_embed(
 
@@ -26,7 +26,10 @@
 ]
 
 
-def _take_indices(n: Union[int, List[int], Tuple[int]], num_blocks: int) -> Tuple[Set[int], int]:
+def _take_indices(
+        num_blocks: int,
+        n: Optional[Union[int, List[int], Tuple[int]]],
+) -> Tuple[Set[int], int]:
     if isinstance(n, int):
         assert n >= 0
         take_indices = {x for x in range(num_blocks - n, num_blocks)}
@@ -35,7 +38,10 @@ def _take_indices(n: Union[int, List[int], Tuple[int]], num_blocks: int) -> Tupl
     return take_indices, max(take_indices)
 
 
-def _take_indices_jit(n: Union[int, List[int], Tuple[int]], num_blocks: int) -> Tuple[List[int], int]:
+def _take_indices_jit(
+        num_blocks: int,
+        n: Union[int, List[int], Tuple[int]],
+) -> Tuple[List[int], int]:
     if isinstance(n, int):
         assert n >= 0
         take_indices = [num_blocks - n + i for i in range(n)]
@@ -47,12 +53,17 @@ def _take_indices_jit(n: Union[int, List[int], Tuple[int]], num_blocks: int) ->
     return take_indices, max(take_indices)
 
 
-def feature_take_indices(n: Union[int, List[int], Tuple[int]], num_blocks: int) -> Tuple[List[int], int]:
+def feature_take_indices(
+        num_blocks: int,
+        indices: Optional[Union[int, List[int], Tuple[int]]] = None,
+) -> Tuple[List[int], int]:
+    if indices is None:
+        indices = num_blocks  # all blocks if None
     if torch.jit.is_scripting():
-        return _take_indices_jit(n, num_blocks)
+        return _take_indices_jit(num_blocks, indices)
     else:
         # NOTE non-jit returns Set[int] instead of List[int] but torchscript can't handle that anno
-        return _take_indices(n, num_blocks)
+        return _take_indices(num_blocks, indices)
 
 
 def _out_indices_as_tuple(x: Union[int, Tuple[int, ...]]) -> Tuple[int, ...]:
@@ -443,10 +454,12 @@ def __init__(
         """
         super().__init__()
         if prune and hasattr(model, 'prune_intermediate_layers'):
-            model.prune_intermediate_layers(
+            # replace out_indices after they've been normalized, -ve indices will be invalid after prune
+            out_indices = model.prune_intermediate_layers(
                 out_indices,
                 prune_norm=not norm,
             )
+            out_indices = list(out_indices)
         self.feature_info = _get_feature_info(model, out_indices)
         self.model = model
         self.out_indices = out_indices
@@ -458,9 +471,9 @@ def __init__(
     def forward(self, x):
         features = self.model.forward_intermediates(
             x,
-            n=self.out_indices,
+            indices=self.out_indices,
             norm=self.norm,
             output_fmt=self.output_fmt,
-            features_only=True,
+            intermediates_only=True,
         )
         return features
@@ -302,6 +302,7 @@ def __init__(
             embed_dim=embed_dim,
         )
         num_patches = self.patch_embed.num_patches
+        r = self.patch_embed.feat_ratio() if hasattr(self.patch_embed, 'feat_ratio') else patch_size
 
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
@@ -334,7 +335,7 @@ def __init__(
             )
             for i in range(depth)])
         self.feature_info = [
-            dict(module=f'blocks.{i}', num_chs=embed_dim, reduction=patch_size) for i in range(depth)]
+            dict(module=f'blocks.{i}', num_chs=embed_dim, reduction=r) for i in range(depth)]
 
         use_fc_norm = self.global_pool == 'avg'
         self.norm = nn.Identity() if use_fc_norm else norm_layer(embed_dim)
@@ -403,33 +404,30 @@ def reset_classifier(self, num_classes, global_pool=None):
     def forward_intermediates(
             self,
             x: torch.Tensor,
-            n: Optional[Union[int, List[int], Tuple[int]]] = None,
+            indices: Optional[Union[int, List[int], Tuple[int]]] = None,
             return_prefix_tokens: bool = False,
             norm: bool = False,
             stop_early: bool = True,
             output_fmt: str = 'NCHW',
-            features_only: bool = False,
+            intermediates_only: bool = False,
     ) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]:
         """ Forward features that returns intermediates.
 
         Args:
             x: Input image tensor
-            n: Take last n blocks if n is an int, if in is a sequence, select by matching indices
+            indices: Take last n blocks if an int, if is a sequence, select by matching indices
             return_prefix_tokens: Return both prefix and spatial intermediate tokens
             norm: Apply norm layer to all intermediates
             stop_early: Stop iterating over blocks when last desired intermediate hit
             output_fmt: Shape of intermediate feature outputs
-            features_only: Only return intermediate features
+            intermediates_only: Only return intermediate features
         Returns:
 
         """
         assert output_fmt in ('NCHW', 'NLC'), 'Output format for ViT features must be one of NCHW or NLC.'
         reshape = output_fmt == 'NCHW'
         intermediates = []
-        num_blocks = len(self.blocks)
-        if n is None:
-            n = num_blocks
-        take_indices, max_index = feature_take_indices(n, num_blocks)
+        take_indices, max_index = feature_take_indices(len(self.blocks), indices)
 
         # forward pass
         B, _, height, width = x.shape
@@ -455,16 +453,14 @@ def forward_intermediates(
             prefix_tokens = [y[:, 0:self.num_prefix_tokens] for y in intermediates]
             intermediates = [y[:, self.num_prefix_tokens:] for y in intermediates]
         if reshape:
-            # reshape == True => BCHW output format
-            patch_size = self.patch_embed.patch_size
-            H = int(math.ceil(height / patch_size[0]))
-            W = int(math.ceil(width / patch_size[1]))
+            # reshape to BCHW output format
+            H, W = self.patch_embed.dynamic_feat_size((height, width))
             intermediates = [y.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() for y in intermediates]
         if not torch.jit.is_scripting() and return_prefix_tokens:
             # return_prefix not support in torchscript due to poor type handling
             intermediates = list(zip(intermediates, prefix_tokens))
 
-        if features_only:
+        if intermediates_only:
             return intermediates
 
         x = self.norm(x)
@@ -479,13 +475,14 @@ def prune_intermediate_layers(
     ):
         """ Prune layers not required for specified intermediates.
         """
-        take_indices, max_index = feature_take_indices(n, len(self.blocks))
+        take_indices, max_index = feature_take_indices(len(self.blocks), n)
         self.blocks = self.blocks[:max_index + 1]  # truncate blocks
         if prune_norm:
             self.norm = nn.Identity()
         if prune_head:
             self.fc_norm = nn.Identity()
             self.head = nn.Identity()
+        return take_indices
 
     def forward_features(self, x):
         x = self.patch_embed(x)
 
@@ -424,6 +424,7 @@ def __init__(
             **embed_args,
         )
         num_patches = self.patch_embed.num_patches
+        r = self.patch_embed.feat_ratio() if hasattr(self.patch_embed, 'feat_ratio') else patch_size
 
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
 
@@ -470,7 +471,7 @@ def __init__(
             )
             for i in range(depth)])
         self.feature_info = [
-            dict(module=f'blocks.{i}', num_chs=embed_dim, reduction=patch_size) for i in range(depth)]
+            dict(module=f'blocks.{i}', num_chs=embed_dim, reduction=r) for i in range(depth)]
 
         use_fc_norm = self.global_pool == 'avg'
         self.norm = nn.Identity() if use_fc_norm else norm_layer(embed_dim)
@@ -564,30 +565,27 @@ def _pos_embed(self, x) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     def forward_intermediates(
             self,
             x: torch.Tensor,
-            n: Optional[Union[int, List[int], Tuple[int]]] = None,
+            indices: Optional[Union[int, List[int], Tuple[int]]] = None,
             return_prefix_tokens: bool = False,
             norm: bool = False,
             stop_early: bool = True,
             output_fmt: str = 'NCHW',
-            features_only: bool = False,
+            intermediates_only: bool = False,
     ) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]:
         """ Forward features that returns intermediates.
         Args:
             x: Input image tensor
-            n: Take last n blocks if n is an int, if in is a sequence, select by matching indices
+            indices: Take last n blocks if an int, if is a sequence, select by matching indices
             return_prefix_tokens: Return both prefix and spatial intermediate tokens
             norm: Apply norm layer to all intermediates
             stop_early: Stop iterating over blocks when last desired intermediate hit
             output_fmt: Shape of intermediate feature outputs
-            features_only: Only return intermediate features
+            intermediates_only: Only return intermediate features
         """
         assert output_fmt in ('NCHW', 'NLC'), 'Output format for EVA-ViT features must be one of NCHW or NLC.'
         reshape = output_fmt == 'NCHW'
         intermediates = []
-        num_blocks = len(self.blocks)
-        if n is None:
-            n = num_blocks
-        take_indices, max_index = feature_take_indices(n, num_blocks)
+        take_indices, max_index = feature_take_indices(len(self.blocks), indices)
 
         # forward pass
         B, _, height, width = x.shape
@@ -608,16 +606,14 @@ def forward_intermediates(
             prefix_tokens = [y[:, 0:self.num_prefix_tokens] for y in intermediates]
             intermediates = [y[:, self.num_prefix_tokens:] for y in intermediates]
         if reshape:
-            # reshape == True => BCHW output format
-            patch_size = self.patch_embed.patch_size
-            H = int(math.ceil(height / patch_size[0]))
-            W = int(math.ceil(width / patch_size[1]))
+            # reshape to BCHW output format
+            H, W = self.patch_embed.dynamic_feat_size((height, width))
             intermediates = [y.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() for y in intermediates]
         if not torch.jit.is_scripting() and return_prefix_tokens:
             # return_prefix not support in torchscript due to poor type handling
             intermediates = list(zip(intermediates, prefix_tokens))
 
-        if features_only:
+        if intermediates_only:
             return intermediates
 
         x = self.norm(x)
@@ -632,13 +628,14 @@ def prune_intermediate_layers(
     ):
         """ Prune layers not required for specified intermediates.
         """
-        take_indices, max_index = feature_take_indices(n, len(self.blocks))
+        take_indices, max_index = feature_take_indices(len(self.blocks), n)
         self.blocks = self.blocks[:max_index + 1]  # truncate blocks
         if prune_norm:
             self.norm = nn.Identity()
         if prune_head:
             self.fc_norm = nn.Identity()
             self.head = nn.Identity()
+        return take_indices
 
     def forward_features(self, x):
         x = self.patch_embed(x)