Dealing with encoder outputs with dimension > 3 when using the reshaper neck

Joao-L-S-Almeida · Joao-L-S-Almeida · commit 8218627a31c9 · 2025-03-03T12:17:29.000-03:00
Signed-off-by: João Lucas de Sousa Almeida &lt;joao.l.sa.9.3@gmail.com&gt;
diff --git a/terratorch/models/necks.py b/terratorch/models/necks.py
@@ -141,16 +141,30 @@ def __init__(self, channel_list: list[int], remove_cls_token=True, effective_tim
         self.remove_cls_token = remove_cls_token
         self.effective_time_dim = effective_time_dim
 
+    def collapse_dims(self, x):
+        """
+        When the encoder output has more than 3 dimensions, is necessary to 
+        reshape it. 
+        """
+        shape = x.shape
+        batch = x.shape[0]
+        e = x.shape[-1]
+        collapsed_dim = np.prod(x.shape[1:-1])
+
+        return x.reshape(batch, collapsed_dim, e)
+
     def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]:
         out = []
         for x in features:
             if self.remove_cls_token:
                 x_no_token = x[:, 1:, :]
             else:
                 x_no_token = x
+            x_no_token = self.collapse_dims(x_no_token)
             number_of_tokens = x_no_token.shape[1]
             tokens_per_timestep = number_of_tokens // self.effective_time_dim
             h = int(np.sqrt(tokens_per_timestep))
+
             encoded = rearrange(
                 x_no_token,
                 "batch (t h w) e -> batch (t e) h w",