[FIX] correct shapes in wrapper, and dataset (needs to be checked) pass logging of images, add mask and plucer to VanillaCFG

Nicolas Violante · Nicolas Violante · commit fe6bcf22579a · 2025-04-09T16:48:43.000+02:00
diff --git a/main.py b/main.py
@@ -456,20 +456,23 @@ def check_frequency(self, check_idx):
     @rank_zero_only
     def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
         if not self.disabled and (pl_module.global_step > 0 or self.log_first_step):
-            self.log_img(pl_module, batch, batch_idx, split="train")
+            # self.log_img(pl_module, batch, batch_idx, split="train")
+            pass
 
     @rank_zero_only
     def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
         if self.log_before_first_step and pl_module.global_step == 0:
             print(f"{self.__class__.__name__}: logging before training")
-            self.log_img(pl_module, batch, batch_idx, split="train")
+            # self.log_img(pl_module, batch, batch_idx, split="train")
+            pass
 
     @rank_zero_only
     def on_validation_batch_end(
         self, trainer, pl_module, outputs, batch, batch_idx, *args, **kwargs
     ):
         if not self.disabled and pl_module.global_step > 0:
-            self.log_img(pl_module, batch, batch_idx, split="val")
+            # self.log_img(pl_module, batch, batch_idx, split="val")
+            pass
         if hasattr(pl_module, "calibrate_grad_norm"):
             if (
                 pl_module.calibrate_grad_norm and batch_idx % 25 == 0
@@ -831,8 +834,6 @@ def init_wandb(save_dir, opt, config, group_name, name_str):
         # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
         # calling these ourselves should not be necessary but it is.
         # lightning still takes care of proper multiprocessing though
-        print("DATAAAAAAA", data)
-        print("-"*100)
         data.prepare_data()
         # data.setup()
         print("#### Data #####")
diff --git a/sgm/data/dataset.py b/sgm/data/dataset.py
@@ -182,19 +182,19 @@ def __getitem__(self, idx):
 
         images_files = [images_files[i] for i in images_idxs]
             
-        frames = np.zeros((self.num_images, self.image_shape[0],  self.image_shape[1], 3))
+        frames = np.zeros((self.num_images, self.target_shape[0],  self.target_shape[1], 3))
         for i, img_file in enumerate(images_files):
             img_path = os.path.join(images_dir, img_file)
             image = cv.imread(img_path)
             image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
-            # image = cv.resize(image, self.target_shape, interpolation=cv.INTER_LINEAR) # TODO: Crops?
+            image = cv.resize(image, self.target_shape, interpolation=cv.INTER_LINEAR) # TODO: Crops?
             if self.transform:
                 frames[i] = image
 
         frames = frames.astype(np.float32) / 255.0
         frames = torch.from_numpy(frames).permute(0, 3, 1, 2)  # Convert to (N, C, H, W)
         frames = frames * 2.0 - 1.0  # Normalize to [-1, 1]
-        # TODO: reisze to target shape
+
 
         # Load colmap data
         colmap_scene_path = os.path.join(
@@ -230,8 +230,8 @@ def __getitem__(self, idx):
             extrinsics_src=w2cs[0],
             extrinsics=w2cs,
             intrinsics=Ks.clone(),
-            target_size=(self.image_shape[0] // self.donwsample_factor, 
-                         self.image_shape[1] // self.donwsample_factor),
+            target_size=(self.target_shape[0] // self.donwsample_factor, 
+                         self.target_shape[1] // self.donwsample_factor),
         )
 
         concat = torch.cat(
@@ -265,6 +265,7 @@ def __init__(
             colmap_dir, 
             batch_size, 
             num_workers=0, 
+            num_images=21,
             shuffle=True):
         super().__init__()
 
@@ -276,7 +277,7 @@ def __init__(
         self.train_dataset = DL3DVDataset(
             dataset_dir,
             colmap_dir,
-            num_images=21,
+            num_images=num_images,
         )
 
     def prepare_data(self):
diff --git a/sgm/modules/diffusionmodules/denoiser.py b/sgm/modules/diffusionmodules/denoiser.py
@@ -35,7 +35,7 @@ def forward(
         c_noise = self.possibly_quantize_c_noise(c_noise.reshape(sigma_shape))
         
         if "mask" in cond:
-            mask = cond.pop("mask")[...,None,None,None]
+            mask = cond.pop("mask")[...,None,None,None].to(dtype=input.dtype)
             input = input * (1 - mask) + input * mask
             
         return (
diff --git a/sgm/modules/diffusionmodules/guiders.py b/sgm/modules/diffusionmodules/guiders.py
@@ -34,7 +34,7 @@ def prepare_inputs(self, x, s, c, uc):
         c_out = dict()
 
         for k in c:
-            if k in ["vector", "crossattn", "concat"]:
+            if k in ["vector", "crossattn", "concat", "mask", "plucker"]:
                 c_out[k] = torch.cat((uc[k], c[k]), 0)
             else:
                 assert c[k] == uc[k]
diff --git a/sgm/modules/diffusionmodules/wrappers.py b/sgm/modules/diffusionmodules/wrappers.py
@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 from packaging import version
+from einops import rearrange, repeat
 
 OPENAIUNETWRAPPER = "sgm.modules.diffusionmodules.wrappers.OpenAIWrapper"
 
@@ -50,12 +51,24 @@ def forward(
     ) -> torch.Tensor:
         x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=2)
 
+
+        b = x.shape[0]
+        f = x.shape[1]
+        x = rearrange(x, "b f c h w -> (b f) c h w")
+        dense_y=rearrange(c["plucker"], "b f c h w -> (b f) c h w")
+
         #TODO: remove
-        c["crossattn"] = torch.zeros((x.shape[0], 1, 1024)).type_as(x).to(x.device)
-        return self.diffusion_model(
+        c = torch.zeros((b, 1, 1024)).type_as(x).to(x.device)
+        c = repeat(c, "b 1 c -> (b f) 1 c", f=f)
+        t = repeat(t, "b -> (b f)", f=f)
+
+        out = self.diffusion_model(
             x,
             t=t,
-            y=c["crossattn"],
-            dense_y=c["plucker"],
+            y=c, # c["crossattn"]
+            dense_y=dense_y,
+            num_frames=f,
             **kwargs,
-        )
+        )
+        out = rearrange(out, "(b f) c h w -> b f c h w", f=f)
+        return out