{MODULE LAYERS]

Kye · Kye · commit 39aacd00a66e · 2024-02-07T23:27:20.000-08:00
diff --git a/example.py b/example.py
@@ -1,4 +1,4 @@
-import torch 
+import torch
 from screenai.main import ScreenAI
 
 # Create a tensor
@@ -7,20 +7,20 @@
 
 # Model
 model = ScreenAI(
-    patch_size=(4, 6),
+    patch_size=16,
     image_size=224,
     dim=512,
     depth=6,
     heads=8,
     vit_depth=4,
     multi_modal_encoder_depth=4,
     llm_decoder_depth=4,
-    mm_encoder_ff_mult=4
+    mm_encoder_ff_mult=4,
 )
 
 
 # Forward
-out = model(image, text)
+out = model(text, image)
 
 # Print the output shape
 print(out.shape)
diff --git a/screenai/main.py b/screenai/main.py
@@ -1,5 +1,3 @@
-from typing import Tuple
-
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
@@ -211,7 +209,7 @@ def forward(self, x, context):
 class ScreenAI(nn.Module):
     def __init__(
         self,
-        patch_size: Tuple[int, int] = (16, 16),
+        patch_size: int,  # Tuple[int, int] = (16, 16),
         image_size: int = 224,
         dim: int = 512,
         depth: int = 6,
@@ -233,12 +231,6 @@ def __init__(
         self.multi_modal_encoder_depth = multi_modal_encoder_depth
         self.llm_decoder_depth = llm_decoder_depth
 
-        # Aspect ratio preserving gride with max 25 patches, split up the image into patches
-        self.grid = (
-            image_size // patch_size[0],
-            image_size // patch_size[1],
-        )
-
         # Patch embedding
         self.patch_embedding = nn.Conv2d(
             3, dim, patch_size, patch_size
@@ -285,14 +277,15 @@ def __init__(
             for _ in range(llm_decoder_depth)
         )
 
-    def forward(self, img: Tensor, text: Tensor) -> Tensor:
+    def forward(self, text: Tensor, img: Tensor) -> Tensor:
         # Image patch
         img = rearrange(
             img,
             "b c (h p1) (w p2) -> b (h w) (p1 p2 c)",
             p1=self.patch_size[0],
             p2=self.patch_size[1],
         )
+        print(f"Image patch shape: {img.shape}")
 
         # vit
         img = self.vit(img, return_embeddings=True)
@@ -306,8 +299,8 @@ def forward(self, img: Tensor, text: Tensor) -> Tensor:
 
         # T5 Multimodal encoder
         for attn, ff in self.mme_layers:
-            x, _, _ = attn(x, x, x)
-            x = ff(x)
+            x, _, _ = attn(x, x, x) + x
+            x = ff(x) + x
 
         # Pass the k, v values into the cross attention of llm
         for cross_attn, attn in self.llm_layers: