kyegomez
diff --git a/‎README.md
Lines changed: 6 additions & 2 deletions b/‎README.md
Lines changed: 6 additions & 2 deletions
diff --git a/‎cm3/model.py
Lines changed: 39 additions & 40 deletions b/‎cm3/model.py
Lines changed: 39 additions & 40 deletions
diff --git a/‎cm3/tokenizer.py
Lines changed: 13 additions & 18 deletions b/‎cm3/tokenizer.py
Lines changed: 13 additions & 18 deletions
@@ -38,11 +38,15 @@ To start with CM3Leon in a PyTorch environment:
 import torch
 from cm3.model import CM3
 
+# usage
 img = torch.randn(1, 3, 256, 256)
-text = torch.randint(0, 20000, (1, 1024))
+caption = torch.randint(0, 20000, (1, 1024))
 
 model = CM3()
-output = model(text, img)
+
+output = model(img, caption)
+print(output.shape)  # (1, 1024, 20000)
+
 
 ```
 
 
@@ -11,14 +11,16 @@
     ViTransformerWrapper,
 )
 
-#logging
-logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+# logging
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
+)
 
 
-#main model
+# main model
 class CM3(Module):
     """
-    Andromeda is a transformer-based model architecture. It initializes with 
+    Andromeda is a transformer-based model architecture. It initializes with
     a Transformer and AutoregressiveWrapper with default or user-specified parameters.
 
     Initialize the model with specified or default parameters.
@@ -41,37 +43,33 @@ class CM3(Module):
         - attn_qk_norm: Attention query-key normalization
         - attn_qk_norm_dim_scale: Attention query-key normalization dimension scale
     """
+
     def __init__(
-            self, 
-            num_tokens=50432, 
-            max_seq_len=8192, 
-            dim=2560, 
-            depth=32, 
-            dim_head=128, 
-            heads=24,
-            use_abs_pos_emb=False, 
-            alibi_pos_bias=True, 
-            alibi_num_heads=12, 
-            rotary_xpos=True,
-            attn_flash=True, 
-            image_size=256,
-            patch_size=32,
-            attn_one_kv_head=True,  # multiquery attention
-            qk_norm=True, 
-            attn_qk_norm=True, 
-            attn_qk_norm_dim_scale=True, 
-        ):
+        self,
+        num_tokens=50432,
+        max_seq_len=8192,
+        dim=2560,
+        depth=32,
+        dim_head=128,
+        heads=24,
+        use_abs_pos_emb=False,
+        alibi_pos_bias=True,
+        alibi_num_heads=12,
+        rotary_xpos=True,
+        attn_flash=True,
+        image_size=256,
+        patch_size=32,
+        attn_one_kv_head=True,  # multiquery attention
+        qk_norm=True,
+        attn_qk_norm=True,
+        attn_qk_norm_dim_scale=True,
+    ):
         super().__init__()
 
         self.encoder = ViTransformerWrapper(
             image_size=image_size,
             patch_size=patch_size,
-            attn_layers=Encoder(
-                dim=dim,
-                depth=depth,
-                dim_head=dim_head,
-                heads=heads
-            )
+            attn_layers=Encoder(dim=dim, depth=depth, dim_head=dim_head, heads=heads),
         )
 
         self.transformer = Transformer(
@@ -91,30 +89,32 @@ def __init__(
                 # qk_norm=qk_norm,
                 # attn_qk_norm=attn_qk_norm,
                 # attn_qk_norm_dim_scale=attn_qk_norm_dim_scale,
-                cross_attend=True
-            )
+                cross_attend=True,
+            ),
         )
 
         self.decoder = AutoregressiveWrapper(self.transformer)
 
     def mask_and_relocate(self, text_tokens):
-        #mask image span
-        text_tokens = text_tokens.masked_fill(text_tokens==self.im_idx, self.mask_token)
+        # mask image span
+        text_tokens = text_tokens.masked_fill(
+            text_tokens == self.im_idx, self.mask_token
+        )
 
-        #relocate to end
-        image_span = text_tokens[text_tokens==self.im_end_idx].unsqueeze(1)
+        # relocate to end
+        image_span = text_tokens[text_tokens == self.im_end_idx].unsqueeze(1)
         text_tokens = torch.cat([text_tokens, image_span], dim=1)
         return text_tokens
-    
+
     def cm3_loss(self, log_probs, labels):
-        #cm3 loss prediction
+        # cm3 loss prediction
         loss = nn.NLLLoss()(log_probs, labels)
         return loss
 
     # def forward(self, text_tokens, img, **kwargs):
     #     try:
     #         encoded_img = self.encoder(img, return_embeddings=True)
-            
+
     #         #mask and relocate image span in text tokens
     #         text_tokens = self.mask_and_relocate(text_tokens)
 
@@ -134,10 +134,9 @@ def cm3_loss(self, log_probs, labels):
     #         raise
 
     def forward(self, img, text):
-        try:    
+        try:
             encoded = self.encoder(img, return_embeddings=True)
             return self.decoder(text, context=encoded)
         except Exception as error:
             print(f"Failed in forward method: {error}")
             raise
-
@@ -10,10 +10,10 @@
 
 class Tokenizer:
     """
-    A SentencePieceTokenizer is a tokenizer that uses a pretrained SentencePiece model 
-    to convert text into tokens and vice versa. 
+    A SentencePieceTokenizer is a tokenizer that uses a pretrained SentencePiece model
+    to convert text into tokens and vice versa.
 
-    It includes the ability to add special tokens for infilling tasks and provides 
+    It includes the ability to add special tokens for infilling tasks and provides
     functionality to encode and decode text with or without implicit leading spaces.
 
     Parameters:
@@ -32,6 +32,7 @@ class Tokenizer:
 
 
     """
+
     def __init__(self, model_path: str):
         # reload tokenizer
         assert os.path.isfile(model_path), model_path
@@ -49,28 +50,23 @@ def __init__(self, model_path: str):
         self.middle_id: Optional[int] = self.sp_model.piece_to_id("▁<MID>") or None
         self.suffix_id: Optional[int] = self.sp_model.piece_to_id("▁<SUF>") or None
         self.eot_id: Optional[int] = self.sp_model.piece_to_id("▁<EOT>") or None
-        
-        #generates text until a modality break token is detected => then img is sampled
+
+        # generates text until a modality break token is detected => then img is sampled
         self.break_id: Optional[int] = self.sp_model.piece_to_id("_<BREAK>") or None
         self.image_id: Optional[int] = self.sp_model.piece_to_id("_<IMG>") or None
         self.infill_id: Optional[int] = self.sp_model.piece_to_id("_<INFILL>") or None
-        
-        logger.info(f"BREAK ID: {self.break_id} - IMG ID: {self.image_id} - INFILL ID: {self.infill_id}")
-
 
+        logger.info(
+            f"BREAK ID: {self.break_id} - IMG ID: {self.image_id} - INFILL ID: {self.infill_id}"
+        )
 
         logger.info(
             f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id} "
             f"- PRE ID: {self.prefix_id} - MID ID: {self.middle_id} - SUF ID: {self.suffix_id} - EOT ID: {self.eot_id}"
         )
         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
 
-    def encode(
-        self, 
-        s: str, 
-        bos: bool, 
-        eos: bool
-    ) -> List[int]:
+    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
         assert type(s) is str
         t = self.sp_model.encode(s)
         if bos:
@@ -89,7 +85,7 @@ def encode_infilling(self, s: str) -> List[int]:
     def decode_infilling(self, t: List[int]) -> str:
         """Decode a string without an implicit leading space."""
         return self.sp_model.decode([self.sp_model.piece_to_id("☺")] + t)[1:]
-    
+
 
 # class CM3LeonTokenizer(Tokenizer):
 #     """
@@ -127,7 +123,7 @@ def decode_infilling(self, t: List[int]) -> str:
 #             model_path=model_path,
 #             query_text="A photo of an image segment",
 #         )
-    
+
 #     def encode(
 #         self,
 #         s: str = None,
@@ -149,9 +145,8 @@ def decode_infilling(self, t: List[int]) -> str:
 #         )
 
 #         #combine text, tokens and image embeddings
-#         #starting with a <break> token followed by img embeds 
+#         #starting with a <break> token followed by img embeds
 #         # and ending with a eos token
 
 #         seq = text + [self.break_id] + img + [self.eos_id]
 #         return seq
-