Finalizing

Artyom17 · Artyom17 · commit 50e45b05257a · 2024-04-19T15:12:31.000-07:00
diff --git a/eval.py b/eval.py
@@ -18,7 +18,7 @@
 torch._inductor.config.triton.cudagraphs = True
 torch._dynamo.config.cache_size_limit = 100000
 
-from sentencepiece import SentencePieceProcessor
+from tokenizer import get_tokenizer
 
 from model import Transformer
 
@@ -217,7 +217,7 @@ def main(
     assert checkpoint_path.is_file(), checkpoint_path
 
     tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-    assert tokenizer_path.is_file(), tokenizer_path
+    assert tokenizer_path.is_file(), str(tokenizer_path)
 
     device = 'cuda'
     precision = torch.bfloat16
@@ -231,7 +231,7 @@ def main(
 
     model.eval()
 
-    tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))
+    tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)
 
     torch.manual_seed(1234)
 
diff --git a/generate.py b/generate.py
@@ -32,8 +32,6 @@ def device_sync(device):
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from sentencepiece import SentencePieceProcessor
-
 from model import Transformer
 from tokenizer import get_tokenizer
 
@@ -268,11 +266,8 @@ def main(
     """
     assert checkpoint_path.is_file(), checkpoint_path
 
-    if "Llama-3" in str(checkpoint_path):
-        tokenizer_path = checkpoint_path.parent / "original/tokenizer.model"
-    else:
-        tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-    assert tokenizer_path.is_file(), tokenizer_path
+    tokenizer_path = checkpoint_path.parent / "tokenizer.model"
+    assert tokenizer_path.is_file(), str(tokenizer_path)
 
     global print
     from tp import maybe_init_dist
@@ -302,7 +297,6 @@ def main(
 
     tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)
 
-    #tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))
     encoded = encode_tokens(tokenizer, prompt, bos=True, device=device)
     prompt_length = encoded.size(0)
 
diff --git a/mixtral-moe/generate.py b/mixtral-moe/generate.py
@@ -175,7 +175,7 @@ def main(
     assert checkpoint_path.is_file(), checkpoint_path
 
     tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-    assert tokenizer_path.is_file(), tokenizer_path
+    assert tokenizer_path.is_file(), str(tokenizer_path)
 
     global print
     rank = maybe_init_dist()
diff --git a/model.py b/model.py
@@ -66,7 +66,6 @@ def from_name(cls, name: str):
     "stories15M": dict(n_layer=6, n_head=6, dim=288),
     "stories110M": dict(n_layer=12, n_head=12, dim=768),
     "Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),
-    "Llama-3-70B": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256),
 }
 
 class KVCache(nn.Module):
diff --git a/quantize.py b/quantize.py
@@ -9,7 +9,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from sentencepiece import SentencePieceProcessor
+from tokenizer import get_tokenizer
 
 try:
     from GPTQ import GenericGPTQRunner, InputRecorder
@@ -578,8 +578,8 @@ def quantize(
         quant_handler = WeightOnlyInt4GPTQQuantHandler(model, groupsize)
 
         tokenizer_path = checkpoint_path.parent / "tokenizer.model"
-        assert tokenizer_path.is_file(), tokenizer_path
-        tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))
+        assert tokenizer_path.is_file(), str(tokenizer_path)
+        tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)
 
         quantized_state_dict = quant_handler.create_quantized_state_dict(
             tokenizer,
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 import json
 import re
+import shutil
 import sys
 from pathlib import Path
 from typing import Optional
@@ -27,33 +28,62 @@ def convert_hf_checkpoint(
     if model_name is None:
         model_name = checkpoint_dir.name
 
+    # Llama 3 8B doesn't need conversion; instead, the original/consolidated.NN.pth files
+    # need to be copied into model.pth.
+    # Llama 3 70B can't be easily merged into one model.pth file, though, since names of the
+    # weights is state dict are the same in each consolidated.NN.pth file. Thus, it is not
+    # currently supported.
+    # Along this, we need to copy the original/tokenizer.model file to tokenizer.model.tiktoken
+    is_llama3 = "Llama-3" in model_name
+    if is_llama3:
+        # Check if we have multiple original/consolidated.NN.pth files and report error
+        # if we do for Llama 3.
+        original_dir = checkpoint_dir / "original"
+        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
+        bin_files = [bin for bin in original_dir.iterdir() if pattern.match(bin.name)]
+        if len(bin_files) > 1:
+            raise ValueError(
+                f"Multiple consolidated.NN.pth files found in {original_dir}. "
+                "Merging them into one model.pth file is not supported for Llama 3.")
+
+
     config = ModelArgs.from_name(model_name)
     print(f"Model config {config.__dict__}")
 
     # Load the json file containing weight mapping
-    model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
-
-    assert model_map_json.is_file()
-
-    with open(model_map_json) as json_map:
-        bin_index = json.load(json_map)
-
-    weight_map = {
-        "model.embed_tokens.weight": "tok_embeddings.weight",
-        "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
-        "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
-        "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
-        "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
-        'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
-        'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
-        "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
-        "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
-        "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
-        "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
-        "model.norm.weight": "norm.weight",
-        "lm_head.weight": "output.weight",
-    }
-    bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
+    if not is_llama3:
+        model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
+
+        assert model_map_json.is_file()
+
+        with open(model_map_json) as json_map:
+            bin_index = json.load(json_map)
+
+        weight_map = {
+            "model.embed_tokens.weight": "tok_embeddings.weight",
+            "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+            "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+            "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+            "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+            'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
+            'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
+            "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+            "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+            "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+            "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+            "model.norm.weight": "norm.weight",
+            "lm_head.weight": "output.weight",
+        }
+        bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
+    else:
+        # There is no separate pytorch_model.bin.index.json file for llama3.
+        # Instead, we will just use all original/consolidated.NN.pth files.
+        # so, we use model.safetensors.index.json
+        weight_map = None
+        original_dir = checkpoint_dir / "original"
+        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
+        bin_files = {bin for bin in original_dir.iterdir() if pattern.match(bin.name)}
+        
 
     def permute(w, n_head):
         dim = config.dim
@@ -68,32 +98,41 @@ def permute(w, n_head):
         state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
         merged_result.update(state_dict)
     final_result = {}
-    for key, value in merged_result.items():
-        if "layers" in key:
-            abstract_key = re.sub(r'(\d+)', '{}', key)
-            layer_num = re.search(r'\d+', key).group(0)
-            new_key = weight_map[abstract_key]
-            if new_key is None:
-                continue
-            new_key = new_key.format(layer_num)
-        else:
-            new_key = weight_map[key]
-
-        final_result[new_key] = value
-
-    for key in tuple(final_result.keys()):
-        if "wq" in key:
-            q = final_result[key]
-            k = final_result[key.replace("wq", "wk")]
-            v = final_result[key.replace("wq", "wv")]
-            q = permute(q, config.n_head)
-            k = permute(k, config.n_local_heads)
-            final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
-            del final_result[key]
-            del final_result[key.replace("wq", "wk")]
-            del final_result[key.replace("wq", "wv")]
+    if weight_map is not None:
+        for key, value in merged_result.items():
+            if "layers" in key:
+                abstract_key = re.sub(r'(\d+)', '{}', key)
+                layer_num = re.search(r'\d+', key).group(0)
+                new_key = weight_map[abstract_key]
+                if new_key is None:
+                    continue
+                new_key = new_key.format(layer_num)
+            else:
+                new_key = weight_map[key]
+
+            final_result[new_key] = value
+
+        for key in tuple(final_result.keys()):
+            if "wq" in key:
+                q = final_result[key]
+                k = final_result[key.replace("wq", "wk")]
+                v = final_result[key.replace("wq", "wv")]
+                q = permute(q, config.n_head)
+                k = permute(k, config.n_local_heads)
+                final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
+                del final_result[key]
+                del final_result[key.replace("wq", "wk")]
+                del final_result[key.replace("wq", "wv")]
+    else:
+        final_result = merged_result
     print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}")
     torch.save(final_result, checkpoint_dir / "model.pth")
+    if is_llama3:
+        original_dir = checkpoint_dir / "original"
+        tokenizer_model = original_dir / "tokenizer.model"
+        tokenizer_model_tiktoken = checkpoint_dir / "tokenizer.model"
+        print(f"Copying {tokenizer_model} to {tokenizer_model_tiktoken}")
+        shutil.copy(tokenizer_model, tokenizer_model_tiktoken)
 
 if __name__ == '__main__':
     import argparse

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,6 @@ def from_name(cls, name: str):`
`66`	`66`	`"stories15M": dict(n_layer=6, n_head=6, dim=288),`
`67`	`67`	`"stories110M": dict(n_layer=12, n_head=12, dim=768),`
`68`	`68`	`"Llama-3-8B": dict(block_size=8192, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=128256),`
`69`		`- "Llama-3-70B": dict(block_size=8192, n_layer=80, n_head=64, n_local_heads=8, dim=8192, intermediate_size=28672, vocab_size=128256),`
`70`	`69`	`}`
`71`	`70`
`72`	`71`	`class KVCache(nn.Module):`