Skip to content

Commit 62b51c3

Browse files
authored
models : change convert-pt-to-ggml to use .tiktoken tokenizer files (ggml-org#725)
1 parent 6112887 commit 62b51c3

File tree

1 file changed

+7
-11
lines changed

1 file changed

+7
-11
lines changed

models/convert-pt-to-ggml.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import code
4040
import torch
4141
import numpy as np
42+
import base64
4243

4344
#from transformers import GPTJForCausalLM
4445
#from transformers import GPT2TokenizerFast
@@ -224,18 +225,14 @@ def bytes_to_unicode():
224225
#code.interact(local=locals())
225226

226227
multilingual = hparams["n_vocab"] == 51865
227-
dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
228-
229-
#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
230-
#print(tokenizer)
231-
#print(tokenizer.name_or_path)
232-
#print(len(tokenizer.additional_special_tokens))
228+
tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
233229

234230
# output in the same directory as the model
235231
fname_out = dir_out + "/ggml-model.bin"
236232

237-
with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
238-
tokens = json.load(f)
233+
with open(tokenizer, "rb") as f:
234+
contents = f.read()
235+
tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
239236

240237
# use 16-bit or 32-bit floats
241238
use_f16 = True
@@ -271,9 +268,8 @@ def bytes_to_unicode():
271268
fout.write(struct.pack("i", len(tokens)))
272269

273270
for key in tokens:
274-
text = bytearray([byte_decoder[c] for c in key])
275-
fout.write(struct.pack("i", len(text)))
276-
fout.write(text)
271+
fout.write(struct.pack("i", len(key)))
272+
fout.write(key)
277273

278274
for name in list_vars.keys():
279275
data = list_vars[name].squeeze().numpy()

0 commit comments

Comments
 (0)