|
39 | 39 | import code
|
40 | 40 | import torch
|
41 | 41 | import numpy as np
|
| 42 | +import base64 |
42 | 43 |
|
43 | 44 | #from transformers import GPTJForCausalLM
|
44 | 45 | #from transformers import GPT2TokenizerFast
|
@@ -224,18 +225,14 @@ def bytes_to_unicode():
|
224 | 225 | #code.interact(local=locals())
|
225 | 226 |
|
226 | 227 | multilingual = hparams["n_vocab"] == 51865
|
227 |
| -dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2") |
228 |
| - |
229 |
| -#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2") |
230 |
| -#print(tokenizer) |
231 |
| -#print(tokenizer.name_or_path) |
232 |
| -#print(len(tokenizer.additional_special_tokens)) |
| 228 | +tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken") |
233 | 229 |
|
234 | 230 | # output in the same directory as the model
|
235 | 231 | fname_out = dir_out + "/ggml-model.bin"
|
236 | 232 |
|
237 |
| -with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f: |
238 |
| - tokens = json.load(f) |
| 233 | +with open(tokenizer, "rb") as f: |
| 234 | + contents = f.read() |
| 235 | + tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)} |
239 | 236 |
|
240 | 237 | # use 16-bit or 32-bit floats
|
241 | 238 | use_f16 = True
|
@@ -271,9 +268,8 @@ def bytes_to_unicode():
|
271 | 268 | fout.write(struct.pack("i", len(tokens)))
|
272 | 269 |
|
273 | 270 | for key in tokens:
|
274 |
| - text = bytearray([byte_decoder[c] for c in key]) |
275 |
| - fout.write(struct.pack("i", len(text))) |
276 |
| - fout.write(text) |
| 271 | + fout.write(struct.pack("i", len(key))) |
| 272 | + fout.write(key) |
277 | 273 |
|
278 | 274 | for name in list_vars.keys():
|
279 | 275 | data = list_vars[name].squeeze().numpy()
|
|
0 commit comments