-
Notifications
You must be signed in to change notification settings - Fork 121
Expand file tree
/
Copy pathindex.ts
More file actions
81 lines (69 loc) · 2.39 KB
/
index.ts
File metadata and controls
81 lines (69 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import { Tokenizer } from "@mlc-ai/web-tokenizers";
async function testJSONTokenizer() {
console.log("JSON Tokenizer");
const jsonBuffer = await (await
fetch("https://huggingface.co/openai/clip-vit-large-patch14/raw/main/tokenizer.json")
).arrayBuffer();
const tok = await Tokenizer.fromJSON(jsonBuffer);
const text = "What is the capital of Canada?";
const ids = tok.encode(text);
console.log("ids=" + ids)
const decodedText = tok.decode(ids);
console.log("decoded=" + decodedText);
const vocabSize = tok.getVocabSize();
console.log("vocabSize=" + vocabSize);
const tok0 = tok.idToToken(0);
console.log("tok0=" + tok0);
if (tok0 !== "!") {
throw Error("Expect token 0 to be !");
}
const tok49407 = tok.idToToken(49407);
console.log("tok49407=" + tok49407);
if (tok49407 !== "<|endoftext|>") {
throw Error("Expect token 49407 to be <|endoftext|>");
}
const id0 = tok.tokenToId("!");
console.log("id0=" + id0);
if (id0 !== 0) {
throw Error("Expect token 0 to be !");
}
const id49407 = tok.tokenToId("<|endoftext|>");
console.log("id49407=" + id49407);
if (id49407 !== 49407) {
throw Error("Expect token 49407 to be <|endoftext|>");
}
}
async function testLlamaTokenizer() {
console.log("Llama Tokenizer");
const modelBuffer = await (await
fetch("https://huggingface.co/hongyij/web-llm-test-model/resolve/main/tokenizer.model")
).arrayBuffer();
const tok = await Tokenizer.fromSentencePiece(modelBuffer);
const text = "What is the capital of Canada?";
const ids = tok.encode(text);
console.log("ids=" + ids)
const decodedText = tok.decode(ids);
console.log("decoded=" + decodedText);
const vocabSize = tok.getVocabSize();
console.log("vocabSize=" + vocabSize);
if (vocabSize !== 32000) {
throw Error("Expect Llama to have vocab size 32000");
}
}
// Without COMPILE_WASM_RUNTIME, this triggers parallel processing, leading to error
async function testBertTokenizer() {
console.log("Bert Tokenizer");
const modelBuffer = await (await
fetch("https://huggingface.co/Snowflake/snowflake-arctic-embed-l/raw/main/tokenizer.json")
).arrayBuffer();
const tok = await Tokenizer.fromJSON(modelBuffer);
const text = "What is the capital of Canada?";
const ids = tok.encode(text);
console.log(ids);
}
async function main() {
await testJSONTokenizer()
await testBertTokenizer();
await testLlamaTokenizer()
}
main()