forked from facebookresearch/code-prediction-transformer
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconvert.py
24 lines (18 loc) · 786 Bytes
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import json
from transformers.tokenization_utils import PreTrainedTokenizer
import utils
from transformers import PreTrainedTokenizerFast
# This will tokenize and add special tokens
# Todo
ast_tok = "<ast>"
tokenizer = PreTrainedTokenizerFast(tokenizer_file = "tokenizer/code-tokenizer.json")
with open("output/new_ast_raw.json", "r") as fin, open("output/converted_train.txt", "w") as fout:
for line in utils.file_tqdm(fin):
json_line = json.loads(line)
json_tokens = json_line["nodes"]
is_ext = json_line["ext"]
if not is_ext:
encoded = tokenizer.encode(ast_tok + " " + " ".join(json_tokens))
else:
encoded = tokenizer.encode(" ".join(json_tokens))
fout.write(" ".join(str(e) for e in encoded) + " \n")