This repository was archived by the owner on Jan 15, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 530
/
Copy pathwmt2017_zhen.sh
89 lines (79 loc) · 3.6 KB
/
wmt2017_zhen.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
SUBWORD_ALGO=$1
SRC=zh
TGT=en
SAVE_PATH=wmt2017_zhen
# Fetch the raw text
nlp_data prepare_wmt \
--dataset wmt2017 \
--lang-pair ${SRC}-${TGT} \
--save-path ${SAVE_PATH}
# We use sacrebleu to fetch the dev set and test set of wmt17
sacrebleu -t wmt17/dev -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/dev.raw.${SRC}
sacrebleu -t wmt17/dev -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/dev.raw.${TGT}
sacrebleu -t wmt17 -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/test.raw.${SRC}
sacrebleu -t wmt17 -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/test.raw.${TGT}
# Clean and tokenize the training + dev corpus
cd ${SAVE_PATH}
nlp_process clean_tok_para_corpus --src-lang ${SRC} \
--tgt-lang ${TGT} \
--src-corpus train.raw.${SRC} \
--tgt-corpus train.raw.${TGT} \
--src-tokenizer jieba \
--tgt-tokenizer moses \
--max-ratio 1.3 \
--min-num-words 3 \
--max-num-words 70 \
--src-save-path train.tok.${SRC} \
--tgt-save-path train.tok.${TGT}
nlp_process clean_tok_para_corpus --src-lang ${SRC} \
--tgt-lang ${TGT} \
--src-corpus dev.raw.${SRC} \
--tgt-corpus dev.raw.${TGT} \
--src-tokenizer jieba \
--tgt-tokenizer moses \
--max-ratio 1.3 \
--min-num-words 3 \
--max-num-words 70 \
--src-save-path dev.tok.${SRC} \
--tgt-save-path dev.tok.${TGT}
# For test corpus, we will just tokenize the data
nlp_process clean_tok_para_corpus --src-lang ${SRC} \
--tgt-lang ${TGT} \
--src-corpus test.raw.${SRC} \
--tgt-corpus test.raw.${TGT} \
--src-tokenizer jieba \
--tgt-tokenizer moses \
--src-save-path test.tok.${SRC} \
--tgt-save-path test.tok.${TGT}
# Learn BPE with the training data. We learn independent source/target vocabulary
nlp_process learn_subword --corpus train.tok.${SRC} \
--model ${SUBWORD_ALGO} \
--save-dir ./${SRC}_model \
--vocab-size 44000
nlp_process learn_subword --corpus train.tok.${TGT} \
--model ${SUBWORD_ALGO} \
--save-dir ./${TGT}_model \
--vocab-size 33000
# Apply the learned codes to the training set
for LANG in ${SRC} ${TGT}
do
nlp_process apply_subword --model ${SUBWORD_ALGO}\
--output-type subword \
--model-path ${LANG}_model/${SUBWORD_ALGO}.model \
--vocab-path ${LANG}_model/${SUBWORD_ALGO}.vocab \
--corpus train.tok.${LANG} \
--save-path train.tok.${SUBWORD_ALGO}.${LANG}
done
# Apply the learned codes to the dev/test set
for LANG in ${SRC} ${TGT}
do
for SPLIT in dev test
do
nlp_process apply_subword --model ${SUBWORD_ALGO} \
--output-type subword \
--model-path ${LANG}_model/${SUBWORD_ALGO}.model \
--vocab-path ${LANG}_model/${SUBWORD_ALGO}.vocab \
--corpus ${SPLIT}.tok.${LANG} \
--save-path ${SPLIT}.tok.${SUBWORD_ALGO}.${LANG}
done
done