From 9d53546976fe94fec6107b0b854f3fd923b3a824 Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Mon, 17 Jul 2023 19:36:17 +0900 Subject: [PATCH] Update train_all.sh The wiki dates hardcoded are outdated. This adds changes the dates and codes it as a variable (`DEFAULT_WIKI_DATE`) so it can be easily changed as necessary. --- kenlm_training/train_all.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kenlm_training/train_all.sh b/kenlm_training/train_all.sh index 6988a01b..80ac2eac 100755 --- a/kenlm_training/train_all.sh +++ b/kenlm_training/train_all.sh @@ -5,6 +5,7 @@ set -e LANGUAGES_WIKIPEDIA=( "es" "af" "ar" "arz" "as" "bn" "fr" "sw" "eu" "ca" "zh" "en" "hi" "ur" "id" "pt" "vi" "gu" "kn" "ml" "mr" "ta" "te" "yo" ) LANGUAGES_OSCAR=( "es" "af" "ar" "arz" "as" "bn" "fr" "sw" "eu" "ca" "zh" "en" "hi" "ur" "id" "pt" "vi" "gu" "kn" "ml" "mr" "te" ) +DEFAULT_WIKI_DATE=20230710 NDOC_FOR_LM=1_000_000 VOCAB_SIZE=65536 SMALL_VOCAB_SIZE=40000 @@ -29,7 +30,7 @@ train_language_and_dataset () { else echo "Downloading ${lang}" mkdir -p "data/${dataset}/cirrus/gz/" - python cc_net/get_wiki_cirrus.py dl --lang "${lang}" --output_dir "data/${dataset}/cirrus/gz" --date 20211115 + python cc_net/get_wiki_cirrus.py dl --lang "${lang}" --output_dir "data/${dataset}/cirrus/gz" --date ${DEFAULT_WIKI_DATE} echo "Downloaded Wikipedia cirrus for ${lang}" fi