-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranslate.sh
executable file
·204 lines (171 loc) · 6.55 KB
/
translate.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/bin/bash
# @file translate.sh
# @brief Simple end-to-end translation.
#
# @author Darlene Stewart
#
# Technologies langagieres interactives / Interactive Language Technologies
# Inst. de technologie de l'information / Institute for Information Technology
# Conseil national de recherches Canada / National Research Council Canada
# Copyright 2010, Sa Majeste la Reine du Chef du Canada
# Copyright 2010, Her Majesty in Right of Canada
# Include NRC's bash library.
SH_UTILS=`which sh_utils.sh`
if [[ "${SH_UTILS}" == "" ]]; then
echo "ERROR: Unable to find sh_utils.sh. Is your PATH set correctly for PortageII?" >&2
exit 1
fi
source ${SH_UTILS}
print_nrc_copyright translate.sh 2010
export PORTAGE_INTERNAL_CALL=1
usage() {
for msg in "$@"; do
echo $msg >&2
done
cat <<==EOF== >&2
Usage: translate.sh [OPTIONS] [SOURCE_FILE]
Translate input source text from SOURCE_FILE (or STDIN) according to the
currently trained models (relative to the current directory). The complete
translation pipeline is run: pre-processing, tokenization, translation,
truecasing (if models available), detokenization, post-processing.
If SOURCE_FILE is not provided, input is read from STDIN. The input text
is assumed to contain paragraphs separated by blank lines.
This script must be located in the root directory of the framework
along with the Makefile.params file from which it obtains settings.
Options:
-do|-decode-only translate without rescoring and confidence estimation
-wr|-with-rescoring translate with rescoring
-wc|-with-ce translate with confidence estimation
-v(erbose) increment the verbosity level by 1 (may be repeated)
-q(uiet) make terminal output as quiet as possible
-d(ebug) print debugging information
-n(otreally) just print the commands to execute, don't run them.
-h(elp) print this help message
==EOF==
exit 1
}
# Command line processing
VERBOSE=0
while [ $# -gt 0 ]; do
case "$1" in
-do|-decode-only) DECODE_ONLY=1;;
-wr|-with-rescoring) WITH_RESCORING=1;;
-wc|-with-ce) WITH_CE=1;;
-nomode) NOMODE=1;;
-v|-verbose) VERBOSE=$(( $VERBOSE + 1 ));;
-q|-quiet) QUIET=1;;
-d|-debug) DEBUG=1;;
-n|-notreally) NOTREALLY=1;;
-h|-help) usage;;
--) shift; break;;
-*) error_exit "Unknown option $1.";;
*) break;;
esac
shift
done
ROOTDIR=`dirname $0`
MAKEFILE_PARAMS="${ROOTDIR}/Makefile.params"
[[ -e ${MAKEFILE_PARAMS} ]] \
|| error_exit "Makefile.params not found in directory with translate.sh."
if [[ $# -gt 0 ]]; then
# [[ $# -gt 0 ]] || error_exit "Missing the input file to translate."
SOURCE_FILE=$1; shift
[[ $# -eq 0 ]] || error_exit "Unexpected argument(s): $@" >&2
fi
# get_param PARAM_NAME
# Read the framework's top-level Makefile.params to find the value of PARAM_NAME
get_param() {
var=$1
#echo "get_param $var" >&2
if TEXT=`make -pn --file=${MAKEFILE_PARAMS} 2> /dev/null | grep "^ *$var"`; then
#echo "Var $var found" >&2
if [[ ${TEXT} =~ '= *([^ ][^ ]*)' ]]; then
echo ${BASH_REMATCH[1]}
else
warn "Parameter $var found but has no value" >&2
fi
#else echo "Var $var NOT found" >&2
fi
}
[[ "${DECODE_ONLY}${WITH_RESCORING}${WITH_CE}" -gt "1" ]] \
&& error_exit "Specify only one of -decode-only, -with-rescoring, -with-ce."
MODE="-decode-only"
[[ $WITH_RESCORING ]] && MODE="-with-rescoring"
[[ $WITH_CE ]] && MODE="-with-ce"
[[ $NOMODE ]] && MODE=""
if [[ ${WITH_RESCORING} ]]; then
# Check if the rescoring model was set to build.
[[ `get_param DO_RESCORING` == 1 ]] \
|| warn "DO_RESCORING not enabled; rescoring model may not have been built."
fi
if [[ ${WITH_CE} ]]; then
# Check if the confidence estimation model was set to build.
[[ `get_param DO_CE` == 1 ]] \
|| warn "DO_CE not enabled; confidence estimation model may not have been built."
fi
# Determine the source language.
SRC_LANG=`get_param SRC_LANG`
[[ $SRC_LANG ]] && SRC_OPT="-src=$SRC_LANG"
# Determine the target language.
TGT_LANG=`get_param TGT_LANG`
[[ $TGT_LANG ]] && TGT_OPT="-tgt=$TGT_LANG"
# Determine the source locale country code.
SRC_LOCALE_COUNTRY=`get_param SRC_LOCALE_COUNTRY`
[[ $SRC_LOCALE_COUNTRY ]] && SRC_CC_OPT="-src-country=$SRC_LOCALE_COUNTRY"
# Determine the target locale country code.
TGT_LOCALE_COUNTRY=`get_param TGT_LOCALE_COUNTRY`
# Determine the TMX source language code
TMX_SRC=`get_param TMX_SRC`
[[ $TMX_SRC ]] && TMX_SRC_OPT="-xsrc=$TMX_SRC"
if [[ ! $TMX_SRC_OPT && $SRC_LANG ]]; then
TMX_SRC_OPT="-xsrc=`echo -n $SRC_LANG | tr 'a-z' 'A-Z'`-$SRC_LOCALE_COUNTRY"
fi
# Determine the TMX target language code
TMX_TGT=`get_param TMX_TGT`
[[ $TMX_TGT ]] && TMX_TGT_OPT="-xtgt=$TMX_TGT"
if [[ ! $TMX_TGT_OPT && $TGT_LANG ]]; then
TMX_TGT_OPT="-xtgt=`echo -n $TGT_LANG | tr 'a-z' 'A-Z'`-$TGT_LOCALE_COUNTRY"
fi
# Determine the PortageLive parallelism level
PAR_LEVEL=`get_param PARALLELISM_LEVEL_PORTAGELIVE`
if [[ $PAR_LEVEL && $PAR_LEVEL -gt 1 ]]; then
PARALLEL_OPT="-w=3 -n=$PAR_LEVEL"
fi
# Locate the canoe.ini.cow file.
# We assume that this translate.sh script is at the root of the framework.
CANOE_INI="canoe.ini.cow"
if [[ ! -e ${CANOE_INI} ]]; then
CANOE_INI="${ROOTDIR}/translate/canoe.ini.cow"
[[ -e ${CANOE_INI} ]] || ln -s "../models/decode/canoe.ini.cow" ${CANOE_INI}
CANOE_INI_OPT="-f=\"${CANOE_INI}\""
fi
# Determine if truecasing
if [[ `get_param DO_TRUECASING` == 1 ]]; then
TPLM_CNT=0
TPLM=( `dirname ${CANOE_INI}`/models/tc/*.tplm )
if [[ ! "${TPLM[*]}" =~ '\*' ]]; then
# found files with .tplm extension - need to exclude any log files.
for NAME in ${TPLM[*]}; do
[[ $(basename ${NAME}) =~ '^log.*$' ]] || TPLM_CNT=$(( $TPLM_CNT + 1 ))
done
fi
if [[ ${TPLM_CNT} -gt 0 ]]; then
TC_OPT="-tctp"
verbose 1 "Using tightly packed truecasing model (-tctp)."
else
TC_OPT="-tc"
verbose 1 "Using text truecasing model (-tc)."
fi
else
verbose 1 "Not truecasing."
fi
for (( V=$VERBOSE; $V>0; V=$V-1 )) ; do
V_OPT="$V_OPT -v"
done
# Determine if we need to skip lowercasing of the input.
[[ `get_param DONT_LOWERCASE_SRC` == 1 ]] && NOLC="-nolc"
[[ $QUIET ]] && Q_OPT="-quiet"
# Make sure plugins in the plugins directory in the framework will be used by translate.pl.
export PATH="${ROOTDIR}/plugins:$PATH"
run_cmd "translate.pl $MODE $NOLC $SRC_OPT $TGT_OPT $SRC_CC_OPT $TMX_SRC_OPT $TMX_TGT_OPT $PARALLEL_OPT $TC_OPT $CANOE_INI_OPT $V_OPT $Q_OPT $SOURCE_FILE"
exit