-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtutorial.tex
3131 lines (2776 loc) · 151 KB
/
tutorial.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[11pt,letterpaper]{article}
\usepackage[letterpaper, top=1in, bottom=1in, left=1in, right=1.50in]{geometry}
\usepackage{xspace}
\usepackage{pifont}
\usepackage{url}
\usepackage{rotating}
\usepackage{tabularx}
% Enter the 21st century! Let's use utf8 and type accented characters directly.
\usepackage[utf8]{inputenc}
% T1 is a saner font encoding: keeps <, > and | looking right, and has code
% points for accented characters (replaces the default OT1)
\usepackage[T1]{fontenc}
% Modern font that allows ligatures to be encoded in such a way as to preserve
% searchability and cut-and-paste ability, e.g., different and finally have
% "fi" and "ff" each as two letters rather than a special font character.
\usepackage{lmodern}
% Need straight quotes in verbatim text.
\usepackage{upquote}
\usepackage{textcomp}
\newcommand\upquote[1]{\textquotesingle#1\textquotesingle}
\usepackage{alltt}
% This is needed to get bold in tt.
\renewcommand{\ttdefault}{txtt}
\newcommand{\bs}{\textbackslash{}}
% My own todo highlighting command
\usepackage{color}
\newcommand{\TODO}[1]{\emph{\textbf{\textcolor{red}{<TODO> #1 </TODO>}}}}
\newcommand{\New}{\textcolor{red}{[New]}\xspace}
\newcommand{\Changed}{\textcolor{red}{[Changed]}\xspace}
% Official typesetting of PORTAGEshared, now called Portage II
\newcommand{\PS}{PortageII\xspace}
% Try generating a PDF with coloured hyperlinks.
\newif\ifcolourlinks
%\colourlinksfalse
\colourlinkstrue
\ifcolourlinks
\usepackage{color}
\usepackage[colorlinks=true,
pdftex,
linktocpage,
backref=page,
pdftitle={PortageII 3.0 Tutorial},
pdfauthor={Joanis, Stewart, Larkin and Foster},
urlcolor=black % we use \url for code, not actual URLs
]{hyperref}
\else
\usepackage{hyperref}
\fi
% \code formats an inline code snippet without line breaking; it treats the
% underscore as a normal character. Use \url to format a code snippet with
% automatic line breaking, but then underscores are not rendered as characters
% on which copy/paste works.
% \code breaks for text containing underscores when used inside a footnote;
% for \code calls inside footnotes, use \us{} to specify an underscore.
% Use \upquote{quote-text} for straight single quotes around text within a \code
% call.
\def\code{\begingroup\catcode`\_=12 \codex}
\newcommand{\codex}[1]{\texttt{#1}\endgroup}
\chardef\us=`\_
\newcommand{\phs}{\tild{s}} % source phrase
\newcommand{\pht}{\tild{t}} % target phrase
\newcommand{\tip}{\textbf{Useful Tip \large{\ding{43}} }}
\newcommand{\margintip}{\marginpar[{\textbf{Tip \large{\ding{43}}}}]{\textbf{\reflectbox{\large{\ding{43}}} Tip}}}
\newcommand{\tipsummary}{\noindent\textbf{Tip summary \large{\ding{43}} }}
\newcommand{\tipend}{\textbf{ \reflectbox{\large{\ding{43}}}}}
\usepackage{ifpdf}
\ifpdf
\setlength{\pdfpagewidth}{8.5in}
\setlength{\pdfpageheight}{11in}
\else
\fi
\title{\PS Tutorial: \\
A detailed walk-through of the \\
experimental framework}
\date{Last updated June 2018}
\author{Eric Joanis, Darlene Stewart, Samuel Larkin, George Foster}
\begin{document}
\vfill
\maketitle
\vfill
\begin{center}
An adaptation of George Foster's \emph{Running Portage: A Toy Example} \\
to Samuel Larkin's experimental framework,\\
%updated to reflect recommended usage of \PS.
updated to reflect recommended usage of PortageII 4.0
\end{center}
\vfill
\vfill
\begin{center}
{~} \\ \footnotesize
Traitement multilingue de textes / Multilingual Text Processing \\
Centre de recherche en technologies numériques / Digital Technologies Research Centre \\
Conseil national de recherches Canada / National Research Council Canada \\
Copyright \copyright\ 2008, 2009, 2010, 2011, 2012, 2013, 2016, 2018, Sa Majesté la Reine du Chef du Canada
\\ Copyright \copyright\ 2008, 2009, 2010, 2011, 2012, 2013, 2016, 2018, Her Majesty in Right of Canada
\end{center}
\vfill
\newpage
%\vfill
\tableofcontents
%\vfill
\newpage
\section{Introduction}
This document describes how to run an experiment from end to end using the \PS
experimental framework. It is intended as a tutorial on using \PS, as well as a
starting point for further experiments. Although the framework automates most
of the steps described below, we go through them one by one to
better explain how to use the \PS software suite.
\PS can be viewed as a set of programs for turning a bilingual corpus into a
translation system. Here this process is illustrated with a small ``toy''
example of French to English translation, using text from the Hansard corpus.
The training corpus is too small for good translation, but is used the same way
a more realistic setup would be. Total running time is one to several hours.
\subsection{Making Sure \PS is Installed}
To begin, you must build or obtain the \PS executables and ensure that they are
in your path, by sourcing the \code{SETUP.bash} file as
customized for your environment during installation of \PS.\footnote{There is
also a \code{SETUP.tcsh} for users of that shell, but we strongly recommend
using bash. The examples in this document assume the use of bash.}
\code{SETUP.bash} also sets environment variable \code{\$PORTAGE} to the
directory where \PS is installed. Follow the
instructions in \code{INSTALL} before you proceed with this document.
To make sure \PS is installed properly, run:\footnote{At NRC, replace this
instance of \code{\$PORTAGE} by your sandbox.}
\begin{small}
\begin{alltt}
> \textbf{make -C \$PORTAGE/test-suite/unit-testing/check-installation}
\end{alltt}
\end{small}
You should see the message ``Installation successful'' near the end.
You can also try \code{canoe -h}, \code{tune.py -h}, \code{utokenize.pl -h},
\code{ce.pl -h}, and \code{filter-nc1.py -h}.
You should see usage information for each of these programs.
If you get error messages, then some part of your installation is incomplete.
See the section \emph{Verifying your installation} in \code{INSTALL} for
troubleshooting suggestions. Otherwise, you should be ready to proceed.
\subsection{Running the Toy Experiment}
Once \PS is installed, you should make a complete copy of the framework
directory hierarchy, because it is designed to work in place, creating the
models within the hierarchy itself. The philosophy of the framework is that
each experiment is done in a separate copy, where you might do various
customizations depending on what each experiment is intended to test.
For example:
\begin{small}
\begin{alltt}
> \textbf{mkdir experiments}
> \textbf{cd experiments}
> \textbf{cp -pr \$PORTAGE/framework toy.experiment}
> \textbf{cd toy.experiment}
\end{alltt}
\end{small}
Alternatively, you can clone the framework repository directly from GitHub:
\begin{small}
\begin{alltt}
> \textbf{mkdir experiments}
> \textbf{cd experiments}
> \textbf{git clone https://github.com/nrc-cnrc/PortageTrainingFramework.git toy.experiment}
> \textbf{cd toy.experiment}
\end{alltt}
\end{small}
All commands provided in the rest of this document assume they are being run in
the \code{toy.experiment} directory or in a subdirectory thereof. Whenever we
quote a \code{cd} command, we repeat the \code{toy.experiment} directory to
show explicitly where you should end up.
As you work through the example, the commands that you should type\footnote{This
PDF document was generated in such a way that you can copy and paste commands
from here onto the command line of your interactive shell if you wish.}
are shown in bold and preceded by a prompt, \code{>}, and the system's response
is not. System output is not usually fully reproduced here, for brevity's
sake. When it is, results (especially numbers) may vary from the ones
shown, due to platform and random-number generation differences. Note that many results are
truncated in precision for presentation purposes.
Many of the commands are expressed as \code{make} targets. This has the
advantage of requiring less typing, while still allowing you to see the actual
commands executed by the system because they are always echoed by \code{make}.
(You could also type them directly.) \code{make} also lets you skip
sections of this document (except for the first one, since it is done
manually). For example, if you are not interested in any steps before decoder
weight optimization, you can go directly to \S\ref{COW} and type
\code{make tune} to begin at that point. \code{make} will automatically run all
the commands required from previous sections before doing the step you
specifically requested. Here are some other useful \code{make} commands:
\begin{itemize}
\item \code{make all}: run all remaining steps at any point.
\item \code{make clean}: clean up and return the directory to its initial state
\item \code{make -j} \emph{target} or \code{make -j} \emph{N target}: build
\emph{target} by running commands in parallel whenever possible (up to
\emph{N} ways parallel if \emph{N} is specified). This lets you take
advantage of a computing cluster if you have one. If you use a single
multi-core computer, \code{-j} has no effect since most commands in the
framework are internally parallelized instead, as discussed in
\S\ref{FrameworkParams}.
\item \code{make help}: display some help and the main targets available in
the makefile.
\item \code{make summary}: display the resources used by the framework: time
and memory used, as well as disk space for the runtime models (most
informative once training has been completed; discussed further in
\S\ref{FrameworkParams} and \S\ref{ResourceSummary}).
\end{itemize}
\tip\margintip When you run the whole process using \code{make all}, you should
also 1) save the output in a log file, 2) use \code{nohup}, and 3) background
the job. This way, if your terminal is disconnected, your job will continue and
you will not lose your logs:
\begin{small}
\begin{alltt}
> \textbf{nohup make all >& log.make-all &}
\end{alltt}
\end{small}
To follow a job run this way as it is running, you can use \code{tail -f
log.make-all}.\tipend
\tip\margintip You can use \code{disown} to retroactively ``nohup'' a process:
Ctrl-Z and \code{bg} put it in the background; \code{jobs} tells you its job
number, typically \code{1}; \code{disown \%1} disconnects it from the shell,
protecting it from hangup signals.\tipend
\subsection{Overview of the Process}
%\TODO{Hide or remove rescoring stuff from here.}
Here is an overview of the \PS process, as described in the following
sections:
\begin{enumerate}
\item Corpus preparation (\S\ref{CorpusPreparation}) includes
alignment,
corpus splitting (\S\ref{Splitting}),
tokenization (\S\ref{Tokenizing}),
and
lowercasing (\S\ref{Lowercasing}).
\item Model training (\S\ref{Training}) includes
language model (\S\ref{LM}),
coarse language model (\S\ref{coarseLM}),
truecasing model (\S\ref{TC}),
translation model (\S\ref{TM}),
hierarchical lexicalized distortion model (\S\ref{LDM}),
sparse model (\S\ref{sparse}),
NNJMs (\S\ref{NNJM}),
mixture models (\S\ref{MIX}),
decoder weight optimization (\S\ref{COW}),
confidence estimation model training (\S\ref{CE}),
and
rescoring model training (\S\ref{RAT}).
\item Translating and testing (\S\ref{TranslatingTesting}) includes
decoding (\S\ref{Decoding}),
confidence estimation(\S\ref{CETrans}),
rescoring (\S\ref{RATTrans}),
truecasing (\S\ref{Truecasing}),
and
testing (\S\ref{Testing}).
\end{enumerate}
These steps are fairly standard, but there are many variants on the
sample process illustrated here, which can be tuned for particular situations.
The \PS user manual (\code{\$PORTAGE/doc/user-manual.html})
has additional information, including a Background section with general
information on statistical machine translation and an annotated bibliography, but the technical details in
the user manual are outdated in many ways; we keep this tutorial up to date
with each release, and we try to make it thorough, so it is the best source of
information. For detailed information about any individual program in \PS, run
the program with the \code{-h} flag (or see \code{\$PORTAGE/doc/usage.html}).
\section{Corpus Preparation} \label{CorpusPreparation}
To begin, let's go over some definitions of text formats:
\begin{description}
\item[Plain text] is just normal text in a flat file without formatting.
\item[Tokenized] text has spaces separating tokens, as in \texttt{I 'm green
, you 're yellow !} Words and punctuation marks are tokens.
\item[One-paragraph-per-line] (OPPL) text is fairly standard since line-wrapping is
typically automatic nowadays.
\item[One-sentence-per-line] (OSPL) text has been segmented in sentences.
\item[Sentence-aligned] corpora are pairs of files in two languages where
each line in one file is the translation of that same line in the other file.
\item[Truecase] text has normal capitalization of proper words, first word of
the sentence, etc.
\item[Lowercase] text drops all casing information.
\end{description}
Corpus preparation involves converting raw document pairs into tokenized,
sentence-aligned files in the format expected by \PS. We provide a
tokenizer, a sentence aligner and sample corpus pre-processing scripts with
\PS, which can help you with these steps. For details, see section \emph{Text
Processing} in the user manual.
If your data exists in the form of a translation memory or an aligned bitext,
our \code{tmx-prepro} module (see \code{\$PORTAGE/tmx-prepro} or
\url{https://github.com/nrc-cnrc/PortageTMXPrepro}) can help you extract
your data and prepare it for training \PS.
This tutorial starts from sentence-aligned plain text, as you would get from
\code{tmx-prepro} extracting data from a TMX file.
We don't perform data clean up and sentence alignment here,
because they are highly dependent on your actual data and what other tools you
already have. You should plan to invest some time in preprocessing your data
well if you want to obtain good results with \PS.
\subsection{Encoding: UTF-8}
The \PS framework only supports the UTF-8 encoding. If you use a different encoding,
please use \code{iconv} or \code{uconv} to convert your data to UTF-8, and the
\PS output to the encoding you need.
We previously supported latin1, cp-1252 and GB-2312 (simplified Chinese), but
UTF-8 can be used to represent all text, and its systematic use allows
us to both simplify the framework and make it more robust at the same time.
\subsection{Splitting the Corpus} \label{Splitting}
The corpus, which we assume you have sentence aligned, as
discussed above, must be split into separate portions to run
experiments. Distinct, non-overlapping sub-corpora are required for model
training (see \S\ref{Training}), for tuning decoder weights (\S\ref{COW}) and
rescoring weights (\S\ref{RAT}), for confidence estimation (if you use it;
\S\ref{CE}), and for testing (\S\ref{Testing}).\footnote{In this example, we
use separate dev sets for tuning decoder and rescoring weights, but this is not
necessary. However, confidence estimation must absolutely have its own
separate tuning set, which can be reused as a test set, but not as a decoder or
rescoring tuning set.} Typically, the tuning (or ``dev'', for development) and
testing sets contain around 2000 segments each. If the corpus is chronological,
then it is a good idea to choose these sets from the most recent material,
which is likely to resemble future text more closely.\footnote{Our
\code{tmx-prepro} module can automate splitting your corpus if you're
starting from a TMX file. It uses random sampling by default for your dev and
test sets.}
Ideally, the splitting of a corpus should take into account its structure and
nature, so these steps are not handled by the experimental framework. For the
toy experiment, we provide small data sets that can be found here:
\code{\$PORTAGE/test-suite/tutorial-data}. These sets are ridiculously small,
to minimize running time, so the resulting translations are of poor quality.
To drop these sets into the framework, copy them (or make symbolic
links) into your copy's corpora directory:
\begin{small}
\begin{alltt}
> \textbf{cd toy.experiment}
> \textbf{cp \$PORTAGE/test-suite/tutorial-data/*.raw* corpora/}
> \textbf{wc_stats corpora/*.raw* | expand-auto.pl}
#Lines #Words #Char [...] filename
100 1912 11284 [...] corpora/dev1_en.raw
100 2026 13276 [...] corpora/dev1_fr.raw
100 1912 11176 [...] corpora/dev2_en.raw
100 2116 13474 [...] corpora/dev2_fr.raw
100 2156 12733 [...] corpora/dev3_en.raw
100 2461 15728 [...] corpora/dev3_fr.raw
8896 163417 954012 [...] corpora/lm-train_en.raw.gz
8893 178680 1139422 [...] corpora/lm-train_fr.raw.gz
100 2174 12981 [...] corpora/test1_en.raw
100 2267 15171 [...] corpora/test1_fr.raw
100 2156 12733 [...] corpora/test2_en.raw
100 2461 15728 [...] corpora/test2_fr.raw
8892 163338 953554 [...] corpora/tm-train_en.raw.gz
8892 178677 1139392 [...] corpora/tm-train_fr.raw.gz
36573 705753 4320664 [...] TOTAL
\end{alltt}
\end{small}
In your own experiments, the files you need to copy into \code{corpora}
should be plain text, truecase, sentence-split and sentence-aligned, just like
the ones we provide here. If your corpora are already tokenized, call your
files \code{*.al} and \code{*.al.gz} instead of \code{*.raw} and
\code{*.raw.gz}: the framework will automatically skip tokenization with those
file extensions.
Although the framework does not support compressed dev and test files, the
training files can and should be compressed, as shown here. Most programs and
modules in \PS transparently handle compressed files, compressing and
decompressing them on the fly as needed.
If you inspect them, you might notice that the \code{lm-train*} and
\code{tm-train*} files are almost identical, but that the \code{lm-train*} files have
more text. When training the language model, it is a good idea to add
available monolingual material to the target side of the parallel corpus, as we
simulated doing here. In practice, your LM training data might be a lot larger
than your parallel data: everything you have can help improve quality!
\subsection{Setting Framework Parameters} \label{FrameworkParams}
Now you need to edit \code{Makefile.params} to set some global parameters:
\begin{itemize}
\item swap the values of \code{SRC_LANG} and
\code{TGT_LANG}, to select translation from French to English, rather than
the other way around, which is the default;
\item \Changed\footnote{
Starting with \PS 3.0, set your LM training corpus using \url{PRIMARY_LM} to
get the recommended default use of the generic LM whenever possible, or
\url{TRAIN_LM} or \code{MIXLM} if you want to override the default
recommendations.
} \code{PRIMARY_LM} and \code{TRAIN_TM} already have the right values, so
they don't need to be changed;
\item \New \code{TRAIN_SPARSE} already defaults to the first corpus listed in
\code{TRAIN_TM}, in this case \code{tm-train}, so the new sparse models will be
trained on the same corpus as the phrase tables;
\item set \code{TUNE_RESCORE} to \code{dev2} and \code{TUNE_CE} to \code{dev3}
by uncommenting the lines defining them, i.e., by removing the \code{\#} at the
beginning of these lines;
\item \code{TEST_SET} already points to our two test sets, so no change is
needed;
\item select a language modeling option (see below for more info about this
choice): set the \code{LM_TOOLKIT} variable to \code{MIT} or \code{SRI} to
use MITLM or SRILM, respectively.
%\item set \code{DO_RESCORING} to \code{1} by uncommenting the relevant line.
\end{itemize}
For now, keep the default value for all other parameters.
While in \code{Makefile.params}, you should read through the variables in
the \emph{User definable variables} section of the file. This is where most of
the configurable behaviours are set, such as whether to do rescoring and/or
truecasing, which optional models to use, the levels of parallelism, etc.
We recommend you use MITLM (\url{https://github.com/mit-nlp/mitlm}) or SRILM
(\url{http://www.speech.sri.com/projects/srilm/}) as your language modeling
toolkit. Although not recommended, IRSTLM is another supported
option.\footnote{These recommendations are based on our
empirical results: we get similar BLEU scores when using MITLM and SRILM, but
lower ones when using IRSTLM.} See \code{Makefile.params} and type \code{make
help} for more details.
Another set of parameters to look at are the various
\code{PARALLELISM_LEVEL_*} variables. \PS takes
advantage of multi-processor computers and/or multi-node computing clusters,
doing tasks in parellel where possible. On a non-clustered computer,
the number of CPUs is the default for all these variables:
explicitly set \code{NCPUS} to restrict the number of CPUs used globally.
On a cluster, the framework uses
\code{qsub} to submit jobs, via the \code{run-parallel.sh} and \code{psub}
scripts, and you can set these variables according to resources available to
you.
When running this framework, many commands are preceded
by control variables \code{RP_PSUB_OPTS=...} or \code{_LOCAL=1}. These strings
only have an impact on a cluster, and are ignored otherwise.
On a cluster, commands preceded by \code{_LOCAL=1} are inexpensive ones
that get run directly instead of being submitted to the queueing system, while
\code{RP_PSUB_OPTS=...} specifies additional options to \code{psub},
which encapsulates the invocation of \code{qsub}. If your
cluster has specific usage rules or if you require additional parameters to
\code{qsub}, customize \code{psub} itself or add options as
required in this framework.
\tip\margintip Many commands run in this framework will also be preceded by
\code{time-mem}. This utility script measures the time and memory usage of a
command. At any time, type \code{make summary} to get a summary of resources
used by all components of the framework so far. \code{make
time-mem} can be very useful to determine which steps are taking the most
resources. They can help you determine if you have enough
computing resources to process your corpora, and
the cost of various choices you can make in \PS. \code{make
summary} will give you the \code{time-mem} information as well as the space on
disk of the models needed at runtime, such as would be deployed on a
translation server.\tipend
Most commands in the framework produce logs called
\code{log.\emph{output-file-name}}. If you encounter errors, look for
explanations in these log files, that's usually where the error messages will
end up.
For the sake of brevity, commands quoted in this document
leave out \code{time-mem}, the control
variables mentioned above, and the log files.
\subsection{Tokenization} \label{Tokenizing}
This step is skipped if you're using tokenized data (with \code{.al} and
\code{.al.gz} extensions), but the
framework automatically recognizes that it needs to tokenize the raw text
corpora we provided for this tutorial.
By default, the \PS tokenizer is used (\code{utokenize.pl})\footnote{Look for
and set \code{TOKENIZER\us{}\emph{ll}} in \code{Makefile.params} to choose a
different tokenizer for language \emph{ll}}, along with a
preprocessing script that separates slash-separated words
(\code{fix-slashes.pl}) because those are rarely meant to stay together.
\begin{small}
\begin{alltt}
> \textbf{cd toy.experiment/corpora}
> \textbf{make tok}
\{ fix-slashes.pl | utokenize.pl -noss -lang=fr; \} < dev1_fr.raw > dev1_fr.al
[...]
parallelize.pl -nolocal -psub -1 -w 100000 -n 16 \bs
"\{ fix-slashes.pl | utokenize.pl -noss -lang=en; \} < tm-train_en.raw.gz > tm-train_en.al.gz"
\end{alltt}
\end{small}
\subsection{Lowercasing and Escaping Special Characters} \label{Lowercasing}
To reduce data sparseness, we convert all files to lowercase.
We keep the lowercase and truecase
versions separate, because we'll use the lowercase version to train language
and translation models, while the truecase version will be used to train a
truecasing model.
\begin{small}
\begin{alltt}
> \textbf{cd toy.experiment/corpora}
> \textbf{make lc}
cat dev1_fr.al | utf8_casemap -c l > dev1_fr.lc
[...]
zcat lm-train_fr.al.gz | utf8_casemap -c l | gzip > lm-train_fr.lc.gz
\end{alltt}
\end{small}
The decoder, \code{canoe}, treats \code{<}, \code{>} and \verb*X\X % *
as special characters to support markup for special translation rules.
We won't use markup in this tutorial, but we must still escape
the special characters in the input files
to \code{canoe}: the source side of the dev and test files.
\begin{small}
\begin{alltt}
> \textbf{make rule}
canoe-escapes.pl -add < dev1_fr.lc > dev1_fr.rule
canoe-escapes.pl -add < dev2_fr.lc > dev2_fr.rule
canoe-escapes.pl -add < dev3_fr.lc > dev3_fr.rule
canoe-escapes.pl -add < test1_fr.lc > test1_fr.rule
canoe-escapes.pl -add < test2_fr.lc > test2_fr.rule
\end{alltt}
\end{small}
\section{Training} \label{Training}
This step creates various models and parameter files that are required for
translation. There are many steps in training. Three are mandatory: creating a
language model (\S\ref{LM}), creating a translation model (\S\ref{TM}), and
optimizing decoder weights (\S\ref{COW}). Several are optional: creating coarse
language models (\S\ref{coarseLM}), creating a truecasing model (\S\ref{TC}),
creating a (possibly hierarchical) lexicalized distortion model (\S\ref{LDM}),
creating a sparse model (\S\ref{sparse}),
creating or fine-tuning an NNJM (\S\ref{NNJM}),
using mixture language models and mixture
translation models for domain adaptation (\S\ref{MIX}), training a confidence
estimation (CE) model (\S\ref{CE}), and training a rescoring model
(\S\ref{RAT}).
\subsection{Creating a Language Model} \label{LM}
\PS does not come with language model training software. However, it
accepts models in the widely-used ``ARPA'' format, which is supported by most
language modelling toolkits. In this tutorial, we assume
you are using MITLM. If you use SRILM, the procedure will be the same, but the
commands that get executed will be different.
By default, we train language models of order five, a
good compromise between translation quality and size of the models. Higher
order language models might sometimes be useful, but only for very large
corpora, and at a cost in space and decoding speed.
In this toy example, we manually added a few sentences to the target language
part of the parallel training corpus to illustrate using more text to train the
language model than the translation model. If you have access to relatively
small amounts of additional monolingual text, adding it to your main LM
training corpus is the simplest option. If you have access to large amounts of
monolingual text, you can use it to train additional
language models or mixture language models. To train separate language
models, drop the corpora into \code{corpora} and list all the corpus stems in
\code{TRAIN\us{}LM}; if an LM is trained externally, add its name to
\code{LM\us{}PRETRAINED\us{}TGT\us{}LMS} instead. But for best results, use
mixture language models (see \S\ref{MIXLM}).
\subsubsection{Using a regular LM} \label{regularLM}
Since \PS 3.0, the recommendation and default is the use a MixLM of your
corpus and our generic language model (see \S\ref{LM+generic-default}).
But you can still use a single, regular LM, by setting
\code{TRAIN_LM=lm-train} instead of \code{PRIMARY_LM=lm-train} in
\code{Makefile.params}. We show what happens in that case here for
completeness.
Here is the command used to produce
\code{lm-train_en-kn-5g.tplm},
the main language model:
\begin{small}
\begin{alltt}
> \textbf{cd toy.experiment}
> \textbf{make lm}
make -C models lm
make -C lm all LM_LANG=en
Creating ARPA text format lm-train_en-kn-5g.lm.gz
zcat ../../corpora/lm-train_en.lc.gz \bs
| perl -ple 's/^{\bs}s+//; s/{\bs}s+\$//; s/{\bs}s+/ /g;' | fold --bytes --spaces --width=4095 \bs
| estimate-ngram -order 5 -smoothing ModKN -text /dev/stdin \bs
-opt-perp ../../corpora/dev1_en.lc \bs
-write-lm lm-train_en-kn-5g.lm.gz
arpalm2binlm lm-train_en-kn-5g.lm.gz lm-train_en-kn-5g.binlm.gz
arpalm2tplm.sh lm-train_en-kn-5g.lm.gz lm-train_en-kn-5g
\end{alltt}
\end{small}
The main command executed, MITLM's \code{estimate-ngram}, produces the language
model in ``ARPA'' format. The \code{perl} filter that precedes it
removes extraneous whitespace, because \code{estimate-ngram} is picky about its
input. The \code{fold} command wraps lines that are longer than 4KB, to avoid
breaking a word (or worse, a UTF-8 character) apart at MITLM's internal buffer
boundary. The \code{arpalm2binlm} and \code{arpalm2tplm.sh} commands
convert the model into the \PS binary language model and
tightly-packed language model formats, respectively, for fast loading and use.
Since version 3.0 of \PS, we take advantage of MITLM's ability to tune the LM
metaparameters on our dev set, yielding slightly better language models. The
tuning set is specified to \code{estimate-ngram} using \code{-opt-perp}, and is
configured in the framework via the \code{TUNE_LM} parameter in
\code{Makefile.params}.
Before continuing the tutorial, don't forget to revert your
\code{Makefile.params} to have \code{PRIMARY_LM=lm-train} and \code{TRAIN_LM}
undefined.
\subsubsection{In-domain LM plus the generic LM} \label{LM+generic-default}
Starting with \PS 3.0, we strongly recommend that you use the generic LM from
Portage Generic Model 2.0 (or more recent) to improve the quality of your translations.
Without introducing out-of-domain vocabulary into your translations, this model
helps the decoder chose the best way to use your in-domain phrase
table.
With \code{PRIMARY_LM=lm-train} defined as is now the default, you can issue
this command to train your a mixture language model (MixLM) combining your
in-domain LM and the generic one.
\begin{small}
\begin{alltt}
> \textbf{cd toy.experiment}
> \textbf{make mixlm}
make[2]: Entering directory `.../toy.experiment/models/mixlm'
[commands to make lm-train_fr-kn-5g.lm.gz and .tplm]
echo "`basename models/mixlm/lm-train_fr*.tplm`" "generic-2.0_fr.tplm" \bs
| tr " " "{\bs}n" > components_fr
mx-calc-distances.sh -v em components_fr ../../corpora/dev1_fr.lc > dev1.distances
mx-dist2weights -v normalize dev1.distances > dev1.weights
[commands to make lm-train_en-kn-5g.lm.gz and .tplm]
echo "`basename models/mixlm/lm-train_en*.tplm`" "generic-2.0_en.tplm" \bs
| tr " " "{\bs}n" > components_en
mx-mix-models.sh mixlm dev1.weights components_en ../../corpora/dev1_fr.lc \bs
> dev1.mixlm
\end{alltt}
\end{small}
The above commands are explained in more details in \S\ref{MIXLM}, so we won't
describe what they do here. The important point here is that the mixture LM is
created by combining the primary LM created from your in-domain training data
with NRC's generic model created on 43 million sentence pairs. Because of your
in-domain data, this model is good at recognizing text that sounds right for
your domain. Because of the very large data set used to train the generic model,
this model is good at recognizing text that sounds right in English (or French)
in general. The combination helps the decoder chose better translations than
either component model could on its own.
\subsection{Creating a Coarse Language Model} \label{coarseLM}
In \PS 3.0, we added the capacity to train and use coarse language models:
language models that are trained and queried not on sequences of words, but
rather on sequences of classes of words. These models have a useful capacity of
abstraction: similar words can be expected to occur in
similar contexts.\footnote{See Stewart et al (AMTA 2014) and the annotated
bibliography included with the \PS user manual for more details on coarse LMs.}
Empirically, we get the best results when we combine a coarse LM trained
on 200 word classes with one trained on 800 classes. These models capture
abstractions at different levels of granularity. With only 200 classes, the
first one is modelling something that probably resembles part-of-speech tag
sequences, while 800 classes gives the second model a chance to capture some
semantic categories as well. As for the order of these models, coarse models do
not suffer from the data sparsity that affects regular LMs, so we train 8-gram
coarse LMs, whereas we only go up to 5-grams for the regular LMs.
Our recommendation, and the framework default, is to train two 8-gram coarse
LMs, one with 200 classes and one with 800 classes.
Before training the coarse LMs themselves, we need to learn word classes on the
target language vocabulary. We use Google's free, open source \code{word2vec}
tool to learn these classes from the corpora.
\begin{small}
\begin{alltt}
> \textbf{cd toy.experiment}
> \textbf{make wcl}
make[2]: Entering directory `.../toy.experiment/models/wcl'
zcat -f ../../corpora/lm-train_fr.lc.gz ../../corpora/tm-train_fr.lc.gz > all-200_fr.lc
word2vec -cbow 0 -size 100 -window 1 -negative 0 -hs 1 -sample 0 -threads 1 \bs
-min-count 1 -classes 200 -output fr.200.classes -train all-200_fr.lc
sed -i -e 's/ /{\bs}t/' fr.200.classes
rm -r all-200_fr.lc
[commands to train en.200.classes, fr.800.classes and en.800.classes]
\end{alltt}
\end{small}
For each class file to create, we first concatenate all the input files in a temporary
uncompressed file, as required by \code{word2vec}.\footnote{For coarse
LM classes, we should only need \code{lm-train\us{}en.lc.gz}, but instead we
use all target language corpora because we want these classes to be aware of
all target-side vocabulary of training files used for any models. Although not
needed for Coarse LMs, classes are also trained on the source side because others
models need them.} Then we call \code{word2vec} with carefully tuned parameters.
Finally the resulting \code{.classes} files are
reformated to respect the standard word-tab-class format on each line and the
temporary input file is deleted.
Now that we have our word-class files, we can train the LMs themselves.
Breaking with the convention in rest of this document, we'll
interleave the commands executed by make and our comments about them, to
improve readability.
\begin{small}
\begin{alltt}
> \textbf{cd toy.experiment}
> \textbf{make coarselm}
make[2]: Entering directory `.../toy.experiment/models/coarselm'
word2class -no-error ../../corpora/lm-train_en.lc.gz ../wcl/en.200.classes | \bs
gzip > lm-train_en-200.lc.gz
182281 of 182281 words mapped to word classes.
word2class -no-error ../../corpora/dev1_en.lc ../wcl/en.200.classes > dev1-200_en.lc
57 word mapping errors.
2058 of 2115 words mapped to word classes.
Warning: Output contains 57 word class mapping errors (<unk>).
[...]
\end{alltt}
\end{small}
Using \code{word2class}, we create a copy of all the input training corpus with
all the words substituted by their word class. A message tells us that all
182281 words were mapped to their classes. Since we're using the dev set for
metaparameter tuning with MITLM, we also need to do the same mapping on the dev
set, but now we see 57 mapping errors: this is due to words in the dev set not
being in the training corpus and is perfectly normal: we want to train \PS and
all its models to deal with unseen words.
Then the command we launched above proceeds to use our chosen LM toolkit to
train a language model on this sequence of classes, instead of the usual
sequence of tokens:
\begin{small}
\begin{alltt}
Creating ARPA text format lm-train_en-200-ukn-8g.lm.gz
zcat lm-train_en-200.lc.gz \bs
| perl -ple 's/^{\bs}s+//; s/{\bs}s+\$//; s/{\bs}s+/ /g;' | fold --bytes --spaces --width=4095 \bs
| estimate-ngram -order 8 -smoothing KN -text /dev/stdin \bs
-opt-perp dev1-200_en.lc -write-lm lm-train_en-200-ukn-8g.lm.gz
arpalm2binlm lm-train_en-200-ukn-8g.lm.gz lm-train_en-200-ukn-8g.binlm.gz
arpalm2tplm.sh lm-train_en-200-ukn-8g.lm.gz lm-train_en-200-ukn-8g
\end{alltt}
\end{small}
The whole sequence above, starting at \code{word2class}, is repeated with 800
classes, since we train both a 200 class and an 800 class coarse LM.
When the decoder uses a coarse model, it applies the same mapping from tokens
to classes before querying the LM, giving us a probability for class
sequences. This helps with sequences that were not seen in training, but where
words are somehow similar to those in sequences that were seen. For example,
maybe ``the blue car'' was never seen in training, but ``the \emph{colour}
car'' was seen for several colours which ended up in the same class: the coarse
LM will consider all these variants equally likely if they map to the same
class sequence.
\subsection{Creating a Truecasing Model} \label{TC}
Decoding is done in lowercase to reduce data sparseness, but at the end of the
translation process, you will need to restore proper mixed case to
your output. We call this step truecasing. To train a truecasing model, you
need both a lowercased version and the original ``truecase'' version of the
training corpus in the target language.
The basic truecasing model consists of two different models: a ``casemap'',
which maps each lower case word to its possible truecase variants, as observed
in the training corpus, and a standard language model trained on the truecase
corpus.
The improved version of truecasing carries casing information from the source
sentence to the target sentence, including the casing for the first word in the
sentence, the casing of out-of-vocabulary words (OOVs), and unusual casing for
sequences of several words (such as all caps).
Truecasing with source information is the default.\footnote{If you want to
use basic truecasing, comment out the line \code{TC\_USE\_SRC\_MODELS=1} in the
advanced configuration section of \code{Makefile.params}.} It requires three
language models: source-side and target-side
case-normalizing\footnote{``Normalized case'' means that the first character of
the sentence is in lowercase unless the word inherently requires the upper
case. E.g., ``his flat in is London.'' and ``London is on the Thames.'' are in
normalized case. The acronym ``nc1'' stands for
``normalized-cased first word'', i.e., normalized case.} language models, as
well as the main truecasing LM. The target-side case-normalizing LM is used
only for generating a normalized-case target-side training corpus; it is not
used in translation. The main truecasing LM and the ``casemap'' are trained on
the normalized-case target-side training corpus. Together, they are used to
determine the case for the output of the decoder. Then case patterns observed
in the source sentence, including the first word, are transferred to the
truecased output of the decoder.
%\TODO{Think about whether this whole section should be simplified with only the
%final model files listed, but not the commands used.}
Running \code{make tc} at the root of the framework will build all necessary
files and models. As for Coarse LMs, we'll
interleave the commands executed by make and our comments about them, to
improve readability.
\begin{small}
\begin{alltt}
> \textbf{cd toy.experiment}
> \textbf{make tc}
make[2]: Entering directory `.../toy.experiment/models/tc'
zcat -f ../../corpora/lm-train_en.tc.gz \bs
| perl -ne '\emph{set UTF-8 encoding} s/^[^[:lower:]]+\$/{\bs}n/; print \$_ unless /^\$/;' \bs
| utokenize.pl -pretok -paraline -ss -p -lang=en \bs
| gzip > lm-train_en.tokss.gz
\end{alltt}
\end{small}
We first generate \code{lm-train_en.tokss.gz}, a variant of the target-side
training corpus with all-caps sentences (which are misleading for training
truecasing models) removed and sentence splitting re-done. Sentence splitting
is re-applied because corpus text from some sources such as TMX files may
contain lines with multiple sentences, and beginning-of-sentence detection is
important for case normalization.
Second, we generate \code{lm-train_en.revtokss.gz}, an inversion of the
target-side training corpus needed to train the target-side case-normalizing
LM:
\begin{small}
\begin{alltt}
zcat -f lm-train_en.tokss.gz | filter-nc1.py -enc UTF-8 \bs
| reverse.pl | gzip > lm-train_en.revtokss.gz
\end{alltt}
\end{small}
Third, we train a language model on this permuted corpus, producing
\code{lm-train_en.nc1. binlm.gz}, the target-side case-normalizing language model.
The actual commands used depend on your LM toolkit:
\begin{small}
\begin{alltt}
Creating ARPA text format lm-train_en.nc1.lm.gz
[commands to train case-normalizing target-language LM lm-train_en.nc1.lm.gz]
arpalm2binlm lm-train_en.nc1.lm.gz lm-train_en.nc1.binlm.gz
\end{alltt}
\end{small}
Next, the \code{normc1} program uses the case-normalizing LM to produce
\code{lm-train_en.nc1.gz}, the nor\-malized-case target-side training corpus:
\begin{small}
\begin{alltt}
normc1 -ignore 1 -extended -notitle -loc en_CA.UTF-8 lm-train_en.nc1.binlm.gz \bs
lm-train_en.tokss.gz \bs
| perl -pe 's/(.)$/$1 /; s/(.){\bs}n/\$1/' | gzip > lm-train_en.nc1.gz
\end{alltt}
\end{small}
Here we generate the casemap for the main target-side truecasing model,
\code{lm-train_en.map}, using \code{compile_truecase_map}, which compiles the
casemap by processing the normalized-case and lowercase versions of the corpus
simultaneously, and recording, for each lower case word, all the cased variants
found in the normalized-case file, along with their distribution:
\begin{small}
\begin{alltt}
zcat -f lm-train_en.nc1.gz | utf8_casemap -c l \bs
| compile_truecase_map lm-train_en.nc1.gz - > lm-train_en.map
\end{alltt}
\end{small}
Then we produce \code{lm-train_en-kn-3g.binlm.gz}, the main target-size
truecasing LM, trained on the normalized-case corpus:
\begin{small}
\begin{alltt}
Creating ARPA text format lm-train_en-kn-3g.lm.gz
[commands to train truecasing LM lm-train_en-kn-3g.lm.gz]
arpalm2binlm lm-train_en-kn-3g.lm.gz lm-train_en-kn-3g.binlm.gz
\end{alltt}
\end{small}
The final block of commands does the same thing as the first block, this time to
produce the source-side case-normalizing language model needed by the new
truecasing workflow, \code{lm-train_fr.nc1.binlm.gz}:
\begin{small}
\begin{alltt}
zcat -f ../../corpora/lm-train_fr.al.gz \bs
| utokenize.pl -pretok -paraline -ss -lang=fr \bs
| perl -pe '\emph{set UTF-8 encoding} s/^[^[:lower:]]+(\$|( : ))//;' \bs
| gzip > lm-train_fr.tokss.gz
zcat -f lm-train_fr.tokss.gz | filter-nc1.py -enc UTF-8 \bs
| reverse.pl | gzip > lm-train_fr.revtokss.gz
Creating ARPA text format lm-train_fr.nc1.lm.gz
[commands to train case-normalizing source-language LM lm-train_fr.nc1.lm.gz]
arpalm2binlm lm-train_fr.nc1.lm.gz lm-train_fr.nc1.binlm.gz
\end{alltt}
\end{small}
\subsection{Creating a Translation Model} \label{TM}
Creating a translation model involves two main steps: 1) training IBM2, HMM and
IBM4 word alignment models in both directions, then 2) using them to
extract phrase pairs from the corpus.
There are many ways to combine the counts obtained from different alignments.
Through ongoing experimentation, our recommendations have changed in the past
and will likely change again; the best setup depends on your data and
resources, but we try to maintain a good general setup as the default in the
framework.
Here we illustrate the method we currently recommend: merge the counts from all
the alignment methods to estimate the main probability feature, using one
alignment method to estimate a lexical smoothing feature.
You can do all this by typing \code{make tm} in your \code{toy.experiment}
directory, but we will break it down into several steps.
By default, \code{make tm} will train IBM2 (\S\ref{IBM2}), HMM (\S\ref{HMM}),
and IBM4 (\S\ref{IBM4}) word-alignment models, and tally their counts
together into the final phrase table (\S\ref{CPT}).
In \S\ref{PI}, we'll show how you can also produce alignment indicator features
telling the system which aligner produced which phrase pairs. The alignment
indicator features can be helpful because the different alignments make different
kinds of errors in the phrase pairs they produce; the diversity of phrase
pairs obtained from two or three separate alignment methods helps the system,
while the indicator features allow the system to learn to give more weight to
alignments suggested by the more reliable alignment method. Now, we don't
explicitly estimate how reliable each method is, instead we let decoder tuning
learn the indicator feature weights (see \S\ref{COW}).
Later, we'll look at interpolating probability estimates
coming from various corpora, in a way that is adapted to your in-domain
material (\S\ref{MIXTM}).
\subsubsection{Creating a Translation Model Using IBM2 Alignments} \label{IBM2}
%\TODO{Think about whether I should reorganize this code around the directory
%structure instead of the alighment model: a IBM section, a WAL section, a JPT
%section, and then the TM section. This would probably be easier to follow.}
\subsubsection*{Training IBM2 Models}
First we train IBM2 word alignment models, which requires training IBM1 models
as a prerequisite. We do this for both directions in \code{models/ibm/}.
\begin{small}
\begin{alltt}
> \textbf{cd toy.experiment/models/ibm}
> \textbf{make ibm2_model}
cat.sh -n 4 -pn 4 -v -n1 5 -n2 0 -bin ibm1.tm-train.en_given_fr.gz \bs
../../corpora/tm-train_fr.lc.gz ../../corpora/tm-train_en.lc.gz
cat.sh -n 4 -pn 4 -v -n1 0 -n2 5 -slen 20 -tlen 20 -bksize 20 \bs
-bin -i ibm1.tm-train.en_given_fr.gz ibm2.tm-train.en_given_fr.gz \bs
../../corpora/tm-train_fr.lc.gz ../../corpora/tm-train_en.lc.gz
cat.sh -n 4 -pn 4 -v -r -n1 5 -n2 0 -bin ibm1.tm-train.fr_given_en.gz \bs
../../corpora/tm-train_fr.lc.gz ../../corpora/tm-train_en.lc.gz
cat.sh -n 4 -pn 4 -v -r -n1 0 -n2 5 -slen 20 -tlen 20 -bksize 20 \bs
-bin -i ibm1.tm-train.fr_given_en.gz ibm2.tm-train.fr_given_en.gz \bs
../../corpora/tm-train_fr.lc.gz ../../corpora/tm-train_en.lc.gz
\end{alltt}