18
18
19
19
import java .io .BufferedInputStream ;
20
20
import java .io .BufferedReader ;
21
+ import java .io .ByteArrayInputStream ;
21
22
import java .io .IOException ;
22
23
import java .io .InputStream ;
23
24
import java .io .InputStreamReader ;
24
25
import java .io .LineNumberReader ;
25
- import java .io .OutputStream ;
26
26
import java .nio .charset .Charset ;
27
27
import java .nio .charset .CharsetDecoder ;
28
28
import java .nio .charset .CodingErrorAction ;
70
70
71
71
/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
72
72
public class Dictionary {
73
+ // Derived from woorm/ openoffice dictionaries.
74
+ // See TestAllDictionaries.testMaxPrologueNeeded.
75
+ static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024 ;
73
76
74
77
static final char [] NOFLAGS = new char [0 ];
75
78
@@ -227,26 +230,37 @@ public Dictionary(
227
230
this .needsInputCleaning = ignoreCase ;
228
231
this .needsOutputCleaning = false ; // set if we have an OCONV
229
232
230
- Path tempPath = getDefaultTempDir (); // TODO: make this configurable?
231
- Path aff = Files .createTempFile (tempPath , "affix" , "aff" );
232
-
233
- BufferedInputStream aff1 = null ;
234
- InputStream aff2 = null ;
235
- boolean success = false ;
236
- try {
237
- // Copy contents of the affix stream to a temp file.
238
- try (OutputStream os = Files .newOutputStream (aff )) {
239
- affix .transferTo (os );
233
+ try (BufferedInputStream affixStream =
234
+ new BufferedInputStream (affix , MAX_PROLOGUE_SCAN_WINDOW ) {
235
+ @ Override
236
+ public void close () throws IOException {
237
+ // TODO: maybe we should consume and close it? Why does it need to stay open?
238
+ // Don't close the affix stream as per javadoc.
239
+ }
240
+ }) {
241
+ // I assume we don't support other BOMs (utf16, etc.)? We trivially could,
242
+ // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
243
+ // any such exotic examples.
244
+ Charset streamCharset ;
245
+ if (maybeConsume (affixStream , BOM_UTF8 )) {
246
+ streamCharset = StandardCharsets .UTF_8 ;
247
+ } else {
248
+ streamCharset = DEFAULT_CHARSET ;
240
249
}
241
250
242
- // pass 1: get encoding & flag
243
- aff1 = new BufferedInputStream (Files .newInputStream (aff ));
244
- readConfig (aff1 );
251
+ /*
252
+ * pass 1: look for encoding & flag. This is simple but works. We just prefetch
253
+ * a large enough chunk of the input and scan through it. The buffered data will
254
+ * be subsequently reused anyway so nothing is wasted.
255
+ */
256
+ affixStream .mark (MAX_PROLOGUE_SCAN_WINDOW );
257
+ byte [] prologue = affixStream .readNBytes (MAX_PROLOGUE_SCAN_WINDOW - 1 );
258
+ affixStream .reset ();
259
+ readConfig (new ByteArrayInputStream (prologue ), streamCharset );
245
260
246
261
// pass 2: parse affixes
247
- aff2 = new BufferedInputStream (Files .newInputStream (aff ));
248
262
FlagEnumerator flagEnumerator = new FlagEnumerator ();
249
- readAffixFile (aff2 , decoder , flagEnumerator );
263
+ readAffixFile (affixStream , decoder , flagEnumerator );
250
264
251
265
// read dictionary entries
252
266
IndexOutput unsorted = mergeDictionaries (tempDir , tempFileNamePrefix , dictionaries , decoder );
@@ -255,14 +269,6 @@ public Dictionary(
255
269
flagLookup = flagEnumerator .finish ();
256
270
aliases = null ; // no longer needed
257
271
morphAliases = null ; // no longer needed
258
- success = true ;
259
- } finally {
260
- IOUtils .closeWhileHandlingException (aff1 , aff2 );
261
- if (success ) {
262
- Files .delete (aff );
263
- } else {
264
- IOUtils .deleteFilesIgnoringExceptions (aff );
265
- }
266
272
}
267
273
}
268
274
@@ -349,6 +355,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
349
355
if (line .isEmpty ()) continue ;
350
356
351
357
String firstWord = line .split ("\\ s" )[0 ];
358
+ // TODO: convert to a switch?
352
359
if ("AF" .equals (firstWord )) {
353
360
parseAlias (line );
354
361
} else if ("AM" .equals (firstWord )) {
@@ -458,6 +465,12 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
458
465
checkCompoundPatterns .add (
459
466
new CheckCompoundPattern (reader .readLine (), flagParsingStrategy , this ));
460
467
}
468
+ } else if ("SET" .equals (firstWord )) {
469
+ // We could add some sanity-checking whether set command is identical to what was
470
+ // parsed in readConfig. This would handle cases of flags too far in the file or
471
+ // duplicated (both are incorrect, I assume).
472
+ } else if ("FLAG" .equals (firstWord )) {
473
+ // Similar for FLAG.
461
474
}
462
475
}
463
476
@@ -791,31 +804,36 @@ private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
791
804
private static final byte [] BOM_UTF8 = {(byte ) 0xef , (byte ) 0xbb , (byte ) 0xbf };
792
805
793
806
/** Parses the encoding and flag format specified in the provided InputStream */
794
- private void readConfig (BufferedInputStream stream ) throws IOException , ParseException {
795
- // I assume we don't support other BOMs (utf16, etc.)? We trivially could,
796
- // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
797
- // any such exotic examples.
798
- Charset streamCharset ;
799
- if (maybeConsume (stream , BOM_UTF8 )) {
800
- streamCharset = StandardCharsets .UTF_8 ;
801
- } else {
802
- streamCharset = DEFAULT_CHARSET ;
803
- }
804
-
805
- // TODO: can these flags change throughout the file? If not then we can abort sooner. And
806
- // then we wouldn't even need to create a temp file for the affix stream - a large enough
807
- // leading buffer (BufferedInputStream) would be sufficient?
807
+ private void readConfig (InputStream stream , Charset streamCharset )
808
+ throws IOException , ParseException {
808
809
LineNumberReader reader = new LineNumberReader (new InputStreamReader (stream , streamCharset ));
809
810
String line ;
811
+ String flagLine = null ;
812
+ boolean charsetFound = false ;
813
+ boolean flagFound = false ;
810
814
while ((line = reader .readLine ()) != null ) {
811
815
if (line .isBlank ()) continue ;
812
816
813
817
String firstWord = line .split ("\\ s" )[0 ];
814
818
if ("SET" .equals (firstWord )) {
815
819
decoder = getDecoder (singleArgument (reader , line ));
820
+ charsetFound = true ;
816
821
} else if ("FLAG" .equals (firstWord )) {
817
- flagParsingStrategy = getFlagParsingStrategy (line , decoder .charset ());
822
+ // Preserve the flag line for parsing later since we need the decoder's charset
823
+ // and just in case they come out of order.
824
+ flagLine = line ;
825
+ flagFound = true ;
826
+ } else {
827
+ continue ;
818
828
}
829
+
830
+ if (charsetFound && flagFound ) {
831
+ break ;
832
+ }
833
+ }
834
+
835
+ if (flagFound ) {
836
+ flagParsingStrategy = getFlagParsingStrategy (flagLine , decoder .charset ());
819
837
}
820
838
}
821
839
0 commit comments