LUCENE-9740: scan affix stream once. (apache#2327)

dweiss · web-flow · commit 061b3f29c99c · 2021-02-09T09:58:26.000+01:00
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -18,11 +18,11 @@
 
 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
-import java.io.OutputStream;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
@@ -70,6 +70,9 @@
 
 /** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
 public class Dictionary {
+  // Derived from woorm/ openoffice dictionaries.
+  // See TestAllDictionaries.testMaxPrologueNeeded.
+  static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024;
 
   static final char[] NOFLAGS = new char[0];
 
@@ -227,26 +230,37 @@ public Dictionary(
     this.needsInputCleaning = ignoreCase;
     this.needsOutputCleaning = false; // set if we have an OCONV
 
-    Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
-    Path aff = Files.createTempFile(tempPath, "affix", "aff");
-
-    BufferedInputStream aff1 = null;
-    InputStream aff2 = null;
-    boolean success = false;
-    try {
-      // Copy contents of the affix stream to a temp file.
-      try (OutputStream os = Files.newOutputStream(aff)) {
-        affix.transferTo(os);
+    try (BufferedInputStream affixStream =
+        new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
+          @Override
+          public void close() throws IOException {
+            // TODO: maybe we should consume and close it? Why does it need to stay open?
+            // Don't close the affix stream as per javadoc.
+          }
+        }) {
+      // I assume we don't support other BOMs (utf16, etc.)? We trivially could,
+      // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
+      // any such exotic examples.
+      Charset streamCharset;
+      if (maybeConsume(affixStream, BOM_UTF8)) {
+        streamCharset = StandardCharsets.UTF_8;
+      } else {
+        streamCharset = DEFAULT_CHARSET;
       }
 
-      // pass 1: get encoding & flag
-      aff1 = new BufferedInputStream(Files.newInputStream(aff));
-      readConfig(aff1);
+      /*
+       * pass 1: look for encoding & flag. This is simple but works. We just prefetch
+       * a large enough chunk of the input and scan through it. The buffered data will
+       * be subsequently reused anyway so nothing is wasted.
+       */
+      affixStream.mark(MAX_PROLOGUE_SCAN_WINDOW);
+      byte[] prologue = affixStream.readNBytes(MAX_PROLOGUE_SCAN_WINDOW - 1);
+      affixStream.reset();
+      readConfig(new ByteArrayInputStream(prologue), streamCharset);
 
       // pass 2: parse affixes
-      aff2 = new BufferedInputStream(Files.newInputStream(aff));
       FlagEnumerator flagEnumerator = new FlagEnumerator();
-      readAffixFile(aff2, decoder, flagEnumerator);
+      readAffixFile(affixStream, decoder, flagEnumerator);
 
       // read dictionary entries
       IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
@@ -255,14 +269,6 @@ public Dictionary(
       flagLookup = flagEnumerator.finish();
       aliases = null; // no longer needed
       morphAliases = null; // no longer needed
-      success = true;
-    } finally {
-      IOUtils.closeWhileHandlingException(aff1, aff2);
-      if (success) {
-        Files.delete(aff);
-      } else {
-        IOUtils.deleteFilesIgnoringExceptions(aff);
-      }
     }
   }
 
@@ -349,6 +355,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
       if (line.isEmpty()) continue;
 
       String firstWord = line.split("\\s")[0];
+      // TODO: convert to a switch?
       if ("AF".equals(firstWord)) {
         parseAlias(line);
       } else if ("AM".equals(firstWord)) {
@@ -458,6 +465,12 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
           checkCompoundPatterns.add(
               new CheckCompoundPattern(reader.readLine(), flagParsingStrategy, this));
         }
+      } else if ("SET".equals(firstWord)) {
+        // We could add some sanity-checking whether set command is identical to what was
+        // parsed in readConfig. This would handle cases of flags too far in the file or
+        // duplicated (both are incorrect, I assume).
+      } else if ("FLAG".equals(firstWord)) {
+        // Similar for FLAG.
       }
     }
 
@@ -791,31 +804,36 @@ private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
   private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
 
   /** Parses the encoding and flag format specified in the provided InputStream */
-  private void readConfig(BufferedInputStream stream) throws IOException, ParseException {
-    // I assume we don't support other BOMs (utf16, etc.)? We trivially could,
-    // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
-    // any such exotic examples.
-    Charset streamCharset;
-    if (maybeConsume(stream, BOM_UTF8)) {
-      streamCharset = StandardCharsets.UTF_8;
-    } else {
-      streamCharset = DEFAULT_CHARSET;
-    }
-
-    // TODO: can these flags change throughout the file? If not then we can abort sooner. And
-    // then we wouldn't even need to create a temp file for the affix stream - a large enough
-    // leading buffer (BufferedInputStream) would be sufficient?
+  private void readConfig(InputStream stream, Charset streamCharset)
+      throws IOException, ParseException {
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset));
     String line;
+    String flagLine = null;
+    boolean charsetFound = false;
+    boolean flagFound = false;
     while ((line = reader.readLine()) != null) {
       if (line.isBlank()) continue;
 
       String firstWord = line.split("\\s")[0];
       if ("SET".equals(firstWord)) {
         decoder = getDecoder(singleArgument(reader, line));
+        charsetFound = true;
       } else if ("FLAG".equals(firstWord)) {
-        flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
+        // Preserve the flag line for parsing later since we need the decoder's charset
+        // and just in case they come out of order.
+        flagLine = line;
+        flagFound = true;
+      } else {
+        continue;
       }
+
+      if (charsetFound && flagFound) {
+        break;
+      }
+    }
+
+    if (flagFound) {
+      flagParsingStrategy = getFlagParsingStrategy(flagLine, decoder.charset());
     }
   }
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@@ -45,6 +45,7 @@
 import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
 import org.apache.lucene.util.NamedThreadFactory;
 import org.apache.lucene.util.RamUsageTester;
+import org.junit.Assert;
 import org.junit.Assume;
 import org.junit.Ignore;
 
@@ -135,6 +136,9 @@ public void testMaxPrologueNeeded() throws Exception {
         (flag, positions) -> {
           long max = positions.stream().mapToLong(v -> v).max().orElse(0);
           System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max);
+          Assert.assertTrue(
+              "Flags beyond max prologue scan window: " + max,
+              max < Dictionary.MAX_PROLOGUE_SCAN_WINDOW);
         });
 
     if (failTest.get()) {