Skip to content

Commit 061b3f2

Browse files
authored
LUCENE-9740: scan affix stream once. (apache#2327)
1 parent f93cbb3 commit 061b3f2

File tree

2 files changed

+61
-39
lines changed

2 files changed

+61
-39
lines changed

Diff for: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

+57-39
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@
1818

1919
import java.io.BufferedInputStream;
2020
import java.io.BufferedReader;
21+
import java.io.ByteArrayInputStream;
2122
import java.io.IOException;
2223
import java.io.InputStream;
2324
import java.io.InputStreamReader;
2425
import java.io.LineNumberReader;
25-
import java.io.OutputStream;
2626
import java.nio.charset.Charset;
2727
import java.nio.charset.CharsetDecoder;
2828
import java.nio.charset.CodingErrorAction;
@@ -70,6 +70,9 @@
7070

7171
/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
7272
public class Dictionary {
73+
// Derived from woorm/ openoffice dictionaries.
74+
// See TestAllDictionaries.testMaxPrologueNeeded.
75+
static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024;
7376

7477
static final char[] NOFLAGS = new char[0];
7578

@@ -227,26 +230,37 @@ public Dictionary(
227230
this.needsInputCleaning = ignoreCase;
228231
this.needsOutputCleaning = false; // set if we have an OCONV
229232

230-
Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
231-
Path aff = Files.createTempFile(tempPath, "affix", "aff");
232-
233-
BufferedInputStream aff1 = null;
234-
InputStream aff2 = null;
235-
boolean success = false;
236-
try {
237-
// Copy contents of the affix stream to a temp file.
238-
try (OutputStream os = Files.newOutputStream(aff)) {
239-
affix.transferTo(os);
233+
try (BufferedInputStream affixStream =
234+
new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
235+
@Override
236+
public void close() throws IOException {
237+
// TODO: maybe we should consume and close it? Why does it need to stay open?
238+
// Don't close the affix stream as per javadoc.
239+
}
240+
}) {
241+
// I assume we don't support other BOMs (utf16, etc.)? We trivially could,
242+
// by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
243+
// any such exotic examples.
244+
Charset streamCharset;
245+
if (maybeConsume(affixStream, BOM_UTF8)) {
246+
streamCharset = StandardCharsets.UTF_8;
247+
} else {
248+
streamCharset = DEFAULT_CHARSET;
240249
}
241250

242-
// pass 1: get encoding & flag
243-
aff1 = new BufferedInputStream(Files.newInputStream(aff));
244-
readConfig(aff1);
251+
/*
252+
* pass 1: look for encoding & flag. This is simple but works. We just prefetch
253+
* a large enough chunk of the input and scan through it. The buffered data will
254+
* be subsequently reused anyway so nothing is wasted.
255+
*/
256+
affixStream.mark(MAX_PROLOGUE_SCAN_WINDOW);
257+
byte[] prologue = affixStream.readNBytes(MAX_PROLOGUE_SCAN_WINDOW - 1);
258+
affixStream.reset();
259+
readConfig(new ByteArrayInputStream(prologue), streamCharset);
245260

246261
// pass 2: parse affixes
247-
aff2 = new BufferedInputStream(Files.newInputStream(aff));
248262
FlagEnumerator flagEnumerator = new FlagEnumerator();
249-
readAffixFile(aff2, decoder, flagEnumerator);
263+
readAffixFile(affixStream, decoder, flagEnumerator);
250264

251265
// read dictionary entries
252266
IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
@@ -255,14 +269,6 @@ public Dictionary(
255269
flagLookup = flagEnumerator.finish();
256270
aliases = null; // no longer needed
257271
morphAliases = null; // no longer needed
258-
success = true;
259-
} finally {
260-
IOUtils.closeWhileHandlingException(aff1, aff2);
261-
if (success) {
262-
Files.delete(aff);
263-
} else {
264-
IOUtils.deleteFilesIgnoringExceptions(aff);
265-
}
266272
}
267273
}
268274

@@ -349,6 +355,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
349355
if (line.isEmpty()) continue;
350356

351357
String firstWord = line.split("\\s")[0];
358+
// TODO: convert to a switch?
352359
if ("AF".equals(firstWord)) {
353360
parseAlias(line);
354361
} else if ("AM".equals(firstWord)) {
@@ -458,6 +465,12 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
458465
checkCompoundPatterns.add(
459466
new CheckCompoundPattern(reader.readLine(), flagParsingStrategy, this));
460467
}
468+
} else if ("SET".equals(firstWord)) {
469+
// We could add some sanity-checking whether set command is identical to what was
470+
// parsed in readConfig. This would handle cases of flags too far in the file or
471+
// duplicated (both are incorrect, I assume).
472+
} else if ("FLAG".equals(firstWord)) {
473+
// Similar for FLAG.
461474
}
462475
}
463476

@@ -791,31 +804,36 @@ private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
791804
private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
792805

793806
/** Parses the encoding and flag format specified in the provided InputStream */
794-
private void readConfig(BufferedInputStream stream) throws IOException, ParseException {
795-
// I assume we don't support other BOMs (utf16, etc.)? We trivially could,
796-
// by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
797-
// any such exotic examples.
798-
Charset streamCharset;
799-
if (maybeConsume(stream, BOM_UTF8)) {
800-
streamCharset = StandardCharsets.UTF_8;
801-
} else {
802-
streamCharset = DEFAULT_CHARSET;
803-
}
804-
805-
// TODO: can these flags change throughout the file? If not then we can abort sooner. And
806-
// then we wouldn't even need to create a temp file for the affix stream - a large enough
807-
// leading buffer (BufferedInputStream) would be sufficient?
807+
private void readConfig(InputStream stream, Charset streamCharset)
808+
throws IOException, ParseException {
808809
LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset));
809810
String line;
811+
String flagLine = null;
812+
boolean charsetFound = false;
813+
boolean flagFound = false;
810814
while ((line = reader.readLine()) != null) {
811815
if (line.isBlank()) continue;
812816

813817
String firstWord = line.split("\\s")[0];
814818
if ("SET".equals(firstWord)) {
815819
decoder = getDecoder(singleArgument(reader, line));
820+
charsetFound = true;
816821
} else if ("FLAG".equals(firstWord)) {
817-
flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
822+
// Preserve the flag line for parsing later since we need the decoder's charset
823+
// and just in case they come out of order.
824+
flagLine = line;
825+
flagFound = true;
826+
} else {
827+
continue;
818828
}
829+
830+
if (charsetFound && flagFound) {
831+
break;
832+
}
833+
}
834+
835+
if (flagFound) {
836+
flagParsingStrategy = getFlagParsingStrategy(flagLine, decoder.charset());
819837
}
820838
}
821839

Diff for: lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java

+4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
4646
import org.apache.lucene.util.NamedThreadFactory;
4747
import org.apache.lucene.util.RamUsageTester;
48+
import org.junit.Assert;
4849
import org.junit.Assume;
4950
import org.junit.Ignore;
5051

@@ -135,6 +136,9 @@ public void testMaxPrologueNeeded() throws Exception {
135136
(flag, positions) -> {
136137
long max = positions.stream().mapToLong(v -> v).max().orElse(0);
137138
System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max);
139+
Assert.assertTrue(
140+
"Flags beyond max prologue scan window: " + max,
141+
max < Dictionary.MAX_PROLOGUE_SCAN_WINDOW);
138142
});
139143

140144
if (failTest.get()) {

0 commit comments

Comments
 (0)