Skip to content

Commit 03af992

Browse files
feat: Optimize SmartCn Dictionaries and Add Dictionary Loading Tests (#1154)
* feat: Optimize SmartCn dictionaries and add loading tests - Replaced ByteBuffer with BinaryReader for efficiency. - Used ReadOnlySpan<char> in BigramDictionary. - Added tests for dictionary loading from embedded resources. - Embedded bigramDict.dct and coreDict.dct. * refactor: apply review suggestions for SmartCn dictionary classes * Fix casing for bigramdict.dct and coredict.dct to lowercase for case-sensitive OSes * Revert breaking changes and restore compatibility; update tests for Bigram and WordDictionary * Improve SmartCN tests: Replace file existence checks with asserts, refine maxlength usage * Optimize dictionary loading: skip unused handle with Stream.Seek * Fix: add final newline and remove trailing whitespace in multiple files * Update SmartCn dictionary tests and BigramDictionary loading * Update BigramDictionary ie Updated LoadFromFile to throw IOException * Lucene.Net.Analysis.Cn.Smart.Hhmm.TestBuildDictionary: Modified the test data with known frequency values to verify the custom data set is loaded. * Revert LoadFromFile length check to match upstream Lucene behavior --------- Co-authored-by: Shad Storhaug <[email protected]>
1 parent 0976c0c commit 03af992

File tree

14 files changed

+384
-62
lines changed

14 files changed

+384
-62
lines changed

src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ private static void Init()
8181
return;
8282
}
8383

84+
8485
try
8586
{
8687
while (new DirectoryInfo(currentPath).Parent != null)

src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
3333
internal abstract class AbstractDictionary
3434
{
3535
// LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312")
36-
protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312");
36+
protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312",
37+
EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
38+
3739

3840
/// <summary>
3941
/// First Chinese Character in GB2312 (15 * 94)
@@ -162,7 +164,7 @@ public virtual long Hash1(char c)
162164
/// </summary>
163165
/// <param name="carray">character array</param>
164166
/// <returns>hashcode</returns>
165-
public virtual long Hash1(char[] carray)
167+
public virtual long Hash1(ReadOnlySpan<char> carray)
166168
{
167169
long p = 1099511628211L;
168170
long hash = unchecked((long)0xcbf29ce484222325L);
@@ -210,7 +212,7 @@ public virtual int Hash2(char c)
210212
/// </summary>
211213
/// <param name="carray">character array</param>
212214
/// <returns>hashcode</returns>
213-
public virtual int Hash2(char[] carray)
215+
public virtual int Hash2(ReadOnlySpan<char> carray)
214216
{
215217
int hash = 5381;
216218

src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -254,30 +254,45 @@ private void Load(string dictRoot)
254254
/// <summary>
255255
/// Load the datafile into this <see cref="BigramDictionary"/>
256256
/// </summary>
257-
/// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
257+
/// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param>
258258
/// <exception cref="IOException">If there is a low-level I/O error</exception>
259259
public virtual void LoadFromFile(string dctFilePath)
260260
{
261261
int i, cnt, length, total = 0;
262+
262263
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
263264
// The 3756th is used (as a header) to store information.
264-
int[]
265-
buffer = new int[3];
266-
byte[] intBuffer = new byte[4];
265+
266+
Span<int> buffer = stackalloc int[3];
267267
string tmpword;
268+
269+
// LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
270+
// LUCENENET specific - refactored constants for clarity
271+
272+
// The 3756th position (using 1-based counting) corresponds to index 3755 (using 0-based indexing)
273+
// This matches the original Java implementation which used 3755 + GB2312_FIRST_CHAR in the condition
274+
const int HEADER_POSITION = 3755;
275+
268276
//using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
269277
using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
278+
using var reader = new BinaryReader(dctFile);
270279

271280
// GB2312 characters 0 - 6768
272281
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
273282
{
283+
274284
string currentStr = GetCCByGB2312Id(i);
275285
// if (i == 5231)
276286
// System.out.println(i);
287+
try
288+
{
289+
cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
290+
}
291+
catch (EndOfStreamException ex)
292+
{
293+
throw new IOException($"Bigram dictionary file is incomplete at character index {i}.", ex);
294+
}
277295

278-
dctFile.Read(intBuffer, 0, intBuffer.Length);
279-
// the dictionary was developed for C, and byte order must be converted to work with Java
280-
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
281296
if (cnt <= 0)
282297
{
283298
continue;
@@ -286,37 +301,37 @@ public virtual void LoadFromFile(string dctFilePath)
286301
int j = 0;
287302
while (j < cnt)
288303
{
289-
dctFile.Read(intBuffer, 0, intBuffer.Length);
290-
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
291-
.GetInt32();// frequency
292-
dctFile.Read(intBuffer, 0, intBuffer.Length);
293-
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
294-
.GetInt32();// length
295-
dctFile.Read(intBuffer, 0, intBuffer.Length);
296-
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
297-
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
304+
// LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
305+
buffer[0] = reader.ReadInt32(); // frequency
306+
buffer[1] = reader.ReadInt32(); // length
307+
reader.BaseStream.Seek(4, SeekOrigin.Current); // Skip handle value (unused)
298308

299309
length = buffer[1];
300310
if (length > 0)
301311
{
302-
byte[] lchBuffer = new byte[length];
303-
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
312+
byte[] lchBuffer = reader.ReadBytes(length); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
313+
304314
//tmpword = new String(lchBuffer, "GB2312");
305315
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
306316
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
307-
if (i != 3755 + GB2312_FIRST_CHAR)
317+
318+
319+
if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
308320
{
309321
tmpword = currentStr + tmpword;
310322
}
311-
char[] carray = tmpword.ToCharArray();
323+
324+
ReadOnlySpan<char> carray = tmpword.AsSpan();
312325
long hashId = Hash1(carray);
313326
int index = GetAvaliableIndex(hashId, carray);
327+
314328
if (index != -1)
315329
{
316330
if (bigramHashTable[index] == 0)
317331
{
318332
bigramHashTable[index] = hashId;
319333
// bigramStringTable[index] = tmpword;
334+
320335
}
321336
frequencyTable[index] += buffer[0];
322337
}
@@ -326,8 +341,7 @@ public virtual void LoadFromFile(string dctFilePath)
326341
}
327342
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
328343
}
329-
330-
private int GetAvaliableIndex(long hashId, char[] carray)
344+
private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
331345
{
332346
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
333347
int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
@@ -357,7 +371,7 @@ private int GetAvaliableIndex(long hashId, char[] carray)
357371
/// <summary>
358372
/// lookup the index into the frequency array.
359373
/// </summary>
360-
private int GetBigramItemIndex(char[] carray)
374+
private int GetBigramItemIndex(ReadOnlySpan<char> carray)
361375
{
362376
long hashId = Hash1(carray);
363377
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
@@ -388,7 +402,7 @@ private int GetBigramItemIndex(char[] carray)
388402
return -1;
389403
}
390404

391-
public int GetFrequency(char[] carray)
405+
public int GetFrequency(ReadOnlySpan<char> carray)
392406
{
393407
int index = GetBigramItemIndex(carray);
394408
if (index != -1)

src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -340,62 +340,62 @@ private void SaveToObj(FileInfo serialObj)
340340
/// <summary>
341341
/// Load the datafile into this <see cref="WordDictionary"/>
342342
/// </summary>
343-
/// <param name="dctFilePath">path to word dictionary (coredict.dct)</param>
344-
/// <returns>number of words read</returns>
343+
/// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param>
344+
/// <returns>Number of words read</returns>
345345
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
346346
private int LoadMainDataFromFile(string dctFilePath)
347347
{
348348
int i, cnt, length, total = 0;
349-
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
349+
350+
// The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760).
350351
// The 3756th is used (as a header) to store information.
351-
int[]
352-
buffer = new int[3];
353-
byte[] intBuffer = new byte[4];
352+
353+
Span<int> buffer = stackalloc int[3];
354354
string tmpword;
355+
356+
// LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
357+
// LUCENENET: Use BinaryReader to simplify endian conversion and stream reading.
358+
355359
using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
360+
using (var reader = new BinaryReader(dctFile))
356361
{
357-
358362
// GB2312 characters 0 - 6768
359363
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
360364
{
361365
// if (i == 5231)
362366
// System.out.println(i);
363367

364-
dctFile.Read(intBuffer, 0, intBuffer.Length);
365-
// the dictionary was developed for C, and byte order must be converted to work with Java
366-
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
368+
cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
369+
367370
if (cnt <= 0)
368371
{
369372
wordItem_charArrayTable[i] = null;
370373
wordItem_frequencyTable[i] = null;
371374
continue;
372375
}
376+
373377
wordItem_charArrayTable[i] = new char[cnt][];
374378
wordItem_frequencyTable[i] = new int[cnt];
375379
total += cnt;
376380
int j = 0;
377381
while (j < cnt)
378382
{
379383
// wordItemTable[i][j] = new WordItem();
380-
dctFile.Read(intBuffer, 0, intBuffer.Length);
381-
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
382-
.GetInt32();// frequency
383-
dctFile.Read(intBuffer, 0, intBuffer.Length);
384-
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
385-
.GetInt32();// length
386-
dctFile.Read(intBuffer, 0, intBuffer.Length);
387-
buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
388-
.GetInt32();// handle
384+
385+
// LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
386+
buffer[0] = reader.ReadInt32(); // frequency
387+
buffer[1] = reader.ReadInt32(); // length
388+
reader.BaseStream.Seek(4, SeekOrigin.Current); // Skip handle value (unused)
389389

390390
// wordItemTable[i][j].frequency = buffer[0];
391+
391392
wordItem_frequencyTable[i][j] = buffer[0];
392393

393394
length = buffer[1];
394395
if (length > 0)
395396
{
396-
byte[] lchBuffer = new byte[length];
397-
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
398-
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
397+
byte[] lchBuffer = reader.ReadBytes(length);
398+
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class
399399
wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
400400
}
401401
else
@@ -411,6 +411,7 @@ private int LoadMainDataFromFile(string dctFilePath)
411411
SetTableIndex(str[0], i);
412412
}
413413
}
414+
414415
return total;
415416
}
416417

src/Lucene.Net.Analysis.SmartCn/Hhmm/package.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ summary: *content
2121
-->
2222

2323
SmartChineseAnalyzer Hidden Markov Model package.
24-
@lucene.experimental
24+
@lucene.experimental

src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@
3838
<NoWarn>$(NoWarn);1591;1573</NoWarn>
3939
</PropertyGroup>
4040

41-
42-
4341
<ItemGroup>
4442
<EmbeddedResource Include="Hhmm/*.mem" Label="Dict Test Data" />
4543
<EmbeddedResource Include="**/*.txt" Exclude="bin/**/*;obj/**/*" Label="Text Test Data" />
@@ -64,4 +62,8 @@
6462
<PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
6563
</ItemGroup>
6664

65+
<ItemGroup>
66+
<InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />
67+
</ItemGroup>
68+
6769
</Project>

src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,3 @@
3434

3535
// The following GUID is for the ID of the typelib if this project is exposed to COM
3636
[assembly: Guid("dba35edf-a0ff-4df7-ae4f-a103b01cd488")]
37-
38-

src/Lucene.Net.Analysis.SmartCn/overview.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ summary: *content
2222

2323
Analyzer for Simplified Chinese, which indexes words.
2424

25-
For an introduction to Lucene's analysis API, see the [Lucene.Net.Analysis](../core/Lucene.Net.Analysis.html) namespace documentation.
25+
For an introduction to Lucene's analysis API, see the [Lucene.Net.Analysis](../core/Lucene.Net.Analysis.html) namespace documentation.

src/Lucene.Net.Analysis.SmartCn/package.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,11 @@ Analyzer for Simplified Chinese, which indexes words.
2525

2626
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
2727

28-
* StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
28+
- StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
2929

30-
* CJKAnalyzer (in the <xref:Lucene.Net.Analysis.Cjk> namespace of <xref:Lucene.Net.Analysis.Common>): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
31-
32-
* SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
30+
- CJKAnalyzer (in the <xref:Lucene.Net.Analysis.Cjk> namespace of <xref:Lucene.Net.Analysis.Common>): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
3331

32+
- SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
3433

3534
Example phrase: "我是中国人"
3635

src/Lucene.Net.Replicator/Http/package.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@ uid: Lucene.Net.Replicator.Http
33
summary: *content
44
---
55

6-
7-
<!--
6+
<!--
87
Licensed to the Apache Software Foundation (ASF) under one or more
98
contributor license agreements. See the NOTICE file distributed with
109
this work for additional information regarding copyright ownership.

0 commit comments

Comments
 (0)