feat: Optimize SmartCn Dictionaries and Add Dictionary Loading Tests (#1154)

NehanPathan · NightOwl888 · web-flow · commit 03af9924f5cd · 2025-10-08T14:37:18.000+05:30
* feat: Optimize SmartCn dictionaries and add loading tests

- Replaced ByteBuffer with BinaryReader for efficiency.
- Used ReadOnlySpan&lt;char&gt; in BigramDictionary.
- Added tests for dictionary loading from embedded resources.
- Embedded bigramDict.dct and coreDict.dct.

* refactor: apply review suggestions for SmartCn dictionary classes

* Fix casing for bigramdict.dct and coredict.dct to lowercase for case-sensitive OSes

* Revert breaking changes and restore compatibility; update tests for Bigram and WordDictionary

* Improve SmartCN tests: Replace file existence checks with asserts, refine maxlength usage

* Optimize dictionary loading: skip unused handle with Stream.Seek

* Fix: add final newline and remove trailing whitespace in multiple files

* Update SmartCn dictionary tests and BigramDictionary loading

* Update BigramDictionary ie Updated LoadFromFile to throw IOException

* Lucene.Net.Analysis.Cn.Smart.Hhmm.TestBuildDictionary: Modified the test data with known frequency values to verify the custom data set is loaded.

* Revert LoadFromFile length check to match upstream Lucene behavior

---------

Co-authored-by: Shad Storhaug &lt;shad@shadstorhaug.com&gt;
diff --git a/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs b/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs
@@ -81,6 +81,7 @@ private static void Init()
                 return;
             }
 
+
             try
             {
                 while (new DirectoryInfo(currentPath).Parent != null)
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
@@ -33,7 +33,9 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
     internal abstract class AbstractDictionary
     {
         // LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312")
-        protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312");
+        protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312",
+    EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
+
 
         /// <summary>
         /// First Chinese Character in GB2312 (15 * 94)
@@ -162,7 +164,7 @@ public virtual long Hash1(char c)
         /// </summary>
         /// <param name="carray">character array</param>
         /// <returns>hashcode</returns>
-        public virtual long Hash1(char[] carray)
+        public virtual long Hash1(ReadOnlySpan<char> carray)
         {
             long p = 1099511628211L;
             long hash = unchecked((long)0xcbf29ce484222325L);
@@ -210,7 +212,7 @@ public virtual int Hash2(char c)
         /// </summary>
         /// <param name="carray">character array</param>
         /// <returns>hashcode</returns>
-        public virtual int Hash2(char[] carray)
+        public virtual int Hash2(ReadOnlySpan<char> carray)
         {
             int hash = 5381;
 
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
@@ -254,30 +254,45 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
             int i, cnt, length, total = 0;
+
             // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
+
+            Span<int> buffer = stackalloc int[3];
             string tmpword;
+
+            // LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
+            // LUCENENET specific - refactored constants for clarity
+
+            // The 3756th position (using 1-based counting) corresponds to index 3755 (using 0-based indexing)
+            // This matches the original Java implementation which used 3755 + GB2312_FIRST_CHAR in the condition
+            const int HEADER_POSITION = 3755;
+
             //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
             for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
             {
+
                 string currentStr = GetCCByGB2312Id(i);
                 // if (i == 5231)
                 // System.out.println(i);
+                try
+                {
+                    cnt = reader.ReadInt32();  // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+                }
+                catch (EndOfStreamException ex)
+                {
+                    throw new IOException($"Bigram dictionary file is incomplete at character index {i}.", ex);
+                }
 
-                dctFile.Read(intBuffer, 0, intBuffer.Length);
-                // the dictionary was developed for C, and byte order must be converted to work with Java
-                cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                 if (cnt <= 0)
                 {
                     continue;
@@ -286,37 +301,37 @@ public virtual void LoadFromFile(string dctFilePath)
                 int j = 0;
                 while (j < cnt)
                 {
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// frequency
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// length
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // buffer[2] = ByteBuffer.wrap(intBuffer).order(
-                    // ByteOrder.LITTLE_ENDIAN).getInt();// handle
+                    // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+                    buffer[0] = reader.ReadInt32(); // frequency
+                    buffer[1] = reader.ReadInt32(); // length
+                    reader.BaseStream.Seek(4, SeekOrigin.Current); // Skip handle value (unused)
 
                     length = buffer[1];
                     if (length > 0)
                     {
-                        byte[] lchBuffer = new byte[length];
-                        dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+                        byte[] lchBuffer = reader.ReadBytes(length);  // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+
                         //tmpword = new String(lchBuffer, "GB2312");
                         tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
                         //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
-                        if (i != 3755 + GB2312_FIRST_CHAR)
+
+
+                        if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
                         {
                             tmpword = currentStr + tmpword;
                         }
-                        char[] carray = tmpword.ToCharArray();
+
+                        ReadOnlySpan<char> carray = tmpword.AsSpan();
                         long hashId = Hash1(carray);
                         int index = GetAvaliableIndex(hashId, carray);
+
                         if (index != -1)
                         {
                             if (bigramHashTable[index] == 0)
                             {
                                 bigramHashTable[index] = hashId;
                                 // bigramStringTable[index] = tmpword;
+
                             }
                             frequencyTable[index] += buffer[0];
                         }
@@ -326,8 +341,7 @@ public virtual void LoadFromFile(string dctFilePath)
             }
             // log.info("load dictionary done! " + dctFilePath + " total:" + total);
         }
-
-        private int GetAvaliableIndex(long hashId, char[] carray)
+        private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
         {
             int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
             int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
@@ -357,7 +371,7 @@ private int GetAvaliableIndex(long hashId, char[] carray)
         /// <summary>
         /// lookup the index into the frequency array.
         /// </summary>
-        private int GetBigramItemIndex(char[] carray)
+        private int GetBigramItemIndex(ReadOnlySpan<char> carray)
         {
             long hashId = Hash1(carray);
             int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
@@ -388,7 +402,7 @@ private int GetBigramItemIndex(char[] carray)
                 return -1;
         }
 
-        public int GetFrequency(char[] carray)
+        public int GetFrequency(ReadOnlySpan<char> carray)
         {
             int index = GetBigramItemIndex(carray);
             if (index != -1)
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
@@ -340,62 +340,62 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
             int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
+
+            // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
+
+            Span<int> buffer = stackalloc int[3];
             string tmpword;
+
+            // LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
                 for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
                 {
                     // if (i == 5231)
                     // System.out.println(i);
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must be converted to work with Java
-                    cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+                    cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
                     int j = 0;
                     while (j < cnt)
                     {
                         // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
+
+                        // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+                        buffer[0] = reader.ReadInt32(); // frequency
+                        buffer[1] = reader.ReadInt32(); // length
+                        reader.BaseStream.Seek(4, SeekOrigin.Current); // Skip handle value (unused)
 
                         // wordItemTable[i][j].frequency = buffer[0];
+
                         wordItem_frequencyTable[i][j] = buffer[0];
 
                         length = buffer[1];
                         if (length > 0)
                         {
-                            byte[] lchBuffer = new byte[length];
-                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                            tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
+                            byte[] lchBuffer = reader.ReadBytes(length);
+                            tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class
                             wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
                         }
                         else
@@ -411,6 +411,7 @@ private int LoadMainDataFromFile(string dctFilePath)
                     SetTableIndex(str[0], i);
                 }
             }
+
             return total;
         }
 
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/package.md b/src/Lucene.Net.Analysis.SmartCn/Hhmm/package.md
@@ -21,4 +21,4 @@ summary: *content
 -->
 
 SmartChineseAnalyzer Hidden Markov Model package.
-@lucene.experimental
+@lucene.experimental
diff --git a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj
@@ -38,8 +38,6 @@
     <NoWarn>$(NoWarn);1591;1573</NoWarn>
   </PropertyGroup>
 
-
-
   <ItemGroup>
     <EmbeddedResource Include="Hhmm/*.mem" Label="Dict Test Data" />
     <EmbeddedResource Include="**/*.txt" Exclude="bin/**/*;obj/**/*" Label="Text Test Data" />
@@ -64,4 +62,8 @@
     <PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
   </ItemGroup>
 
+  <ItemGroup>
+    <InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />
+  </ItemGroup>
+
 </Project>
diff --git a/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs b/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs
@@ -34,5 +34,3 @@
 
 // The following GUID is for the ID of the typelib if this project is exposed to COM
 [assembly: Guid("dba35edf-a0ff-4df7-ae4f-a103b01cd488")]
-
-
diff --git a/src/Lucene.Net.Analysis.SmartCn/overview.md b/src/Lucene.Net.Analysis.SmartCn/overview.md
@@ -22,4 +22,4 @@ summary: *content
 
 Analyzer for Simplified Chinese, which indexes words.
 
-For an introduction to Lucene's analysis API, see the [Lucene.Net.Analysis](../core/Lucene.Net.Analysis.html) namespace documentation. 
+For an introduction to Lucene's analysis API, see the [Lucene.Net.Analysis](../core/Lucene.Net.Analysis.html) namespace documentation.
diff --git a/src/Lucene.Net.Analysis.SmartCn/package.md b/src/Lucene.Net.Analysis.SmartCn/package.md
@@ -25,12 +25,11 @@ Analyzer for Simplified Chinese, which indexes words.
 
 Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
 
-*   StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
+- StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
 
-*   CJKAnalyzer (in the <xref:Lucene.Net.Analysis.Cjk> namespace of <xref:Lucene.Net.Analysis.Common>): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
-
-*   SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
+- CJKAnalyzer (in the <xref:Lucene.Net.Analysis.Cjk> namespace of <xref:Lucene.Net.Analysis.Common>): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
 
+- SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
 
 Example phrase： "我是中国人"
 
diff --git a/src/Lucene.Net.Replicator/Http/package.md b/src/Lucene.Net.Replicator/Http/package.md
@@ -3,8 +3,7 @@ uid: Lucene.Net.Replicator.Http
 summary: *content
 ---
 
-
-<!-- 
+<!--
  Licensed to the Apache Software Foundation (ASF) under one or more
  contributor license agreements.  See the NOTICE file distributed with
  this work for additional information regarding copyright ownership.
diff --git a/src/Lucene.Net.Replicator/overview.md b/src/Lucene.Net.Replicator/overview.md
@@ -20,4 +20,4 @@ summary: *content
  limitations under the License.
 -->
 
-Provides index files replication capabilities.
+Provides index files replication capabilities.
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Hhmm/TestBuildDictionary.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/Hhmm/TestBuildDictionary.cs
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Hhmm/custom-dictionary-input.zip b/src/Lucene.Net.Tests.Analysis.SmartCn/Hhmm/custom-dictionary-input.zip
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj

Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,7 @@ private static void Init()`
`81`	`81`	`return;`
`82`	`82`	`}`
`83`	`83`
	`84`	`+`
`84`	`85`	`try`
`85`	`86`	`{`
`86`	`87`	`while (new DirectoryInfo(currentPath).Parent != null)`
Original file line number	Diff line number	Diff line change
`@@ -22,4 +22,4 @@ summary: *content`
`22`	`22`
`23`	`23`	`Analyzer for Simplified Chinese, which indexes words.`
`24`	`24`
`25`		`-For an introduction to Lucene's analysis API, see the [Lucene.Net.Analysis](../core/Lucene.Net.Analysis.html) namespace documentation.`
	`25`	`+For an introduction to Lucene's analysis API, see the [Lucene.Net.Analysis](../core/Lucene.Net.Analysis.html) namespace documentation.`