feat: Optimize SmartCn dictionaries and add loading tests

NehanPathan · NehanPathan · commit 12223a4ede0a · 2025-04-02T11:30:41.000+05:30
- Replaced ByteBuffer with BinaryReader for efficiency.
- Used ReadOnlySpan&lt;char&gt; in BigramDictionary.
- Added tests for dictionary loading from embedded resources.
- Embedded bigramDict.dct and coreDict.dct.
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
@@ -162,7 +162,7 @@ public virtual long Hash1(char c)
         /// </summary>
         /// <param name="carray">character array</param>
         /// <returns>hashcode</returns>
-        public virtual long Hash1(char[] carray)
+        public virtual long Hash1(ReadOnlySpan<char> carray)
         {
             long p = 1099511628211L;
             long hash = unchecked((long)0xcbf29ce484222325L);
@@ -210,7 +210,7 @@ public virtual int Hash2(char c)
         /// </summary>
         /// <param name="carray">character array</param>
         /// <returns>hashcode</returns>
-        public virtual int Hash2(char[] carray)
+        public virtual int Hash2(ReadOnlySpan<char> carray)
         {
             int hash = 5381;
 
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
@@ -254,80 +254,84 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary (bigramDict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
-            // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
-            //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
+            // Position of special header entry in the file structure
+            const int HEADER_POSITION = 3755;
+            // Maximum valid length for word entries to prevent loading corrupted data
+            const int MAX_VALID_LENGTH = 1000;
+
+            // Open file for reading in binary mode
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
-            // GB2312 characters 0 - 6768
-            for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
+            try
             {
-                string currentStr = GetCCByGB2312Id(i);
-                // if (i == 5231)
-                // System.out.println(i);
-
-                dctFile.Read(intBuffer, 0, intBuffer.Length);
-                // the dictionary was developed for C, and byte order must be converted to work with Java
-                cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
-                if (cnt <= 0)
-                {
-                    continue;
-                }
-                total += cnt;
-                int j = 0;
-                while (j < cnt)
+                // Iterate through all GB2312 characters in the valid range
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
                 {
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// frequency
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// length
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // buffer[2] = ByteBuffer.wrap(intBuffer).order(
-                    // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
-                    length = buffer[1];
-                    if (length > 0)
+                    // Get the current Chinese character
+                    string currentStr = GetCCByGB2312Id(i);
+                    // Read the count of words starting with this character
+                    int cnt = reader.ReadInt32();
+
+                    // Skip if no words start with this character
+                    if (cnt <= 0) continue;
+
+                    // Process all words for the current character
+                    for (int j = 0; j < cnt; j++)
                     {
-                        byte[] lchBuffer = new byte[length];
-                        dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                        //tmpword = new String(lchBuffer, "GB2312");
-                        tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
-                        //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
-                        if (i != 3755 + GB2312_FIRST_CHAR)
-                        {
-                            tmpword = currentStr + tmpword;
-                        }
-                        char[] carray = tmpword.ToCharArray();
-                        long hashId = Hash1(carray);
-                        int index = GetAvaliableIndex(hashId, carray);
-                        if (index != -1)
+                        // Read word metadata
+                        int frequency = reader.ReadInt32();  // How often this word appears
+                        int length = reader.ReadInt32();     // Length of the word in bytes
+                        reader.ReadInt32();                  // Skip handle value (unused)
+
+                        // Validate word length and ensure we don't read past the file end
+                        if (length > 0 && length <= MAX_VALID_LENGTH && dctFile.Position + length <= dctFile.Length)
                         {
-                            if (bigramHashTable[index] == 0)
+                            // Read the word bytes and convert to string
+                            byte[] lchBuffer = reader.ReadBytes(length);
+                            string tmpword = gb2312Encoding.GetString(lchBuffer);
+
+                            // For regular entries (not header entries), prepend the current character
+                            if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
+                            {
+                                tmpword = currentStr + tmpword;
+                            }
+
+                            // Create a span for efficient string handling
+                            ReadOnlySpan<char> carray = tmpword.AsSpan();
+                            // Generate hash for the word
+                            long hashId = Hash1(carray);
+                            // Find available slot in hash table
+                            int index = GetAvaliableIndex(hashId, carray);
+
+                            // Store word if a valid index was found
+                            if (index != -1)
                             {
-                                bigramHashTable[index] = hashId;
-                                // bigramStringTable[index] = tmpword;
+                                // Set hash ID if slot is empty
+                                if (bigramHashTable[index] == 0)
+                                {
+                                    bigramHashTable[index] = hashId;
+                                }
+                                // Add word frequency to the table
+                                frequencyTable[index] += frequency;
                             }
-                            frequencyTable[index] += buffer[0];
                         }
                     }
-                    j++;
                 }
             }
+            // Handle expected end-of-file condition silently
+            catch (EndOfStreamException) { /* Reached end of file */ }
+            // Re-throw IO exceptions as required by contract
+            catch (IOException) { /* Re-throw as per method contract */ throw; }
+
+            // Note: Commented out logging statement
             // log.info("load dictionary done! " + dctFilePath + " total:" + total);
         }
-
-        private int GetAvaliableIndex(long hashId, char[] carray)
+        private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
         {
             int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
             int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
@@ -357,7 +361,7 @@ private int GetAvaliableIndex(long hashId, char[] carray)
         /// <summary>
         /// lookup the index into the frequency array.
         /// </summary>
-        private int GetBigramItemIndex(char[] carray)
+        private int GetBigramItemIndex(ReadOnlySpan<char> carray)
         {
             long hashId = Hash1(carray);
             int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
@@ -388,7 +392,7 @@ private int GetBigramItemIndex(char[] carray)
                 return -1;
         }
 
-        public int GetFrequency(char[] carray)
+        public int GetFrequency(ReadOnlySpan<char> carray)
         {
             int index = GetBigramItemIndex(carray);
             if (index != -1)
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
@@ -340,80 +340,70 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param>
+        /// <param name="dctFilePath">path to word dictionary (coreDict.dct)</param>
         /// <returns>number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
-            // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+            // Counter for total number of words loaded
+            int total = 0;
+
+            // Open the dictionary file for binary reading
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
-                // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
+                // Process each Chinese character in the GB2312 encoding range
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    // Read number of words starting with this character
+                    int cnt = reader.ReadInt32();
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must be converted to work with Java
-                    cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+                    // If no words start with this character, set arrays to null and skip
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
+                    // Initialize arrays to store words and their frequencies
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    // Process each word for the current character
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];
-                        wordItem_frequencyTable[i][j] = buffer[0];
-
-                        length = buffer[1];
+                        // Read word metadata
+                        int frequency = reader.ReadInt32();  // Word usage frequency
+                        int length = reader.ReadInt32();     // Length of word in bytes
+                        reader.ReadInt32();                  // Skip handle (unused)
+
+                        // Store word frequency
+                        wordItem_frequencyTable[i][j] = frequency;
+
+                        // Process word data if it exists
                         if (length > 0)
                         {
-                            byte[] lchBuffer = new byte[length];
-                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                            tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
+                            // Read word bytes and convert to character array
+                            byte[] lchBuffer = reader.ReadBytes(length);
+                            string tmpword = gb2312Encoding.GetString(lchBuffer);
                             wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
                         }
                         else
                         {
-                            // wordItemTable[i][j].charArray = null;
+                            // No word data, set to null
                             wordItem_charArrayTable[i][j] = null;
                         }
-                        // System.out.println(indexTable[i].wordItems[j]);
-                        j++;
                     }
 
+                    // Map the character to its index in the lookup tables
                     string str = GetCCByGB2312Id(i);
                     SetTableIndex(str[0], i);
                 }
             }
-            return total;
+            return total;  // Return total number of words loaded
         }
-
         /// <summary>
         /// The original lexicon puts all information with punctuation into a
         /// chart (from 1 to 3755). Here it then gets expanded, separately being
diff --git a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj
@@ -64,4 +64,9 @@
     <PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
   </ItemGroup>
 
+  <ItemGroup>
+  <InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />
+</ItemGroup>
+
+
 </Project>
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs
@@ -0,0 +1,75 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+[TestFixture]
+public class DictionaryTests : LuceneTestCase
+{
+    private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramDict.dct";
+
+    [Test, Category("Dictionary")]
+    public void TestBigramDictionary()
+    {
+        // Extract embedded resource
+        using var resourceStream = GetResourceStream(BigramResourceName);
+
+        // Copy to temp file
+        FileInfo _tempFile = CreateTempFile("bigramDict", ".dct");
+        CopyStreamToFile(resourceStream, _tempFile);
+
+        // Verify temp file
+        Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
+
+        // Test BigramDictionary functionality
+        BigramDictionary bigramDict = BigramDictionary.GetInstance();
+        bigramDict.LoadFromFile(_tempFile.FullName);
+
+        Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect.");
+        Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect.");
+    }
+
+    [Test, Category("Dictionary")]
+    public void TestWordDictionaryGetInstance()
+    {
+        // Test WordDictionary functionality using GetInstance()
+        WordDictionary wordDict = WordDictionary.GetInstance();
+
+        // Verify that the WordDictionary instance is not null
+        Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned null.");
+
+    }
+
+    private Stream GetResourceStream(string resourceName)
+    {
+        var stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName);
+        Assert.NotNull(stream, $"Resource '{resourceName}' not found!");
+        Assert.IsTrue(stream.Length > 0, "Resource Stream is empty");
+        return stream;
+    }
+
+    private void CopyStreamToFile(Stream stream, FileInfo file)
+    {
+        try
+        {
+            stream.Position = 0;
+            using var outputStream = File.Create(file.FullName);
+            stream.CopyTo(outputStream);
+        }
+        catch (Exception ex)
+        {
+            Assert.Fail($"Failed to copy stream to file: {ex.Message}");
+        }
+    }
+
+    private new FileInfo CreateTempFile(string prefix, string extension)
+    {
+        string tempFileName = Path.Combine(
+            Path.GetTempPath(),
+            $"{prefix}_{Guid.NewGuid():N}{extension}"
+        );
+        return new FileInfo(tempFileName);
+    }
+}
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj
@@ -62,5 +62,10 @@
   <ItemGroup Condition=" '$(TargetFramework)' == 'net472' ">
     <PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
   </ItemGroup>
+  <ItemGroup>
+  <EmbeddedResource Include="Resources\bigramDict.dct" />
+  <EmbeddedResource Include="Resources\coreDict.dct" />
+</ItemGroup>
+
 
 </Project>
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Resources/bigramDict.dct b/src/Lucene.Net.Tests.Analysis.SmartCn/Resources/bigramDict.dct
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Resources/coreDict.dct b/src/Lucene.Net.Tests.Analysis.SmartCn/Resources/coreDict.dct