refactor: apply review suggestions for SmartCn dictionary classes

NehanPathan · NehanPathan · commit fa8fc77b1598 · 2025-04-08T14:47:21.000+05:30
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
@@ -254,82 +254,81 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">Path to the Bigramdictionary (bigramDict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            // Position of special header entry in the file structure
+            // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
+            // The 3756th is used (as a header) to store information.
+
+            // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
+            // LUCENENET specific - refactored constants for clarity
             const int HEADER_POSITION = 3755;
-            // Maximum valid length for word entries to prevent loading corrupted data
             const int MAX_VALID_LENGTH = 1000;
 
-            // Open file for reading in binary mode
+            //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
             using var reader = new BinaryReader(dctFile);
 
-            try
+            // GB2312 characters 0 - 6768
+            for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
             {
-                // Iterate through all GB2312 characters in the valid range
-                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
+
+                string currentStr = GetCCByGB2312Id(i); 
+                int cnt;
+                try
+                {
+                   cnt = reader.ReadInt32();  // LUCENENET: Use BinaryReader methods instead of ByteBuffer
+                }
+                catch (EndOfStreamException)
                 {
-                    // Get the current Chinese character
-                    string currentStr = GetCCByGB2312Id(i);
-                    // Read the count of words starting with this character
-                    int cnt = reader.ReadInt32();
+                    // Reached end of file
+                    break;
+                }
+
+                if (cnt <= 0)
+                {
+                    continue;
+                }
 
-                    // Skip if no words start with this character
-                    if (cnt <= 0) continue;
+                for (int j = 0; j < cnt; j++)
+                {
+                    // LUCENENET: Use BinaryReader methods instead of ByteBuffer
+                    int frequency = reader.ReadInt32();
+                    int length = reader.ReadInt32();
+                    reader.ReadInt32();  // Skip handle value (unused)
 
-                    // Process all words for the current character
-                    for (int j = 0; j < cnt; j++)
+                    if (length > 0 && length <= MAX_VALID_LENGTH && dctFile.Position + length <= dctFile.Length)
                     {
-                        // Read word metadata
-                        int frequency = reader.ReadInt32();  // How often this word appears
-                        int length = reader.ReadInt32();     // Length of the word in bytes
-                        reader.ReadInt32();                  // Skip handle value (unused)
+                        byte[] lchBuffer = reader.ReadBytes(length);  // LUCENENET: Use BinaryReader methods instead of ByteBuffer
 
-                        // Validate word length and ensure we don't read past the file end
-                        if (length > 0 && length <= MAX_VALID_LENGTH && dctFile.Position + length <= dctFile.Length)
-                        {
-                            // Read the word bytes and convert to string
-                            byte[] lchBuffer = reader.ReadBytes(length);
-                            string tmpword = gb2312Encoding.GetString(lchBuffer);
+                        //tmpword = new String(lchBuffer, "GB2312");
+                        string tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
+                        //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
 
-                            // For regular entries (not header entries), prepend the current character
-                            if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
-                            {
-                                tmpword = currentStr + tmpword;
-                            }
 
-                            // Create a span for efficient string handling
-                            ReadOnlySpan<char> carray = tmpword.AsSpan();
-                            // Generate hash for the word
-                            long hashId = Hash1(carray);
-                            // Find available slot in hash table
-                            int index = GetAvaliableIndex(hashId, carray);
+                        if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
+                        {
+                            tmpword = currentStr + tmpword;
+                        }
+
+                        ReadOnlySpan<char> carray = tmpword.AsSpan();
+                        long hashId = Hash1(carray);
+                        int index = GetAvaliableIndex(hashId, carray);
 
-                            // Store word if a valid index was found
-                            if (index != -1)
+                        if (index != -1)
+                        {
+                            if (bigramHashTable[index] == 0)
                             {
-                                // Set hash ID if slot is empty
-                                if (bigramHashTable[index] == 0)
-                                {
-                                    bigramHashTable[index] = hashId;
-                                }
-                                // Add word frequency to the table
-                                frequencyTable[index] += frequency;
+                                bigramHashTable[index] = hashId;
+                                // bigramStringTable[index] = tmpword;
+
                             }
+                            frequencyTable[index] += frequency;
                         }
                     }
                 }
             }
-            // Handle expected end-of-file condition silently
-            catch (EndOfStreamException) { /* Reached end of file */ }
-            // Re-throw IO exceptions as required by contract
-            catch (IOException) { /* Re-throw as per method contract */ throw; }
-
-            // Note: Commented out logging statement
-            // log.info("load dictionary done! " + dctFilePath + " total:" + total);
         }
         private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
         {
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
@@ -340,70 +340,67 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary (coreDict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            // Counter for total number of words loaded
             int total = 0;
 
-            // Open the dictionary file for binary reading
+            // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760).
+            // The 3756th is used (as a header) to store information.
+
+            // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
             using (var reader = new BinaryReader(dctFile))
             {
-                // Process each Chinese character in the GB2312 encoding range
+                // GB2312 characters 0 - 6768
                 for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
                 {
-                    // Read number of words starting with this character
-                    int cnt = reader.ReadInt32();
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer
 
-                    // If no words start with this character, set arrays to null and skip
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
 
-                    // Initialize arrays to store words and their frequencies
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
 
-                    // Process each word for the current character
                     for (int j = 0; j < cnt; j++)
                     {
-                        // Read word metadata
-                        int frequency = reader.ReadInt32();  // Word usage frequency
-                        int length = reader.ReadInt32();     // Length of word in bytes
-                        reader.ReadInt32();                  // Skip handle (unused)
+                        // LUCENENET: Use BinaryReader methods instead of ByteBuffer
+                        int frequency = reader.ReadInt32();
+                        int length = reader.ReadInt32();
+                        reader.ReadInt32(); // Skip handle (unused)
 
-                        // Store word frequency
                         wordItem_frequencyTable[i][j] = frequency;
 
-                        // Process word data if it exists
                         if (length > 0)
                         {
-                            // Read word bytes and convert to character array
                             byte[] lchBuffer = reader.ReadBytes(length);
-                            string tmpword = gb2312Encoding.GetString(lchBuffer);
+                            string tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class
                             wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
                         }
                         else
                         {
-                            // No word data, set to null
                             wordItem_charArrayTable[i][j] = null;
                         }
                     }
 
-                    // Map the character to its index in the lookup tables
                     string str = GetCCByGB2312Id(i);
                     SetTableIndex(str[0], i);
                 }
             }
-            return total;  // Return total number of words loaded
+
+            return total;
         }
+
         /// <summary>
         /// The original lexicon puts all information with punctuation into a
         /// chart (from 1 to 3755). Here it then gets expanded, separately being
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs
@@ -1,29 +1,28 @@
 using Lucene.Net.Util;
 using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
 using NUnit.Framework;
 using System;
 using System.IO;
 using System.Reflection;
 
+
 [TestFixture]
+[LuceneNetSpecific]
 public class DictionaryTests : LuceneTestCase
 {
-    private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramDict.dct";
+    private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
 
     [Test, Category("Dictionary")]
     public void TestBigramDictionary()
     {
-        // Extract embedded resource
         using var resourceStream = GetResourceStream(BigramResourceName);
 
-        // Copy to temp file
-        FileInfo _tempFile = CreateTempFile("bigramDict", ".dct");
+        FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
         CopyStreamToFile(resourceStream, _tempFile);
 
-        // Verify temp file
         Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
 
-        // Test BigramDictionary functionality
         BigramDictionary bigramDict = BigramDictionary.GetInstance();
         bigramDict.LoadFromFile(_tempFile.FullName);
 
@@ -34,10 +33,8 @@ public void TestBigramDictionary()
     [Test, Category("Dictionary")]
     public void TestWordDictionaryGetInstance()
     {
-        // Test WordDictionary functionality using GetInstance()
         WordDictionary wordDict = WordDictionary.GetInstance();
 
-        // Verify that the WordDictionary instance is not null
         Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned null.");
 
     }
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj
@@ -63,8 +63,8 @@
     <PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
   </ItemGroup>
   <ItemGroup>
-  <EmbeddedResource Include="Resources\bigramDict.dct" />
-  <EmbeddedResource Include="Resources\coreDict.dct" />
+  <EmbeddedResource Include="Resources\bigramdict.dct" />
+  <EmbeddedResource Include="Resources\coredict.dct" />
 </ItemGroup>