Skip to content

Commit 76d55f6

Browse files
committed
refactor: apply review suggestions for SmartCn dictionary classes
1 parent 12223a4 commit 76d55f6

File tree

4 files changed

+75
-83
lines changed

4 files changed

+75
-83
lines changed

src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs

Lines changed: 50 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -254,82 +254,80 @@ private void Load(string dictRoot)
254254
/// <summary>
255255
/// Load the datafile into this <see cref="BigramDictionary"/>
256256
/// </summary>
257-
/// <param name="dctFilePath">Path to the Bigramdictionary (bigramDict.dct)</param>
257+
/// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param>
258258
/// <exception cref="IOException">If there is a low-level I/O error</exception>
259259
public virtual void LoadFromFile(string dctFilePath)
260260
{
261-
// Position of special header entry in the file structure
261+
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
262+
// The 3756th is used (as a header) to store information.
263+
264+
// LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
265+
// LUCENENET specific - refactored constants for clarity
262266
const int HEADER_POSITION = 3755;
263-
// Maximum valid length for word entries to prevent loading corrupted data
264267
const int MAX_VALID_LENGTH = 1000;
265268

266-
// Open file for reading in binary mode
269+
//using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
267270
using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
268271
using var reader = new BinaryReader(dctFile);
269272

270-
try
273+
// GB2312 characters 0 - 6768
274+
for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
271275
{
272-
// Iterate through all GB2312 characters in the valid range
273-
for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
276+
277+
string currentStr = GetCCByGB2312Id(i);
278+
int cnt;
279+
try
280+
{
281+
cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer
282+
}
283+
catch (EndOfStreamException)
274284
{
275-
// Get the current Chinese character
276-
string currentStr = GetCCByGB2312Id(i);
277-
// Read the count of words starting with this character
278-
int cnt = reader.ReadInt32();
285+
break;
286+
}
287+
288+
if (cnt <= 0)
289+
{
290+
continue;
291+
}
279292

280-
// Skip if no words start with this character
281-
if (cnt <= 0) continue;
293+
for (int j = 0; j < cnt; j++)
294+
{
295+
// LUCENENET: Use BinaryReader methods instead of ByteBuffer
296+
int frequency = reader.ReadInt32();
297+
int length = reader.ReadInt32();
298+
reader.ReadInt32(); // Skip handle value (unused)
282299

283-
// Process all words for the current character
284-
for (int j = 0; j < cnt; j++)
300+
if (length > 0 && length <= MAX_VALID_LENGTH && dctFile.Position + length <= dctFile.Length)
285301
{
286-
// Read word metadata
287-
int frequency = reader.ReadInt32(); // How often this word appears
288-
int length = reader.ReadInt32(); // Length of the word in bytes
289-
reader.ReadInt32(); // Skip handle value (unused)
302+
byte[] lchBuffer = reader.ReadBytes(length); // LUCENENET: Use BinaryReader methods instead of ByteBuffer
290303

291-
// Validate word length and ensure we don't read past the file end
292-
if (length > 0 && length <= MAX_VALID_LENGTH && dctFile.Position + length <= dctFile.Length)
293-
{
294-
// Read the word bytes and convert to string
295-
byte[] lchBuffer = reader.ReadBytes(length);
296-
string tmpword = gb2312Encoding.GetString(lchBuffer);
304+
//tmpword = new String(lchBuffer, "GB2312");
305+
string tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
306+
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
297307

298-
// For regular entries (not header entries), prepend the current character
299-
if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
300-
{
301-
tmpword = currentStr + tmpword;
302-
}
303308

304-
// Create a span for efficient string handling
305-
ReadOnlySpan<char> carray = tmpword.AsSpan();
306-
// Generate hash for the word
307-
long hashId = Hash1(carray);
308-
// Find available slot in hash table
309-
int index = GetAvaliableIndex(hashId, carray);
309+
if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
310+
{
311+
tmpword = currentStr + tmpword;
312+
}
313+
314+
ReadOnlySpan<char> carray = tmpword.AsSpan();
315+
long hashId = Hash1(carray);
316+
int index = GetAvaliableIndex(hashId, carray);
310317

311-
// Store word if a valid index was found
312-
if (index != -1)
318+
if (index != -1)
319+
{
320+
if (bigramHashTable[index] == 0)
313321
{
314-
// Set hash ID if slot is empty
315-
if (bigramHashTable[index] == 0)
316-
{
317-
bigramHashTable[index] = hashId;
318-
}
319-
// Add word frequency to the table
320-
frequencyTable[index] += frequency;
322+
bigramHashTable[index] = hashId;
323+
// bigramStringTable[index] = tmpword;
324+
321325
}
326+
frequencyTable[index] += frequency;
322327
}
323328
}
324329
}
325330
}
326-
// Handle expected end-of-file condition silently
327-
catch (EndOfStreamException) { /* Reached end of file */ }
328-
// Re-throw IO exceptions as required by contract
329-
catch (IOException) { /* Re-throw as per method contract */ throw; }
330-
331-
// Note: Commented out logging statement
332-
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
333331
}
334332
private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
335333
{

src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -340,70 +340,67 @@ private void SaveToObj(FileInfo serialObj)
340340
/// <summary>
341341
/// Load the datafile into this <see cref="WordDictionary"/>
342342
/// </summary>
343-
/// <param name="dctFilePath">path to word dictionary (coreDict.dct)</param>
344-
/// <returns>number of words read</returns>
343+
/// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param>
344+
/// <returns>Number of words read</returns>
345345
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
346346
private int LoadMainDataFromFile(string dctFilePath)
347347
{
348-
// Counter for total number of words loaded
349348
int total = 0;
350349

351-
// Open the dictionary file for binary reading
350+
// The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760).
351+
// The 3756th is used (as a header) to store information.
352+
353+
// LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
354+
// LUCENENET: Use BinaryReader to simplify endian conversion and stream reading.
355+
352356
using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
353357
using (var reader = new BinaryReader(dctFile))
354358
{
355-
// Process each Chinese character in the GB2312 encoding range
359+
// GB2312 characters 0 - 6768
356360
for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
357361
{
358-
// Read number of words starting with this character
359-
int cnt = reader.ReadInt32();
362+
int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer
360363

361-
// If no words start with this character, set arrays to null and skip
362364
if (cnt <= 0)
363365
{
364366
wordItem_charArrayTable[i] = null;
365367
wordItem_frequencyTable[i] = null;
366368
continue;
367369
}
368370

369-
// Initialize arrays to store words and their frequencies
370371
wordItem_charArrayTable[i] = new char[cnt][];
371372
wordItem_frequencyTable[i] = new int[cnt];
372373
total += cnt;
373374

374-
// Process each word for the current character
375375
for (int j = 0; j < cnt; j++)
376376
{
377-
// Read word metadata
378-
int frequency = reader.ReadInt32(); // Word usage frequency
379-
int length = reader.ReadInt32(); // Length of word in bytes
380-
reader.ReadInt32(); // Skip handle (unused)
377+
// LUCENENET: Use BinaryReader methods instead of ByteBuffer
378+
int frequency = reader.ReadInt32();
379+
int length = reader.ReadInt32();
380+
reader.ReadInt32(); // Skip handle (unused)
381381

382-
// Store word frequency
383382
wordItem_frequencyTable[i][j] = frequency;
384383

385-
// Process word data if it exists
386384
if (length > 0)
387385
{
388-
// Read word bytes and convert to character array
389386
byte[] lchBuffer = reader.ReadBytes(length);
390-
string tmpword = gb2312Encoding.GetString(lchBuffer);
387+
string tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class
391388
wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
392389
}
393390
else
394391
{
395-
// No word data, set to null
396392
wordItem_charArrayTable[i][j] = null;
397393
}
398394
}
399395

400-
// Map the character to its index in the lookup tables
401396
string str = GetCCByGB2312Id(i);
402397
SetTableIndex(str[0], i);
403398
}
404399
}
405-
return total; // Return total number of words loaded
400+
401+
return total;
406402
}
403+
407404
/// <summary>
408405
/// The original lexicon puts all information with punctuation into a
409406
/// chart (from 1 to 3755). Here it then gets expanded, separately being

src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,28 @@
11
using Lucene.Net.Util;
22
using Lucene.Net.Analysis.Cn.Smart.Hhmm;
3+
using Lucene.Net.Attributes;
34
using NUnit.Framework;
45
using System;
56
using System.IO;
67
using System.Reflection;
78

9+
810
[TestFixture]
11+
[LuceneNetSpecific]
912
public class DictionaryTests : LuceneTestCase
1013
{
11-
private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramDict.dct";
14+
private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
1215

1316
[Test, Category("Dictionary")]
1417
public void TestBigramDictionary()
1518
{
16-
// Extract embedded resource
1719
using var resourceStream = GetResourceStream(BigramResourceName);
1820

19-
// Copy to temp file
20-
FileInfo _tempFile = CreateTempFile("bigramDict", ".dct");
21+
FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
2122
CopyStreamToFile(resourceStream, _tempFile);
2223

23-
// Verify temp file
2424
Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
2525

26-
// Test BigramDictionary functionality
2726
BigramDictionary bigramDict = BigramDictionary.GetInstance();
2827
bigramDict.LoadFromFile(_tempFile.FullName);
2928

@@ -34,10 +33,8 @@ public void TestBigramDictionary()
3433
[Test, Category("Dictionary")]
3534
public void TestWordDictionaryGetInstance()
3635
{
37-
// Test WordDictionary functionality using GetInstance()
3836
WordDictionary wordDict = WordDictionary.GetInstance();
3937

40-
// Verify that the WordDictionary instance is not null
4138
Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned null.");
4239

4340
}

src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@
6363
<PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
6464
</ItemGroup>
6565
<ItemGroup>
66-
<EmbeddedResource Include="Resources\bigramDict.dct" />
67-
<EmbeddedResource Include="Resources\coreDict.dct" />
66+
<EmbeddedResource Include="Resources\bigramdict.dct" />
67+
<EmbeddedResource Include="Resources\coredict.dct" />
6868
</ItemGroup>
6969

7070

0 commit comments

Comments
 (0)