Skip to content

Commit 12223a4

Browse files
committed
feat: Optimize SmartCn dictionaries and add loading tests
- Replaced ByteBuffer with BinaryReader for efficiency. - Used ReadOnlySpan<char> in BigramDictionary. - Added tests for dictionary loading from embedded resources. - Embedded bigramDict.dct and coreDict.dct.
1 parent a0578d6 commit 12223a4

File tree

8 files changed

+181
-102
lines changed

8 files changed

+181
-102
lines changed

src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ public virtual long Hash1(char c)
162162
/// </summary>
163163
/// <param name="carray">character array</param>
164164
/// <returns>hashcode</returns>
165-
public virtual long Hash1(char[] carray)
165+
public virtual long Hash1(ReadOnlySpan<char> carray)
166166
{
167167
long p = 1099511628211L;
168168
long hash = unchecked((long)0xcbf29ce484222325L);
@@ -210,7 +210,7 @@ public virtual int Hash2(char c)
210210
/// </summary>
211211
/// <param name="carray">character array</param>
212212
/// <returns>hashcode</returns>
213-
public virtual int Hash2(char[] carray)
213+
public virtual int Hash2(ReadOnlySpan<char> carray)
214214
{
215215
int hash = 5381;
216216

src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs

Lines changed: 63 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -254,80 +254,84 @@ private void Load(string dictRoot)
254254
/// <summary>
255255
/// Load the datafile into this <see cref="BigramDictionary"/>
256256
/// </summary>
257-
/// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
257+
/// <param name="dctFilePath">Path to the Bigramdictionary (bigramDict.dct)</param>
258258
/// <exception cref="IOException">If there is a low-level I/O error</exception>
259259
public virtual void LoadFromFile(string dctFilePath)
260260
{
261-
int i, cnt, length, total = 0;
262-
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
263-
// The 3756th is used (as a header) to store information.
264-
int[]
265-
buffer = new int[3];
266-
byte[] intBuffer = new byte[4];
267-
string tmpword;
268-
//using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
261+
// Position of special header entry in the file structure
262+
const int HEADER_POSITION = 3755;
263+
// Maximum valid length for word entries to prevent loading corrupted data
264+
const int MAX_VALID_LENGTH = 1000;
265+
266+
// Open file for reading in binary mode
269267
using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
268+
using var reader = new BinaryReader(dctFile);
270269

271-
// GB2312 characters 0 - 6768
272-
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
270+
try
273271
{
274-
string currentStr = GetCCByGB2312Id(i);
275-
// if (i == 5231)
276-
// System.out.println(i);
277-
278-
dctFile.Read(intBuffer, 0, intBuffer.Length);
279-
// the dictionary was developed for C, and byte order must be converted to work with Java
280-
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
281-
if (cnt <= 0)
282-
{
283-
continue;
284-
}
285-
total += cnt;
286-
int j = 0;
287-
while (j < cnt)
272+
// Iterate through all GB2312 characters in the valid range
273+
for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
288274
{
289-
dctFile.Read(intBuffer, 0, intBuffer.Length);
290-
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
291-
.GetInt32();// frequency
292-
dctFile.Read(intBuffer, 0, intBuffer.Length);
293-
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
294-
.GetInt32();// length
295-
dctFile.Read(intBuffer, 0, intBuffer.Length);
296-
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
297-
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
298-
299-
length = buffer[1];
300-
if (length > 0)
275+
// Get the current Chinese character
276+
string currentStr = GetCCByGB2312Id(i);
277+
// Read the count of words starting with this character
278+
int cnt = reader.ReadInt32();
279+
280+
// Skip if no words start with this character
281+
if (cnt <= 0) continue;
282+
283+
// Process all words for the current character
284+
for (int j = 0; j < cnt; j++)
301285
{
302-
byte[] lchBuffer = new byte[length];
303-
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
304-
//tmpword = new String(lchBuffer, "GB2312");
305-
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
306-
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
307-
if (i != 3755 + GB2312_FIRST_CHAR)
308-
{
309-
tmpword = currentStr + tmpword;
310-
}
311-
char[] carray = tmpword.ToCharArray();
312-
long hashId = Hash1(carray);
313-
int index = GetAvaliableIndex(hashId, carray);
314-
if (index != -1)
286+
// Read word metadata
287+
int frequency = reader.ReadInt32(); // How often this word appears
288+
int length = reader.ReadInt32(); // Length of the word in bytes
289+
reader.ReadInt32(); // Skip handle value (unused)
290+
291+
// Validate word length and ensure we don't read past the file end
292+
if (length > 0 && length <= MAX_VALID_LENGTH && dctFile.Position + length <= dctFile.Length)
315293
{
316-
if (bigramHashTable[index] == 0)
294+
// Read the word bytes and convert to string
295+
byte[] lchBuffer = reader.ReadBytes(length);
296+
string tmpword = gb2312Encoding.GetString(lchBuffer);
297+
298+
// For regular entries (not header entries), prepend the current character
299+
if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
300+
{
301+
tmpword = currentStr + tmpword;
302+
}
303+
304+
// Create a span for efficient string handling
305+
ReadOnlySpan<char> carray = tmpword.AsSpan();
306+
// Generate hash for the word
307+
long hashId = Hash1(carray);
308+
// Find available slot in hash table
309+
int index = GetAvaliableIndex(hashId, carray);
310+
311+
// Store word if a valid index was found
312+
if (index != -1)
317313
{
318-
bigramHashTable[index] = hashId;
319-
// bigramStringTable[index] = tmpword;
314+
// Set hash ID if slot is empty
315+
if (bigramHashTable[index] == 0)
316+
{
317+
bigramHashTable[index] = hashId;
318+
}
319+
// Add word frequency to the table
320+
frequencyTable[index] += frequency;
320321
}
321-
frequencyTable[index] += buffer[0];
322322
}
323323
}
324-
j++;
325324
}
326325
}
326+
// Handle expected end-of-file condition silently
327+
catch (EndOfStreamException) { /* Reached end of file */ }
328+
// Re-throw IO exceptions as required by contract
329+
catch (IOException) { /* Re-throw as per method contract */ throw; }
330+
331+
// Note: Commented out logging statement
327332
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
328333
}
329-
330-
private int GetAvaliableIndex(long hashId, char[] carray)
334+
private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
331335
{
332336
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
333337
int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
@@ -357,7 +361,7 @@ private int GetAvaliableIndex(long hashId, char[] carray)
357361
/// <summary>
358362
/// lookup the index into the frequency array.
359363
/// </summary>
360-
private int GetBigramItemIndex(char[] carray)
364+
private int GetBigramItemIndex(ReadOnlySpan<char> carray)
361365
{
362366
long hashId = Hash1(carray);
363367
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
@@ -388,7 +392,7 @@ private int GetBigramItemIndex(char[] carray)
388392
return -1;
389393
}
390394

391-
public int GetFrequency(char[] carray)
395+
public int GetFrequency(ReadOnlySpan<char> carray)
392396
{
393397
int index = GetBigramItemIndex(carray);
394398
if (index != -1)

src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs

Lines changed: 31 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -340,80 +340,70 @@ private void SaveToObj(FileInfo serialObj)
340340
/// <summary>
341341
/// Load the datafile into this <see cref="WordDictionary"/>
342342
/// </summary>
343-
/// <param name="dctFilePath">path to word dictionary (coredict.dct)</param>
343+
/// <param name="dctFilePath">path to word dictionary (coreDict.dct)</param>
344344
/// <returns>number of words read</returns>
345345
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
346346
private int LoadMainDataFromFile(string dctFilePath)
347347
{
348-
int i, cnt, length, total = 0;
349-
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
350-
// The 3756th is used (as a header) to store information.
351-
int[]
352-
buffer = new int[3];
353-
byte[] intBuffer = new byte[4];
354-
string tmpword;
348+
// Counter for total number of words loaded
349+
int total = 0;
350+
351+
// Open the dictionary file for binary reading
355352
using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
353+
using (var reader = new BinaryReader(dctFile))
356354
{
357-
358-
// GB2312 characters 0 - 6768
359-
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
355+
// Process each Chinese character in the GB2312 encoding range
356+
for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
360357
{
361-
// if (i == 5231)
362-
// System.out.println(i);
358+
// Read number of words starting with this character
359+
int cnt = reader.ReadInt32();
363360

364-
dctFile.Read(intBuffer, 0, intBuffer.Length);
365-
// the dictionary was developed for C, and byte order must be converted to work with Java
366-
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
361+
// If no words start with this character, set arrays to null and skip
367362
if (cnt <= 0)
368363
{
369364
wordItem_charArrayTable[i] = null;
370365
wordItem_frequencyTable[i] = null;
371366
continue;
372367
}
368+
369+
// Initialize arrays to store words and their frequencies
373370
wordItem_charArrayTable[i] = new char[cnt][];
374371
wordItem_frequencyTable[i] = new int[cnt];
375372
total += cnt;
376-
int j = 0;
377-
while (j < cnt)
373+
374+
// Process each word for the current character
375+
for (int j = 0; j < cnt; j++)
378376
{
379-
// wordItemTable[i][j] = new WordItem();
380-
dctFile.Read(intBuffer, 0, intBuffer.Length);
381-
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
382-
.GetInt32();// frequency
383-
dctFile.Read(intBuffer, 0, intBuffer.Length);
384-
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
385-
.GetInt32();// length
386-
dctFile.Read(intBuffer, 0, intBuffer.Length);
387-
buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
388-
.GetInt32();// handle
389-
390-
// wordItemTable[i][j].frequency = buffer[0];
391-
wordItem_frequencyTable[i][j] = buffer[0];
392-
393-
length = buffer[1];
377+
// Read word metadata
378+
int frequency = reader.ReadInt32(); // Word usage frequency
379+
int length = reader.ReadInt32(); // Length of word in bytes
380+
reader.ReadInt32(); // Skip handle (unused)
381+
382+
// Store word frequency
383+
wordItem_frequencyTable[i][j] = frequency;
384+
385+
// Process word data if it exists
394386
if (length > 0)
395387
{
396-
byte[] lchBuffer = new byte[length];
397-
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
398-
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
388+
// Read word bytes and convert to character array
389+
byte[] lchBuffer = reader.ReadBytes(length);
390+
string tmpword = gb2312Encoding.GetString(lchBuffer);
399391
wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
400392
}
401393
else
402394
{
403-
// wordItemTable[i][j].charArray = null;
395+
// No word data, set to null
404396
wordItem_charArrayTable[i][j] = null;
405397
}
406-
// System.out.println(indexTable[i].wordItems[j]);
407-
j++;
408398
}
409399

400+
// Map the character to its index in the lookup tables
410401
string str = GetCCByGB2312Id(i);
411402
SetTableIndex(str[0], i);
412403
}
413404
}
414-
return total;
405+
return total; // Return total number of words loaded
415406
}
416-
417407
/// <summary>
418408
/// The original lexicon puts all information with punctuation into a
419409
/// chart (from 1 to 3755). Here it then gets expanded, separately being

src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,9 @@
6464
<PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
6565
</ItemGroup>
6666

67+
<ItemGroup>
68+
<InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />
69+
</ItemGroup>
70+
71+
6772
</Project>
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
using Lucene.Net.Util;
2+
using Lucene.Net.Analysis.Cn.Smart.Hhmm;
3+
using NUnit.Framework;
4+
using System;
5+
using System.IO;
6+
using System.Reflection;
7+
8+
[TestFixture]
9+
public class DictionaryTests : LuceneTestCase
10+
{
11+
private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramDict.dct";
12+
13+
[Test, Category("Dictionary")]
14+
public void TestBigramDictionary()
15+
{
16+
// Extract embedded resource
17+
using var resourceStream = GetResourceStream(BigramResourceName);
18+
19+
// Copy to temp file
20+
FileInfo _tempFile = CreateTempFile("bigramDict", ".dct");
21+
CopyStreamToFile(resourceStream, _tempFile);
22+
23+
// Verify temp file
24+
Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
25+
26+
// Test BigramDictionary functionality
27+
BigramDictionary bigramDict = BigramDictionary.GetInstance();
28+
bigramDict.LoadFromFile(_tempFile.FullName);
29+
30+
Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect.");
31+
Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect.");
32+
}
33+
34+
[Test, Category("Dictionary")]
35+
public void TestWordDictionaryGetInstance()
36+
{
37+
// Test WordDictionary functionality using GetInstance()
38+
WordDictionary wordDict = WordDictionary.GetInstance();
39+
40+
// Verify that the WordDictionary instance is not null
41+
Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned null.");
42+
43+
}
44+
45+
private Stream GetResourceStream(string resourceName)
46+
{
47+
var stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName);
48+
Assert.NotNull(stream, $"Resource '{resourceName}' not found!");
49+
Assert.IsTrue(stream.Length > 0, "Resource Stream is empty");
50+
return stream;
51+
}
52+
53+
private void CopyStreamToFile(Stream stream, FileInfo file)
54+
{
55+
try
56+
{
57+
stream.Position = 0;
58+
using var outputStream = File.Create(file.FullName);
59+
stream.CopyTo(outputStream);
60+
}
61+
catch (Exception ex)
62+
{
63+
Assert.Fail($"Failed to copy stream to file: {ex.Message}");
64+
}
65+
}
66+
67+
private new FileInfo CreateTempFile(string prefix, string extension)
68+
{
69+
string tempFileName = Path.Combine(
70+
Path.GetTempPath(),
71+
$"{prefix}_{Guid.NewGuid():N}{extension}"
72+
);
73+
return new FileInfo(tempFileName);
74+
}
75+
}

src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,5 +62,10 @@
6262
<ItemGroup Condition=" '$(TargetFramework)' == 'net472' ">
6363
<PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
6464
</ItemGroup>
65+
<ItemGroup>
66+
<EmbeddedResource Include="Resources\bigramDict.dct" />
67+
<EmbeddedResource Include="Resources\coreDict.dct" />
68+
</ItemGroup>
69+
6570

6671
</Project>
42 Bytes
Binary file not shown.
26.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)