Skip to content

Commit cdcc306

Browse files
committed
feat: Optimize SmartCn dictionaries and add loading tests
- Replaced ByteBuffer with BinaryReader for efficiency. - Used ReadOnlySpan<char> in BigramDictionary. - Added tests for dictionary loading from embedded resources. - Embedded bigramDict.dct and coreDict.dct.
1 parent a0578d6 commit cdcc306

File tree

8 files changed

+145
-102
lines changed

8 files changed

+145
-102
lines changed

src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ public virtual long Hash1(char c)
162162
/// </summary>
163163
/// <param name="carray">character array</param>
164164
/// <returns>hashcode</returns>
165-
public virtual long Hash1(char[] carray)
165+
public virtual long Hash1(ReadOnlySpan<char> carray)
166166
{
167167
long p = 1099511628211L;
168168
long hash = unchecked((long)0xcbf29ce484222325L);
@@ -210,7 +210,7 @@ public virtual int Hash2(char c)
210210
/// </summary>
211211
/// <param name="carray">character array</param>
212212
/// <returns>hashcode</returns>
213-
public virtual int Hash2(char[] carray)
213+
public virtual int Hash2(ReadOnlySpan<char> carray)
214214
{
215215
int hash = 5381;
216216

src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs

Lines changed: 41 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -254,80 +254,61 @@ private void Load(string dictRoot)
254254
/// <summary>
255255
/// Load the datafile into this <see cref="BigramDictionary"/>
256256
/// </summary>
257-
/// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
257+
/// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param>
258258
/// <exception cref="IOException">If there is a low-level I/O error</exception>
259259
public virtual void LoadFromFile(string dctFilePath)
260260
{
261-
int i, cnt, length, total = 0;
262-
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
263-
// The 3756th is used (as a header) to store information.
264-
int[]
265-
buffer = new int[3];
266-
byte[] intBuffer = new byte[4];
267-
string tmpword;
268-
//using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
261+
const int HEADER_POSITION = 3755;
262+
const int MAX_VALID_LENGTH = 1000;
263+
269264
using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
265+
using var reader = new BinaryReader(dctFile);
270266

271-
// GB2312 characters 0 - 6768
272-
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
267+
try
273268
{
274-
string currentStr = GetCCByGB2312Id(i);
275-
// if (i == 5231)
276-
// System.out.println(i);
277-
278-
dctFile.Read(intBuffer, 0, intBuffer.Length);
279-
// the dictionary was developed for C, and byte order must be converted to work with Java
280-
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
281-
if (cnt <= 0)
282-
{
283-
continue;
284-
}
285-
total += cnt;
286-
int j = 0;
287-
while (j < cnt)
269+
for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
288270
{
289-
dctFile.Read(intBuffer, 0, intBuffer.Length);
290-
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
291-
.GetInt32();// frequency
292-
dctFile.Read(intBuffer, 0, intBuffer.Length);
293-
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
294-
.GetInt32();// length
295-
dctFile.Read(intBuffer, 0, intBuffer.Length);
296-
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
297-
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
298-
299-
length = buffer[1];
300-
if (length > 0)
271+
string currentStr = GetCCByGB2312Id(i);
272+
int cnt = reader.ReadInt32();
273+
274+
if (cnt <= 0) continue;
275+
276+
for (int j = 0; j < cnt; j++)
301277
{
302-
byte[] lchBuffer = new byte[length];
303-
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
304-
//tmpword = new String(lchBuffer, "GB2312");
305-
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
306-
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
307-
if (i != 3755 + GB2312_FIRST_CHAR)
308-
{
309-
tmpword = currentStr + tmpword;
310-
}
311-
char[] carray = tmpword.ToCharArray();
312-
long hashId = Hash1(carray);
313-
int index = GetAvaliableIndex(hashId, carray);
314-
if (index != -1)
278+
int frequency = reader.ReadInt32();
279+
int length = reader.ReadInt32();
280+
reader.ReadInt32(); // Skip handle
281+
282+
if (length > 0 && length <= MAX_VALID_LENGTH && dctFile.Position + length <= dctFile.Length)
315283
{
316-
if (bigramHashTable[index] == 0)
284+
byte[] lchBuffer = reader.ReadBytes(length);
285+
string tmpword = gb2312Encoding.GetString(lchBuffer);
286+
287+
if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
317288
{
318-
bigramHashTable[index] = hashId;
319-
// bigramStringTable[index] = tmpword;
289+
tmpword = currentStr + tmpword;
290+
}
291+
292+
ReadOnlySpan<char> carray = tmpword.AsSpan();
293+
long hashId = Hash1(carray);
294+
int index = GetAvaliableIndex(hashId, carray);
295+
296+
if (index != -1)
297+
{
298+
if (bigramHashTable[index] == 0)
299+
{
300+
bigramHashTable[index] = hashId;
301+
}
302+
frequencyTable[index] += frequency;
320303
}
321-
frequencyTable[index] += buffer[0];
322304
}
323305
}
324-
j++;
325306
}
326307
}
327-
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
308+
catch (EndOfStreamException) { /* Reached end of file */ }
309+
catch (IOException) { /* Re-throw as per method contract */ throw; }
328310
}
329-
330-
private int GetAvaliableIndex(long hashId, char[] carray)
311+
private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
331312
{
332313
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
333314
int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
@@ -357,7 +338,7 @@ private int GetAvaliableIndex(long hashId, char[] carray)
357338
/// <summary>
358339
/// lookup the index into the frequency array.
359340
/// </summary>
360-
private int GetBigramItemIndex(char[] carray)
341+
private int GetBigramItemIndex(ReadOnlySpan<char> carray)
361342
{
362343
long hashId = Hash1(carray);
363344
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
@@ -388,7 +369,7 @@ private int GetBigramItemIndex(char[] carray)
388369
return -1;
389370
}
390371

391-
public int GetFrequency(char[] carray)
372+
public int GetFrequency(ReadOnlySpan<char> carray)
392373
{
393374
int index = GetBigramItemIndex(carray);
394375
if (index != -1)

src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs

Lines changed: 17 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -343,68 +343,46 @@ private void SaveToObj(FileInfo serialObj)
343343
/// <param name="dctFilePath">path to word dictionary (coredict.dct)</param>
344344
/// <returns>number of words read</returns>
345345
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
346-
private int LoadMainDataFromFile(string dctFilePath)
346+
internal int LoadMainDataFromFile(string dctFilePath)
347347
{
348-
int i, cnt, length, total = 0;
349-
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
350-
// The 3756th is used (as a header) to store information.
351-
int[]
352-
buffer = new int[3];
353-
byte[] intBuffer = new byte[4];
354-
string tmpword;
348+
int total = 0;
349+
355350
using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
351+
using (var reader = new BinaryReader(dctFile))
356352
{
357-
358-
// GB2312 characters 0 - 6768
359-
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
353+
for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
360354
{
361-
// if (i == 5231)
362-
// System.out.println(i);
355+
int cnt = reader.ReadInt32();
363356

364-
dctFile.Read(intBuffer, 0, intBuffer.Length);
365-
// the dictionary was developed for C, and byte order must be converted to work with Java
366-
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
367357
if (cnt <= 0)
368358
{
369359
wordItem_charArrayTable[i] = null;
370360
wordItem_frequencyTable[i] = null;
371361
continue;
372362
}
363+
373364
wordItem_charArrayTable[i] = new char[cnt][];
374365
wordItem_frequencyTable[i] = new int[cnt];
375366
total += cnt;
376-
int j = 0;
377-
while (j < cnt)
367+
368+
for (int j = 0; j < cnt; j++)
378369
{
379-
// wordItemTable[i][j] = new WordItem();
380-
dctFile.Read(intBuffer, 0, intBuffer.Length);
381-
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
382-
.GetInt32();// frequency
383-
dctFile.Read(intBuffer, 0, intBuffer.Length);
384-
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
385-
.GetInt32();// length
386-
dctFile.Read(intBuffer, 0, intBuffer.Length);
387-
buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
388-
.GetInt32();// handle
389-
390-
// wordItemTable[i][j].frequency = buffer[0];
391-
wordItem_frequencyTable[i][j] = buffer[0];
392-
393-
length = buffer[1];
370+
int frequency = reader.ReadInt32();
371+
int length = reader.ReadInt32();
372+
reader.ReadInt32(); // Skip handle
373+
374+
wordItem_frequencyTable[i][j] = frequency;
375+
394376
if (length > 0)
395377
{
396-
byte[] lchBuffer = new byte[length];
397-
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
398-
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
378+
byte[] lchBuffer = reader.ReadBytes(length);
379+
string tmpword = gb2312Encoding.GetString(lchBuffer);
399380
wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
400381
}
401382
else
402383
{
403-
// wordItemTable[i][j].charArray = null;
404384
wordItem_charArrayTable[i][j] = null;
405385
}
406-
// System.out.println(indexTable[i].wordItems[j]);
407-
j++;
408386
}
409387

410388
string str = GetCCByGB2312Id(i);
@@ -413,7 +391,6 @@ private int LoadMainDataFromFile(string dctFilePath)
413391
}
414392
return total;
415393
}
416-
417394
/// <summary>
418395
/// The original lexicon puts all information with punctuation into a
419396
/// chart (from 1 to 3755). Here it then gets expanded, separately being

src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,9 @@
6464
<PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
6565
</ItemGroup>
6666

67+
<ItemGroup>
68+
<InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />
69+
</ItemGroup>
70+
71+
6772
</Project>
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
using Lucene.Net.Util;
2+
using Lucene.Net.Analysis.Cn.Smart.Hhmm;
3+
using NUnit.Framework;
4+
using System;
5+
using System.IO;
6+
using System.Reflection;
7+
8+
[TestFixture]
9+
public class DictionaryTests : LuceneTestCase
10+
{
11+
private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramDict.dct";
12+
13+
[Test, Category("Dictionary")]
14+
public void TestBigramDictionary()
15+
{
16+
// Extract embedded resource
17+
using var resourceStream = GetResourceStream(BigramResourceName);
18+
19+
// Copy to temp file
20+
FileInfo _tempFile = CreateTempFile("bigramDict", ".dct");
21+
CopyStreamToFile(resourceStream, _tempFile);
22+
23+
// Verify temp file
24+
Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
25+
26+
// Test BigramDictionary functionality
27+
BigramDictionary bigramDict = BigramDictionary.GetInstance();
28+
bigramDict.LoadFromFile(_tempFile.FullName);
29+
30+
Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect.");
31+
Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect.");
32+
}
33+
34+
[Test, Category("Dictionary")]
35+
public void TestWordDictionaryGetInstance()
36+
{
37+
// Test WordDictionary functionality using GetInstance()
38+
WordDictionary wordDict = WordDictionary.GetInstance();
39+
40+
// Verify that the WordDictionary instance is not null
41+
Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned null.");
42+
43+
}
44+
45+
private Stream GetResourceStream(string resourceName)
46+
{
47+
var stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName);
48+
Assert.NotNull(stream, $"Resource '{resourceName}' not found!");
49+
Assert.IsTrue(stream.Length > 0, "Resource Stream is empty");
50+
return stream;
51+
}
52+
53+
private void CopyStreamToFile(Stream stream, FileInfo file)
54+
{
55+
try
56+
{
57+
stream.Position = 0;
58+
using var outputStream = File.Create(file.FullName);
59+
stream.CopyTo(outputStream);
60+
}
61+
catch (Exception ex)
62+
{
63+
Assert.Fail($"Failed to copy stream to file: {ex.Message}");
64+
}
65+
}
66+
67+
private new FileInfo CreateTempFile(string prefix, string extension)
68+
{
69+
string tempFileName = Path.Combine(
70+
Path.GetTempPath(),
71+
$"{prefix}_{Guid.NewGuid():N}{extension}"
72+
);
73+
return new FileInfo(tempFileName);
74+
}
75+
}

src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,5 +62,10 @@
6262
<ItemGroup Condition=" '$(TargetFramework)' == 'net472' ">
6363
<PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
6464
</ItemGroup>
65+
<ItemGroup>
66+
<EmbeddedResource Include="Resources\bigramDict.dct" />
67+
<EmbeddedResource Include="Resources\coreDict.dct" />
68+
</ItemGroup>
69+
6570

6671
</Project>
42 Bytes
Binary file not shown.
26.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)