Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ private static void Init()
return;
}


try
{
while (new DirectoryInfo(currentPath).Parent != null)
Expand Down
8 changes: 5 additions & 3 deletions src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
internal abstract class AbstractDictionary
{
// LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312")
protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312");
protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312",
EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);


/// <summary>
/// First Chinese Character in GB2312 (15 * 94)
Expand Down Expand Up @@ -162,7 +164,7 @@ public virtual long Hash1(char c)
/// </summary>
/// <param name="carray">character array</param>
/// <returns>hashcode</returns>
public virtual long Hash1(char[] carray)
public virtual long Hash1(ReadOnlySpan<char> carray)
{
long p = 1099511628211L;
long hash = unchecked((long)0xcbf29ce484222325L);
Expand Down Expand Up @@ -210,7 +212,7 @@ public virtual int Hash2(char c)
/// </summary>
/// <param name="carray">character array</param>
/// <returns>hashcode</returns>
public virtual int Hash2(char[] carray)
public virtual int Hash2(ReadOnlySpan<char> carray)
{
int hash = 5381;

Expand Down
62 changes: 38 additions & 24 deletions src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -254,30 +254,45 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
/// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
/// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
int i, cnt, length, total = 0;

// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
// The 3756th is used (as a header) to store information.
int[]
buffer = new int[3];
byte[] intBuffer = new byte[4];

Span<int> buffer = stackalloc int[3];
string tmpword;

// LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
// LUCENENET specific - refactored constants for clarity

// The 3756th position (using 1-based counting) corresponds to index 3755 (using 0-based indexing)
// This matches the original Java implementation which used 3755 + GB2312_FIRST_CHAR in the condition
const int HEADER_POSITION = 3755;

//using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
using var reader = new BinaryReader(dctFile);

// GB2312 characters 0 - 6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
{

string currentStr = GetCCByGB2312Id(i);
// if (i == 5231)
// System.out.println(i);
try
{
cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
}
catch (EndOfStreamException ex)
{
throw new IOException($"Bigram dictionary file is incomplete at character index {i}.", ex);
}

dctFile.Read(intBuffer, 0, intBuffer.Length);
// the dictionary was developed for C, and byte order must be converted to work with Java
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
continue;
Expand All @@ -286,37 +301,37 @@ public virtual void LoadFromFile(string dctFilePath)
int j = 0;
while (j < cnt)
{
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// frequency
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// length
dctFile.Read(intBuffer, 0, intBuffer.Length);
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
// LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
buffer[0] = reader.ReadInt32(); // frequency
buffer[1] = reader.ReadInt32(); // length
reader.BaseStream.Seek(4, SeekOrigin.Current); // Skip handle value (unused)

length = buffer[1];
if (length > 0)
{
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
byte[] lchBuffer = reader.ReadBytes(length); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET

//tmpword = new String(lchBuffer, "GB2312");
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
if (i != 3755 + GB2312_FIRST_CHAR)


if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
{
tmpword = currentStr + tmpword;
}
char[] carray = tmpword.ToCharArray();

ReadOnlySpan<char> carray = tmpword.AsSpan();
long hashId = Hash1(carray);
int index = GetAvaliableIndex(hashId, carray);

if (index != -1)
{
if (bigramHashTable[index] == 0)
{
bigramHashTable[index] = hashId;
// bigramStringTable[index] = tmpword;

}
frequencyTable[index] += buffer[0];
}
Expand All @@ -326,8 +341,7 @@ public virtual void LoadFromFile(string dctFilePath)
}
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
}

private int GetAvaliableIndex(long hashId, char[] carray)
private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
{
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
Expand Down Expand Up @@ -357,7 +371,7 @@ private int GetAvaliableIndex(long hashId, char[] carray)
/// <summary>
/// lookup the index into the frequency array.
/// </summary>
private int GetBigramItemIndex(char[] carray)
private int GetBigramItemIndex(ReadOnlySpan<char> carray)
{
long hashId = Hash1(carray);
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
Expand Down Expand Up @@ -388,7 +402,7 @@ private int GetBigramItemIndex(char[] carray)
return -1;
}

public int GetFrequency(char[] carray)
public int GetFrequency(ReadOnlySpan<char> carray)
{
int index = GetBigramItemIndex(carray);
if (index != -1)
Expand Down
45 changes: 23 additions & 22 deletions src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -340,62 +340,62 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
/// <param name="dctFilePath">path to word dictionary (coredict.dct)</param>
/// <returns>number of words read</returns>
/// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param>
/// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.

// The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760).
// The 3756th is used (as a header) to store information.
int[]
buffer = new int[3];
byte[] intBuffer = new byte[4];

Span<int> buffer = stackalloc int[3];
string tmpword;

// LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
// LUCENENET: Use BinaryReader to simplify endian conversion and stream reading.

using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
using (var reader = new BinaryReader(dctFile))
{

// GB2312 characters 0 - 6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
{
// if (i == 5231)
// System.out.println(i);

dctFile.Read(intBuffer, 0, intBuffer.Length);
// the dictionary was developed for C, and byte order must be converted to work with Java
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET

if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}

wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
int j = 0;
while (j < cnt)
{
// wordItemTable[i][j] = new WordItem();
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// frequency
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// length
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// handle

// LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
buffer[0] = reader.ReadInt32(); // frequency
buffer[1] = reader.ReadInt32(); // length
reader.BaseStream.Seek(4, SeekOrigin.Current); // Skip handle value (unused)

// wordItemTable[i][j].frequency = buffer[0];

wordItem_frequencyTable[i][j] = buffer[0];

length = buffer[1];
if (length > 0)
{
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
byte[] lchBuffer = reader.ReadBytes(length);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class
wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
}
else
Expand All @@ -411,6 +411,7 @@ private int LoadMainDataFromFile(string dctFilePath)
SetTableIndex(str[0], i);
}
}

return total;
}

Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Analysis.SmartCn/Hhmm/package.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ summary: *content
-->

SmartChineseAnalyzer Hidden Markov Model package.
@lucene.experimental
@lucene.experimental
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@
<NoWarn>$(NoWarn);1591;1573</NoWarn>
</PropertyGroup>



<ItemGroup>
<EmbeddedResource Include="Hhmm/*.mem" Label="Dict Test Data" />
<EmbeddedResource Include="**/*.txt" Exclude="bin/**/*;obj/**/*" Label="Text Test Data" />
Expand All @@ -64,4 +62,8 @@
<PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
</ItemGroup>

<ItemGroup>
<InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />
</ItemGroup>

</Project>
2 changes: 0 additions & 2 deletions src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,3 @@

// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("dba35edf-a0ff-4df7-ae4f-a103b01cd488")]


2 changes: 1 addition & 1 deletion src/Lucene.Net.Analysis.SmartCn/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ summary: *content

Analyzer for Simplified Chinese, which indexes words.

For an introduction to Lucene's analysis API, see the [Lucene.Net.Analysis](../core/Lucene.Net.Analysis.html) namespace documentation.
For an introduction to Lucene's analysis API, see the [Lucene.Net.Analysis](../core/Lucene.Net.Analysis.html) namespace documentation.
7 changes: 3 additions & 4 deletions src/Lucene.Net.Analysis.SmartCn/package.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@ Analyzer for Simplified Chinese, which indexes words.

Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.

* StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
- StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.

* CJKAnalyzer (in the <xref:Lucene.Net.Analysis.Cjk> namespace of <xref:Lucene.Net.Analysis.Common>): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.

* SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
- CJKAnalyzer (in the <xref:Lucene.Net.Analysis.Cjk> namespace of <xref:Lucene.Net.Analysis.Common>): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.

- SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.

Example phrase: "ζˆ‘ζ˜―δΈ­ε›½δΊΊ"

Expand Down
3 changes: 1 addition & 2 deletions src/Lucene.Net.Replicator/Http/package.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ uid: Lucene.Net.Replicator.Http
summary: *content
---


<!--
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Replicator/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ summary: *content
limitations under the License.
-->

Provides index files replication capabilities.
Provides index files replication capabilities.
Loading