@@ -254,82 +254,80 @@ private void Load(string dictRoot)
254254 /// <summary>
255255 /// Load the datafile into this <see cref="BigramDictionary"/>
256256 /// </summary>
257- /// <param name="dctFilePath">Path to the Bigramdictionary (bigramDict .dct)</param>
257+ /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict .dct)</param>
258258 /// <exception cref="IOException">If there is a low-level I/O error</exception>
259259 public virtual void LoadFromFile ( string dctFilePath )
260260 {
261- // Position of special header entry in the file structure
261+ // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
262+ // The 3756th is used (as a header) to store information.
263+
264+ // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
265+ // LUCENENET specific - refactored constants for clarity
262266 const int HEADER_POSITION = 3755 ;
263- // Maximum valid length for word entries to prevent loading corrupted data
264267 const int MAX_VALID_LENGTH = 1000 ;
265268
266- // Open file for reading in binary mode
269+ //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
267270 using var dctFile = new FileStream ( dctFilePath , FileMode . Open , FileAccess . Read ) ;
268271 using var reader = new BinaryReader ( dctFile ) ;
269272
270- try
273+ // GB2312 characters 0 - 6768
274+ for ( int i = GB2312_FIRST_CHAR ; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE ; i ++ )
271275 {
272- // Iterate through all GB2312 characters in the valid range
273- for ( int i = GB2312_FIRST_CHAR ; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE ; i ++ )
276+
277+ string currentStr = GetCCByGB2312Id ( i ) ;
278+ int cnt ;
279+ try
280+ {
281+ cnt = reader . ReadInt32 ( ) ; // LUCENENET: Use BinaryReader methods instead of ByteBuffer
282+ }
283+ catch ( EndOfStreamException )
274284 {
275- // Get the current Chinese character
276- string currentStr = GetCCByGB2312Id ( i ) ;
277- // Read the count of words starting with this character
278- int cnt = reader . ReadInt32 ( ) ;
285+ break ;
286+ }
287+
288+ if ( cnt <= 0 )
289+ {
290+ continue ;
291+ }
279292
280- // Skip if no words start with this character
281- if ( cnt <= 0 ) continue ;
293+ for ( int j = 0 ; j < cnt ; j ++ )
294+ {
295+ // LUCENENET: Use BinaryReader methods instead of ByteBuffer
296+ int frequency = reader . ReadInt32 ( ) ;
297+ int length = reader . ReadInt32 ( ) ;
298+ reader . ReadInt32 ( ) ; // Skip handle value (unused)
282299
283- // Process all words for the current character
284- for ( int j = 0 ; j < cnt ; j ++ )
300+ if ( length > 0 && length <= MAX_VALID_LENGTH && dctFile . Position + length <= dctFile . Length )
285301 {
286- // Read word metadata
287- int frequency = reader . ReadInt32 ( ) ; // How often this word appears
288- int length = reader . ReadInt32 ( ) ; // Length of the word in bytes
289- reader . ReadInt32 ( ) ; // Skip handle value (unused)
302+ byte [ ] lchBuffer = reader . ReadBytes ( length ) ; // LUCENENET: Use BinaryReader methods instead of ByteBuffer
290303
291- // Validate word length and ensure we don't read past the file end
292- if ( length > 0 && length <= MAX_VALID_LENGTH && dctFile . Position + length <= dctFile . Length )
293- {
294- // Read the word bytes and convert to string
295- byte [ ] lchBuffer = reader . ReadBytes ( length ) ;
296- string tmpword = gb2312Encoding . GetString ( lchBuffer ) ;
304+ //tmpword = new String(lchBuffer, "GB2312");
305+ string tmpword = gb2312Encoding . GetString ( lchBuffer ) ; // LUCENENET specific: use cached encoding instance from base class
306+ //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
297307
298- // For regular entries (not header entries), prepend the current character
299- if ( i != HEADER_POSITION + GB2312_FIRST_CHAR )
300- {
301- tmpword = currentStr + tmpword ;
302- }
303308
304- // Create a span for efficient string handling
305- ReadOnlySpan < char > carray = tmpword . AsSpan ( ) ;
306- // Generate hash for the word
307- long hashId = Hash1 ( carray ) ;
308- // Find available slot in hash table
309- int index = GetAvaliableIndex ( hashId , carray ) ;
309+ if ( i != HEADER_POSITION + GB2312_FIRST_CHAR )
310+ {
311+ tmpword = currentStr + tmpword ;
312+ }
313+
314+ ReadOnlySpan < char > carray = tmpword . AsSpan ( ) ;
315+ long hashId = Hash1 ( carray ) ;
316+ int index = GetAvaliableIndex ( hashId , carray ) ;
310317
311- // Store word if a valid index was found
312- if ( index != - 1 )
318+ if ( index != - 1 )
319+ {
320+ if ( bigramHashTable [ index ] == 0 )
313321 {
314- // Set hash ID if slot is empty
315- if ( bigramHashTable [ index ] == 0 )
316- {
317- bigramHashTable [ index ] = hashId ;
318- }
319- // Add word frequency to the table
320- frequencyTable [ index ] += frequency ;
322+ bigramHashTable [ index ] = hashId ;
323+ // bigramStringTable[index] = tmpword;
324+
321325 }
326+ frequencyTable [ index ] += frequency ;
322327 }
323328 }
324329 }
325330 }
326- // Handle expected end-of-file condition silently
327- catch ( EndOfStreamException ) { /* Reached end of file */ }
328- // Re-throw IO exceptions as required by contract
329- catch ( IOException ) { /* Re-throw as per method contract */ throw ; }
330-
331- // Note: Commented out logging statement
332- // log.info("load dictionary done! " + dctFilePath + " total:" + total);
333331 }
334332 private int GetAvaliableIndex ( long hashId , ReadOnlySpan < char > carray )
335333 {
0 commit comments