@@ -254,82 +254,81 @@ private void Load(string dictRoot)
254254 /// <summary>
255255 /// Load the datafile into this <see cref="BigramDictionary"/>
256256 /// </summary>
257- /// <param name="dctFilePath">Path to the Bigramdictionary (bigramDict .dct)</param>
257+ /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict .dct)</param>
258258 /// <exception cref="IOException">If there is a low-level I/O error</exception>
259259 public virtual void LoadFromFile ( string dctFilePath )
260260 {
261- // Position of special header entry in the file structure
261+ // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
262+ // The 3756th is used (as a header) to store information.
263+
264+ // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
265+ // LUCENENET specific - refactored constants for clarity
262266 const int HEADER_POSITION = 3755 ;
263- // Maximum valid length for word entries to prevent loading corrupted data
264267 const int MAX_VALID_LENGTH = 1000 ;
265268
266- // Open file for reading in binary mode
269+ //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
267270 using var dctFile = new FileStream ( dctFilePath , FileMode . Open , FileAccess . Read ) ;
268271 using var reader = new BinaryReader ( dctFile ) ;
269272
270- try
273+ // GB2312 characters 0 - 6768
274+ for ( int i = GB2312_FIRST_CHAR ; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE ; i ++ )
271275 {
272- // Iterate through all GB2312 characters in the valid range
273- for ( int i = GB2312_FIRST_CHAR ; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE ; i ++ )
276+
277+ string currentStr = GetCCByGB2312Id ( i ) ;
278+ int cnt ;
279+ try
280+ {
281+ cnt = reader . ReadInt32 ( ) ; // LUCENENET: Use BinaryReader methods instead of ByteBuffer
282+ }
283+ catch ( EndOfStreamException )
274284 {
275- // Get the current Chinese character
276- string currentStr = GetCCByGB2312Id ( i ) ;
277- // Read the count of words starting with this character
278- int cnt = reader . ReadInt32 ( ) ;
285+ // Reached end of file
286+ break ;
287+ }
288+
289+ if ( cnt <= 0 )
290+ {
291+ continue ;
292+ }
279293
280- // Skip if no words start with this character
281- if ( cnt <= 0 ) continue ;
294+ for ( int j = 0 ; j < cnt ; j ++ )
295+ {
296+ // LUCENENET: Use BinaryReader methods instead of ByteBuffer
297+ int frequency = reader . ReadInt32 ( ) ;
298+ int length = reader . ReadInt32 ( ) ;
299+ reader . ReadInt32 ( ) ; // Skip handle value (unused)
282300
283- // Process all words for the current character
284- for ( int j = 0 ; j < cnt ; j ++ )
301+ if ( length > 0 && length <= MAX_VALID_LENGTH && dctFile . Position + length <= dctFile . Length )
285302 {
286- // Read word metadata
287- int frequency = reader . ReadInt32 ( ) ; // How often this word appears
288- int length = reader . ReadInt32 ( ) ; // Length of the word in bytes
289- reader . ReadInt32 ( ) ; // Skip handle value (unused)
303+ byte [ ] lchBuffer = reader . ReadBytes ( length ) ; // LUCENENET: Use BinaryReader methods instead of ByteBuffer
290304
291- // Validate word length and ensure we don't read past the file end
292- if ( length > 0 && length <= MAX_VALID_LENGTH && dctFile . Position + length <= dctFile . Length )
293- {
294- // Read the word bytes and convert to string
295- byte [ ] lchBuffer = reader . ReadBytes ( length ) ;
296- string tmpword = gb2312Encoding . GetString ( lchBuffer ) ;
305+ //tmpword = new String(lchBuffer, "GB2312");
306+ string tmpword = gb2312Encoding . GetString ( lchBuffer ) ; // LUCENENET specific: use cached encoding instance from base class
307+ //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
297308
298- // For regular entries (not header entries), prepend the current character
299- if ( i != HEADER_POSITION + GB2312_FIRST_CHAR )
300- {
301- tmpword = currentStr + tmpword ;
302- }
303309
304- // Create a span for efficient string handling
305- ReadOnlySpan < char > carray = tmpword . AsSpan ( ) ;
306- // Generate hash for the word
307- long hashId = Hash1 ( carray ) ;
308- // Find available slot in hash table
309- int index = GetAvaliableIndex ( hashId , carray ) ;
310+ if ( i != HEADER_POSITION + GB2312_FIRST_CHAR )
311+ {
312+ tmpword = currentStr + tmpword ;
313+ }
314+
315+ ReadOnlySpan < char > carray = tmpword . AsSpan ( ) ;
316+ long hashId = Hash1 ( carray ) ;
317+ int index = GetAvaliableIndex ( hashId , carray ) ;
310318
311- // Store word if a valid index was found
312- if ( index != - 1 )
319+ if ( index != - 1 )
320+ {
321+ if ( bigramHashTable [ index ] == 0 )
313322 {
314- // Set hash ID if slot is empty
315- if ( bigramHashTable [ index ] == 0 )
316- {
317- bigramHashTable [ index ] = hashId ;
318- }
319- // Add word frequency to the table
320- frequencyTable [ index ] += frequency ;
323+ bigramHashTable [ index ] = hashId ;
324+ // bigramStringTable[index] = tmpword;
325+
321326 }
327+ frequencyTable [ index ] += frequency ;
322328 }
323329 }
324330 }
325331 }
326- // Handle expected end-of-file condition silently
327- catch ( EndOfStreamException ) { /* Reached end of file */ }
328- // Re-throw IO exceptions as required by contract
329- catch ( IOException ) { /* Re-throw as per method contract */ throw ; }
330-
331- // Note: Commented out logging statement
332- // log.info("load dictionary done! " + dctFilePath + " total:" + total);
333332 }
334333 private int GetAvaliableIndex ( long hashId , ReadOnlySpan < char > carray )
335334 {
0 commit comments