@@ -418,27 +418,16 @@ where
418418 where
419419 W : Write + Seek ,
420420 {
421- // Chunk size: vocab size (u64), minimum n-gram length (u32),
422- // maximum n-gram length (u32), bucket exponent (u32), for
423- // each word: word length in bytes (u32), word bytes
424- // (variable-length).
425- let chunk_len = size_of :: < u64 > ( )
426- + size_of :: < u32 > ( )
427- + size_of :: < u32 > ( )
428- + size_of :: < u32 > ( )
429- + self
430- . words ( )
431- . iter ( )
432- . map ( |w| w. len ( ) + size_of :: < u32 > ( ) )
433- . sum :: < usize > ( ) ;
434-
435421 write
436422 . write_u32 :: < LittleEndian > ( chunk_identifier as u32 )
437423 . map_err ( |e| {
438424 Error :: write_error ( "Cannot write subword vocabulary chunk identifier" , e)
439425 } ) ?;
426+
427+ let remaining_chunk_len = self . chunk_len_ ( ) - ( size_of :: < u32 > ( ) + size_of :: < u64 > ( ) ) as u64 ;
428+
440429 write
441- . write_u64 :: < LittleEndian > ( chunk_len as u64 )
430+ . write_u64 :: < LittleEndian > ( remaining_chunk_len )
442431 . map_err ( |e| Error :: write_error ( "Cannot write subword vocabulary chunk length" , e) ) ?;
443432 write
444433 . write_u64 :: < LittleEndian > ( self . words . len ( ) as u64 )
@@ -563,34 +552,15 @@ impl ExplicitSubwordVocab {
563552 where
564553 W : Write + Seek ,
565554 {
566- // Chunk size: word vocab size (u64), ngram vocab size (u64)
567- // minimum n-gram length (u32), maximum n-gram length (u32),
568- // for each word and ngram:
569- // length in bytes (u32), number of bytes (variable-length).
570- // each ngram is followed by its index (u64)
571- let chunk_len = size_of :: < u64 > ( )
572- + size_of :: < u64 > ( )
573- + size_of :: < u32 > ( )
574- + size_of :: < u32 > ( )
575- + self
576- . words ( )
577- . iter ( )
578- . map ( |w| w. len ( ) + size_of :: < u32 > ( ) )
579- . sum :: < usize > ( )
580- + self
581- . indexer
582- . ngrams ( )
583- . iter ( )
584- . map ( |ngram| ngram. len ( ) + size_of :: < u32 > ( ) + size_of :: < u64 > ( ) )
585- . sum :: < usize > ( ) ;
555+ let remaining_chunk_len = self . chunk_len_ ( ) - ( size_of :: < u32 > ( ) + size_of :: < u64 > ( ) ) as u64 ;
586556
587557 write
588558 . write_u32 :: < LittleEndian > ( chunk_identifier as u32 )
589559 . map_err ( |e| {
590560 Error :: write_error ( "Cannot write subword vocabulary chunk identifier" , e)
591561 } ) ?;
592562 write
593- . write_u64 :: < LittleEndian > ( chunk_len as u64 )
563+ . write_u64 :: < LittleEndian > ( remaining_chunk_len )
594564 . map_err ( |e| Error :: write_error ( "Cannot write subword vocabulary chunk length" , e) ) ?;
595565 write
596566 . write_u64 :: < LittleEndian > ( self . words . len ( ) as u64 )
@@ -685,27 +655,15 @@ impl FloretSubwordVocab {
685655 where
686656 W : Write + Seek ,
687657 {
688- // Chunk size: minimum n-gram length (u32), maximum n-gram length (u32),
689- // number of buckets (u64), number of hashes (u32), hash seed (u32),
690- // bow and row (variable length).
691-
692- let chunk_len = size_of :: < u32 > ( )
693- + size_of :: < u32 > ( )
694- + size_of :: < u64 > ( )
695- + size_of :: < u32 > ( )
696- + size_of :: < u32 > ( )
697- + self . bow . len ( )
698- + size_of :: < u32 > ( )
699- + self . eow . len ( )
700- + size_of :: < u32 > ( ) ;
658+ let remaining_chunk_len = self . chunk_len_ ( ) - ( size_of :: < u32 > ( ) + size_of :: < u64 > ( ) ) as u64 ;
701659
702660 write
703661 . write_u32 :: < LittleEndian > ( chunk_identifier as u32 )
704662 . map_err ( |e| {
705663 Error :: write_error ( "Cannot write subword vocabulary chunk identifier" , e)
706664 } ) ?;
707665 write
708- . write_u64 :: < LittleEndian > ( chunk_len as u64 )
666+ . write_u64 :: < LittleEndian > ( remaining_chunk_len )
709667 . map_err ( |e| Error :: write_error ( "Cannot write subword vocabulary chunk length" , e) ) ?;
710668 write
711669 . write_u32 :: < LittleEndian > ( self . min_n )
0 commit comments