|
| 1 | +use std::fmt::{self, Display}; |
| 2 | +use std::fs::File; |
| 3 | +use std::io::{BufReader, Read, Seek, Write}; |
| 4 | + |
| 5 | +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; |
| 6 | + |
| 7 | +use crate::io::{Error, ErrorKind, Result}; |
| 8 | + |
| 9 | +const MODEL_VERSION: u32 = 0; |
| 10 | + |
| 11 | +const MAGIC: [u8; 4] = [b'F', b'i', b'F', b'u']; |
| 12 | + |
| 13 | +#[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| 14 | +#[repr(u32)] |
| 15 | +pub enum ChunkIdentifier { |
| 16 | + Header = 0, |
| 17 | + SimpleVocab = 1, |
| 18 | + NdArray = 2, |
| 19 | + FinalfusionSubwordVocab = 3, |
| 20 | + QuantizedArray = 4, |
| 21 | + Metadata = 5, |
| 22 | + NdNorms = 6, |
| 23 | + FastTextSubwordVocab = 7, |
| 24 | +} |
| 25 | + |
| 26 | +impl ChunkIdentifier { |
| 27 | + pub fn try_from(identifier: u32) -> Option<Self> { |
| 28 | + use self::ChunkIdentifier::*; |
| 29 | + |
| 30 | + match identifier { |
| 31 | + 1 => Some(SimpleVocab), |
| 32 | + 2 => Some(NdArray), |
| 33 | + 3 => Some(FinalfusionSubwordVocab), |
| 34 | + 4 => Some(QuantizedArray), |
| 35 | + 5 => Some(Metadata), |
| 36 | + 6 => Some(NdNorms), |
| 37 | + 7 => Some(FastTextSubwordVocab), |
| 38 | + _ => None, |
| 39 | + } |
| 40 | + } |
| 41 | + |
| 42 | + /// Read and ensure that the chunk has the given identifier. |
| 43 | + pub fn ensure_chunk_type<R>(read: &mut R, identifier: ChunkIdentifier) -> Result<()> |
| 44 | + where |
| 45 | + R: Read, |
| 46 | + { |
| 47 | + let chunk_id = read |
| 48 | + .read_u32::<LittleEndian>() |
| 49 | + .map_err(|e| ErrorKind::io_error("Cannot read chunk identifier", e))?; |
| 50 | + let chunk_id = ChunkIdentifier::try_from(chunk_id) |
| 51 | + .ok_or_else(|| ErrorKind::Format(format!("Unknown chunk identifier: {}", chunk_id))) |
| 52 | + .map_err(Error::from)?; |
| 53 | + if chunk_id != identifier { |
| 54 | + return Err(ErrorKind::Format(format!( |
| 55 | + "Invalid chunk identifier, expected: {}, got: {}", |
| 56 | + identifier, chunk_id |
| 57 | + )) |
| 58 | + .into()); |
| 59 | + } |
| 60 | + |
| 61 | + Ok(()) |
| 62 | + } |
| 63 | +} |
| 64 | + |
| 65 | +impl Display for ChunkIdentifier { |
| 66 | + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 67 | + use self::ChunkIdentifier::*; |
| 68 | + |
| 69 | + match self { |
| 70 | + Header => write!(f, "Header"), |
| 71 | + SimpleVocab => write!(f, "SimpleVocab"), |
| 72 | + NdArray => write!(f, "NdArray"), |
| 73 | + FastTextSubwordVocab => write!(f, "FastTextSubwordVocab"), |
| 74 | + FinalfusionSubwordVocab => write!(f, "FinalfusionSubwordVocab"), |
| 75 | + QuantizedArray => write!(f, "QuantizedArray"), |
| 76 | + Metadata => write!(f, "Metadata"), |
| 77 | + NdNorms => write!(f, "NdNorms"), |
| 78 | + } |
| 79 | + } |
| 80 | +} |
| 81 | + |
| 82 | +pub trait TypeId { |
| 83 | + /// Read and ensure that the data type is equal to `Self`. |
| 84 | + fn ensure_data_type<R>(read: &mut R) -> Result<()> |
| 85 | + where |
| 86 | + R: Read; |
| 87 | + |
| 88 | + fn type_id() -> u32; |
| 89 | +} |
| 90 | + |
| 91 | +macro_rules! typeid_impl { |
| 92 | + ($type:ty, $id:expr) => { |
| 93 | + impl TypeId for $type { |
| 94 | + fn ensure_data_type<R>(read: &mut R) -> Result<()> |
| 95 | + where |
| 96 | + R: Read, |
| 97 | + { |
| 98 | + let type_id = read |
| 99 | + .read_u32::<LittleEndian>() |
| 100 | + .map_err(|e| ErrorKind::io_error("Cannot read type identifier", e))?; |
| 101 | + if type_id != Self::type_id() { |
| 102 | + return Err(ErrorKind::Format(format!( |
| 103 | + "Invalid type, expected: {}, got: {}", |
| 104 | + Self::type_id(), |
| 105 | + type_id |
| 106 | + )) |
| 107 | + .into()); |
| 108 | + } |
| 109 | + |
| 110 | + Ok(()) |
| 111 | + } |
| 112 | + |
| 113 | + fn type_id() -> u32 { |
| 114 | + $id |
| 115 | + } |
| 116 | + } |
| 117 | + }; |
| 118 | +} |
| 119 | + |
| 120 | +typeid_impl!(f32, 10); |
| 121 | +typeid_impl!(u8, 1); |
| 122 | + |
| 123 | +pub trait ReadChunk |
| 124 | +where |
| 125 | + Self: Sized, |
| 126 | +{ |
| 127 | + fn read_chunk<R>(read: &mut R) -> Result<Self> |
| 128 | + where |
| 129 | + R: Read + Seek; |
| 130 | +} |
| 131 | + |
| 132 | +/// Memory-mappable chunks. |
| 133 | +pub trait MmapChunk |
| 134 | +where |
| 135 | + Self: Sized, |
| 136 | +{ |
| 137 | + /// Memory map a chunk. |
| 138 | + /// |
| 139 | + /// The given `File` object should be positioned at the start of the chunk. |
| 140 | + fn mmap_chunk(read: &mut BufReader<File>) -> Result<Self>; |
| 141 | +} |
| 142 | + |
| 143 | +pub trait WriteChunk { |
| 144 | + /// Get the identifier of a chunk. |
| 145 | + fn chunk_identifier(&self) -> ChunkIdentifier; |
| 146 | + |
| 147 | + fn write_chunk<W>(&self, write: &mut W) -> Result<()> |
| 148 | + where |
| 149 | + W: Write + Seek; |
| 150 | +} |
| 151 | + |
| 152 | +#[derive(Debug, Eq, PartialEq)] |
| 153 | +pub(crate) struct Header { |
| 154 | + chunk_identifiers: Vec<ChunkIdentifier>, |
| 155 | +} |
| 156 | + |
| 157 | +impl Header { |
| 158 | + pub fn new(chunk_identifiers: impl Into<Vec<ChunkIdentifier>>) -> Self { |
| 159 | + Header { |
| 160 | + chunk_identifiers: chunk_identifiers.into(), |
| 161 | + } |
| 162 | + } |
| 163 | + |
| 164 | + pub fn chunk_identifiers(&self) -> &[ChunkIdentifier] { |
| 165 | + &self.chunk_identifiers |
| 166 | + } |
| 167 | +} |
| 168 | + |
| 169 | +impl WriteChunk for Header { |
| 170 | + fn chunk_identifier(&self) -> ChunkIdentifier { |
| 171 | + ChunkIdentifier::Header |
| 172 | + } |
| 173 | + |
| 174 | + fn write_chunk<W>(&self, write: &mut W) -> Result<()> |
| 175 | + where |
| 176 | + W: Write + Seek, |
| 177 | + { |
| 178 | + write |
| 179 | + .write_all(&MAGIC) |
| 180 | + .map_err(|e| ErrorKind::io_error("Cannot write magic", e))?; |
| 181 | + write |
| 182 | + .write_u32::<LittleEndian>(MODEL_VERSION) |
| 183 | + .map_err(|e| ErrorKind::io_error("Cannot write model version", e))?; |
| 184 | + write |
| 185 | + .write_u32::<LittleEndian>(self.chunk_identifiers.len() as u32) |
| 186 | + .map_err(|e| ErrorKind::io_error("Cannot write chunk identifiers length", e))?; |
| 187 | + |
| 188 | + for &identifier in &self.chunk_identifiers { |
| 189 | + write |
| 190 | + .write_u32::<LittleEndian>(identifier as u32) |
| 191 | + .map_err(|e| ErrorKind::io_error("Cannot write chunk identifier", e))?; |
| 192 | + } |
| 193 | + |
| 194 | + Ok(()) |
| 195 | + } |
| 196 | +} |
| 197 | + |
| 198 | +impl ReadChunk for Header { |
| 199 | + fn read_chunk<R>(read: &mut R) -> Result<Self> |
| 200 | + where |
| 201 | + R: Read + Seek, |
| 202 | + { |
| 203 | + // Magic and version ceremony. |
| 204 | + let mut magic = [0u8; 4]; |
| 205 | + read.read_exact(&mut magic) |
| 206 | + .map_err(|e| ErrorKind::io_error("Cannot read magic", e))?; |
| 207 | + |
| 208 | + if magic != MAGIC { |
| 209 | + return Err(ErrorKind::Format(format!( |
| 210 | + "Expected 'FiFu' as magic, got: {}", |
| 211 | + String::from_utf8_lossy(&magic).into_owned() |
| 212 | + )) |
| 213 | + .into()); |
| 214 | + } |
| 215 | + |
| 216 | + let version = read |
| 217 | + .read_u32::<LittleEndian>() |
| 218 | + .map_err(|e| ErrorKind::io_error("Cannot read model version", e))?; |
| 219 | + if version != MODEL_VERSION { |
| 220 | + return Err( |
| 221 | + ErrorKind::Format(format!("Unknown finalfusion version: {}", version)).into(), |
| 222 | + ); |
| 223 | + } |
| 224 | + |
| 225 | + // Read chunk identifiers. |
| 226 | + let chunk_identifiers_len = read |
| 227 | + .read_u32::<LittleEndian>() |
| 228 | + .map_err(|e| ErrorKind::io_error("Cannot read chunk identifiers length", e))? |
| 229 | + as usize; |
| 230 | + let mut chunk_identifiers = Vec::with_capacity(chunk_identifiers_len); |
| 231 | + for _ in 0..chunk_identifiers_len { |
| 232 | + let identifier = read |
| 233 | + .read_u32::<LittleEndian>() |
| 234 | + .map_err(|e| ErrorKind::io_error("Cannot read chunk identifier", e))?; |
| 235 | + let chunk_identifier = ChunkIdentifier::try_from(identifier) |
| 236 | + .ok_or_else(|| { |
| 237 | + ErrorKind::Format(format!("Unknown chunk identifier: {}", identifier)) |
| 238 | + }) |
| 239 | + .map_err(Error::from)?; |
| 240 | + chunk_identifiers.push(chunk_identifier); |
| 241 | + } |
| 242 | + |
| 243 | + Ok(Header { chunk_identifiers }) |
| 244 | + } |
| 245 | +} |
| 246 | + |
| 247 | +#[cfg(test)] |
| 248 | +mod tests { |
| 249 | + use std::io::{Cursor, Seek, SeekFrom}; |
| 250 | + |
| 251 | + use super::{ChunkIdentifier, Header, ReadChunk, WriteChunk}; |
| 252 | + |
| 253 | + #[test] |
| 254 | + fn header_write_read_roundtrip() { |
| 255 | + let check_header = |
| 256 | + Header::new(vec![ChunkIdentifier::SimpleVocab, ChunkIdentifier::NdArray]); |
| 257 | + let mut cursor = Cursor::new(Vec::new()); |
| 258 | + check_header.write_chunk(&mut cursor).unwrap(); |
| 259 | + cursor.seek(SeekFrom::Start(0)).unwrap(); |
| 260 | + let header = Header::read_chunk(&mut cursor).unwrap(); |
| 261 | + assert_eq!(header, check_header); |
| 262 | + } |
| 263 | +} |
0 commit comments