diff --git a/Cargo.toml b/Cargo.toml index 2e98610..7e4ea9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ env_logger = { version = "0.9.0", optional = true } [dev-dependencies] rust-lzma = "0.5" seq-macro = "0.3" +xz2 = "0.1.6" [features] enable_logging = ["env_logger", "log"] diff --git a/src/decode/delta.rs b/src/decode/delta.rs new file mode 100644 index 0000000..08d07ed --- /dev/null +++ b/src/decode/delta.rs @@ -0,0 +1,60 @@ +use crate::{error, decode::lzbuffer::{self, LzBuffer}}; +use byteorder::ReadBytesExt; +use std::{num::Wrapping, io}; + +#[derive(Debug)] +/// Decoder for XZ delta-encoded blocks (filter 3). +pub struct DeltaDecoder { + distance: Wrapping, + pos: Wrapping, + delta: [Wrapping; 256], +} + +impl DeltaDecoder { + /// Creates a new object ready for transforming data that it's given. + pub fn new(property_distance: u8) -> Self { + DeltaDecoder { + distance: Wrapping(property_distance) + Wrapping(1), + pos: Wrapping(0u8), + delta: [Wrapping(0u8); 256], + } + } + + /// Performs the equivalent of replacing this decompression state with a + /// freshly allocated copy. + /// + /// This function may not allocate memory and will attempt to reuse any + /// previously allocated resources. + #[cfg(feature = "raw_decoder")] + pub fn reset(&mut self) { + self.pos = Wrapping(0u8); + self.delta = [Wrapping(0u8); 256]; + } + + /// Decompresses the input data into the output, consuming only as much + /// input as needed and writing as much output as possible. + pub fn decompress( + &mut self, + input: &mut R, + output: &mut W, + ) -> error::Result<()> { + let mut accum = lzbuffer::LzAccumBuffer::from_stream(output, usize::MAX); + + // See xz-file-format.txt for the C pseudocode this is implementing. + loop { + let byte = if let Ok(byte) = input.read_u8() { + Wrapping(byte) + } else { + lzma_info!("Delta end of input"); + break; + }; + + self.delta[self.pos.0 as usize] = byte + self.delta[(self.pos - self.distance).0 as usize]; + accum.append_literal(self.delta[self.pos.0 as usize].0)?; + self.pos += 1; + } + + accum.finish()?; + Ok(()) + } +} diff --git a/src/decode/mod.rs b/src/decode/mod.rs index 2a7b0b8..703717e 100644 --- a/src/decode/mod.rs +++ b/src/decode/mod.rs @@ -1,5 +1,6 @@ //! Decoding logic. +pub mod delta; pub mod lzbuffer; pub mod lzma; pub mod lzma2; diff --git a/src/decode/xz.rs b/src/decode/xz.rs index 95aeca0..acd2756 100644 --- a/src/decode/xz.rs +++ b/src/decode/xz.rs @@ -1,5 +1,6 @@ //! Decoder for the `.xz` file format. +use crate::decode::delta::DeltaDecoder; use crate::decode::lzma2::Lzma2Decoder; use crate::decode::util; use crate::error; @@ -173,15 +174,18 @@ where #[derive(Debug)] enum FilterId { Lzma2, + Delta, } fn get_filter_id(id: u64) -> error::Result { match id { 0x21 => Ok(FilterId::Lzma2), + 0x03 => Ok(FilterId::Delta), _ => Err(error::Error::XzError(format!("Unknown filter id {}", id))), } } +#[derive(Debug)] struct Filter { filter_id: FilterId, props: Vec, @@ -204,6 +208,13 @@ where R: io::BufRead, W: io::Write, { + // We use each block's "unpacked size" to pre-allocate tmpbuf to avoid the + // need to resize. The unpacked size is not a required field on the block, + // however, so we need to pick a default when it is not present. A default + // size of 4KiB (2^12) is a common page size and strikes a balance between + // over-allocating and creating many small allocations. + const DEFAULT_TMPBUF_SIZE: u64 = 4_096; + let mut digest = CRC32.digest(); digest.update(&[header_size]); let header_size = ((header_size as u64) << 2) - 1; @@ -223,31 +234,24 @@ where ))); } - let mut tmpbuf: Vec = Vec::new(); - let filters = block_header.filters; - for (i, filter) in filters.iter().enumerate() { - if i == 0 { - // TODO: use SubBufRead on input if packed_size is known? - let packed_size = decode_filter(count_input, &mut tmpbuf, filter)?; - if let Some(expected_packed_size) = block_header.packed_size { - if (packed_size as u64) != expected_packed_size { - return Err(error::Error::XzError(format!( - "Invalid compressed size: expected {} but got {}", - expected_packed_size, packed_size - ))); - } - } - } else { - let mut newbuf: Vec = Vec::new(); - decode_filter( - &mut io::BufReader::new(tmpbuf.as_slice()), - &mut newbuf, - filter, - )?; - // TODO: does this move or copy? - tmpbuf = newbuf; + let mut decompress_filters = block_header.filters.iter().rev(); + // In read_block_header, num_filters is always at least 1, so it is safe to unwrap here. + let first_filter = decompress_filters.next().expect("num_filters is at least 1"); + let mut tmpbuf = Vec::with_capacity(block_header.unpacked_size.unwrap_or(DEFAULT_TMPBUF_SIZE) as usize); + let packed_size = decode_filter(count_input, &mut tmpbuf, first_filter)?; + if let Some(expected_packed_size) = block_header.packed_size { + if (packed_size as u64) != expected_packed_size { + return Err(error::Error::XzError(format!( + "Invalid compressed size: expected {} but got {}", + expected_packed_size, packed_size + ))); } } + for filter in decompress_filters { + let mut succ = Vec::with_capacity(block_header.unpacked_size.unwrap_or(DEFAULT_TMPBUF_SIZE) as usize); + decode_filter(&mut (tmpbuf.as_slice()), &mut succ, filter)?; + tmpbuf = succ; + } let unpacked_size = tmpbuf.len(); lzma_info!("XZ block decompressed to {} byte(s)", tmpbuf.len()); @@ -350,6 +354,16 @@ where Lzma2Decoder::new().decompress(&mut count_input, output)?; Ok(count_input.count()) } + FilterId::Delta => { + if filter.props.len() != 1 { + return Err(error::Error::XzError(format!( + "Invalid properties for filter {:?}", + filter.filter_id + ))); + } + DeltaDecoder::new(filter.props[0]).decompress(&mut count_input, output)?; + Ok(count_input.count()) + } } } diff --git a/tests/files/delta-filter-3.dat b/tests/files/delta-filter-3.dat new file mode 100644 index 0000000..6c35fbf Binary files /dev/null and b/tests/files/delta-filter-3.dat differ diff --git a/tests/files/delta-filter-3.dat.xz b/tests/files/delta-filter-3.dat.xz new file mode 100644 index 0000000..998af51 Binary files /dev/null and b/tests/files/delta-filter-3.dat.xz differ diff --git a/tests/xz.rs b/tests/xz.rs index d624185..443ee52 100644 --- a/tests/xz.rs +++ b/tests/xz.rs @@ -1,13 +1,7 @@ #[cfg(feature = "enable_logging")] use log::{debug, info}; -use std::io::{BufReader, Cursor, Read}; - -/// Utility function to read a file into memory -fn read_all_file(filename: &str) -> std::io::Result> { - let mut data = Vec::new(); - std::fs::File::open(filename).and_then(|mut file| file.read_to_end(&mut data))?; - Ok(data) -} +use std::{fs, io::{BufRead, BufReader, Cursor, Read}}; +use xz2::stream; fn round_trip(x: &[u8]) { let mut compressed: Vec = Vec::new(); @@ -23,7 +17,7 @@ fn round_trip(x: &[u8]) { } fn round_trip_file(filename: &str) { - let x = read_all_file(filename).unwrap(); + let x = fs::read(filename).unwrap(); round_trip(x.as_slice()); } @@ -51,9 +45,26 @@ fn round_trip_files() { round_trip_file("tests/files/foo.txt"); } +fn decode_xz_xz2(f: R) -> Vec { + // create new XZ decompression stream with 8Gb memory limit and checksum + // verification disabled + let xz_stream = + stream::Stream::new_stream_decoder(8 * 1024 * 1024 * 1024, stream::IGNORE_CHECK) + .expect("Failed to create stream"); + let mut decomp: Vec = Vec::new(); + xz2::bufread::XzDecoder::new_stream(f, xz_stream).read_to_end(&mut decomp).unwrap(); + decomp +} + fn decomp_big_file(compfile: &str, plainfile: &str) { - let expected = read_all_file(plainfile).unwrap(); - let mut f = BufReader::new(std::fs::File::open(compfile).unwrap()); + let expected = fs::read(plainfile).unwrap(); + + // Decode with the reference implementation to ensure our test case is accurate + let mut f = BufReader::new(fs::File::open(compfile).unwrap()); + let decomp = decode_xz_xz2(f); + assert!(decomp == expected); + + let mut f = BufReader::new(fs::File::open(compfile).unwrap()); let mut decomp: Vec = Vec::new(); lzma_rs::xz_decompress(&mut f, &mut decomp).unwrap(); assert!(decomp == expected) @@ -126,7 +137,7 @@ fn test_xz_block_check_crc32_invalid() { let testcase = "tests/files/block-check-crc32.txt.xz"; let mut corrupted = { - let mut buf = read_all_file(testcase).unwrap(); + let mut buf = fs::read(testcase).unwrap(); // Mangle the "Block Check" field. buf[0x54] = 0x67; buf[0x55] = 0x45; @@ -144,3 +155,14 @@ fn test_xz_block_check_crc32_invalid() { "xz error: Invalid footer CRC32: expected 0x01234567 but got 0x8b0d303e" ) } + +#[test] +fn test_xz_delta_filter() { + #[cfg(feature = "enable_logging")] + let _ = env_logger::try_init(); + + decomp_big_file( + "tests/files/delta-filter-3.dat.xz", + "tests/files/delta-filter-3.dat", + ); +}