Skip to content

Commit 6a7102e

Browse files
committed
Created new struct EscapedString which borrows escaped strings from the input when deserializing, and has an iterator of unescaped fragments. Useful for zero-copy deserialization.
1 parent 0141836 commit 6a7102e

File tree

4 files changed

+395
-77
lines changed

4 files changed

+395
-77
lines changed

src/de/mod.rs

+65-72
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,14 @@ pub enum Error {
9595

9696
impl serde::de::StdError for Error {}
9797

98+
impl From<crate::str::StringUnescapeError> for Error {
99+
fn from(error: crate::str::StringUnescapeError) -> Self {
100+
match error {
101+
crate::str::StringUnescapeError::InvalidEscapeSequence => Self::InvalidEscapeSequence,
102+
}
103+
}
104+
}
105+
98106
/// A structure that deserializes Rust values from JSON in a buffer.
99107
pub struct Deserializer<'b, 's> {
100108
slice: &'b [u8],
@@ -485,89 +493,43 @@ impl<'a, 'de, 's> de::Deserializer<'de> for &'a mut Deserializer<'de, 's> {
485493
where
486494
V: Visitor<'de>,
487495
{
488-
let s = self.parse_str()?;
496+
let escaped_string = self.parse_str()?;
489497

490498
if let Some(string_unescape_buffer) = self.string_unescape_buffer.as_deref_mut() {
491-
if s.as_bytes().contains(&b'\\') {
492-
let mut string_unescape_buffer_slots = string_unescape_buffer.iter_mut();
499+
if escaped_string.as_bytes().contains(&b'\\') {
500+
let mut string_unescape_buffer_write_position = 0;
493501

494-
// We've already checked that the string is valid UTF-8, so the only b'\\' is the start of escape sequence
495-
let mut escaped_string_bytes = s.as_bytes().iter();
502+
for fragment in crate::str::unescape_fragments(escaped_string) {
503+
let char_encode_buffer = &mut [0; 4];
496504

497-
loop {
498-
match escaped_string_bytes.next().copied() {
499-
None => break,
500-
Some(b'\\') => {
501-
let unescaped_byte = match escaped_string_bytes.next() {
502-
Some(b'"') => b'"',
503-
Some(b'\\') => b'\\',
504-
Some(b'/') => b'/',
505-
Some(b'b') => 0x8,
506-
Some(b'f') => 0xC,
507-
Some(b'n') => b'\n',
508-
Some(b'r') => b'\r',
509-
Some(b't') => b'\t',
510-
Some(b'u') => {
511-
// TODO - Replace with `<[u8]>::split_first_chunk::<4>` once MSRV >= 1.77
512-
fn split_first_slice(
513-
bytes: &[u8],
514-
len: usize,
515-
) -> Option<(&[u8], &[u8])>
516-
{
517-
Some((bytes.get(..len)?, bytes.get(len..)?))
518-
}
519-
520-
let (escape_sequence, remaining_escaped_string_bytes) =
521-
split_first_slice(escaped_string_bytes.as_slice(), 4)
522-
.ok_or(Error::InvalidEscapeSequence)?;
523-
524-
escaped_string_bytes = remaining_escaped_string_bytes.iter();
525-
526-
let unescaped_char = core::str::from_utf8(escape_sequence)
527-
.ok()
528-
.and_then(|escape_sequence| {
529-
u32::from_str_radix(escape_sequence, 16).ok()
530-
})
531-
.and_then(char::from_u32)
532-
.ok_or(Error::InvalidEscapeSequence)?;
533-
534-
for &unescaped_byte in
535-
unescaped_char.encode_utf8(&mut [0; 4]).as_bytes()
536-
{
537-
*string_unescape_buffer_slots
538-
.next()
539-
.ok_or(Error::EscapedStringIsTooLong)? = unescaped_byte;
540-
}
541-
542-
continue;
543-
}
544-
_ => return Err(Error::InvalidEscapeSequence),
545-
};
546-
547-
*string_unescape_buffer_slots
548-
.next()
549-
.ok_or(Error::EscapedStringIsTooLong)? = unescaped_byte;
505+
let unescaped_bytes = match fragment? {
506+
crate::str::EscapedStringFragment::NotEscaped(fragment) => {
507+
fragment.as_bytes()
550508
}
551-
Some(c) => {
552-
*string_unescape_buffer_slots
553-
.next()
554-
.ok_or(Error::EscapedStringIsTooLong)? = c;
509+
crate::str::EscapedStringFragment::Escaped(c) => {
510+
c.encode_utf8(char_encode_buffer).as_bytes()
555511
}
556-
}
557-
}
512+
};
513+
514+
string_unescape_buffer[string_unescape_buffer_write_position..]
515+
.get_mut(..unescaped_bytes.len())
516+
.ok_or(Error::EscapedStringIsTooLong)?
517+
.copy_from_slice(unescaped_bytes);
558518

559-
let remaining_length = string_unescape_buffer_slots.len();
560-
let unescaped_string_length = string_unescape_buffer.len() - remaining_length;
519+
string_unescape_buffer_write_position += unescaped_bytes.len();
520+
}
561521

562522
visitor.visit_str(
563-
str::from_utf8(&string_unescape_buffer[..unescaped_string_length])
564-
.map_err(|_| Error::InvalidUnicodeCodePoint)?,
523+
str::from_utf8(
524+
&string_unescape_buffer[..string_unescape_buffer_write_position],
525+
)
526+
.map_err(|_| Error::InvalidUnicodeCodePoint)?,
565527
)
566528
} else {
567-
visitor.visit_borrowed_str(s)
529+
visitor.visit_borrowed_str(escaped_string)
568530
}
569531
} else {
570-
visitor.visit_borrowed_str(s)
532+
visitor.visit_borrowed_str(escaped_string)
571533
}
572534
}
573535

@@ -638,11 +600,34 @@ impl<'a, 'de, 's> de::Deserializer<'de> for &'a mut Deserializer<'de, 's> {
638600
}
639601

640602
/// Unsupported. We can’t parse newtypes because we don’t know the underlying type.
641-
fn deserialize_newtype_struct<V>(self, _name: &'static str, visitor: V) -> Result<V::Value>
603+
fn deserialize_newtype_struct<V>(self, name: &'static str, visitor: V) -> Result<V::Value>
642604
where
643605
V: Visitor<'de>,
644606
{
645-
visitor.visit_newtype_struct(self)
607+
if name == crate::str::EscapedStr::NAME {
608+
struct EscapedStringDeserializer<'a, 'de, 's>(&'a mut Deserializer<'de, 's>);
609+
610+
impl<'a, 'de, 's> serde::Deserializer<'de> for EscapedStringDeserializer<'a, 'de, 's> {
611+
type Error = Error;
612+
613+
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value>
614+
where
615+
V: Visitor<'de>,
616+
{
617+
visitor.visit_borrowed_str(self.0.parse_str()?)
618+
}
619+
620+
serde::forward_to_deserialize_any! {
621+
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
622+
bytes byte_buf option unit unit_struct newtype_struct seq tuple
623+
tuple_struct map struct enum identifier ignored_any
624+
}
625+
}
626+
627+
visitor.visit_newtype_struct(EscapedStringDeserializer(self))
628+
} else {
629+
visitor.visit_newtype_struct(self)
630+
}
646631
}
647632

648633
fn deserialize_seq<V>(self, visitor: V) -> Result<V::Value>
@@ -1058,6 +1043,14 @@ mod tests {
10581043
);
10591044
}
10601045

1046+
#[test]
1047+
fn escaped_str() {
1048+
assert_eq!(
1049+
crate::from_str(r#""Hello\nWorld""#),
1050+
Ok((crate::str::EscapedStr::new(r#"Hello\nWorld"#).unwrap(), 14))
1051+
);
1052+
}
1053+
10611054
#[test]
10621055
fn struct_bool() {
10631056
#[derive(Debug, Deserialize, PartialEq)]

src/lib.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,15 @@
5555
//! This crate is guaranteed to compile on stable Rust 1.62.0 and up. It *might* compile with older
5656
//! versions but that may change in any new patch release.
5757
58-
#![deny(missing_docs)]
58+
// #![deny(missing_docs)]
5959
#![deny(rust_2018_compatibility)]
6060
#![deny(rust_2018_idioms)]
61-
#![deny(warnings)]
61+
// #![deny(warnings)]
6262
#![cfg_attr(not(feature = "std"), no_std)]
6363

6464
pub mod de;
6565
pub mod ser;
66+
pub mod str;
6667

6768
#[doc(inline)]
6869
pub use self::de::{from_slice, from_slice_escaped, from_str, from_str_escaped};

0 commit comments

Comments
 (0)