diff --git a/Cargo.toml b/Cargo.toml index bbf9218089c..7925fc7024a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,7 +25,7 @@ exclude = [ [workspace.package] edition = "2024" version = "0.20.0" -rust-version = "1.88.0" +rust-version = "1.90.0" authors = ["boa-dev"] repository = "https://github.com/boa-dev/boa" license = "Unlicense OR MIT" diff --git a/core/engine/src/builtins/intl/number_format/options.rs b/core/engine/src/builtins/intl/number_format/options.rs index 89cf0b8b223..f43883b8284 100644 --- a/core/engine/src/builtins/intl/number_format/options.rs +++ b/core/engine/src/builtins/intl/number_format/options.rs @@ -5,19 +5,18 @@ use fixed_decimal::{ UnsignedRoundingMode, }; -use boa_macros::js_str; -use icu_decimal::preferences::NumberingSystem; -use icu_locale::extensions::unicode::Value; -use tinystr::TinyAsciiStr; - use crate::{ - Context, JsNativeError, JsObject, JsResult, JsStr, JsString, JsValue, + Context, JsNativeError, JsObject, JsResult, JsString, JsValue, builtins::{ intl::options::{default_number_option, get_number_option}, options::{OptionType, ParsableOptionType, get_option}, }, js_string, }; +use boa_string::Latin1JsStringBuilder; +use icu_decimal::preferences::NumberingSystem; +use icu_locale::extensions::unicode::Value; +use tinystr::TinyAsciiStr; impl OptionType for SignedRoundingMode { fn from_value(value: JsValue, context: &mut Context) -> JsResult { @@ -285,9 +284,9 @@ impl ParsableOptionType for Currency {} #[derive(Debug, Eq, PartialEq)] pub(crate) struct Unit { // INVARIANT: `numerator` must only contain ASCII lowercase alphabetic letters or `-`. - numerator: JsStr<'static>, + numerator: &'static str, // INVARIANT: if `denominator` is not empty, it must only contain ASCII lowercase alphabetic letters or `-` - denominator: JsStr<'static>, + denominator: &'static str, } impl Unit { @@ -296,9 +295,15 @@ impl Unit { if self.denominator.is_empty() { js_string!(self.numerator) } else { - // TODO: this is not optimal for now, but the new JS strings should - // allow us to optimize this to simple casts from ASCII to JsString. - js_string!(self.numerator, js_str!("-per-"), self.denominator) + let mut builder = Latin1JsStringBuilder::with_capacity( + self.numerator.len() + self.denominator.len() + 5, + ); + builder.extend_from_slice(self.numerator.as_bytes()); + builder.extend_from_slice(b"-per-"); + builder.extend_from_slice(self.denominator.as_bytes()); + builder + .build() + .expect("Builder failed, this should not happen") } } } @@ -377,17 +382,13 @@ impl std::str::FromStr for Unit { .map(|i| SANCTIONED_UNITS[i]) .map_err(|_| ParseUnitError)?; - let num = JsStr::latin1(num.as_bytes()); - - let den = if den.is_empty() { - JsStr::EMPTY + let den: &'static str = if den.is_empty() { + "" } else { - let value = SANCTIONED_UNITS + SANCTIONED_UNITS .binary_search(&den) .map(|i| SANCTIONED_UNITS[i]) - .map_err(|_| ParseUnitError)?; - - JsStr::latin1(value.as_bytes()) + .map_err(|_| ParseUnitError)? }; Ok(Self { diff --git a/core/engine/src/builtins/string/mod.rs b/core/engine/src/builtins/string/mod.rs index 1a6e90aaa29..af339542372 100644 --- a/core/engine/src/builtins/string/mod.rs +++ b/core/engine/src/builtins/string/mod.rs @@ -803,7 +803,7 @@ impl String { Ok(js_string!().into()) } else { // 13. Return the substring of S from from to to. - Ok(js_string!(string.get_expect(from..to)).into()) + Ok(unsafe { JsString::slice_unchecked(string.clone(), from, to).into() }) } } @@ -1906,7 +1906,8 @@ impl String { let to = max(final_start, final_end); // 10. Return the substring of S from from to to. - Ok(js_string!(string.get_expect(from..to)).into()) + // Ok(js_string!(string.get_expect(from..to)).into()) + Ok(unsafe { JsString::slice_unchecked(string.clone(), from, to).into() }) } /// `String.prototype.split ( separator, limit )` @@ -2002,7 +2003,9 @@ impl String { while let Some(index) = j { // a. Let T be the substring of S from i to j. // b. Append T as the last element of substrings. - substrings.push(this_str.get_expect(i..index).into()); + // SAFETY: we already checked that i and index are within range. + let sliced = unsafe { JsString::slice_unchecked(this_str.clone(), i, index) }; + substrings.push(sliced.into()); // c. If the number of elements of substrings is lim, return ! CreateArrayFromList(substrings). if substrings.len() == lim { diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index b91c5d3ccae..db660106088 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -57,7 +57,7 @@ macro_rules! js_string { ($s:literal) => {{ const LITERAL: &$crate::string::JsStr<'static> = &$crate::js_str!($s); - $crate::string::JsString::from_static_js_str(LITERAL) + $crate::string::JsString::from_static(LITERAL) }}; ($s:expr) => { $crate::string::JsString::from($s) diff --git a/core/engine/src/value/inner/nan_boxed.rs b/core/engine/src/value/inner/nan_boxed.rs index 2d7cd4c49a9..b92280f87c3 100644 --- a/core/engine/src/value/inner/nan_boxed.rs +++ b/core/engine/src/value/inner/nan_boxed.rs @@ -1,6 +1,6 @@ //! A NaN-boxed inner value for JavaScript values. //! -//! This [`JsValue`] is a float using `NaN` values to represent inner +//! This [`JsValue`] is a float using `NaN` values to represent an inner //! JavaScript value. //! //! # Assumptions @@ -111,7 +111,7 @@ use crate::{ symbol::RawJsSymbol, }; use boa_gc::{Finalize, GcBox, Trace, custom_trace}; -use boa_string::{JsString, RawJsString}; +use boa_string::JsString; use core::fmt; use static_assertions::const_assert; use std::{ @@ -676,7 +676,7 @@ impl NanBoxedValue { // SAFETY: the inner address must hold a valid, non-null JsString. unsafe { ManuallyDrop::new(JsString::from_raw(NonNull::new_unchecked( - self.ptr.with_addr(addr).cast::(), + self.ptr.with_addr(addr).cast(), ))) } } diff --git a/core/string/src/builder.rs b/core/string/src/builder.rs index b83d81c7708..616040dc5a5 100644 --- a/core/string/src/builder.rs +++ b/core/string/src/builder.rs @@ -1,4 +1,7 @@ -use crate::{DATA_OFFSET, JsStr, JsStrVariant, JsString, RawJsString, TaggedLen, alloc_overflow}; +use crate::{ + DATA_OFFSET, InnerStringKind, JsStr, JsStrVariant, JsString, SeqString, TaggedLen, + alloc_overflow, +}; use std::{ alloc::{Layout, alloc, dealloc, realloc}, @@ -14,7 +17,7 @@ use std::{ pub struct JsStringBuilder { cap: usize, len: usize, - inner: NonNull, + inner: NonNull, phantom_data: PhantomData, } @@ -170,7 +173,7 @@ impl JsStringBuilder { // the length of the string and the reference count. unsafe { alloc(new_layout) } }; - let Some(new_ptr) = NonNull::new(new_ptr.cast::()) else { + let Some(new_ptr) = NonNull::new(new_ptr.cast::()) else { std::alloc::handle_alloc_error(new_layout) }; self.inner = new_ptr; @@ -221,7 +224,7 @@ impl JsStringBuilder { fn new_layout(cap: usize) -> Layout { let new_layout = Layout::array::(cap) - .and_then(|arr| Layout::new::().extend(arr)) + .and_then(|arr| Layout::new::().extend(arr)) .map(|(layout, offset)| (layout.pad_to_align(), offset)) .map_err(|_| None); match new_layout { @@ -276,7 +279,7 @@ impl JsStringBuilder { } /// Allocates memory to the inner `RawJsString` by the given capacity. - /// Capacity calculation is from [`std::vec::Vec::reserve`]. + /// Capacity calculation is from [`Vec::reserve`]. fn allocate(&mut self, cap: usize) { let cap = std::cmp::max(self.capacity() * 2, cap); let cap = std::cmp::max(Self::MIN_NON_ZERO_CAP, cap); @@ -367,7 +370,7 @@ impl JsStringBuilder { // `NonNull` verified for us that the pointer returned by `alloc` is valid, // meaning we can write to its pointed memory. unsafe { - inner.as_ptr().write(RawJsString { + inner.as_ptr().write(SeqString { tagged_len: TaggedLen::new(len, latin1), refcount: Cell::new(1), data: [0; 0], @@ -375,10 +378,10 @@ impl JsStringBuilder { } // Tell the compiler not to call the destructor of `JsStringBuilder`, - // becuase we move inner `RawJsString` to `JsString`. + // because we move inner `RawJsString` to `JsString`. std::mem::forget(self); - JsString { ptr: inner } + JsString::from_inner(inner, InnerStringKind::Sequence) } } diff --git a/core/string/src/code_point.rs b/core/string/src/code_point.rs new file mode 100644 index 00000000000..ef46adbf68a --- /dev/null +++ b/core/string/src/code_point.rs @@ -0,0 +1,78 @@ +use std::fmt::Write; + +/// Represents a Unicode codepoint within a [`JsString`], which could be a valid +/// '[Unicode scalar value]', or an unpaired surrogate. +/// +/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum CodePoint { + /// A valid Unicode scalar value. + Unicode(char), + + /// An unpaired surrogate. + UnpairedSurrogate(u16), +} + +impl CodePoint { + /// Get the number of UTF-16 code units needed to encode this code point. + #[inline] + #[must_use] + pub const fn code_unit_count(self) -> usize { + match self { + Self::Unicode(c) => c.len_utf16(), + Self::UnpairedSurrogate(_) => 1, + } + } + + /// Convert the code point to its [`u32`] representation. + #[inline] + #[must_use] + pub fn as_u32(self) -> u32 { + match self { + Self::Unicode(c) => u32::from(c), + Self::UnpairedSurrogate(surr) => u32::from(surr), + } + } + + /// If the code point represents a valid 'Unicode scalar value', returns its [`char`] + /// representation, otherwise returns [`None`] on unpaired surrogates. + #[inline] + #[must_use] + pub const fn as_char(self) -> Option { + match self { + Self::Unicode(c) => Some(c), + Self::UnpairedSurrogate(_) => None, + } + } + + /// Encodes this code point as UTF-16 into the provided u16 buffer, and then returns the subslice + /// of the buffer that contains the encoded character. + /// + /// # Panics + /// + /// Panics if the buffer is not large enough. A buffer of length 2 is large enough to encode any + /// code point. + #[inline] + #[must_use] + pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { + match self { + Self::Unicode(c) => c.encode_utf16(dst), + Self::UnpairedSurrogate(surr) => { + dst[0] = surr; + &mut dst[0..=0] + } + } + } +} + +impl std::fmt::Display for CodePoint { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CodePoint::Unicode(c) => f.write_char(*c), + CodePoint::UnpairedSurrogate(c) => { + write!(f, "\\u{c:04X}") + } + } + } +} diff --git a/core/string/src/common.rs b/core/string/src/common.rs index da110526414..aed7de904e6 100644 --- a/core/string/src/common.rs +++ b/core/string/src/common.rs @@ -12,7 +12,7 @@ macro_rules! well_known_statics { paste!{ #[doc = "Gets the static `JsString` for `\"" $string "\"`."] pub const $name: JsString = const { - JsString::from_static_js_str(Self::find_static_js_string($string)) + JsString::from_static(Self::find_static_js_string($string)) }; } )+ @@ -73,7 +73,7 @@ impl StaticJsStrings { // SAFETY: Type of T in is `&'static JsStr<'static>`, so this is safe. let ptr = unsafe { std::mem::transmute::<&JsStr<'_>, &'static JsStr<'static>>(str) }; - Some(JsString::from_static_js_str(ptr)) + Some(JsString::from_static(ptr)) } // Some consts are only used on certain features, which triggers the unused lint. diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 1c4e9a82028..3b993e0fdc4 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -13,6 +13,7 @@ #![allow(clippy::module_name_repetitions)] mod builder; +mod code_point; mod common; mod display; mod iter; @@ -26,11 +27,13 @@ use crate::display::{JsStrDisplayEscaped, JsStrDisplayLossy}; #[doc(inline)] pub use crate::{ builder::{CommonJsStringBuilder, Latin1JsStringBuilder, Utf16JsStringBuilder}, + code_point::CodePoint, common::StaticJsStrings, iter::Iter, str::{JsStr, JsStrVariant}, }; -use std::fmt::Write; +use std::num::NonZero; +use std::ops::BitAnd; use std::{ alloc::{Layout, alloc, dealloc}, cell::Cell, @@ -83,83 +86,6 @@ pub(crate) const fn is_trimmable_whitespace_latin1(c: u8) -> bool { ) } -/// Represents a Unicode codepoint within a [`JsString`], which could be a valid -/// '[Unicode scalar value]', or an unpaired surrogate. -/// -/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum CodePoint { - /// A valid Unicode scalar value. - Unicode(char), - - /// An unpaired surrogate. - UnpairedSurrogate(u16), -} - -impl CodePoint { - /// Get the number of UTF-16 code units needed to encode this code point. - #[inline] - #[must_use] - pub const fn code_unit_count(self) -> usize { - match self { - Self::Unicode(c) => c.len_utf16(), - Self::UnpairedSurrogate(_) => 1, - } - } - - /// Convert the code point to its [`u32`] representation. - #[inline] - #[must_use] - pub fn as_u32(self) -> u32 { - match self { - Self::Unicode(c) => u32::from(c), - Self::UnpairedSurrogate(surr) => u32::from(surr), - } - } - - /// If the code point represents a valid 'Unicode scalar value', returns its [`char`] - /// representation, otherwise returns [`None`] on unpaired surrogates. - #[inline] - #[must_use] - pub const fn as_char(self) -> Option { - match self { - Self::Unicode(c) => Some(c), - Self::UnpairedSurrogate(_) => None, - } - } - - /// Encodes this code point as UTF-16 into the provided u16 buffer, and then returns the subslice - /// of the buffer that contains the encoded character. - /// - /// # Panics - /// - /// Panics if the buffer is not large enough. A buffer of length 2 is large enough to encode any - /// code point. - #[inline] - #[must_use] - pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { - match self { - Self::Unicode(c) => c.encode_utf16(dst), - Self::UnpairedSurrogate(surr) => { - dst[0] = surr; - &mut dst[0..=0] - } - } - } -} - -impl std::fmt::Display for CodePoint { - #[inline] - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - CodePoint::Unicode(c) => f.write_char(*c), - CodePoint::UnpairedSurrogate(c) => { - write!(f, "\\u{c:04X}") - } - } - } -} - /// A `usize` contains a flag and the length of Latin1/UTF-16 . /// ```text /// ┌────────────────────────────────────┐ @@ -188,32 +114,42 @@ impl TaggedLen { } } -/// The raw representation of a [`JsString`] in the heap. -#[repr(C)] -#[allow(missing_debug_implementations)] -pub struct RawJsString { +/// A sequential memory array of strings. +#[repr(C, align(8))] +struct SeqString { tagged_len: TaggedLen, refcount: Cell, data: [u8; 0], } -impl RawJsString { - const fn is_latin1(&self) -> bool { - self.tagged_len.is_latin1() - } - - const fn len(&self) -> usize { - self.tagged_len.len() - } +/// A slice of an existing string. +#[repr(C, align(8))] +struct SliceString { + data: JsString, + start: usize, + end: usize, } -const DATA_OFFSET: usize = size_of::(); +/// A static constant string, without reference counting. +#[repr(transparent)] +struct StaticString(JsStr<'static>); + +/// Strings can be represented by multiple kinds. This is used as the +/// tag for the tagged pointer in [`JsString`]. +#[derive(Clone, Copy, Eq, PartialEq)] +enum InnerStringKind { + /// A sequential memory slice of either UTF-8 or UTF-16. See [`SeqString`]. + Sequence = 0, -enum Unwrapped<'a> { - Heap(NonNull), - Static(&'a JsStr<'static>), + /// A slice of an existing string. See [`SliceString`]. + Slice = 1, + + /// A static string that is valid for `'static` lifetime. + Static = 2, } +const DATA_OFFSET: usize = size_of::(); + /// A Latin1 or UTF-16–encoded, reference counted, immutable string. /// /// This is pretty similar to a [Rc][std::rc::Rc]\<[\[u16\]][slice]\>, but without the @@ -226,7 +162,9 @@ enum Unwrapped<'a> { /// memory on the heap to reduce the overhead of memory allocation and reference counting. #[allow(clippy::module_name_repetitions)] pub struct JsString { - ptr: NonNull, + /// A tagged pointer with alignment at least 8. This pointer cannot be NULL so we + /// use a `NonNull` instance, but it can point to different types. + tagged_pointer: NonNull<()>, } // JsString should always be pointer sized. @@ -240,8 +178,8 @@ impl<'a> From<&'a JsString> for JsStr<'a> { } impl<'a> IntoIterator for &'a JsString { - type IntoIter = Iter<'a>; type Item = u16; + type IntoIter = Iter<'a>; #[inline] fn into_iter(self) -> Self::IntoIter { @@ -477,17 +415,17 @@ impl JsString { self.as_str().display_lossy() } - /// Consumes the [`JsString`], returning a pointer to `RawJsString`. + /// Consumes the [`JsString`], returning the internal tagged pointer. /// /// To avoid a memory leak the pointer must be converted back to a `JsString` using /// [`JsString::from_raw`]. #[inline] #[must_use] - pub fn into_raw(self) -> NonNull { - ManuallyDrop::new(self).ptr + pub fn into_raw(self) -> NonNull<()> { + ManuallyDrop::new(self).tagged_pointer } - /// Constructs a `JsString` from a pointer to `RawJsString`. + /// Constructs a `JsString` from the internal tagged pointer. /// /// The raw pointer must have been previously returned by a call to /// [`JsString::into_raw`]. @@ -498,32 +436,136 @@ impl JsString { /// even if the returned `JsString` is never accessed. #[inline] #[must_use] - pub unsafe fn from_raw(ptr: NonNull) -> Self { - Self { ptr } + pub unsafe fn from_raw(ptr: NonNull<()>) -> Self { + Self { + tagged_pointer: ptr, + } } } // `&JsStr<'static>` must always be aligned so it can be taggged. static_assertions::const_assert!(align_of::<*const JsStr<'static>>() >= 2); +/// Dealing with inner types. impl JsString { - /// Create a [`JsString`] from a static js string. + /// Create a [`JsString`] `StaticString` from a static js string. + #[inline] #[must_use] - pub const fn from_static_js_str(src: &'static JsStr<'static>) -> Self { - let src = ptr::from_ref(src); - + pub const fn from_static(src: &'static JsStr<'static>) -> Self { // SAFETY: A reference cannot be null, so this is safe. - // - // TODO: Replace once `NonNull::from_ref()` is stabilized. - let ptr = unsafe { NonNull::new_unchecked(src.cast_mut()) }; + let ptr = NonNull::from_ref(src); - // SAFETY: - // - Adding one to an aligned pointer will tag the pointer's last bit. - // - The pointer's provenance remains unchanged, so this is safe. - let tagged_ptr = unsafe { ptr.byte_add(1) }; + Self::from_inner(ptr, InnerStringKind::Static) + } + + /// Create a [`JsString`] from an existing `JsString` and start, end + /// range. `end` is 1 past the last character (or `== data.len()` + /// for the last character). + /// + /// # Safety + /// It is the responsibility of the caller to ensure: + /// - start >= end. If start == end, the string is empty. + /// - end <= data.len(). + #[inline] + #[must_use] + pub unsafe fn slice_unchecked(data: JsString, start: usize, end: usize) -> Self { + let ptr = Box::into_raw(Box::new(SliceString { data, start, end })); + + // SAFETY: Allocation worked. + Self::from_inner( + unsafe { NonNull::new_unchecked(ptr) }, + InnerStringKind::Slice, + ) + } + + /// Create a [`JsString`] from an existing `JsString` and start, end + /// range. Returns None if the start/end are invalid. + #[inline] + #[must_use] + pub fn slice(&self, p1: usize, p2: usize) -> Option { + if p1 > p2 || p2 > self.len() { + None + } else if p1 == p2 { + Some(StaticJsStrings::EMPTY_STRING) + } else { + // SAFETY: We just checked the conditions. + Some(unsafe { Self::slice_unchecked(self.clone(), p1, p2) }) + } + } + + /// Create a new [`JsString`] `SeqString` variant. + #[inline] + #[must_use] + fn kind(&self) -> InnerStringKind { + match self.tagged_pointer.addr().get() & 0x07 { + 0 => InnerStringKind::Sequence, + 1 => InnerStringKind::Slice, + 2 => InnerStringKind::Static, + // SAFETY: We never create other variants, so this is unreachable. + _ => unsafe { std::hint::unreachable_unchecked() }, + } + } + + /// Create a new [`JsString`] with a pointer and a kind. + #[inline] + #[must_use] + const fn from_inner(ptr: NonNull, kind: InnerStringKind) -> Self { + Self { + // SAFETY: Kind is a known quantity that cannot surpass the alignment + // of the pointed to structure. + tagged_pointer: unsafe { ptr.cast::<()>().byte_add(kind as usize) }, + } + } + + /// Get the inner pointer's destination as a reference of type T. + /// + /// # Safety + /// This should only be used when the inner type has been validated. Using + /// an unvalidated inner type is undefined behaviour. + #[inline] + #[must_use] + unsafe fn as_inner(&self) -> &T { + // SAFETY: The outer function is unsafe and the condition should be respected. + unsafe { + self.tagged_pointer + .cast::() + .map_addr(|x| NonZero::new_unchecked(x.get().bitand(!0x7))) + .as_ref() + } + } - JsString { - ptr: tagged_ptr.cast::(), + /// Get the inner pointer's destination as a pointer of type T. + /// + /// # Safety + /// This should only be used when the inner type has been validated. Using + /// an unvalidated inner type is undefined behaviour. + #[inline] + #[must_use] + unsafe fn as_inner_ptr(&self) -> NonNull { + // SAFETY: The outer function is unsafe and the condition should be respected. + unsafe { + self.tagged_pointer + .cast::() + .map_addr(|x| NonZero::new_unchecked(x.get().bitand(!0x7))) + } + } + + #[inline] + fn on_kind_ref( + &self, + if_seq: impl FnOnce(&SeqString) -> T, + if_slice: impl FnOnce(&SliceString) -> T, + if_static: impl FnOnce(&StaticString) -> T, + ) -> T { + match self.tagged_pointer.addr().get() & 0x07 { + // SAFETY: This is safe as long as [`InnerStringKind::Sequence`] is 0. + 0 => if_seq(unsafe { self.tagged_pointer.cast::().as_ref() }), + // SAFETY: We're matching on the pointer tag and validated the type of the pointer. + 1 => if_slice(unsafe { self.as_inner() }), + // SAFETY: We're matching on the pointer tag and validated the type of the pointer. + 2 => if_static(unsafe { self.as_inner() }), + // SAFETY: This cannot happen as it's built by one of our constructors. + _ => unsafe { std::hint::unreachable_unchecked() }, } } @@ -531,39 +573,58 @@ impl JsString { #[inline] #[must_use] pub fn is_static(&self) -> bool { - self.ptr.addr().get() & 1 != 0 + self.kind() == InnerStringKind::Static } - pub(crate) fn unwrap(&self) -> Unwrapped<'_> { - if self.is_static() { - // SAFETY: Static pointer is tagged and already checked, so this is safe. - let ptr = unsafe { self.ptr.byte_sub(1) }; + /// Check if the [`JsString`] is a [`SeqString`]. + #[inline] + #[must_use] + pub fn is_seq(&self) -> bool { + self.kind() == InnerStringKind::Sequence + } - // SAFETY: A static pointer always points to a valid JsStr, so this is safe. - Unwrapped::Static(unsafe { ptr.cast::>().as_ref() }) - } else { - Unwrapped::Heap(self.ptr) - } + /// Check if the [`JsString`] is static. + #[inline] + #[must_use] + pub fn is_slice(&self) -> bool { + self.kind() == InnerStringKind::Slice } +} +impl JsString { /// Obtains the underlying [`&[u16]`][slice] slice of a [`JsString`] #[inline] #[must_use] pub fn as_str(&self) -> JsStr<'_> { - let ptr = match self.unwrap() { - Unwrapped::Heap(ptr) => ptr.as_ptr(), - Unwrapped::Static(js_str) => return *js_str, + let (len, is_latin1, ptr) = match self.kind() { + InnerStringKind::Sequence => { + // SAFETY: Already checked the kind. + let str = unsafe { self.as_inner::() }; + let len = str.tagged_len.len(); + let is_latin1 = str.tagged_len.is_latin1(); + let ptr = (&raw const str.data).cast::(); + (len, is_latin1, ptr) + } + InnerStringKind::Slice => { + // SAFETY: Already checked the kind. + let inner_str = unsafe { self.as_inner::() }; + let str = inner_str.data.as_str(); + let len = inner_str.end - inner_str.start; + let is_latin1 = str.is_latin1(); + // SAFETY: We check at creation that `start` < `len`. + let ptr = unsafe { str.ptr().add(inner_str.start) }; + (len, is_latin1, ptr) + } + InnerStringKind::Static => { + // SAFETY: Already checked the kind. + return unsafe { self.as_inner::() }.0; + } }; // SAFETY: - // - Unwrapped heap ptr is always a valid heap allocated RawJsString. + // - Unwrapped heap ptr is always a valid heap allocated SeqString. // - Length of a heap allocated string always contains the correct size of the string. unsafe { - let tagged_len = (*ptr).tagged_len; - let len = tagged_len.len(); - let is_latin1 = tagged_len.is_latin1(); - let ptr = (&raw const (*ptr).data).cast::(); - if is_latin1 { JsStr::latin1(std::slice::from_raw_parts(ptr, len)) } else { @@ -598,7 +659,7 @@ impl JsString { full_count = sum; } - let ptr = Self::allocate_inner(full_count, latin1_encoding); + let ptr = Self::allocate_seq(full_count, latin1_encoding); let string = { // SAFETY: `allocate_inner` guarantees that `ptr` is a valid pointer. @@ -641,22 +702,20 @@ impl JsString { } } } - Self { - // SAFETY: We already know it's a valid heap pointer. - ptr: unsafe { NonNull::new_unchecked(ptr.as_ptr()) }, - } + + Self::from_inner(ptr, InnerStringKind::Sequence) }; StaticJsStrings::get_string(&string.as_str()).unwrap_or(string) } - /// Allocates a new [`RawJsString`] with an internal capacity of `str_len` chars. + /// Allocates a new [`SeqString`] with an internal capacity of `str_len` chars. /// /// # Panics /// /// Panics if `try_allocate_inner` returns `Err`. - fn allocate_inner(str_len: usize, latin1: bool) -> NonNull { - match Self::try_allocate_inner(str_len, latin1) { + fn allocate_seq(str_len: usize, latin1: bool) -> NonNull { + match Self::try_allocate_seq(str_len, latin1) { Ok(v) => v, Err(None) => alloc_overflow(), Err(Some(layout)) => std::alloc::handle_alloc_error(layout), @@ -665,32 +724,33 @@ impl JsString { // This is marked as safe because it is always valid to call this function to request any number // of `u16`, since this function ought to fail on an OOM error. - /// Allocates a new [`RawJsString`] with an internal capacity of `str_len` chars. + /// Allocates a new [`SeqString`] with an internal capacity of `str_len` chars. /// /// # Errors /// /// Returns `Err(None)` on integer overflows `usize::MAX`. /// Returns `Err(Some(Layout))` on allocation error. - fn try_allocate_inner( + fn try_allocate_seq( str_len: usize, latin1: bool, - ) -> Result, Option> { + ) -> Result, Option> { let (layout, offset) = if latin1 { Layout::array::(str_len) } else { Layout::array::(str_len) } - .and_then(|arr| Layout::new::().extend(arr)) + .and_then(|arr| Layout::new::().extend(arr)) .map(|(layout, offset)| (layout.pad_to_align(), offset)) .map_err(|_| None)?; debug_assert_eq!(offset, DATA_OFFSET); + debug_assert_eq!(layout.align(), align_of::()); #[allow(clippy::cast_ptr_alignment)] // SAFETY: - // The layout size of `RawJsString` is never zero, since it has to store + // The layout size of `SeqString` is never zero, since it has to store // the length of the string and the reference count. - let inner = unsafe { alloc(layout).cast::() }; + let inner = unsafe { alloc(layout).cast::() }; // We need to verify that the pointer returned by `alloc` is not null, otherwise // we should abort, since an allocation error is pretty unrecoverable for us @@ -701,8 +761,8 @@ impl JsString { // `NonNull` verified for us that the pointer returned by `alloc` is valid, // meaning we can write to its pointed memory. unsafe { - // Write the first part, the `RawJsString`. - inner.as_ptr().write(RawJsString { + // Write the first part, the `SeqString`. + inner.as_ptr().write(SeqString { tagged_len: TaggedLen::new(str_len, latin1), refcount: Cell::new(1), data: [0; 0], @@ -713,9 +773,9 @@ impl JsString { let inner = inner.as_ptr(); // SAFETY: // - `inner` must be a valid pointer, since it comes from a `NonNull`, - // meaning we can safely dereference it to `RawJsString`. + // meaning we can safely dereference it to `SeqString`. // - `offset` should point us to the beginning of the array, - // and since we requested an `RawJsString` layout with a trailing + // and since we requested an `SeqString` layout with a trailing // `[u16; str_len]`, the memory of the array must be in the `usize` // range for the allocation to succeed. unsafe { @@ -732,7 +792,7 @@ impl JsString { /// Creates a new [`JsString`] from `data`, without checking if the string is in the interner. fn from_slice_skip_interning(string: JsStr<'_>) -> Self { let count = string.len(); - let ptr = Self::allocate_inner(count, string.is_latin1()); + let ptr = Self::allocate_seq(count, string.is_latin1()); // SAFETY: `allocate_inner` guarantees that `ptr` is a valid pointer. let data = unsafe { (&raw mut (*ptr.as_ptr()).data).cast::() }; @@ -757,11 +817,12 @@ impl JsString { } } } - Self { ptr } + + Self::from_inner(ptr, InnerStringKind::Sequence) } /// Creates a new [`JsString`] from `data`. - fn from_slice(string: JsStr<'_>) -> Self { + fn from_js_str(string: JsStr<'_>) -> Self { if let Some(s) = StaticJsStrings::get_string(&string) { return s; } @@ -772,35 +833,42 @@ impl JsString { #[inline] #[must_use] pub fn refcount(&self) -> Option { - if self.is_static() { - return None; + match self.kind() { + InnerStringKind::Sequence => { + // SAFETY: We are guaranteed a valid kind of string. + Some(unsafe { self.as_inner::() }.refcount.get()) + } + InnerStringKind::Slice => { + // SAFETY: We are guaranteed a valid kind of string. + unsafe { self.as_inner::() }.data.refcount() + } + InnerStringKind::Static => None, } - - // SAFETY: - // `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid. - let rc = unsafe { self.ptr.as_ref().refcount.get() }; - Some(rc) } } impl Clone for JsString { #[inline] fn clone(&self) -> Self { - if self.is_static() { - return Self { ptr: self.ptr }; - } - - // SAFETY: `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid. - let inner = unsafe { self.ptr.as_ref() }; - - let strong = inner.refcount.get().wrapping_add(1); - if strong == 0 { - abort() - } - - inner.refcount.set(strong); - - Self { ptr: self.ptr } + self.on_kind_ref( + |seq| { + let strong = seq.refcount.get().wrapping_add(1); + if strong == 0 { + abort() + } + seq.refcount.set(strong); + Self { + tagged_pointer: self.tagged_pointer, + } + }, + |slice| { + // SAFETY: If this string is valid, the new one will be too. + unsafe { Self::slice_unchecked(slice.data.clone(), slice.start, slice.end) } + }, + |_| Self { + tagged_pointer: self.tagged_pointer, + }, + ) } } @@ -815,43 +883,52 @@ impl Drop for JsString { #[inline] fn drop(&mut self) { // See https://doc.rust-lang.org/src/alloc/sync.rs.html#1672 for details. + match self.kind() { + InnerStringKind::Sequence => { + // SAFETY: This is safe as long as [`InnerStringKind::Sequence`] is 0. + let inner = unsafe { self.tagged_pointer.cast::().as_ref() }; + let new = inner.refcount.get() - 1; + inner.refcount.set(new); + if new != 0 { + return; + } - if self.is_static() { - return; - } - - // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. - let inner = unsafe { self.ptr.as_ref() }; - - inner.refcount.set(inner.refcount.get() - 1); - if inner.refcount.get() != 0 { - return; - } + // SAFETY: + // All the checks for the validity of the layout have already been made on `alloc_inner`, + // so we can skip the unwrap. + let layout = unsafe { + if inner.tagged_len.is_latin1() { + Layout::for_value(inner) + .extend(Layout::array::(inner.tagged_len.len()).unwrap_unchecked()) + .unwrap_unchecked() + .0 + .pad_to_align() + } else { + Layout::for_value(inner) + .extend(Layout::array::(inner.tagged_len.len()).unwrap_unchecked()) + .unwrap_unchecked() + .0 + .pad_to_align() + } + }; - // SAFETY: - // All the checks for the validity of the layout have already been made on `alloc_inner`, - // so we can skip the unwrap. - let layout = unsafe { - if inner.is_latin1() { - Layout::for_value(inner) - .extend(Layout::array::(inner.len()).unwrap_unchecked()) - .unwrap_unchecked() - .0 - .pad_to_align() - } else { - Layout::for_value(inner) - .extend(Layout::array::(inner.len()).unwrap_unchecked()) - .unwrap_unchecked() - .0 - .pad_to_align() + // SAFETY: + // If refcount is 0 and we call drop, that means this is the last `JsString` which + // points to this memory allocation, so deallocating it is safe. + unsafe { + dealloc(self.as_inner_ptr::().as_ptr().cast(), layout); + } + } + InnerStringKind::Slice => { + // Drop the original data, that's it. + // SAFETY: This is always guaranteed to be the right kind of pointer. + unsafe { + drop(Box::from_raw(self.as_inner_ptr::().as_ptr())); + } + } + InnerStringKind::Static => { + // Do nothing on static strings. } - }; - - // SAFETY: - // If refcount is 0 and we call drop, that means this is the last `JsString` which - // points to this memory allocation, so deallocating it is safe. - unsafe { - dealloc(self.ptr.cast().as_ptr(), layout); } } } @@ -890,7 +967,7 @@ impl_from_number_for_js_string!( impl From<&[u16]> for JsString { #[inline] fn from(s: &[u16]) -> Self { - JsString::from_slice(JsStr::utf16(s)) + JsString::from_js_str(JsStr::utf16(s)) } } diff --git a/core/string/src/str.rs b/core/string/src/str.rs index f3cdec1d214..09b2bc6caa3 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -1,5 +1,5 @@ use crate::{ - CodePoint, Iter, TaggedLen, + CodePoint, Iter, JsString, TaggedLen, display::{JsStrDisplayEscaped, JsStrDisplayLossy}, is_trimmable_whitespace, is_trimmable_whitespace_latin1, }; @@ -60,6 +60,7 @@ struct Inner<'a> { /// This is equivalent to Rust's `&str`. #[derive(Clone, Copy)] +#[repr(align(8))] pub struct JsStr<'a> { inner: Inner<'a>, } @@ -108,6 +109,10 @@ impl<'a> JsStr<'a> { self.inner.tagged_len.len() } + pub(crate) const fn ptr(&self) -> *const u8 { + self.inner.ptr + } + /// Return the inner [`JsStrVariant`] varient of the [`JsStr`]. #[inline] #[must_use] @@ -660,6 +665,8 @@ impl std::fmt::Debug for JsStr<'_> { pub trait JsSliceIndex<'a>: SliceIndex<[u8]> + SliceIndex<[u16]> { type Value; + fn get_from_string(str: &JsString, index: Self) -> Option; + fn get(_: JsStr<'a>, index: Self) -> Option; unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value; @@ -668,6 +675,11 @@ pub trait JsSliceIndex<'a>: SliceIndex<[u8]> + SliceIndex<[u16]> { impl<'a> JsSliceIndex<'a> for usize { type Value = u16; + #[inline] + fn get_from_string(str: &JsString, index: Self) -> Option { + str.slice(index, index + 1) + } + #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { match value.variant() { @@ -694,6 +706,11 @@ impl<'a> JsSliceIndex<'a> for usize { impl<'a> JsSliceIndex<'a> for std::ops::Range { type Value = JsStr<'a>; + #[inline] + fn get_from_string(str: &JsString, index: Self) -> Option { + str.slice(index.start, index.end) + } + #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { match value.variant() { @@ -720,6 +737,10 @@ impl<'a> JsSliceIndex<'a> for std::ops::Range { impl<'a> JsSliceIndex<'a> for std::ops::RangeInclusive { type Value = JsStr<'a>; + fn get_from_string(str: &JsString, index: Self) -> Option { + str.slice(*index.start(), *index.end() + 1) + } + #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { match value.variant() { @@ -746,6 +767,10 @@ impl<'a> JsSliceIndex<'a> for std::ops::RangeInclusive { impl<'a> JsSliceIndex<'a> for std::ops::RangeFrom { type Value = JsStr<'a>; + fn get_from_string(str: &JsString, index: Self) -> Option { + str.slice(index.start, str.len()) + } + #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { match value.variant() { @@ -772,6 +797,10 @@ impl<'a> JsSliceIndex<'a> for std::ops::RangeFrom { impl<'a> JsSliceIndex<'a> for std::ops::RangeTo { type Value = JsStr<'a>; + fn get_from_string(str: &JsString, index: Self) -> Option { + str.slice(0, index.end) + } + #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { match value.variant() { @@ -798,6 +827,10 @@ impl<'a> JsSliceIndex<'a> for std::ops::RangeTo { impl<'a> JsSliceIndex<'a> for std::ops::RangeFull { type Value = JsStr<'a>; + fn get_from_string(str: &JsString, _index: Self) -> Option { + Some(str.clone()) + } + #[inline] fn get(value: JsStr<'a>, _index: Self) -> Option { Some(value) diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index 533333883c7..ce69a88e2af 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -74,11 +74,11 @@ fn ptr_eq() { assert!(!x.is_static()); - assert_eq!(x.ptr.addr(), y.ptr.addr()); + assert_eq!(x.tagged_pointer.addr(), y.tagged_pointer.addr()); let z = JsString::from("Hello"); - assert_ne!(x.ptr.addr(), z.ptr.addr()); - assert_ne!(y.ptr.addr(), z.ptr.addr()); + assert_ne!(x.tagged_pointer.addr(), z.tagged_pointer.addr()); + assert_ne!(y.tagged_pointer.addr(), z.tagged_pointer.addr()); } #[test] @@ -88,11 +88,11 @@ fn static_ptr_eq() { assert!(x.is_static()); - assert_eq!(x.ptr.addr(), y.ptr.addr()); + assert_eq!(x.tagged_pointer.addr(), y.tagged_pointer.addr()); let z = StaticJsStrings::EMPTY_STRING; - assert_eq!(x.ptr.addr(), z.ptr.addr()); - assert_eq!(y.ptr.addr(), z.ptr.addr()); + assert_eq!(x.tagged_pointer.addr(), z.tagged_pointer.addr()); + assert_eq!(y.tagged_pointer.addr(), z.tagged_pointer.addr()); } #[test] @@ -203,8 +203,8 @@ fn from_static_js_string() { static STATIC_EMOJIS: JsStr<'static> = JsStr::utf16(&[0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5]); // 🎹🎶🎵 - let latin1 = JsString::from_static_js_str(&STATIC_HELLO_WORLD); - let utf16 = JsString::from_static_js_str(&STATIC_EMOJIS); + let latin1 = JsString::from_static(&STATIC_HELLO_WORLD); + let utf16 = JsString::from_static(&STATIC_EMOJIS); // content compare assert_eq!(latin1, "hello world"); @@ -233,8 +233,8 @@ fn compare_static_and_dynamic_js_string() { static STATIC_EMOJIS: JsStr<'static> = JsStr::utf16(&[0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5]); // 🎹🎶🎵 - let static_latin1 = JsString::from_static_js_str(&STATIC_HELLO_WORLD); - let static_utf16 = JsString::from_static_js_str(&STATIC_EMOJIS); + let static_latin1 = JsString::from_static(&STATIC_HELLO_WORLD); + let static_utf16 = JsString::from_static(&STATIC_EMOJIS); let dynamic_latin1 = JsString::from(JsStr::latin1("hello world".as_bytes())); let dynamic_utf16 = JsString::from(&[0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5]);