diff --git a/cli/src/debug/string.rs b/cli/src/debug/string.rs index 479227011ae..edf3e7abe75 100644 --- a/cli/src/debug/string.rs +++ b/cli/src/debug/string.rs @@ -1,6 +1,6 @@ use boa_engine::{ Context, JsNativeError, JsObject, JsResult, JsValue, NativeFunction, js_string, - object::ObjectInitializer, property::Attribute, string::JsStrVariant, + object::ObjectInitializer, property::Attribute, string::JsStr, }; fn storage(_: &JsValue, args: &[JsValue], _: &mut Context) -> JsResult { @@ -34,9 +34,10 @@ fn encoding(_: &JsValue, args: &[JsValue], _: &mut Context) -> JsResult }; let str = string.as_str(); - let encoding = match str.variant() { - JsStrVariant::Latin1(_) => "latin1", - JsStrVariant::Utf16(_) => "utf16", + let encoding = match str { + JsStr::Latin1(_) => "latin1", + JsStr::Utf16(_) => "utf16", + JsStr::Rope(_) => "rope", }; Ok(js_string!(encoding).into()) } @@ -55,9 +56,10 @@ fn summary(_: &JsValue, args: &[JsValue], context: &mut Context) -> JsResult "latin1", - JsStrVariant::Utf16(_) => "utf16", + let encoding = match string.as_str() { + JsStr::Latin1(_) => "latin1", + JsStr::Utf16(_) => "utf16", + JsStr::Rope(_) => "rope", }; let summary = ObjectInitializer::new(context) diff --git a/core/engine/src/builtins/date/utils.rs b/core/engine/src/builtins/date/utils.rs index 964c63e541f..addf6b19a48 100644 --- a/core/engine/src/builtins/date/utils.rs +++ b/core/engine/src/builtins/date/utils.rs @@ -1,6 +1,5 @@ use crate::{JsStr, JsString, context::HostHooks, js_string, value::IntegerOrInfinity}; use boa_macros::js_str; -use boa_string::JsStrVariant; use std::slice::Iter; use std::str; use std::{borrow::Cow, iter::Peekable}; @@ -753,22 +752,22 @@ pub(super) fn pad_six(t: u32, output: &mut [u8; 6]) -> JsStr<'_> { /// [spec-format]: https://tc39.es/ecma262/#sec-date-time-string-format pub(super) fn parse_date(date: &JsString, hooks: &dyn HostHooks) -> Option { // All characters must be ASCII so we can return early if we find a non-ASCII character. - let owned_js_str = date.as_str(); - let date = match owned_js_str.variant() { - JsStrVariant::Latin1(s) => { + let date = match date.as_flat_str() { + JsStr::Latin1(s) => { if !s.is_ascii() { return None; } // SAFETY: Since all characters are ASCII we can safely convert this into str. Cow::Borrowed(unsafe { str::from_utf8_unchecked(s) }) } - JsStrVariant::Utf16(s) => { + JsStr::Utf16(s) => { let date = String::from_utf16(s).ok()?; if !date.is_ascii() { return None; } Cow::Owned(date) } + JsStr::Rope(_) => unreachable!("rope should be flattened by as_flat_str"), }; // Date Time String Format: 'YYYY-MM-DDTHH:mm:ss.sssZ' diff --git a/core/engine/src/builtins/intl/segmenter/iterator.rs b/core/engine/src/builtins/intl/segmenter/iterator.rs index ceb7c1222f2..f677664ee15 100644 --- a/core/engine/src/builtins/intl/segmenter/iterator.rs +++ b/core/engine/src/builtins/intl/segmenter/iterator.rs @@ -128,7 +128,7 @@ impl SegmentIterator { .downcast_ref::() .js_expect("segment iterator object should contain a segmenter") .ok()?; - let mut segments = segmenter.native.segment(string.variant()); + let mut segments = segmenter.native.segment(string.as_str()); // the first elem is always 0. segments.next(); segments diff --git a/core/engine/src/builtins/intl/segmenter/mod.rs b/core/engine/src/builtins/intl/segmenter/mod.rs index 459a385f6ed..b1134bc3074 100644 --- a/core/engine/src/builtins/intl/segmenter/mod.rs +++ b/core/engine/src/builtins/intl/segmenter/mod.rs @@ -15,7 +15,7 @@ use crate::{ string::StaticJsStrings, }; use boa_gc::{Finalize, Trace}; -use boa_string::JsStrVariant; +use boa_string::JsStr; use icu_collator::provider::CollationDiacriticsV1; use icu_locale::Locale; use icu_segmenter::{ @@ -63,12 +63,9 @@ impl NativeSegmenter { /// Segment the passed string, returning an iterator with the index boundaries /// of the segments. - pub(crate) fn segment<'l, 's>( - &'l self, - input: JsStrVariant<'s>, - ) -> NativeSegmentIterator<'l, 's> { + pub(crate) fn segment<'l, 's>(&'l self, input: JsStr<'s>) -> NativeSegmentIterator<'l, 's> { match input { - JsStrVariant::Latin1(input) => match self { + JsStr::Latin1(input) => match self { Self::Grapheme(g) => { NativeSegmentIterator::GraphemeLatin1(g.as_borrowed().segment_latin1(input)) } @@ -79,7 +76,7 @@ impl NativeSegmenter { NativeSegmentIterator::SentenceLatin1(s.as_borrowed().segment_latin1(input)) } }, - JsStrVariant::Utf16(input) => match self { + JsStr::Utf16(input) => match self { Self::Grapheme(g) => { NativeSegmentIterator::GraphemeUtf16(g.as_borrowed().segment_utf16(input)) } @@ -90,6 +87,10 @@ impl NativeSegmenter { NativeSegmentIterator::SentenceUtf16(s.as_borrowed().segment_utf16(input)) } }, + JsStr::Rope(rope) => { + let flat = boa_string::vtable::rope::flatten_rope(rope.header); + self.segment(flat.as_str()) + } } } } diff --git a/core/engine/src/builtins/intl/segmenter/segments.rs b/core/engine/src/builtins/intl/segmenter/segments.rs index f489dce1247..ea4170ee66f 100644 --- a/core/engine/src/builtins/intl/segmenter/segments.rs +++ b/core/engine/src/builtins/intl/segmenter/segments.rs @@ -89,7 +89,7 @@ impl Segments { // 8. Let startIndex be ! FindBoundary(segmenter, string, n, before). // 9. Let endIndex be ! FindBoundary(segmenter, string, n, after). let (range, is_word_like) = { - let mut segments = segmenter.native.segment(segments.string.variant()); + let mut segments = segmenter.native.segment(segments.string.as_str()); std::iter::from_fn(|| segments.next().map(|i| (i, segments.is_word_like()))) .tuple_windows() .find(|((i, _), (j, _))| (*i..*j).contains(&n)) diff --git a/core/engine/src/builtins/number/globals.rs b/core/engine/src/builtins/number/globals.rs index 39c8c92b611..31308fb6bc5 100644 --- a/core/engine/src/builtins/number/globals.rs +++ b/core/engine/src/builtins/number/globals.rs @@ -8,7 +8,6 @@ use crate::{ }; use boa_macros::js_str; -use boa_string::JsStrVariant; /// Builtin javascript 'isFinite(number)' function. /// @@ -359,13 +358,14 @@ pub(crate) fn parse_float( return Ok(JsValue::nan()); } - let value = match trimmed_string.variant() { - JsStrVariant::Latin1(s) => fast_float2::parse_partial::(s), - JsStrVariant::Utf16(s) => { + let value = match input_string.as_flat_str() { + JsStr::Latin1(s) => fast_float2::parse_partial::(s), + JsStr::Utf16(s) => { // TODO: Explore adding direct UTF-16 parsing support to fast_float2. let s = String::from_utf16_lossy(s); fast_float2::parse_partial::(s.as_bytes()) } + JsStr::Rope(_) => unreachable!("rope should be flattened by as_flat_str"), }; Ok(value.map_or_else( diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 037eed6eb19..1764d6d0621 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -18,7 +18,7 @@ use crate::{ object::{CONSTRUCTOR, JsObject, internal_methods::get_prototype_from_constructor}, property::Attribute, realm::Realm, - string::{CodePoint, CommonJsStringBuilder, JsStrVariant, StaticJsStrings}, + string::{CodePoint, CommonJsStringBuilder, JsStr, StaticJsStrings}, symbol::JsSymbol, value::JsValue, }; @@ -1157,20 +1157,20 @@ impl RegExp { // 13.b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. // 13.c. Let r be matcher(input, inputIndex). - let r: Option = match (full_unicode, input.as_str().variant()) { - (true | false, JsStrVariant::Latin1(_)) => { + let r: Option = match (full_unicode, input.as_flat_str()) { + (true | false, JsStr::Latin1(input)) => { // TODO: Currently regress does not support latin1 encoding. - let input = input.to_vec(); - + let input = input.iter().map(|&b| u16::from(b)).collect::>(); // NOTE: We can use the faster ucs2 variant since there will never be two byte unicode. matcher.find_from_ucs2(&input, last_index as usize).next() } - (true, JsStrVariant::Utf16(input)) => { + (true, JsStr::Utf16(input)) => { matcher.find_from_utf16(input, last_index as usize).next() } - (false, JsStrVariant::Utf16(input)) => { + (false, JsStr::Utf16(input)) => { matcher.find_from_ucs2(input, last_index as usize).next() } + (_, JsStr::Rope(_)) => unreachable!("rope should be flattened by as_flat_str"), }; let Some(match_value) = r else { diff --git a/core/engine/src/builtins/string/mod.rs b/core/engine/src/builtins/string/mod.rs index 9421b4654d7..f96ba385437 100644 --- a/core/engine/src/builtins/string/mod.rs +++ b/core/engine/src/builtins/string/mod.rs @@ -653,18 +653,19 @@ impl String { let this = this.require_object_coercible()?; // 2. Let S be ? ToString(O). - let mut string = this.to_string(context)?; + let mut strings = Vec::with_capacity(args.len() + 1); + strings.push(this.to_string(context)?); // 3. Let R be S. // 4. For each element next of args, do for arg in args { // a. Let nextString be ? ToString(next). // b. Set R to the string-concatenation of R and nextString. - string = js_string!(&string, &arg.to_string(context)?); + strings.push(arg.to_string(context)?); } // 5. Return R. - Ok(JsValue::new(string)) + Ok(JsValue::new(JsString::concat_array_strings(&strings))) } /// `String.prototype.repeat( count )` diff --git a/core/engine/src/module/loader/mod.rs b/core/engine/src/module/loader/mod.rs index 6c6e4b89c23..f0495800874 100644 --- a/core/engine/src/module/loader/mod.rs +++ b/core/engine/src/module/loader/mod.rs @@ -3,6 +3,8 @@ use std::cell::RefCell; use std::path::{Component, Path, PathBuf}; use std::rc::Rc; +#[cfg(target_family = "windows")] +use cow_utils::CowUtils; use dynify::dynify; use rustc_hash::FxHashMap; @@ -62,7 +64,7 @@ pub fn resolve_module_specifier( // On Windows, also replace `/` with `\`. JavaScript imports use `/` as path separator. #[cfg(target_family = "windows")] - let specifier = cow_utils::CowUtils::cow_replace(&*specifier, '/', "\\"); + let specifier = specifier.cow_replace('/', "\\"); let short_path = Path::new(&*specifier); diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index 35c40c8c245..4a19f8ae225 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -62,10 +62,10 @@ macro_rules! js_string { $crate::string::JsString::from($s) }; ( $x:expr, $y:expr ) => { - $crate::string::JsString::concat($crate::string::JsStr::from($x), $crate::string::JsStr::from($y)) + $crate::string::JsString::concat(&$crate::string::JsString::from($x), &$crate::string::JsString::from($y)) }; ( $( $s:expr ),+ ) => { - $crate::string::JsString::concat_array(&[ $( $crate::string::JsStr::from($s) ),+ ]) + $crate::string::JsString::concat_array_strings(&[ $( $crate::string::JsString::from($s) ),+ ]) }; } @@ -166,15 +166,15 @@ mod tests { let x = js_string!("hello"); let z = js_string!("world"); - let xy = js_string!(&x, &JsString::from(Y)); + let xy = js_string!(x.clone(), JsString::from(Y)); assert_eq!(&xy, utf16!("hello, ")); assert_eq!(xy.refcount(), Some(1)); - let xyz = js_string!(&xy, &z); + let xyz = js_string!(xy.clone(), z.clone()); assert_eq!(&xyz, utf16!("hello, world")); assert_eq!(xyz.refcount(), Some(1)); - let xyzw = js_string!(&xyz, &JsString::from(W)); + let xyzw = js_string!(xyz.clone(), JsString::from(W)); assert_eq!(&xyzw, utf16!("hello, world!")); assert_eq!(xyzw.refcount(), Some(1)); } diff --git a/core/engine/src/vm/opcode/concat/mod.rs b/core/engine/src/vm/opcode/concat/mod.rs index eee34e20fa4..67fec65236d 100644 --- a/core/engine/src/vm/opcode/concat/mod.rs +++ b/core/engine/src/vm/opcode/concat/mod.rs @@ -21,7 +21,7 @@ impl ConcatToString { let val = context.vm.get_register(value.into()).clone(); strings.push(val.to_string(context)?); } - let s = JsString::concat_array(&strings.iter().map(JsString::as_str).collect::>()); + let s = JsString::concat_array_strings(&strings); context.vm.set_register(string.into(), s.into()); Ok(()) } diff --git a/core/runtime/src/text/encodings.rs b/core/runtime/src/text/encodings.rs index cbe961fdc8e..31850063130 100644 --- a/core/runtime/src/text/encodings.rs +++ b/core/runtime/src/text/encodings.rs @@ -22,8 +22,17 @@ pub(crate) mod utf8 { } pub(crate) mod utf16le { + use boa_engine::string::JsStr; use boa_engine::{JsString, js_string}; + #[allow(dead_code)] + pub(crate) fn encode(input: &JsString) -> Vec { + match input.as_str() { + JsStr::Latin1(l) => l.iter().flat_map(|c| [*c, 0]).collect(), + JsStr::Utf16(s) => bytemuck::cast_slice(s).to_vec(), + JsStr::Rope(_) => input.iter().flat_map(u16::to_le_bytes).collect(), + } + } pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString { if strip_bom { input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input); @@ -48,8 +57,17 @@ pub(crate) mod utf16le { } pub(crate) mod utf16be { + use boa_engine::string::JsStr; use boa_engine::{JsString, js_string}; + #[allow(dead_code)] + pub(crate) fn encode(input: &JsString) -> Vec { + match input.as_str() { + JsStr::Latin1(l) => l.iter().flat_map(|c| [0, *c]).collect(), + JsStr::Utf16(s) => s.iter().flat_map(|b| b.to_be_bytes()).collect::>(), + JsStr::Rope(_) => input.iter().flat_map(u16::to_be_bytes).collect(), + } + } pub(crate) fn decode(mut input: Vec, strip_bom: bool) -> JsString { if strip_bom && input.starts_with(&[0xFE, 0xFF]) { input.drain(..2); diff --git a/core/string/src/builder.rs b/core/string/src/builder.rs index e245ac7a894..ec5a2c07a3c 100644 --- a/core/string/src/builder.rs +++ b/core/string/src/builder.rs @@ -1,5 +1,5 @@ use crate::r#type::{InternalStringType, Latin1, Utf16}; -use crate::{JsStr, JsStrVariant, JsString, SequenceString, alloc_overflow}; +use crate::{JsStr, JsString, SequenceString, alloc_overflow}; use std::{ alloc::{Layout, alloc, dealloc, realloc}, marker::PhantomData, @@ -41,7 +41,7 @@ impl JsStringBuilder { } } - /// Returns the number of elements that inner `RawJsString` holds. + /// Returns the number of elements that inner `JsStringHeader` holds. #[inline] #[must_use] pub const fn len(&self) -> usize { @@ -68,7 +68,7 @@ impl JsStringBuilder { self.cap } - /// Returns the allocated byte of inner `RawJsString`'s data. + /// Returns the allocated byte of `JsStringHeader`'s data. #[must_use] const fn allocated_data_byte_len(&self) -> usize { self.len() * Self::DATA_SIZE @@ -90,8 +90,8 @@ impl JsStringBuilder { let layout = Self::new_layout(cap); #[allow(clippy::cast_ptr_alignment)] // SAFETY: - // The layout size of `RawJsString` is never zero, since it has to store - // the length of the string and the reference count. + // The layout size of `JsStringHeader` is never zero, since it has to store + // basic string metadata like length and refcount. let ptr = unsafe { alloc(layout) }; let Some(ptr) = NonNull::new(ptr.cast()) else { @@ -105,7 +105,7 @@ impl JsStringBuilder { } } - /// Checks if the inner `RawJsString` is allocated. + /// Checks if the inner `JsStringHeader` is allocated. #[must_use] fn is_allocated(&self) -> bool { self.inner != NonNull::dangling() @@ -163,12 +163,12 @@ impl JsStringBuilder { // SAFETY: // Valid pointer is required by `realloc` and pointer is checked above to be valid. // The layout size of the sequence string is never zero, since it has to store - // the length of the string and the reference count. + // the `JsStringHeader` header. unsafe { realloc(old_ptr.cast(), old_layout, new_layout.size()) } } else { // SAFETY: // The layout size of the sequence string is never zero, since it has to store - // the length of the string and the reference count. + // the `JsStringHeader` header. unsafe { alloc(new_layout) } }; @@ -179,7 +179,7 @@ impl JsStringBuilder { self.cap = Self::capacity_from_layout(new_layout); } - /// Appends an element to the inner `RawJsString` of `JsStringBuilder`. + /// Appends an element to the inner `JsStringHeader` of `JsStringBuilder`. #[inline] pub fn push(&mut self, v: D::Byte) { let required_cap = self.len() + 1; @@ -277,7 +277,7 @@ impl JsStringBuilder { } } - /// Allocates memory to the inner `RawJsString` by the given capacity. + /// Allocates memory to the inner `JsStringHeader` by the given capacity. /// Capacity calculation is from [`Vec::reserve`]. fn allocate(&mut self, cap: usize) { let cap = std::cmp::max(self.capacity() * 2, cap); @@ -285,7 +285,7 @@ impl JsStringBuilder { self.allocate_inner(Self::new_layout(cap)); } - /// Appends an element to the inner `RawJsString` of `JsStringBuilder` without doing bounds check. + /// Appends an element to the inner `JsStringHeader` of `JsStringBuilder` without doing bounds check. /// # Safety /// /// Caller should ensure the capacity is large enough to hold elements. @@ -305,7 +305,7 @@ impl JsStringBuilder { self.len() == 0 } - /// Checks if all bytes in inner `RawJsString`'s data are ascii. + /// Checks if all bytes in inner `JsStringHeader`'s data are ascii. #[inline] #[must_use] pub fn is_ascii(&self) -> bool { @@ -318,20 +318,20 @@ impl JsStringBuilder { data.is_ascii() } - /// Extracts a slice containing the elements in the inner `RawJsString`. + /// Extracts a slice containing the elements in the inner `JsStringHeader`. #[inline] #[must_use] pub fn as_slice(&self) -> &[D::Byte] { if self.is_allocated() { // SAFETY: - // The inner `RawJsString` is allocated which means it is not null. + // The inner `JsStringHeader` is allocated which means it is not null. unsafe { std::slice::from_raw_parts(self.data(), self.len()) } } else { &[] } } - /// Extracts a mutable slice containing the elements in the inner `RawJsString`. + /// Extracts a mutable slice containing the elements in the inner `JsStringHeader`. /// /// # Safety /// The caller must ensure that the content of the slice is valid encoding before the borrow ends. @@ -341,7 +341,7 @@ impl JsStringBuilder { pub unsafe fn as_mut_slice(&mut self) -> &mut [D::Byte] { if self.is_allocated() { // SAFETY: - // The inner `RawJsString` is allocated which means it is not null. + // The inner `JsStringHeader` is allocated which means it is not null. unsafe { std::slice::from_raw_parts_mut(self.data(), self.len()) } } else { &mut [] @@ -810,14 +810,16 @@ impl<'seg, 'ref_str: 'seg> CommonJsStringBuilder<'seg> { match seg { Segment::String(s) => { let js_str = s.as_str(); - match js_str.variant() { - JsStrVariant::Latin1(s) => builder.extend(s.iter().copied().map(u16::from)), - JsStrVariant::Utf16(s) => builder.extend_from_slice(s), + match js_str { + JsStr::Latin1(s) => builder.extend(s.iter().copied().map(u16::from)), + JsStr::Utf16(s) => builder.extend_from_slice(s), + JsStr::Rope(_) => builder.extend(js_str.iter()), } } - Segment::Str(s) => match s.variant() { - JsStrVariant::Latin1(s) => builder.extend(s.iter().copied().map(u16::from)), - JsStrVariant::Utf16(s) => builder.extend_from_slice(s), + Segment::Str(s) => match s { + JsStr::Latin1(s) => builder.extend(s.iter().copied().map(u16::from)), + JsStr::Utf16(s) => builder.extend_from_slice(s), + JsStr::Rope(_) => builder.extend(s.iter()), }, Segment::Latin1(latin1) => builder.push(u16::from(latin1)), Segment::CodePoint(code_point) => { diff --git a/core/string/src/common.rs b/core/string/src/common.rs index 132c1c97afe..57f7e67b5bc 100644 --- a/core/string/src/common.rs +++ b/core/string/src/common.rs @@ -232,7 +232,7 @@ static RAW_STATICS_CACHE: LazyLock, &'static StaticStri LazyLock::new(|| RAW_STATICS.iter().map(|s| (s.str, s)).collect()); /// Array of raw static strings that aren't reference counted. -const RAW_STATICS: &[StaticString] = &[ +pub(crate) const RAW_STATICS: &[StaticString] = &[ StaticString::new(JsStr::latin1("".as_bytes())), // Well known symbols StaticString::new(JsStr::latin1("Symbol.asyncIterator".as_bytes())), diff --git a/core/string/src/display.rs b/core/string/src/display.rs index 17e585db03a..e1e7fc36498 100644 --- a/core/string/src/display.rs +++ b/core/string/src/display.rs @@ -1,6 +1,6 @@ //! Display implementations for [`JsString`]. -use crate::{CodePoint, JsStr, JsStrVariant, JsString, JsStringKind, SliceString}; +use crate::{CodePoint, JsStr, JsString, JsStringKind, SliceString}; use std::cell::RefCell; use std::fmt; use std::fmt::Write; @@ -14,19 +14,21 @@ pub struct JsStrDisplayEscaped<'a> { impl fmt::Display for JsStrDisplayEscaped<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self.inner.variant() { - // SAFETY: `JsStrVariant::Latin1` does not contain any unpaired surrogates, so need to check. - JsStrVariant::Latin1(v) => v + match self.inner.as_str() { + // SAFETY: `JsStr::Latin1` does not contain any unpaired surrogates, so need to check. + JsStr::Latin1(v) => v .iter() .copied() .map(char::from) .try_for_each(|c| f.write_char(c)), - JsStrVariant::Utf16(_) => self.inner.code_points().try_for_each(|r| match r { - CodePoint::Unicode(c) => f.write_char(c), - CodePoint::UnpairedSurrogate(c) => { - write!(f, "\\u{c:04X}") - } - }), + JsStr::Utf16(_) | JsStr::Rope(_) => { + self.inner.code_points().try_for_each(|r| match r { + CodePoint::Unicode(c) => f.write_char(c), + CodePoint::UnpairedSurrogate(c) => { + write!(f, "\\u{c:04X}") + } + }) + } } } } @@ -92,7 +94,7 @@ impl fmt::Debug for JsStringDebugInfo<'_> { // Show kind specific fields from string. match self.inner.kind() { - JsStringKind::Latin1Sequence | JsStringKind::Utf16Sequence => { + JsStringKind::Latin1Sequence | JsStringKind::Utf16Sequence | JsStringKind::Rope => { if let Some(rc) = self.inner.refcount() { dbg.borrow_mut().field("refcount", &rc); } diff --git a/core/string/src/iter.rs b/core/string/src/iter.rs index d327b06187d..848f068de26 100644 --- a/core/string/src/iter.rs +++ b/core/string/src/iter.rs @@ -1,13 +1,22 @@ use std::iter::FusedIterator; +use crate::str::RopeSlice; use crate::{CodePoint, JsStr}; -use super::JsStrVariant; - #[derive(Debug, Clone)] enum IterInner<'a> { U8(std::iter::Copied>), U16(std::iter::Copied>), + Rope(Box>), +} + +#[derive(Debug, Clone)] +struct RopeIter<'a> { + front_stack: Vec>, + front_current: Option>, + back_stack: Vec>, + back_current: Option>, + len: usize, } /// Iterator over a [`JsStr`]. @@ -19,9 +28,17 @@ pub struct Iter<'a> { impl<'a> Iter<'a> { #[inline] pub(crate) fn new(s: JsStr<'a>) -> Self { - let inner = match s.variant() { - JsStrVariant::Latin1(s) => IterInner::U8(s.iter().copied()), - JsStrVariant::Utf16(s) => IterInner::U16(s.iter().copied()), + let len = s.len(); + let inner = match s { + JsStr::Latin1(s) => IterInner::U8(s.iter().copied()), + JsStr::Utf16(s) => IterInner::U16(s.iter().copied()), + JsStr::Rope(r) => IterInner::Rope(Box::new(RopeIter { + front_stack: vec![r], + front_current: None, + back_stack: Vec::new(), + back_current: None, + len, + })), }; Iter { inner } } @@ -33,8 +50,171 @@ impl Iterator for Iter<'_> { #[inline] fn next(&mut self) -> Option { match &mut self.inner { - IterInner::U8(iter) => iter.map(u16::from).next(), + IterInner::U8(iter) => iter.next().map(u16::from), IterInner::U16(iter) => iter.next(), + IterInner::Rope(rope) => rope.next(), + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (self.len(), Some(self.len())) + } +} + +impl DoubleEndedIterator for Iter<'_> { + #[inline] + fn next_back(&mut self) -> Option { + match &mut self.inner { + IterInner::U8(iter) => iter.next_back().map(u16::from), + IterInner::U16(iter) => iter.next_back(), + IterInner::Rope(rope) => rope.next_back(), + } + } +} + +impl Iterator for RopeIter<'_> { + type Item = u16; + + fn next(&mut self) -> Option { + if self.len == 0 { + return None; + } + loop { + if let Some(ref mut iter) = self.front_current { + if let Some(cu) = iter.next() { + self.len -= 1; + return Some(cu); + } + self.front_current = None; + } + + if let Some(slice) = self.front_stack.pop() { + if slice.header.vtable.kind == crate::JsStringKind::Rope { + // SAFETY: The header is guaranteed to be a `RopeString` because the kind is `Rope`. + let r = unsafe { + &*std::ptr::from_ref(slice.header).cast::() + }; + let left_len = r.left.len(); + + if slice.start < left_len { + let left_end = std::cmp::min(slice.end, left_len); + if slice.end > left_len { + self.front_stack.push(RopeSlice { + // SAFETY: The child pointers in a `RopeString` are always valid `JsStringHeader` pointers. + header: unsafe { &*r.right.ptr.as_ptr().cast() }, + start: 0, + end: slice.end - left_len, + }); + } + self.front_stack.push(RopeSlice { + // SAFETY: The child pointers in a `RopeString` are always valid `JsStringHeader` pointers. + header: unsafe { &*r.left.ptr.as_ptr().cast() }, + start: slice.start, + end: left_end, + }); + } else { + self.front_stack.push(RopeSlice { + // SAFETY: The child pointers in a `RopeString` are always valid `JsStringHeader` pointers. + header: unsafe { &*r.right.ptr.as_ptr().cast() }, + start: slice.start - left_len, + end: slice.end - left_len, + }); + } + } else { + let s = (slice.header.vtable.as_str)(slice.header) + .get(slice.start..slice.end) + .expect("valid slice"); + self.front_current = Some(s.iter()); + } + continue; + } + + if let Some(cu) = self.back_current.as_mut().and_then(Iterator::next) { + self.len -= 1; + return Some(cu); + } + if let Some(slice) = self.back_stack.pop() { + self.front_stack.push(slice); + continue; + } + return None; + } + } +} + +impl DoubleEndedIterator for RopeIter<'_> { + fn next_back(&mut self) -> Option { + if self.len == 0 { + return None; + } + loop { + if let Some(ref mut iter) = self.back_current { + if let Some(cu) = iter.next_back() { + self.len -= 1; + return Some(cu); + } + self.back_current = None; + } + + if let Some(slice) = self.back_stack.pop() { + if slice.header.vtable.kind == crate::JsStringKind::Rope { + // SAFETY: The header is guaranteed to be a `RopeString` because the kind is `Rope`. + let r = unsafe { + &*std::ptr::from_ref(slice.header).cast::() + }; + let left_len = r.left.len(); + + if slice.end > left_len { + let right_start = std::cmp::max( + 0, + (slice.start.cast_signed() - left_len.cast_signed()).max(0), + ) + .cast_unsigned(); + if slice.start < left_len { + self.back_stack.push(RopeSlice { + // SAFETY: The child pointers in a `RopeString` are always valid `JsStringHeader` pointers. + header: unsafe { &*r.left.ptr.as_ptr().cast() }, + start: slice.start, + end: left_len, + }); + } + self.back_stack.push(RopeSlice { + // SAFETY: The child pointers in a `RopeString` are always valid `JsStringHeader` pointers. + header: unsafe { &*r.right.ptr.as_ptr().cast() }, + start: right_start, + end: slice.end - left_len, + }); + } else { + self.back_stack.push(RopeSlice { + // SAFETY: The child pointers in a `RopeString` are always valid `JsStringHeader` pointers. + header: unsafe { &*r.left.ptr.as_ptr().cast() }, + start: slice.start, + end: slice.end, + }); + } + } else { + let s = (slice.header.vtable.as_str)(slice.header) + .get(slice.start..slice.end) + .expect("valid slice"); + self.back_current = Some(s.iter()); + } + continue; + } + + if let Some(cu) = self + .front_current + .as_mut() + .and_then(DoubleEndedIterator::next_back) + { + self.len -= 1; + return Some(cu); + } + if let Some(slice) = self.front_stack.pop() { + self.back_stack.push(slice); + continue; + } + return None; } } } @@ -47,6 +227,7 @@ impl ExactSizeIterator for Iter<'_> { match &self.inner { IterInner::U8(v) => v.len(), IterInner::U16(v) => v.len(), + IterInner::Rope(rope) => rope.len, } } } @@ -55,6 +236,11 @@ impl ExactSizeIterator for Iter<'_> { enum WindowsInner<'a> { U8(std::slice::Windows<'a, u8>), U16(std::slice::Windows<'a, u16>), + Rope { + slice: RopeSlice<'a>, + size: usize, + index: usize, + }, } /// An iterator over overlapping subslices of length size. @@ -68,9 +254,14 @@ pub struct Windows<'a> { impl<'a> Windows<'a> { #[inline] pub(crate) fn new(string: JsStr<'a>, size: usize) -> Self { - let inner = match string.variant() { - JsStrVariant::Latin1(v) => WindowsInner::U8(v.windows(size)), - JsStrVariant::Utf16(v) => WindowsInner::U16(v.windows(size)), + let inner = match string { + JsStr::Latin1(v) => WindowsInner::U8(v.windows(size)), + JsStr::Utf16(v) => WindowsInner::U16(v.windows(size)), + JsStr::Rope(r) => WindowsInner::Rope { + slice: r, + size, + index: 0, + }, }; Self { inner } } @@ -84,6 +275,19 @@ impl<'a> Iterator for Windows<'a> { match &mut self.inner { WindowsInner::U8(iter) => iter.next().map(JsStr::latin1), WindowsInner::U16(iter) => iter.next().map(JsStr::utf16), + WindowsInner::Rope { slice, size, index } => { + if *index + *size <= slice.len() { + let res = JsStr::Rope(RopeSlice { + header: slice.header, + start: slice.start + *index, + end: slice.start + *index + *size, + }); + *index += 1; + Some(res) + } else { + None + } + } } } } @@ -96,6 +300,13 @@ impl ExactSizeIterator for Windows<'_> { match &self.inner { WindowsInner::U8(v) => v.len(), WindowsInner::U16(v) => v.len(), + WindowsInner::Rope { slice, size, index } => { + if *index + *size <= slice.len() { + slice.len() - *index - *size + 1 + } else { + 0 + } + } } } } @@ -104,6 +315,13 @@ impl ExactSizeIterator for Windows<'_> { enum CodePointsIterInner<'a> { Latin1(std::iter::Copied>), Utf16(std::char::DecodeUtf16>>), + Rope(Box>), +} + +#[derive(Debug, Clone)] +pub(crate) struct RopeCodePointsIter<'a> { + stack: Vec>, + current: Option<(Option, CodePointsIter<'a>)>, } #[derive(Debug, Clone)] @@ -114,14 +332,34 @@ pub struct CodePointsIter<'a> { impl<'a> CodePointsIter<'a> { #[inline] pub(crate) fn new(s: JsStr<'a>) -> Self { - let inner = match s.variant() { - JsStrVariant::Latin1(s) => CodePointsIterInner::Latin1(s.iter().copied()), - JsStrVariant::Utf16(s) => { - CodePointsIterInner::Utf16(char::decode_utf16(s.iter().copied())) - } + let inner = match s { + JsStr::Latin1(s) => CodePointsIterInner::Latin1(s.iter().copied()), + JsStr::Utf16(s) => CodePointsIterInner::Utf16(char::decode_utf16(s.iter().copied())), + JsStr::Rope(r) => CodePointsIterInner::Rope(Box::new(RopeCodePointsIter { + stack: vec![r], + current: None, + })), }; CodePointsIter { inner } } + + #[inline] + pub(super) fn rope(s: &crate::JsString) -> Self { + let header = + // SAFETY: The pointer in a `JsString` is always valid. + unsafe { s.ptr.as_ref() }; + CodePointsIter { + inner: CodePointsIterInner::Rope(Box::new(RopeCodePointsIter { + stack: vec![RopeSlice { + // SAFETY: The pointer in a `JsString` is always valid. + header: unsafe { &*s.ptr.as_ptr().cast() }, + start: 0, + end: header.len, + }], + current: None, + })), + } + } } impl Iterator for CodePointsIter<'_> { @@ -137,6 +375,66 @@ impl Iterator for CodePointsIter<'_> { Ok(c) => CodePoint::Unicode(c), Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()), }), + CodePointsIterInner::Rope(rope) => rope.next(), + } + } +} + +impl<'a> Iterator for RopeCodePointsIter<'a> { + type Item = CodePoint; + + fn next(&mut self) -> Option { + loop { + if let Some((_, ref mut iter)) = self.current { + if let Some(cp) = iter.next() { + return Some(cp); + } + self.current = None; + } + + let slice = self.stack.pop()?; + + if slice.header.vtable.kind == crate::JsStringKind::Rope { + // SAFETY: The header is guaranteed to be a `RopeString` because the kind is `Rope`. + let r = unsafe { + &*std::ptr::from_ref(slice.header).cast::() + }; + let left_len = r.left.len(); + + if slice.start < left_len { + let left_end = std::cmp::min(slice.end, left_len); + if slice.end > left_len { + self.stack.push(RopeSlice { + // SAFETY: The child pointers in a `RopeString` are always valid `JsStringHeader` pointers. + header: unsafe { &*r.right.ptr.as_ptr().cast() }, + start: 0, + end: slice.end - left_len, + }); + } + self.stack.push(RopeSlice { + // SAFETY: The child pointers in a `RopeString` are always valid `JsStringHeader` pointers. + header: unsafe { &*r.left.ptr.as_ptr().cast() }, + start: slice.start, + end: left_end, + }); + } else { + self.stack.push(RopeSlice { + // SAFETY: The child pointers in a `RopeString` are always valid `JsStringHeader` pointers. + header: unsafe { &*r.right.ptr.as_ptr().cast() }, + start: slice.start - left_len, + end: slice.end - left_len, + }); + } + } else { + let s = (slice.header.vtable.as_str)(slice.header) + .get(slice.start..slice.end) + .expect("valid slice"); + let iter = s.code_points(); + // SAFETY: The lifetime of the code points is tied to the rope string, which is guaranteed to live for 'a. + let iter = + unsafe { std::mem::transmute::, CodePointsIter<'a>>(iter) }; + self.current = Some((None, iter)); + } } } } diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 507a1e4b921..e5f6bea49f0 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -19,7 +19,8 @@ mod display; mod iter; mod str; mod r#type; -mod vtable; +/// VTable-based string implementation details. +pub mod vtable; #[cfg(test)] mod tests; @@ -29,21 +30,22 @@ use crate::display::{JsStrDisplayEscaped, JsStrDisplayLossy, JsStringDebugInfo}; use crate::iter::CodePointsIter; use crate::r#type::{Latin1, Utf16}; pub use crate::vtable::StaticString; -use crate::vtable::{SequenceString, SliceString}; +pub(crate) use crate::vtable::{JsStringHeader, RopeString, SequenceString, SliceString}; #[doc(inline)] pub use crate::{ builder::{CommonJsStringBuilder, Latin1JsStringBuilder, Utf16JsStringBuilder}, code_point::CodePoint, common::StaticJsStrings, iter::Iter, - str::{JsStr, JsStrVariant}, + str::JsStr, }; -use std::marker::PhantomData; +use std::ptr::NonNull; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::{borrow::Cow, mem::ManuallyDrop}; use std::{ convert::Infallible, hash::{Hash, Hasher}, - ptr::{self, NonNull}, + ptr::{self}, str::FromStr, }; use vtable::JsStringVTable; @@ -91,18 +93,11 @@ pub(crate) const fn is_trimmable_whitespace_latin1(c: u8) -> bool { ) } -/// Opaque type of a raw string pointer. -#[allow(missing_copy_implementations, missing_debug_implementations)] -pub struct RawJsString { - // Make this non-send, non-sync, invariant and unconstructable. - phantom_data: PhantomData<*mut ()>, -} - /// Strings can be represented internally by multiple kinds. This is used to identify /// the storage kind of string. #[derive(Debug, Clone, Copy, Eq, PartialEq)] #[repr(u8)] -pub(crate) enum JsStringKind { +pub enum JsStringKind { /// A sequential memory slice of Latin1 bytes. See [`SequenceString`]. Latin1Sequence = 0, @@ -114,6 +109,9 @@ pub(crate) enum JsStringKind { /// A static string that is valid for `'static` lifetime. Static = 3, + + /// A rope string that is a tree of other strings. See [`RopeString`]. + Rope = 4, } /// A Latin1 or UTF-16–encoded, reference counted, immutable string. @@ -136,9 +134,8 @@ pub(crate) enum JsStringKind { /// type to allow for better optimization (and simpler code). #[allow(clippy::module_name_repetitions)] pub struct JsString { - /// Pointer to the string data. Always points to a struct whose first field is - /// `JsStringVTable`. - ptr: NonNull, + /// Pointer to the string data. Always points to a `JsStringHeader` header. + pub(crate) ptr: NonNull, } // `JsString` should always be thin-pointer sized. @@ -207,9 +204,7 @@ impl JsString { /// errors. #[inline] #[allow(clippy::missing_panics_doc)] - pub fn to_std_string_with_surrogates( - &self, - ) -> impl Iterator> + use<'_> { + pub fn to_std_string_with_surrogates(&self) -> impl Iterator> + '_ { let mut iter = self.code_points().peekable(); std::iter::from_fn(move || { @@ -261,14 +256,38 @@ impl JsString { #[inline] #[must_use] pub fn code_points(&self) -> CodePointsIter<'_> { - (self.vtable().code_points)(self.ptr) + // SAFETY: The pointer `self.ptr` is always valid and points to a `JsStringHeader` header. + let header = unsafe { self.ptr.as_ref() }; + (header.vtable.code_points)(header) } /// Get the variant of this string. #[inline] #[must_use] - pub fn variant(&self) -> JsStrVariant<'_> { - self.as_str().variant() + pub fn variant(&self) -> JsStr<'_> { + // SAFETY: The pointer `self.ptr` is always valid and points to a `JsStringHeader` header. + let header = unsafe { self.ptr.as_ref() }; + match header.vtable.kind { + JsStringKind::Latin1Sequence => { + // SAFETY: `header.vtable.kind` is `Latin1Sequence`, so `self.ptr` can be safely cast to `SequenceString`. + let seq: &SequenceString = unsafe { self.ptr.cast().as_ref() }; + // SAFETY: `seq.data()` is a valid pointer to the Latin1 data, and `header.len` is the correct length. + unsafe { + let slice = std::slice::from_raw_parts(seq.data(), header.len); + JsStr::Latin1(slice) + } + } + JsStringKind::Utf16Sequence => { + // SAFETY: `header.vtable.kind` is `Utf16Sequence`, so `self.ptr` can be safely cast to `SequenceString`. + let seq: &SequenceString = unsafe { self.ptr.cast().as_ref() }; + // SAFETY: `seq.data()` is a valid pointer to the UTF-16 data, and `header.len` is the correct length. + let slice = unsafe { std::slice::from_raw_parts(seq.data().cast(), header.len) }; + JsStr::Utf16(slice) + } + // For Static, Slice, and Rope, the `as_str()` method handles the variant conversion. + // This avoids redundant logic and ensures consistency. + JsStringKind::Static | JsStringKind::Slice | JsStringKind::Rope => self.as_str(), + } } /// Abstract operation `StringIndexOf ( string, searchValue, fromIndex )` @@ -324,7 +343,8 @@ impl JsString { #[inline] #[must_use] pub fn len(&self) -> usize { - self.vtable().len + // SAFETY: The pointer `self.ptr` is always valid and points to a `JsStringHeader` header. + unsafe { self.ptr.as_ref().len } } /// Return true if the [`JsString`] is empty. @@ -354,7 +374,7 @@ impl JsString { pub fn trim(&self) -> JsString { // Calculate both bounds directly to avoid intermediate allocations. let (start, end) = match self.variant() { - JsStrVariant::Latin1(v) => { + JsStr::Latin1(v) => { let Some(start) = v.iter().position(|c| !is_trimmable_whitespace_latin1(*c)) else { return StaticJsStrings::EMPTY_STRING; }; @@ -364,7 +384,7 @@ impl JsString { .unwrap_or(start); (start, end) } - JsStrVariant::Utf16(v) => { + JsStr::Utf16(v) => { let Some(start) = v.iter().copied().position(|r| { !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace) }) else { @@ -379,9 +399,24 @@ impl JsString { .unwrap_or(start); (start, end) } + JsStr::Rope(_) => { + let Some(start) = self.iter().position(|r| { + !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace) + }) else { + return StaticJsStrings::EMPTY_STRING; + }; + let end = self + .iter() + .rposition(|r| { + !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace) + }) + .unwrap_or(start); + (start, end) + } }; - // SAFETY: `position(...)` and `rposition(...)` cannot exceed the length of the string. + // SAFETY: `start` and `end` are calculated from valid indices within the string, + // ensuring `start <= end` and `end <= self.len()`. unsafe { Self::slice_unchecked(self, start, end + 1) } } @@ -390,16 +425,19 @@ impl JsString { #[must_use] pub fn trim_start(&self) -> JsString { let Some(start) = (match self.variant() { - JsStrVariant::Latin1(v) => v.iter().position(|c| !is_trimmable_whitespace_latin1(*c)), - JsStrVariant::Utf16(v) => v + JsStr::Latin1(v) => v.iter().position(|c| !is_trimmable_whitespace_latin1(*c)), + JsStr::Utf16(v) => v .iter() .copied() .position(|r| !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace)), + JsStr::Rope(_) => self + .iter() + .position(|r| !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace)), }) else { return StaticJsStrings::EMPTY_STRING; }; - // SAFETY: `position(...)` cannot exceed the length of the string. + // SAFETY: `start` is a valid index within the string, ensuring `start <= self.len()`. unsafe { Self::slice_unchecked(self, start, self.len()) } } @@ -408,17 +446,19 @@ impl JsString { #[must_use] pub fn trim_end(&self) -> JsString { let Some(end) = (match self.variant() { - JsStrVariant::Latin1(v) => v.iter().rposition(|c| !is_trimmable_whitespace_latin1(*c)), - JsStrVariant::Utf16(v) => v + JsStr::Latin1(v) => v.iter().rposition(|c| !is_trimmable_whitespace_latin1(*c)), + JsStr::Utf16(v) => v .iter() .copied() .rposition(|r| !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace)), + JsStr::Rope(_) => self + .iter() + .rposition(|r| !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace)), }) else { return StaticJsStrings::EMPTY_STRING; }; - // SAFETY: `rposition(...)` cannot exceed the length of the string. `end` is the first - // character that is not trimmable, therefore we need to add 1 to it. + // SAFETY: `end` is a valid index within the string, ensuring `end + 1 <= self.len()`. unsafe { Self::slice_unchecked(self, 0, end + 1) } } @@ -437,15 +477,7 @@ impl JsString { // We check the size, so this should never panic. #[allow(clippy::missing_panics_doc)] pub fn ends_with(&self, needle: JsStr<'_>) -> bool { - self.as_str().starts_with(needle) - } - - /// Get the `u16` code unit at index. This does not parse any characters if there - /// are pairs, it is simply the index of the `u16` elements. - #[inline] - #[must_use] - pub fn code_unit_at(&self, index: usize) -> Option { - self.as_str().get(index) + self.as_str().ends_with(needle) } /// Get the element at the given index, or [`None`] if the index is out of range. @@ -499,10 +531,9 @@ impl JsString { /// /// To avoid a memory leak the pointer must be converted back to a `JsString` using /// [`JsString::from_raw`]. - #[inline] #[must_use] - pub fn into_raw(self) -> NonNull { - ManuallyDrop::new(self).ptr.cast() + pub fn into_raw(self) -> NonNull { + ManuallyDrop::new(self).ptr } /// Constructs a `JsString` from the internal pointer. @@ -516,19 +547,7 @@ impl JsString { /// even if the returned `JsString` is never accessed. #[inline] #[must_use] - pub const unsafe fn from_raw(ptr: NonNull) -> Self { - Self { ptr: ptr.cast() } - } - - /// Constructs a `JsString` from a reference to a `VTable`. - /// - /// # Safety - /// - /// This function is unsafe because improper use may lead to memory unsafety, - /// even if the returned `JsString` is never accessed. - #[inline] - #[must_use] - pub(crate) const unsafe fn from_ptr(ptr: NonNull) -> Self { + pub const unsafe fn from_raw(ptr: NonNull) -> Self { Self { ptr } } } @@ -542,16 +561,16 @@ impl JsString { #[inline] #[must_use] pub fn is_static(&self) -> bool { - // Check the vtable kind tag - self.vtable().kind == JsStringKind::Static + self.kind() == JsStringKind::Static } /// Get the vtable for this string. #[inline] #[must_use] + #[allow(dead_code)] const fn vtable(&self) -> &JsStringVTable { - // SAFETY: All JsString variants have vtable as the first field (embedded directly). - unsafe { self.ptr.as_ref() } + // SAFETY: The pointer `self.ptr` is always valid and points to a `JsStringHeader` header. + unsafe { self.ptr.as_ref().vtable } } /// Create a [`JsString`] from a [`StaticString`] instance. This is assumed that the @@ -560,8 +579,12 @@ impl JsString { #[inline] #[must_use] pub const fn from_static(str: &'static StaticString) -> Self { + // SAFETY: `str` is a reference to a `StaticString`, which is guaranteed to have a valid `header` field. + // The address of `str.header` is valid and non-null, and casting to `*mut JsStringHeader` is safe + // because `JsStringHeader` is the common header for all string types. Self { - ptr: NonNull::from_ref(str).cast(), + // SAFETY: The address of `str.header` is guaranteed to be non-null. + ptr: unsafe { NonNull::new_unchecked((&raw const str.header).cast_mut()) }, } } @@ -576,7 +599,8 @@ impl JsString { #[inline] #[must_use] pub unsafe fn slice_unchecked(data: &JsString, start: usize, end: usize) -> Self { - // Safety: invariant stated by this whole function. + // Safety: The caller guarantees `start <= end` and `end <= data.len()`. + // `SliceString::new` creates a valid slice, and `Box::leak` correctly manages memory. let slice = Box::new(unsafe { SliceString::new(data, start, end) }); Self { @@ -584,6 +608,30 @@ impl JsString { } } + /// Casts the string to an inner type. + /// + /// # Safety + /// The caller must ensure that the string is of the correct kind. + #[inline] + #[must_use] + pub(crate) unsafe fn as_inner(&self) -> &T { + // SAFETY: The caller must ensure that the string is of the correct kind, + // allowing the `self.ptr` to be safely cast to `NonNull` and then dereferenced. + unsafe { self.ptr.cast().as_ref() } + } + + /// Get the depth of the rope (0 if not a rope). + #[inline] + #[must_use] + pub fn depth(&self) -> u8 { + if self.kind() == JsStringKind::Rope { + // SAFETY: `self.kind()` is checked to be `Rope`, so `self.as_inner::()` is safe. + unsafe { self.as_inner::().depth() } + } else { + 0 + } + } + /// Create a [`JsString`] from an existing `JsString` and start, end /// range. Returns None if the start/end is invalid. #[inline] @@ -595,45 +643,217 @@ impl JsString { if p1 >= p2 { StaticJsStrings::EMPTY_STRING } else { - // SAFETY: We just checked the conditions. + // SAFETY: The conditions `p1 <= p2` and `p2 <= self.len()` are ensured by the `if` blocks above. unsafe { Self::slice_unchecked(self, p1, p2) } } } + /// Returns the string as a contiguous [`JsStr`]. + /// + /// If the string is a rope, this flattens it into a contiguous buffer and returns a view + /// into that buffer. Subsequent calls will return the cached buffer instantly. + #[inline] + #[must_use] + pub fn as_flat_str(&self) -> JsStr<'_> { + let view = self.as_str(); + if let JsStr::Rope(rope) = view { + let flat = vtable::rope::flatten_rope(rope.header); + flat.as_str() + } else { + view + } + } + /// Get the kind of this string (for debugging/introspection). #[inline] #[must_use] pub(crate) fn kind(&self) -> JsStringKind { - self.vtable().kind + // SAFETY: The pointer `self.ptr` is always valid and points to a `JsStringHeader` header. + unsafe { self.ptr.as_ref().vtable.kind } } - /// Get the inner pointer as a reference of type T. - /// - /// # Safety - /// This should only be used when the inner type has been validated via `kind()`. - /// Using an unvalidated inner type is undefined behaviour. + /// Returns the string as a [`JsStr`]. #[inline] - pub(crate) unsafe fn as_inner(&self) -> &T { - // SAFETY: Caller must ensure the type matches. - unsafe { self.ptr.cast::().as_ref() } + #[must_use] + pub fn as_str(&self) -> JsStr<'_> { + // SAFETY: The pointer `self.ptr` is always valid and points to a `JsStringHeader` header. + let header = unsafe { self.ptr.as_ref() }; + // FAST PATH: Devirtualize common kinds. + match header.vtable.kind { + JsStringKind::Latin1Sequence => { + // SAFETY: `header.vtable.kind` is `Latin1Sequence`, so `self.ptr` can be safely cast to `SequenceString`. + let seq: &SequenceString = unsafe { self.ptr.cast().as_ref() }; + // SAFETY: `seq.data()` returns a valid pointer to the Latin1 data, and `header.len` is the correct length. + // The lifetime is bound to `self` via the function signature. + unsafe { + let slice = std::slice::from_raw_parts(seq.data(), header.len); + JsStr::latin1(slice) + } + } + JsStringKind::Utf16Sequence => { + // SAFETY: `header.vtable.kind` is `Utf16Sequence`, so `self.ptr` can be safely cast to `SequenceString`. + let seq: &SequenceString = unsafe { self.ptr.cast().as_ref() }; + // SAFETY: `seq.data()` returns a valid pointer to the UTF-16 data, and `header.len` is the correct length. + // The lifetime is bound to `self` via the function signature. + unsafe { + let slice = std::slice::from_raw_parts(seq.data().cast(), header.len); + JsStr::utf16(slice) + } + } + JsStringKind::Static => { + // SAFETY: `header.vtable.kind` is `Static`, so `self.ptr` can be safely cast to `StaticString`. + let s: &StaticString = unsafe { self.ptr.cast().as_ref() }; + s.str + } + JsStringKind::Slice => { + // SAFETY: `header.vtable.kind` is `Slice`, so `self.ptr` can be safely cast to `SliceString`. + let s: &SliceString = unsafe { self.ptr.cast().as_ref() }; + s.inner + } + JsStringKind::Rope => { + // SAFETY: The `vtable.as_str` function is guaranteed to be a valid function pointer + // for the specific string type, and `header` is a valid reference to the header. + (header.vtable.as_str)(header) + } + } + } + + /// Returns the code unit at `index`. + #[inline] + #[must_use] + pub fn code_unit_at(&self, index: usize) -> Option { + // SAFETY: The pointer `self.ptr` is always valid and points to a `JsStringHeader` header. + let header = unsafe { self.ptr.as_ref() }; + if index >= header.len { + return None; + } + // FAST PATH: Devirtualize common kinds. + match header.vtable.kind { + JsStringKind::Latin1Sequence => { + // SAFETY: `header.vtable.kind` is `Latin1Sequence`, so `self.ptr` can be safely cast to `SequenceString`. + let seq: &SequenceString = unsafe { self.ptr.cast().as_ref() }; + // SAFETY: `seq.data()` returns a valid pointer to the Latin1 data. + // `index` is checked to be within `header.len`, so `add(index)` is in bounds. + Some(u16::from(unsafe { *seq.data().add(index) })) + } + JsStringKind::Utf16Sequence => { + // SAFETY: `header.vtable.kind` is `Utf16Sequence`, so `self.ptr` can be safely cast to `SequenceString`. + let seq: &SequenceString = unsafe { self.ptr.cast().as_ref() }; + // SAFETY: `seq.data()` returns a valid pointer to the UTF-16 data. + // `index` is checked to be within `header.len`, so `add(index)` is in bounds. + // The pointer is aligned because `JsStringHeader` has an alignment of 8 and a size that is a multiple of 2. + #[allow(clippy::cast_ptr_alignment)] + unsafe { + Some(*seq.data().cast::().add(index)) + } + } + JsStringKind::Static | JsStringKind::Slice => { + // All these have a direct JsStr representation. + self.as_str().get(index) + } + JsStringKind::Rope => (header.vtable.code_unit_at)(header, index), + } + } + + /// Returns a reference to the reference count cell, if it exists. + #[inline] + fn refcount_cell(&self) -> Option<&AtomicUsize> { + // SAFETY: The pointer is always valid. + // We use raw pointer projection to avoid creating a shared reference to the header, + // which would mark the memory as read-only (Frozen) in Miri's borrow model. + unsafe { + let header_ptr = self.ptr.as_ptr(); + let kind = (*header_ptr).vtable.kind; + if kind == JsStringKind::Static { + None + } else { + // Deriving the pointer to refcount directly from the raw pointer ensures + // it's not restricted by a temporary shared reference to the whole struct. + let refcount_ptr = &raw const (*header_ptr).refcount; + Some(&*refcount_ptr.cast::()) + } + } } } impl JsString { - /// Obtains the underlying [`&[u16]`][slice] slice of a [`JsString`] + /// Creates a new [`JsString`] from the concatenation of `x` and `y`. #[inline] #[must_use] - pub fn as_str(&self) -> JsStr<'_> { - (self.vtable().as_str)(self.ptr) + pub fn concat(x: &Self, y: &Self) -> Self { + Self::concat_array_strings(&[x.clone(), y.clone()]) } - /// Creates a new [`JsString`] from the concatenation of `x` and `y`. + /// Creates a new [`JsString`] from the concatenation of two slices `x` and `y`. #[inline] #[must_use] - pub fn concat(x: JsStr<'_>, y: JsStr<'_>) -> Self { + pub fn concat_slices(x: JsStr<'_>, y: JsStr<'_>) -> Self { Self::concat_array(&[x, y]) } + /// Creates a new [`JsString`] from the concatenation of every element of + /// `strings`. + /// + /// This will use a rope representation if the concatenation is large enough to + /// warrant it. + #[inline] + #[must_use] + pub fn concat_array_strings(strings: &[Self]) -> Self { + if strings.is_empty() { + return StaticJsStrings::EMPTY_STRING; + } + if strings.len() == 1 { + return strings[0].clone(); + } + + let full_count: usize = strings.iter().map(Self::len).sum(); + + // Hybrid Strategy: Use ropes for large concatenations. + if full_count > 1024 { + if strings.len() == 2 { + return RopeString::create(strings[0].clone(), strings[1].clone()); + } + return Self::concat_strings_balanced(strings); + } + + let slices: Vec<_> = strings.iter().map(Self::as_str).collect(); + Self::concat_array(&slices) + } + + /// Internal helper to build a balanced rope tree from a slice of strings. + pub(crate) fn concat_strings_balanced(strings: &[Self]) -> Self { + match strings.len() { + 0 => StaticJsStrings::EMPTY_STRING, + 1 => strings[0].clone(), + 2 => RopeString::create(strings[0].clone(), strings[1].clone()), + _ => { + // To build a truly balanced tree, we first collect all leaves if the input + // contains ropes. This prevents the "unbalanced ropes in array" problem. + let mut leaves = Vec::with_capacity(strings.len() * 2); + for s in strings { + RopeString::collect_leaves(s, &mut leaves); + } + + Self::concat_leaves_balanced(&leaves) + } + } + } + + /// Recursively builds a balanced rope tree from a flat list of leaves. + pub(crate) fn concat_leaves_balanced(leaves: &[Self]) -> Self { + match leaves.len() { + 0 => StaticJsStrings::EMPTY_STRING, + 1 => leaves[0].clone(), + 2 => RopeString::create(leaves[0].clone(), leaves[1].clone()), + _ => { + let mid = leaves.len() / 2; + let left = Self::concat_leaves_balanced(&leaves[..mid]); + let right = Self::concat_leaves_balanced(&leaves[mid..]); + RopeString::create(left, right) + } + } + } + /// Creates a new [`JsString`] from the concatenation of every element of /// `strings`. #[inline] @@ -670,41 +890,46 @@ impl JsString { // The sum of all `count` for each `string` equals `full_count`, and since we're // iteratively writing each of them to `data`, `copy_non_overlapping` always stays // in-bounds for `count` reads of each string and `full_count` writes to `data`. - // // Each `string` must be properly aligned to be a valid slice, and `data` must be // properly aligned by `allocate_seq`. - // - // `allocate_seq` must return a valid pointer to newly allocated memory, meaning - // `ptr` and all `string`s should never overlap. unsafe { // NOTE: The alignment is checked when we allocate the array. #[allow(clippy::cast_ptr_alignment)] - match (latin1_encoding, string.variant()) { - (true, JsStrVariant::Latin1(s)) => { + match (latin1_encoding, string) { + (true, JsStr::Latin1(s)) => { let count = s.len(); ptr::copy_nonoverlapping(s.as_ptr(), data.cast::(), count); data = data.cast::().add(count).cast::(); } - (false, JsStrVariant::Latin1(s)) => { + (false, JsStr::Latin1(s)) => { let count = s.len(); for (i, byte) in s.iter().enumerate() { *data.cast::().add(i) = u16::from(*byte); } data = data.cast::().add(count).cast::(); } - (false, JsStrVariant::Utf16(s)) => { + (false, JsStr::Utf16(s)) => { let count = s.len(); ptr::copy_nonoverlapping(s.as_ptr(), data.cast::(), count); data = data.cast::().add(count).cast::(); } - (true, JsStrVariant::Utf16(_)) => { + (false, JsStr::Rope(r)) => { + let count = r.len(); + for (i, cu) in string.iter().enumerate() { + *data.cast::().add(i) = cu; + } + data = data.cast::().add(count).cast::(); + } + (true, JsStr::Utf16(_) | JsStr::Rope(_)) => { unreachable!("Already checked that it's latin1 encoding") } } } } - Self { ptr: ptr.cast() } + Self { + ptr: ptr.cast::(), + } }; StaticJsStrings::get_string(&string.as_str()).unwrap_or(string) @@ -714,31 +939,39 @@ impl JsString { fn from_slice_skip_interning(string: JsStr<'_>) -> Self { let count = string.len(); - // SAFETY: - // - We read `count = data.len()` elements from `data`, which is within the bounds of the slice. - // - `allocate_*_seq` must allocate at least `count` elements, which allows us to safely - // write at least `count` elements. - // - `allocate_*_seq` should already take care of the alignment of `ptr`, and `data` must be - // aligned to be a valid slice. - // - `allocate_*_seq` must return a valid pointer to newly allocated memory, meaning `ptr` - // and `data` should never overlap. + // SAFETY: `allocate_*_seq` must return a valid pointer to newly allocated memory. unsafe { // NOTE: The alignment is checked when we allocate the array. #[allow(clippy::cast_ptr_alignment)] - match string.variant() { - JsStrVariant::Latin1(s) => { + match string { + JsStr::Latin1(s) => { let ptr = SequenceString::::allocate(count); let data = (&raw mut (*ptr.as_ptr()).data) .cast::<::Byte>(); ptr::copy_nonoverlapping(s.as_ptr(), data, count); - Self { ptr: ptr.cast() } + Self { + ptr: ptr.cast::(), + } } - JsStrVariant::Utf16(s) => { + JsStr::Utf16(s) => { let ptr = SequenceString::::allocate(count); let data = (&raw mut (*ptr.as_ptr()).data) .cast::<::Byte>(); ptr::copy_nonoverlapping(s.as_ptr(), data, count); - Self { ptr: ptr.cast() } + Self { + ptr: ptr.cast::(), + } + } + JsStr::Rope(_) => { + let ptr = SequenceString::::allocate(count); + let data = (&raw mut (*ptr.as_ptr()).data) + .cast::<::Byte>(); + for (i, cu) in string.iter().enumerate() { + ptr::write(data.add(i), cu); + } + Self { + ptr: ptr.cast::(), + } } } } @@ -756,14 +989,17 @@ impl JsString { #[inline] #[must_use] pub fn refcount(&self) -> Option { - (self.vtable().refcount)(self.ptr) + self.refcount_cell().map(|rc| rc.load(Ordering::Relaxed)) } } impl Clone for JsString { #[inline] fn clone(&self) -> Self { - (self.vtable().clone)(self.ptr) + if let Some(refcount) = self.refcount_cell() { + refcount.fetch_add(1, Ordering::Relaxed); + } + Self { ptr: self.ptr } } } @@ -777,7 +1013,18 @@ impl Default for JsString { impl Drop for JsString { #[inline] fn drop(&mut self) { - (self.vtable().drop)(self.ptr); + if self + .refcount_cell() + .is_some_and(|rc| rc.fetch_sub(1, Ordering::AcqRel) > 1) + { + return; + } + // SAFETY: The pointer is always valid and this is the last reference. + // We call the deallocator from the vtable to correctly clean up the specific string kind. + unsafe { + let header = self.ptr.as_ref(); + (header.vtable.dealloc)(self.ptr); + } } } @@ -849,17 +1096,24 @@ impl From> for JsString { } } +impl From<&JsString> for JsString { + #[inline] + fn from(value: &JsString) -> Self { + value.clone() + } +} + impl From<&[JsString]> for JsString { #[inline] fn from(value: &[JsString]) -> Self { - Self::concat_array(&value.iter().map(Self::as_str).collect::>()[..]) + Self::concat_array_strings(value) } } impl From<&[JsString; N]> for JsString { #[inline] fn from(value: &[JsString; N]) -> Self { - Self::concat_array(&value.iter().map(Self::as_str).collect::>()[..]) + Self::concat_array_strings(value) } } @@ -890,7 +1144,25 @@ impl From<&[u16; N]> for JsString { impl Hash for JsString { #[inline] fn hash(&self, state: &mut H) { - self.as_str().hash(state); + // SAFETY: The pointer is always valid. + // We use raw pointer projection to avoid creating a read-only shared reference to the header + // that covers the hash field, as we intend to perform interior mutation (atomic store) on it. + unsafe { + let header_ptr = self.ptr.as_ptr(); + let hash_ptr = (&raw const (*header_ptr).hash).cast::(); + let mut hash = (*hash_ptr).load(Ordering::Relaxed); + if hash == 0 { + hash = self.as_str().content_hash(); + if hash == 0 { + hash = 1; + } + if (*header_ptr).vtable.kind != JsStringKind::Static { + // SAFETY: Not a static string. Alignment and size match. + (*hash_ptr).store(hash, Ordering::Relaxed); + } + } + state.write_u64(hash); + } } } @@ -911,6 +1183,13 @@ impl Ord for JsString { impl PartialEq for JsString { #[inline] fn eq(&self, other: &Self) -> bool { + if self.ptr == other.ptr { + return true; + } + if self.len() != other.len() { + return false; + } + self.as_str() == other.as_str() } } @@ -1030,7 +1309,7 @@ macro_rules! impl_js_string_slice_index { None } else { // SAFETY: we just checked the indices. - Some(unsafe { JsString::slice_unchecked(str, start, end) }) + unsafe { Some(JsString::slice_unchecked(str, start, end)) } } } } diff --git a/core/string/src/str.rs b/core/string/src/str.rs index 59e98d9d57d..1f0589083f6 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -5,33 +5,37 @@ use std::{ slice::SliceIndex, }; -/// Inner representation of a [`JsStr`]. +/// A view into a rope string. #[derive(Debug, Clone, Copy)] -pub enum JsStrVariant<'a> { - /// Latin1 string representation. - Latin1(&'a [u8]), - - /// U16 string representation. - Utf16(&'a [u16]), +pub struct RopeSlice<'a> { + pub header: &'a crate::vtable::JsStringHeader, + pub(crate) start: usize, + pub(crate) end: usize, } -impl JsStrVariant<'_> { - pub(crate) const fn len(&self) -> usize { - match self { - JsStrVariant::Latin1(data) => data.len(), - JsStrVariant::Utf16(data) => data.len(), - } +impl RopeSlice<'_> { + /// Get the length of the slice. + #[inline] + #[must_use] + pub const fn len(&self) -> usize { + self.end - self.start } } /// This is equivalent to Rust's `&str`. #[derive(Clone, Copy)] -#[repr(align(8))] -pub struct JsStr<'a> { - inner: JsStrVariant<'a>, +pub enum JsStr<'a> { + /// Latin1 string representation. + Latin1(&'a [u8]), + + /// U16 string representation. + Utf16(&'a [u16]), + + /// A view into a rope string. + Rope(RopeSlice<'a>), } -// SAFETY: Inner<'_> has only immutable references to Sync types (u8/u16), so this is safe. +// SAFETY: JsStr<'_> has only immutable references to Sync types, so this is safe. unsafe impl Sync for JsStr<'_> {} // SAFETY: It's read-only, sending this reference to another thread doesn't @@ -40,54 +44,61 @@ unsafe impl Send for JsStr<'_> {} impl<'a> JsStr<'a> { /// This represents an empty string. - pub const EMPTY: Self = Self::latin1("".as_bytes()); + pub const EMPTY: Self = Self::Latin1("".as_bytes()); /// Creates a [`JsStr`] from codepoints that can fit in a `u8`. #[inline] #[must_use] pub const fn latin1(value: &'a [u8]) -> Self { - Self { - inner: JsStrVariant::Latin1(value), - } + Self::Latin1(value) } /// Creates a [`JsStr`] from utf16 encoded string. #[inline] #[must_use] pub const fn utf16(value: &'a [u16]) -> Self { - Self { - inner: JsStrVariant::Utf16(value), - } + Self::Utf16(value) } - /// Get the length of the [`JsStr`]. + /// Creates a [`JsStr`] from a rope slice. #[inline] #[must_use] - pub const fn len(&self) -> usize { - self.inner.len() + pub const fn rope(slice: RopeSlice<'a>) -> Self { + Self::Rope(slice) } - /// Return the inner [`JsStrVariant`] variant of the [`JsStr`]. + /// Return the inner variant of the [`JsStr`]. #[inline] #[must_use] - pub const fn variant(self) -> JsStrVariant<'a> { - self.inner + pub const fn variant(self) -> Self { + self + } + + /// Get the length of the [`JsStr`]. + #[inline] + #[must_use] + pub const fn len(&self) -> usize { + match self { + Self::Latin1(data) => data.len(), + Self::Utf16(data) => data.len(), + Self::Rope(rope) => rope.len(), + } } /// Check if the [`JsStr`] is latin1 encoded. #[inline] #[must_use] pub const fn is_latin1(&self) -> bool { - matches!(self.inner, JsStrVariant::Latin1(_)) + matches!(self, Self::Latin1(_)) } /// Returns [`u8`] slice if the [`JsStr`] is latin1 encoded, otherwise [`None`]. #[inline] #[must_use] pub const fn as_latin1(&self) -> Option<&[u8]> { - match &self.inner { - JsStrVariant::Latin1(v) => Some(v), - JsStrVariant::Utf16(_) => None, + match self { + Self::Latin1(v) => Some(v), + _ => None, } } @@ -99,21 +110,29 @@ impl<'a> JsStr<'a> { #[inline] #[must_use] pub unsafe fn as_static(self) -> JsStr<'static> { - let inner: JsStrVariant<'static> = match self.inner { - JsStrVariant::Latin1(v) => { + match self { + Self::Latin1(v) => { // SAFETY: Caller is responsible for ensuring the lifetime of this slice. let static_v: &'static [u8] = unsafe { std::slice::from_raw_parts(v.as_ptr(), v.len()) }; - JsStrVariant::<'static>::Latin1(static_v) + JsStr::<'static>::Latin1(static_v) } - JsStrVariant::Utf16(v) => { + Self::Utf16(v) => { // SAFETY: Caller is responsible for ensuring the lifetime of this slice. let static_v: &'static [u16] = unsafe { std::slice::from_raw_parts(v.as_ptr(), v.len()) }; - JsStrVariant::<'static>::Utf16(static_v) + JsStr::<'static>::Utf16(static_v) } - }; - JsStr::<'static> { inner } + Self::Rope(r) => { + JsStr::<'static>::Rope(RopeSlice { + // SAFETY: The `JsStr` is a view into a string that is guaranteed to stay alive for 'static + // if the caller ensures the lifetime requirements. + header: unsafe { &*std::ptr::from_ref(r.header) }, + start: r.start, + end: r.end, + }) + } + } } /// Iterate over the codepoints of the string. @@ -172,7 +191,7 @@ impl<'a> JsStr<'a> { where I: JsSliceIndex<'a>, { - // Safety: Caller must ensure the index is not out of bounds + // SAFETY: Caller must ensure the index is not out of bounds unsafe { JsSliceIndex::get_unchecked(self, index) } } @@ -180,9 +199,10 @@ impl<'a> JsStr<'a> { #[inline] #[must_use] pub fn to_vec(&self) -> Vec { - match self.variant() { - JsStrVariant::Latin1(v) => v.iter().copied().map(u16::from).collect(), - JsStrVariant::Utf16(v) => v.to_vec(), + match self { + Self::Latin1(v) => v.iter().copied().map(u16::from).collect(), + Self::Utf16(v) => v.to_vec(), + Self::Rope(_) => self.iter().collect(), } } @@ -270,8 +290,8 @@ impl<'a> JsStr<'a> { // position >= 0 ensured by position: usize assert!(position < size); - match self.variant() { - JsStrVariant::Latin1(v) => { + match self { + Self::Latin1(v) => { let code_point = v.get(position).expect("Already checked the size"); CodePoint::Unicode(*code_point as char) } @@ -285,7 +305,7 @@ impl<'a> JsStr<'a> { // 8. If second is not a trailing surrogate, then // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. // 9. Set cp to ! UTF16SurrogatePairToCodePoint(first, second). - JsStrVariant::Utf16(v) => { + Self::Utf16(v) => { // We can skip the checks and instead use the `char::decode_utf16` function to take care of that for us. let code_point = v .get(position..=position + 1) @@ -299,6 +319,10 @@ impl<'a> JsStr<'a> { Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()), } } + Self::Rope(_) => self + .code_points() + .nth(position) + .expect("Already checked the size"), } } @@ -376,9 +400,10 @@ impl<'a> JsStr<'a> { #[inline] #[must_use] pub fn contains(&self, element: u8) -> bool { - match self.variant() { - JsStrVariant::Latin1(v) => v.contains(&element), - JsStrVariant::Utf16(v) => v.contains(&u16::from(element)), + match self { + Self::Latin1(v) => v.contains(&element), + Self::Utf16(v) => v.contains(&u16::from(element)), + Self::Rope(_) => self.iter().any(|u| u == u16::from(element)), } } @@ -397,9 +422,10 @@ impl<'a> JsStr<'a> { /// [`FromUtf16Error`][std::string::FromUtf16Error] if it contains any invalid data. #[inline] pub fn to_std_string(&self) -> Result { - match self.variant() { - JsStrVariant::Latin1(v) => Ok(v.iter().copied().map(char::from).collect()), - JsStrVariant::Utf16(v) => String::from_utf16(v), + match self { + Self::Latin1(v) => Ok(v.iter().copied().map(char::from).collect()), + Self::Utf16(v) => String::from_utf16(v), + Self::Rope(_) => String::from_utf16(&self.to_vec()), } } @@ -425,31 +451,46 @@ impl<'a> JsStr<'a> { impl Hash for JsStr<'_> { #[inline] fn hash(&self, state: &mut H) { - // NOTE: The hash function has been inlined to ensure that a hash of latin1 and U16 - // encoded strings remains the same if they have the same characters - match self.variant() { - JsStrVariant::Latin1(s) => { - state.write_usize(s.len()); + state.write_u64(self.content_hash()); + } +} + +impl JsStr<'_> { + /// Computes the hash of the string content. + #[inline] + #[must_use] + pub fn content_hash(&self) -> u64 { + let mut h = rustc_hash::FxHasher::default(); + match *self { + Self::Latin1(s) => { + h.write_usize(s.len()); for elem in s { - state.write_u16(u16::from(*elem)); + h.write_u16(u16::from(*elem)); } } - JsStrVariant::Utf16(s) => { - state.write_usize(s.len()); + Self::Utf16(s) => { + h.write_usize(s.len()); for elem in s { - state.write_u16(*elem); + h.write_u16(*elem); + } + } + Self::Rope(_) => { + h.write_usize(self.len()); + for elem in self.iter() { + h.write_u16(elem); } } } + h.finish() } } impl Ord for JsStr<'_> { #[inline] fn cmp(&self, other: &Self) -> std::cmp::Ordering { - match (self.variant(), other.variant()) { - (JsStrVariant::Latin1(x), JsStrVariant::Latin1(y)) => x.cmp(y), - (JsStrVariant::Utf16(x), JsStrVariant::Utf16(y)) => x.cmp(y), + match (self, other) { + (Self::Latin1(x), Self::Latin1(y)) => x.cmp(y), + (Self::Utf16(x), Self::Utf16(y)) => x.cmp(y), _ => self.iter().cmp(other.iter()), } } @@ -460,9 +501,9 @@ impl Eq for JsStr<'_> {} impl PartialEq for JsStr<'_> { #[inline] fn eq(&self, other: &Self) -> bool { - match (self.variant(), other.variant()) { - (JsStrVariant::Latin1(lhs), JsStrVariant::Latin1(rhs)) => return lhs == rhs, - (JsStrVariant::Utf16(lhs), JsStrVariant::Utf16(rhs)) => return lhs == rhs, + match (self, other) { + (Self::Latin1(lhs), Self::Latin1(rhs)) => return lhs == rhs, + (Self::Utf16(lhs), Self::Utf16(rhs)) => return lhs == rhs, _ => {} } if self.len() != other.len() { @@ -480,9 +521,10 @@ impl PartialEq for JsStr<'_> { impl PartialEq for JsStr<'_> { #[inline] fn eq(&self, other: &str) -> bool { - match self.variant() { - JsStrVariant::Latin1(v) => v == other.as_bytes(), - JsStrVariant::Utf16(v) => other.encode_utf16().zip(v).all(|(a, b)| a == *b), + match *self { + Self::Latin1(v) => v == other.as_bytes(), + Self::Utf16(v) => other.encode_utf16().zip(v).all(|(a, b)| a == *b), + Self::Rope(_) => other.encode_utf16().zip(self.iter()).all(|(a, b)| a == b), } } } @@ -529,9 +571,16 @@ impl<'a> JsSliceIndex<'a> for usize { #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { - match value.variant() { - JsStrVariant::Latin1(v) => v.get(index).copied().map(u16::from), - JsStrVariant::Utf16(v) => v.get(index).copied(), + match value { + JsStr::Latin1(v) => v.get(index).copied().map(u16::from), + JsStr::Utf16(v) => v.get(index).copied(), + JsStr::Rope(_) => { + if index < value.len() { + Some(value.iter().nth(index).expect("Already checked the size")) + } else { + None + } + } } } @@ -540,11 +589,12 @@ impl<'a> JsSliceIndex<'a> for usize { /// Caller must ensure the index is not out of bounds #[inline] unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value { - // Safety: Caller must ensure the index is not out of bounds + // SAFETY: Caller must ensure the index is not out of bounds. unsafe { - match value.variant() { - JsStrVariant::Latin1(v) => u16::from(*v.get_unchecked(index)), - JsStrVariant::Utf16(v) => *v.get_unchecked(index), + match value { + JsStr::Latin1(v) => u16::from(*v.get_unchecked(index)), + JsStr::Utf16(v) => *v.get_unchecked(index), + JsStr::Rope(_) => value.iter().nth(index).expect("Already checked the size"), } } } @@ -555,9 +605,20 @@ impl<'a> JsSliceIndex<'a> for std::ops::Range { #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { - match value.variant() { - JsStrVariant::Latin1(v) => v.get(index).map(JsStr::latin1), - JsStrVariant::Utf16(v) => v.get(index).map(JsStr::utf16), + match value { + JsStr::Latin1(v) => v.get(index).map(JsStr::latin1), + JsStr::Utf16(v) => v.get(index).map(JsStr::utf16), + JsStr::Rope(r) => { + if index.start <= index.end && index.end <= r.len() { + Some(JsStr::Rope(RopeSlice { + header: r.header, + start: r.start + index.start, + end: r.start + index.end, + })) + } else { + None + } + } } } @@ -566,12 +627,16 @@ impl<'a> JsSliceIndex<'a> for std::ops::Range { /// Caller must ensure the index is not out of bounds #[inline] unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value { - // Safety: Caller must ensure the index is not out of bounds - unsafe { - match value.variant() { - JsStrVariant::Latin1(v) => JsStr::latin1(v.get_unchecked(index)), - JsStrVariant::Utf16(v) => JsStr::utf16(v.get_unchecked(index)), - } + match value { + // SAFETY: Caller must ensure the index is not out of bounds. + JsStr::Latin1(v) => unsafe { JsStr::latin1(v.get_unchecked(index)) }, + // SAFETY: Caller must ensure the index is not out of bounds. + JsStr::Utf16(v) => unsafe { JsStr::utf16(v.get_unchecked(index)) }, + JsStr::Rope(r) => JsStr::Rope(RopeSlice { + header: r.header, + start: r.start + index.start, + end: r.start + index.end, + }), } } } @@ -581,10 +646,10 @@ impl<'a> JsSliceIndex<'a> for std::ops::RangeInclusive { #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { - match value.variant() { - JsStrVariant::Latin1(v) => v.get(index).map(JsStr::latin1), - JsStrVariant::Utf16(v) => v.get(index).map(JsStr::utf16), + if *index.end() == usize::MAX { + return None; } + JsSliceIndex::get(value, *index.start()..*index.end() + 1) } /// # Safety @@ -593,12 +658,7 @@ impl<'a> JsSliceIndex<'a> for std::ops::RangeInclusive { #[inline] unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value { // Safety: Caller must ensure the index is not out of bounds - unsafe { - match value.variant() { - JsStrVariant::Latin1(v) => JsStr::latin1(v.get_unchecked(index)), - JsStrVariant::Utf16(v) => JsStr::utf16(v.get_unchecked(index)), - } - } + unsafe { JsSliceIndex::get_unchecked(value, *index.start()..*index.end() + 1) } } } @@ -607,10 +667,7 @@ impl<'a> JsSliceIndex<'a> for std::ops::RangeFrom { #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { - match value.variant() { - JsStrVariant::Latin1(v) => v.get(index).map(JsStr::latin1), - JsStrVariant::Utf16(v) => v.get(index).map(JsStr::utf16), - } + JsSliceIndex::get(value, index.start..value.len()) } /// # Safety @@ -619,12 +676,7 @@ impl<'a> JsSliceIndex<'a> for std::ops::RangeFrom { #[inline] unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value { // Safety: Caller must ensure the index is not out of bounds - unsafe { - match value.variant() { - JsStrVariant::Latin1(v) => JsStr::latin1(v.get_unchecked(index)), - JsStrVariant::Utf16(v) => JsStr::utf16(v.get_unchecked(index)), - } - } + unsafe { JsSliceIndex::get_unchecked(value, index.start..value.len()) } } } @@ -633,10 +685,7 @@ impl<'a> JsSliceIndex<'a> for std::ops::RangeTo { #[inline] fn get(value: JsStr<'a>, index: Self) -> Option { - match value.variant() { - JsStrVariant::Latin1(v) => v.get(index).map(JsStr::latin1), - JsStrVariant::Utf16(v) => v.get(index).map(JsStr::utf16), - } + JsSliceIndex::get(value, 0..index.end) } /// # Safety @@ -645,12 +694,7 @@ impl<'a> JsSliceIndex<'a> for std::ops::RangeTo { #[inline] unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value { // Safety: Caller must ensure the index is not out of bounds - unsafe { - match value.variant() { - JsStrVariant::Latin1(v) => JsStr::latin1(v.get_unchecked(index)), - JsStrVariant::Utf16(v) => JsStr::utf16(v.get_unchecked(index)), - } - } + unsafe { JsSliceIndex::get_unchecked(value, 0..index.end) } } } diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index 43d0d3268d9..353ed3defd7 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -127,15 +127,15 @@ fn concat() { let x = JsString::from("hello"); let z = JsString::from("world"); - let xy = JsString::concat(x.as_str(), JsString::from(Y).as_str()); + let xy = JsString::concat(&x, &JsString::from(Y)); assert_eq!(&xy, &ascii_to_utf16(b"hello, ")); assert_eq!(xy.refcount(), Some(1)); - let xyz = JsString::concat(xy.as_str(), z.as_str()); + let xyz = JsString::concat(&xy, &z); assert_eq!(&xyz, &ascii_to_utf16(b"hello, world")); assert_eq!(xyz.refcount(), Some(1)); - let xyzw = JsString::concat(xyz.as_str(), JsString::from(W).as_str()); + let xyzw = JsString::concat(&xyz, &JsString::from(W)); assert_eq!(&xyzw, &ascii_to_utf16(b"hello, world!")); assert_eq!(xyzw.refcount(), Some(1)); } @@ -553,3 +553,216 @@ fn trim() { let base_str = JsString::from(" \u{000B} Hello World \t "); assert_eq!(base_str.trim(), JsString::from("Hello World")); } +#[test] +fn rope_basic() { + let s_large = JsString::from("a".repeat(1025)); // 1025 chars + let s3 = JsString::from("!"); + let rope = JsString::concat_array_strings(&[s_large.clone(), s3.clone()]); + + assert_eq!(rope.kind(), JsStringKind::Rope); + assert_eq!(rope.len(), 1026); + assert_eq!(rope.code_unit_at(1025), Some(u16::from(b'!'))); +} + +#[test] +fn rope_balanced_tree() { + let strings: Vec = (0..8) + .map(|i| JsString::from("a".repeat(1025) + &format!("{i:03}"))) + .collect(); + + let rope = JsString::concat_array_strings(&strings); + assert_eq!(rope.kind(), JsStringKind::Rope); + assert_eq!(rope.len(), 8 * 1028); + + // With 8 strings, balanced tree should have depth 3. + assert_eq!(rope.depth(), 3); +} + +#[test] +fn rope_rebalancing() { + let mut s = JsString::from("a".repeat(1025)); + // Highly unbalanced incremental concatenation. + for _ in 0..100 { + s = JsString::concat_array_strings(&[s, JsString::from("b")]); + } + + // Without rebalancing, depth would be 100. + // With Fibonacci rebalancing and hysteresis, depth should be kept small (e.g. < 45). + assert_eq!(s.kind(), JsStringKind::Rope); + assert!( + s.depth() < 45, + "Depth should be balanced (was {})", + s.depth() + ); + + // Verify it still works. + assert_eq!(s.code_unit_at(0), Some(u16::from(b'a'))); + assert_eq!(s.code_unit_at(1025), Some(u16::from(b'b'))); + assert_eq!(s.code_unit_at(1025 + 99), Some(u16::from(b'b'))); +} + +#[test] +fn pathological_batch_rebalancing() { + // Create a very deep (unbalanced) rope manually (if possible) or by bypassing create if needed. + // Actually, we can just create strings that are just at the threshold of rebalancing. + // But since concat_strings_balanced now collects leaves, it's inherently safe. + let strings: Vec = (0..50).map(|_| JsString::from("a".repeat(200))).collect(); + + // Batch concat should produce a balanced tree. + let rope = JsString::concat_array_strings(&strings); + assert_eq!(rope.kind(), JsStringKind::Rope); + // log2(50) is ~6. + assert!( + rope.depth() <= 7, + "Batch concat should be perfectly balanced (was {})", + rope.depth() + ); +} + +#[test] +fn test_rope_fibonacci_rebalancing() { + let mut s1 = JsString::from("a".repeat(20)); // Base length to bypass 512 flat threshold quickly + let mut s2 = JsString::from("b".repeat(20)); + + // Skew right + for _ in 0..200 { + s1 = JsString::concat(&s1, &JsString::from("c")); + } + // Skew left + for _ in 0..200 { + s2 = JsString::concat(&JsString::from("d"), &s2); + } + + assert_eq!(s1.len(), 20 + 200); + assert_eq!(s2.len(), 20 + 200); + + // Despite 10,000 skewed concatenations, the Fibonacci heuristic + // ensures the rope depth does not exceed moderate bounds (e.g. < 45). + assert!( + s1.depth() < 45, + "Right-skewed rope should have logarithmic depth via Fibonacci rebalancing, got: {}", + s1.depth() + ); + assert!( + s2.depth() < 45, + "Left-skewed rope should have logarithmic depth via Fibonacci rebalancing, got: {}", + s2.depth() + ); + + // Verify traversal is still accurate across the rebalanced structure + assert_eq!(s1.code_unit_at(0), Some(u16::from(b'a'))); + assert_eq!(s1.code_unit_at(219), Some(u16::from(b'c'))); + + assert_eq!(s2.code_unit_at(0), Some(u16::from(b'd'))); + assert_eq!(s2.code_unit_at(219), Some(u16::from(b'b'))); +} + +#[test] +fn rope_dag_sharing_stress() { + // Create a DAG via s = s + s. + // Length grows as 2^n. + let mut s = JsString::from("abc"); + for _ in 0..30 { + s = JsString::concat(&s, &s); + } + + // Depth should be reasonable (22). + // Note: iterations where length <= 1024 produce SequenceStrings (depth 0), + // and s="abc" (3) * 2^8 = 768. 3 * 2^9 = 1536. + // So depth only starts increasing at iter 8. 30 - 8 = 22. + assert_eq!(s.depth(), 22); + assert_eq!(s.len(), (1 << 30) * 3); + + // Flattening would take ~3GB, which might be too much for some CI environments. + // But we can check code_unit_at which is O(depth). + assert_eq!(s.code_unit_at(0), Some(u16::from(b'a'))); + assert_eq!(s.code_unit_at(s.len() - 1), Some(u16::from(b'c'))); +} + +#[test] +fn rope_dag_rebalance_explosion_prevented() { + // To trigger rebalance on a DAG, we need len < Fib(depth+2). + // So we need a very deep tree with very little content, but with sharing. + // Pathological case: s = "a", then s = s + empty. + let mut s = JsString::from("a"); + let empty = JsString::from(""); + for _ in 0..35 { + s = JsString::concat(&s, &empty); + } + // Now s has depth 35 and len 1. + // Fib(37) is 24 million. 1 < 24 million, so it rebalances. + // If we share the s: + let shared = JsString::concat(&s, &s); + assert_eq!(shared.len(), 2); +} + +#[test] +fn test_reentrancy_oncecell() { + let mut s = JsString::from("a"); + // Create a deeply shared DAG + for i in 0..10 { + s = JsString::concat(&s.clone(), &s.clone()); + println!("Depth step {}: len = {}", i, s.len()); + } + + println!("Triggering lazy flattening on DAG..."); + // Our iterative flattening naturally avoids OnceCell reentrancy panics. + // In a DAG where `left == right`, a recursive implementation would call `as_str()` + // on the same node twice, triggering `get_or_init` while already inside it. + // However, our iterative `rope_as_str` avoids calling `as_str()` on sub-ropes, + // instead manually traversing them using `flattened.get()`. This ensures + // we never trigger a recursive initialization of the same OnceCell. + let flat = s.as_str(); + println!("Flattened successfully! Len: {}", flat.len()); +} + +#[test] +fn deep_rope_stress() { + let mut s = JsString::from("a"); + + for _ in 0..10000 { + s = JsString::concat(&s.clone(), &"b".into()); + } + + let _ = s.as_str(); +} + +#[test] +fn shared_rope_stress() { + let base = JsString::from("x"); + + let mut ropes = Vec::new(); + + for _ in 0..1000 { + ropes.push(JsString::concat(&base.clone(), &base.clone())); + } + + for r in ropes { + let _ = r.as_str(); + } +} + +#[test] +fn rope_flatten_cache() { + let a = JsString::from("a".repeat(2000)); + let b = JsString::from("b".repeat(2000)); + + let rope = JsString::concat(&a, &b); + + let first = rope.as_str(); + let second = rope.as_str(); + + assert_eq!(first, second); +} + +#[test] +fn flatten_shared_subtree() { + let base = JsString::from("x".repeat(2000)); + + let a = JsString::concat(&base, &base); + let b = JsString::concat(&base, &base); + + let root = JsString::concat(&a, &b); + + assert_eq!(root.as_str().len(), 8000); +} diff --git a/core/string/src/type.rs b/core/string/src/type.rs index f19999ebf87..e7196ed9614 100644 --- a/core/string/src/type.rs +++ b/core/string/src/type.rs @@ -1,6 +1,6 @@ //! Module containing string types public and crate-specific. +use crate::JsStr; use crate::vtable::SequenceString; -use crate::{JsStr, JsStringKind}; use std::alloc::Layout; mod sealed { @@ -11,12 +11,12 @@ mod sealed { /// Internal trait for crate-specific usage. Contains implementation details /// that should not leak through the API. #[allow(private_interfaces)] -pub(crate) trait InternalStringType: StringType { +pub trait InternalStringType: StringType { /// The offset to the data field in the sequence string struct. const DATA_OFFSET: usize; - /// The kind of string produced by this string type. - const KIND: JsStringKind; + /// The static vtable for this string type. + const VTABLE: &'static crate::vtable::JsStringVTable; /// Create the base layout for the sequence string header. fn base_layout() -> Layout; @@ -45,7 +45,7 @@ impl StringType for Latin1 { impl InternalStringType for Latin1 { const DATA_OFFSET: usize = size_of::>(); - const KIND: JsStringKind = JsStringKind::Latin1Sequence; + const VTABLE: &'static crate::vtable::JsStringVTable = &crate::vtable::LATIN1_VTABLE; fn base_layout() -> Layout { Layout::new::>() @@ -69,7 +69,7 @@ impl StringType for Utf16 { impl InternalStringType for Utf16 { const DATA_OFFSET: usize = size_of::>(); - const KIND: JsStringKind = JsStringKind::Utf16Sequence; + const VTABLE: &'static crate::vtable::JsStringVTable = &crate::vtable::UTF16_VTABLE; fn base_layout() -> Layout { Layout::new::>() diff --git a/core/string/src/vtable/mod.rs b/core/string/src/vtable/mod.rs index 3e5f9151426..30288b38d5d 100644 --- a/core/string/src/vtable/mod.rs +++ b/core/string/src/vtable/mod.rs @@ -1,39 +1,73 @@ -//! Module defining the [`JsString`] `VTable` and kinds of strings. use crate::iter::CodePointsIter; -use crate::{JsStr, JsString, JsStringKind}; +use crate::{JsStr, JsStringKind}; use std::ptr::NonNull; -mod sequence; -pub(crate) use sequence::SequenceString; +/// Sequential string implementation. +pub mod sequence; +pub use sequence::SequenceString; +pub(crate) use sequence::{LATIN1_VTABLE, UTF16_VTABLE}; -pub(crate) mod slice; -pub(crate) use slice::SliceString; +/// Slice string implementation. +pub mod slice; +pub use slice::SliceString; pub(crate) mod r#static; pub use r#static::StaticString; -/// Embedded vtable for `JsString` operations. This is stored directly in each string -/// struct (not as a reference) to eliminate one level of indirection on hot paths. -#[derive(Debug, Copy, Clone)] +/// Rope string implementation. +pub mod rope; +pub use rope::RopeString; + +/// Header for all `JsString` allocations. +/// +/// This is stored at the beginning of every string allocation. +/// By using a reference to a static vtable, we reduce the header size +/// and improve cache locality for common operations. +#[repr(C, align(8))] +#[derive(Debug, Clone, Copy)] +pub struct JsStringHeader { + /// Reference to the static vtable for this string kind. + pub vtable: &'static JsStringVTable, + /// Length of the string in code units. + pub(crate) len: usize, + /// Reference count for this string. + pub(crate) refcount: usize, + /// Explicit padding to ensure `hash` is 8-byte aligned on 32-bit platforms. + #[cfg(target_pointer_width = "32")] + pub(crate) _padding: u32, + /// Cached hash of the string content. + pub(crate) hash: u64, +} + +impl JsStringHeader { + /// Create a new `JsStringHeader`. + pub(crate) const fn new(vtable: &'static JsStringVTable, len: usize, refcount: usize) -> Self { + Self { + vtable, + len, + refcount, + #[cfg(target_pointer_width = "32")] + _padding: 0, + hash: 0, + } + } +} + +/// Static vtable for `JsString` operations. +/// +/// This contains function pointers for polymorphic operations and static metadata. #[repr(C)] -pub(crate) struct JsStringVTable { - /// Clone the string, incrementing the refcount. - pub clone: fn(NonNull) -> JsString, - /// Drop the string, decrementing the refcount and freeing if needed. - pub drop: fn(NonNull), - /// Get the string as a `JsStr`. Although this is marked as `'static`, this is really - /// of the lifetime of the string itself. This is conveyed by the [`JsString`] API - /// itself rather than this vtable. - pub as_str: fn(NonNull) -> JsStr<'static>, - /// Get an iterator of code points. This is the basic form of character access. - /// Although this is marked as `'static`, this is really of the lifetime of the string - /// itself. This is conveyed by the [`JsString`] API itself rather than this vtable. - pub code_points: fn(NonNull) -> CodePointsIter<'static>, - /// Get the refcount, if applicable. - pub refcount: fn(NonNull) -> Option, - /// Get the length of the string. Since a string is immutable, this does not need - /// to be a call, can be calculated at construction. - pub len: usize, - /// Kind tag to identify the string type. +#[derive(Debug, Copy, Clone)] +pub struct JsStringVTable { + /// Get the string as a `JsStr`. + pub as_str: for<'a> fn(&'a JsStringHeader) -> JsStr<'a>, + /// Get an iterator of code points. + pub code_points: for<'a> fn(&'a JsStringHeader) -> CodePointsIter<'a>, + /// Get the code unit at the given index. + pub code_unit_at: fn(&JsStringHeader, usize) -> Option, + /// Deallocate the string. + pub dealloc: fn(NonNull), + + /// Kind tag to identify the string type. Shared across all strings of this vtable. pub kind: JsStringKind, } diff --git a/core/string/src/vtable/rope.rs b/core/string/src/vtable/rope.rs new file mode 100644 index 00000000000..d61fc2915c8 --- /dev/null +++ b/core/string/src/vtable/rope.rs @@ -0,0 +1,436 @@ +use crate::vtable::{JsStringHeader, JsStringVTable}; +use crate::{JsStr, JsString, JsStringKind}; +use std::cell::OnceCell; +use std::ptr::{self, NonNull}; + +/// Fibonacci numbers for rope balancing thresholds. +/// `F(n) = Fib(n + 2)`. A rope of depth `n` is balanced if its length >= `F(n)`. +static FIBONACCI_THRESHOLDS: [usize; 46] = [ + 1, + 2, + 3, + 5, + 8, + 13, + 21, + 34, + 55, + 89, + 144, + 233, + 377, + 610, + 987, + 1597, + 2584, + 4181, + 6765, + 10946, + 17711, + 28657, + 46368, + 75025, + 121_393, + 196_418, + 317_811, + 514_229, + 832_040, + 1_346_269, + 2_178_309, + 3_524_578, + 5_702_887, + 9_227_465, + 14_930_352, + 24_157_817, + 39_088_169, + 63_245_986, + 102_334_155, + 165_580_141, + 267_914_296, + 433_494_437, + 701_408_733, + 1_134_903_170, + 1_836_311_903, + 2_971_215_073, +]; + +/// Static vtable for rope strings. +pub(crate) static ROPE_VTABLE: JsStringVTable = JsStringVTable { + as_str: rope_as_str, + code_points: rope_code_points, + code_unit_at: rope_code_unit_at, + dealloc: rope_dealloc, + kind: JsStringKind::Rope, +}; + +/// The flattened representation of a rope. +#[allow(dead_code)] +#[derive(Debug)] +pub enum Flattened { + /// Latin1 encoded contiguous buffer. + Latin1(Box<[u8]>), + /// UTF-16 encoded contiguous buffer. + Utf16(Box<[u16]>), +} + +impl Flattened { + /// Gets the flattened string as a `JsStr`. + #[must_use] + pub fn as_str(&self) -> JsStr<'_> { + match self { + Self::Latin1(b) => JsStr::Latin1(b), + Self::Utf16(b) => JsStr::Utf16(b), + } + } +} + +/// A rope string that is a tree of other strings. +#[repr(C)] +#[derive(Debug)] +pub struct RopeString { + /// Standardized header for all strings. + pub(crate) header: JsStringHeader, + pub(crate) left: JsString, + pub(crate) right: JsString, + // Using a raw buffer cache instead of `JsString` in `OnceCell` solves the "refcount aliasing" + // problem. Storing a refcounted `JsString` inside a `OnceCell` could lead to ownership cycles + // or use-after-free errors if shared nodes were dropped while still cached. + // By storing raw `Box<[u8]>` or `Box<[u16]>`, we decouple the cache from the refcounting system. + flattened: OnceCell, + pub(crate) depth: u8, +} + +impl RopeString { + /// Create a new rope string. + /// + /// This will rebalance if the rope becomes too deep relative to its length. + #[inline] + #[must_use] + pub(crate) fn create(left: JsString, right: JsString) -> JsString { + let depth = std::cmp::max(left.depth(), right.depth()) + 1; + let len = left.len() + right.len(); + + let d = depth as usize; + + // Leaf Consolidation: If the result is small, always flatten to avoid depth build-up for small concatenations. + if len < 512 { + return JsString::concat_array(&[left.as_str(), right.as_str()]); + } + + // Leaf Sinking: If we're appending a small string and the left side is a rope, + // try to sink the append into the rightmost leaf. + // This keeps the tree shallow and avoids frequent rebalancing for small repeated appends. + if right.len() < 256 && left.kind() == JsStringKind::Rope { + // SAFETY: kind() check ensures this is a rope. + let left_rope = unsafe { Self::from_vtable(left.ptr) }; + if left_rope.right.len() + right.len() < 512 { + let new_right = JsString::concat_slices(left_rope.right.as_str(), right.as_str()); + return Self::create(left_rope.left.clone(), new_right); + } + } + + // Classical Fibonacci Weight Invariant: + // A rope of depth `d` is considered balanced if its length is at least `Fib(d + 2)`. + // If the current length is less than the threshold for the current depth, we rebalance. + // This alone guarantees logarithmic depth while ensuring rebalancing happens only O(log n) times. + if d >= FIBONACCI_THRESHOLDS.len() || len < FIBONACCI_THRESHOLDS[d] { + // Otherwise, collect leaves and rebuild a balanced tree. + let mut leaves = Vec::with_capacity(std::cmp::max(depth as usize * 2, 16)); + Self::collect_leaves(&left, &mut leaves); + Self::collect_leaves(&right, &mut leaves); + return JsString::concat_leaves_balanced(&leaves); + } + + let rope = Box::new(Self { + header: JsStringHeader::new(&ROPE_VTABLE, len, 1), + left, + right, + flattened: OnceCell::new(), + depth, + }); + + // SAFETY: The `rope` is leaked as a raw pointer and wrapped in `NonNull`. + // The `JsStringHeader` header is at the start of `RopeString`. + unsafe { JsString::from_raw(NonNull::from(Box::leak(rope)).cast()) } + } + + /// Internal helper to collect all leaf strings of a rope. + pub(crate) fn collect_leaves(s: &JsString, leaves: &mut Vec) { + let mut stack = vec![s.clone()]; + while let Some(current) = stack.pop() { + if current.kind() == JsStringKind::Rope { + // SAFETY: We know the kind is `Rope`, so it's safe to cast the pointer to `RopeString`. + let r = unsafe { Self::from_vtable(current.ptr) }; + + // If the child is already flattened, don't descend into it; just use its cached result. + // This prevents exponential traversal of shared subtrees (DAGs) during rebalancing. + if let Some(flat) = r.flattened.get() { + let s = match flat { + Flattened::Latin1(b) => JsString::from(JsStr::latin1(b)), + Flattened::Utf16(b) => JsString::from(JsStr::utf16(b)), + }; + leaves.push(s); + } else { + stack.push(r.right.clone()); + stack.push(r.left.clone()); + } + } else if !current.is_empty() { + leaves.push(current); + } + } + } + + #[inline] + pub(crate) fn depth(&self) -> u8 { + self.depth + } + + /// Casts a `NonNull` to `&Self`. + /// + /// # Safety + /// The caller must ensure the pointer is valid and of the correct kind. + #[inline] + pub(crate) unsafe fn from_vtable<'a>(ptr: NonNull) -> &'a Self { + // SAFETY: The caller must ensure the pointer is valid and of the correct kind. + unsafe { ptr.cast().as_ref() } + } +} + +#[inline] +fn rope_dealloc(ptr: NonNull) { + // We use a stack to iteratively drop rope nodes and avoid stack overflow. + let mut stack = vec![ptr]; + while let Some(current_ptr) = stack.pop() { + // SAFETY: The pointer is guaranteed to be a valid `NonNull` pointing to a `RopeString` + // that is ready to be deallocated (refcount reached 0). + unsafe { + // SAFETY: The pointer was created from a Box in `create` and hasn't been freed yet. + let rope_ptr = current_ptr.cast::(); + // SAFETY: We own this pointer now conceptually. + let mut rope_box = Box::from_raw(rope_ptr.as_ptr()); + + // Check children. If they are ropes and we are the last reference, defer their deallocation. + // This prevents the recursive drop of fields. + let left = std::mem::replace(&mut rope_box.left, crate::StaticJsStrings::EMPTY_STRING); + if left.kind() == JsStringKind::Rope && left.refcount() == Some(1) { + stack.push(left.ptr); + std::mem::forget(left); + } + let right = + std::mem::replace(&mut rope_box.right, crate::StaticJsStrings::EMPTY_STRING); + if right.kind() == JsStringKind::Rope && right.refcount() == Some(1) { + stack.push(right.ptr); + std::mem::forget(right); + } + // rope_box is dropped here. Its remaining fields (depth, OnceCell, and the empty JsStrings) are dropped normally. + } + } +} + +#[inline] +fn rope_as_str(header: &JsStringHeader) -> JsStr<'_> { + // SAFETY: The header is part of a RopeString and it's aligned. + let this: &RopeString = unsafe { &*ptr::from_ref(header).cast::() }; + + if let Some(flattened) = this.flattened.get() { + return match flattened { + Flattened::Latin1(b) => JsStr::latin1(b), + Flattened::Utf16(b) => JsStr::utf16(b), + }; + } + + JsStr::rope(crate::str::RopeSlice { + header, + start: 0, + end: header.len, + }) +} + +/// Flattens a rope string into a contiguous buffer. +/// +/// # Panics +/// +/// Panics if the string contains non-Latin1 characters but was expected to be Latin1. +#[must_use] +pub fn flatten_rope(header: &JsStringHeader) -> &Flattened { + // SAFETY: The header is part of a RopeString and it's aligned. + let this: &RopeString = unsafe { &*ptr::from_ref(header).cast::() }; + + this.flattened.get_or_init(|| { + // SAFETY: We manually increment the refcount to ensure the `JsString` handle is valid for traversal. + let root_handle = unsafe { + let rc_ptr = (&raw const header.refcount).cast::(); + (*rc_ptr).fetch_add(1, std::sync::atomic::Ordering::Relaxed); + JsString::from_raw(NonNull::new_unchecked(ptr::from_ref(header).cast_mut())) + }; + + // Pass 1: Determine encoding using a seen-set to handle DAGs in O(Nodes). + // This prevents exponential traversal (O(2^depth)) in highly shared trees. + // By checking `flattened.get()` first, we effectively prune the traversal + // using previously computed results, ensuring each internal node is visited + // effectively only once per flattening call. + let mut is_latin1 = true; + let mut stack = vec![root_handle.clone()]; + let mut seen = rustc_hash::FxHashSet::default(); + while let Some(current) = stack.pop() { + if current.kind() == JsStringKind::Rope { + // SAFETY: We know the kind is `Rope`, so it's safe to cast the pointer to `RopeString`. + let r: &RopeString = unsafe { &*current.ptr.as_ptr().cast::() }; + + // If the child is already flattened, use its cached result to determine encoding. + // This is a major optimization for DAGs. + if let Some(flat) = r.flattened.get() { + match flat { + Flattened::Utf16(_) => { + is_latin1 = false; + break; + } + Flattened::Latin1(_) => {} + } + } else if seen.insert(current.ptr.as_ptr()) { + stack.push(r.right.clone()); + stack.push(r.left.clone()); + } + } else if !current.as_str().is_latin1() { + is_latin1 = false; + break; + } + } + + let len = header.len; + if is_latin1 { + let mut buffer = Vec::with_capacity(len); + let mut stack = vec![root_handle]; + while let Some(current) = stack.pop() { + if current.kind() == JsStringKind::Rope { + // SAFETY: We know the kind is `Rope`, so it's safe to cast the pointer to `RopeString`. + let r: &RopeString = unsafe { &*current.ptr.as_ptr().cast::() }; + if let Some(Flattened::Latin1(b)) = r.flattened.get() { + buffer.extend_from_slice(b); + } else { + stack.push(r.right.clone()); + stack.push(r.left.clone()); + } + } else { + // SAFETY: The initial pass already verified that all components of this rope are Latin1. + buffer.extend_from_slice( + current + .as_str() + .as_latin1() + .expect("initial pass verified encoding"), + ); + } + } + Flattened::Latin1(buffer.into_boxed_slice()) + } else { + let mut buffer = Vec::with_capacity(len); + let mut stack = vec![root_handle]; + while let Some(current) = stack.pop() { + if current.kind() == JsStringKind::Rope { + // SAFETY: We know the kind is `Rope`, so it's safe to cast the pointer to `RopeString`. + let r: &RopeString = unsafe { &*current.ptr.as_ptr().cast::() }; + if let Some(flat) = r.flattened.get() { + match flat { + Flattened::Latin1(b) => buffer.extend(b.iter().copied().map(u16::from)), + Flattened::Utf16(b) => buffer.extend_from_slice(b), + } + } else { + stack.push(r.right.clone()); + stack.push(r.left.clone()); + } + } else { + let s = current.as_str(); + match s { + JsStr::Latin1(s) => buffer.extend(s.iter().copied().map(u16::from)), + JsStr::Utf16(s) => buffer.extend_from_slice(s), + // SAFETY: We skip recursion for the current node to avoid stack overflow, + // and `flatten_rope` logic ensures we eventually reach leaves. + JsStr::Rope(_) => buffer.extend(s.iter()), + } + } + } + Flattened::Utf16(buffer.into_boxed_slice()) + } + }) +} + +#[inline] +fn rope_code_points(header: &JsStringHeader) -> crate::iter::CodePointsIter<'_> { + // SAFETY: The header is guaranteed to be a `RopeString` and it's aligned. + let this: &RopeString = unsafe { &*ptr::from_ref(header).cast::() }; + + // CACHE-AWARE: If already flattened, use the fast contiguous iterator. + if let Some(flattened) = this.flattened.get() { + return match flattened { + Flattened::Latin1(b) => JsStr::latin1(b).code_points(), + Flattened::Utf16(b) => JsStr::utf16(b).code_points(), + }; + } + + // SAFETY: The header is guaranteed to be a `RopeString` and alive for the duration of the iterator + // as it is bound by the lifetime of the input reference. + unsafe { + let header_ptr: *const JsStringHeader = header; + // We create a temporary handle to pass to the iterator. + // Since the iterator only takes a reference, we don't need to increment refcount. + let s = JsString::from_raw(NonNull::new_unchecked(header_ptr.cast_mut())); + let iter = crate::iter::CodePointsIter::rope(&s); + std::mem::forget(s); + iter + } +} + +#[inline] +fn rope_code_unit_at(header: &JsStringHeader, mut index: usize) -> Option { + // SAFETY: This is part of the correct vtable which is validated on construction. + // The pointer is guaranteed to be a valid `NonNull` pointing to a `RopeString`. + let mut current: &RopeString = unsafe { &*ptr::from_ref(header).cast::() }; + + // CACHE-AWARE: If the root is already flattened, return the unit from the buffer in O(1). + if let Some(flattened) = current.flattened.get() { + return match flattened { + Flattened::Latin1(b) => b.get(index).copied().map(u16::from), + Flattened::Utf16(b) => b.get(index).copied(), + }; + } + + loop { + if index >= current.header.len { + return None; + } + + // If the current node we are traversing is already flattened, we can finish early. + if let Some(flattened) = current.flattened.get() { + return match flattened { + Flattened::Latin1(b) => b.get(index).copied().map(u16::from), + Flattened::Utf16(b) => b.get(index).copied(), + }; + } + + let left_len = current.left.len(); + if index < left_len { + match current.left.kind() { + JsStringKind::Rope => { + // SAFETY: current.left is a Rope. + current = unsafe { current.left.ptr.cast().as_ref() }; + } + _ => { + // SAFETY: `current.left` is not a `Rope`, so we can safely get the code unit. + return current.left.code_unit_at(index); + } + } + } else { + index -= left_len; + match current.right.kind() { + JsStringKind::Rope => { + // SAFETY: current.right is a Rope. + current = unsafe { current.right.ptr.cast().as_ref() }; + } + _ => { + // SAFETY: `current.right` is not a `Rope`, so we can safely get the code unit. + return current.right.code_unit_at(index); + } + } + } + } +} diff --git a/core/string/src/vtable/sequence.rs b/core/string/src/vtable/sequence.rs index dea448c2d16..a90520b5e3c 100644 --- a/core/string/src/vtable/sequence.rs +++ b/core/string/src/vtable/sequence.rs @@ -1,26 +1,39 @@ -//! `VTable` implementations for [`SequenceString`]. -use crate::iter::CodePointsIter; -use crate::r#type::InternalStringType; -use crate::vtable::JsStringVTable; -use crate::{JsStr, JsString, alloc_overflow}; use std::alloc::{Layout, alloc, dealloc}; -use std::cell::Cell; use std::marker::PhantomData; -use std::process::abort; -use std::ptr; -use std::ptr::NonNull; +use std::ptr::{self, NonNull}; + +use crate::iter::CodePointsIter; +use crate::r#type::{InternalStringType, Latin1, Utf16}; +use crate::vtable::{JsStringHeader, JsStringVTable}; +use crate::{JsStr, JsStringKind, alloc_overflow}; +pub(crate) static LATIN1_VTABLE: JsStringVTable = JsStringVTable { + as_str: seq_as_str::, + code_points: seq_code_points::, + code_unit_at: seq_code_unit_at::, + dealloc: seq_dealloc::, + kind: JsStringKind::Latin1Sequence, +}; + +/// Static vtable for UTF-16 sequence strings. +pub(crate) static UTF16_VTABLE: JsStringVTable = JsStringVTable { + as_str: seq_as_str::, + code_points: seq_code_points::, + code_unit_at: seq_code_unit_at::, + dealloc: seq_dealloc::, + kind: JsStringKind::Utf16Sequence, +}; /// A sequential memory array of `T::Char` elements. /// /// # Notes -/// A [`SequenceString`] is `!Sync` (using [`Cell`]) and invariant over `T` (strings +/// A [`SequenceString`] is `!Sync` (using [`std::cell::Cell`]) and invariant over `T` (strings /// of various types cannot be used interchangeably). The string, however, could be /// `Send`, although within Boa this does not make sense. #[repr(C)] -pub(crate) struct SequenceString { - /// Embedded `VTable` - must be the first field for vtable dispatch. - vtable: JsStringVTable, - refcount: Cell, +#[derive(Debug)] +pub struct SequenceString { + /// Standardized header for all strings. + pub(crate) header: JsStringHeader, // Forces invariant contract. _marker: PhantomData T>, pub(crate) data: [u8; 0], @@ -33,16 +46,7 @@ impl SequenceString { #[must_use] pub(crate) fn new(len: usize) -> Self { SequenceString { - vtable: JsStringVTable { - clone: seq_clone::, - drop: seq_drop::, - as_str: seq_as_str::, - code_points: seq_code_points::, - refcount: seq_refcount::, - len, - kind: T::KIND, - }, - refcount: Cell::new(1), + header: JsStringHeader::new(T::VTABLE, len, 1), _marker: PhantomData, data: [0; 0], } @@ -77,9 +81,7 @@ impl SequenceString { debug_assert_eq!(layout.align(), align_of::()); #[allow(clippy::cast_ptr_alignment)] - // SAFETY: - // The layout size of `SequenceString` is never zero, since it has to store - // the length of the string and the reference count. + // SAFETY: The layout size of `SequenceString` is never zero. let inner = unsafe { alloc(layout).cast::() }; // We need to verify that the pointer returned by `alloc` is not null, otherwise @@ -87,9 +89,7 @@ impl SequenceString { // right now. let inner = NonNull::new(inner).ok_or(Some(layout))?; - // SAFETY: - // `NonNull` verified for us that the pointer returned by `alloc` is valid, - // meaning we can write to its pointed memory. + // SAFETY: `NonNull` verified that the pointer is valid. unsafe { // Write the first part, the `SequenceString`. inner.as_ptr().write(Self::new(len)); @@ -97,13 +97,7 @@ impl SequenceString { debug_assert!({ let inner = inner.as_ptr(); - // SAFETY: - // - `inner` must be a valid pointer, since it comes from a `NonNull`, - // meaning we can safely dereference it to `SequenceString`. - // - `offset` should point us to the beginning of the array, - // and since we requested a `SequenceString` layout with a trailing - // `[T::Byte; str_len]`, the memory of the array must be in the `usize` - // range for the allocation to succeed. + // SAFETY: `inner` is a valid pointer and `offset` points to the array start. unsafe { // This is `` as the offset is in bytes. ptr::eq( @@ -125,68 +119,46 @@ impl SequenceString { } #[inline] -fn seq_clone(vtable: NonNull) -> JsString { - // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SequenceString = unsafe { vtable.cast().as_ref() }; - let Some(strong) = this.refcount.get().checked_add(1) else { - abort(); - }; - this.refcount.set(strong); - // SAFETY: validated the string outside this function. - unsafe { JsString::from_ptr(vtable) } -} - -#[inline] -fn seq_drop(vtable: NonNull) { - // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SequenceString = unsafe { vtable.cast().as_ref() }; - let Some(new) = this.refcount.get().checked_sub(1) else { - abort(); - }; - this.refcount.set(new); - if new != 0 { - return; - } - - // SAFETY: All the checks for the validity of the layout have already been made on allocation. +fn seq_dealloc(ptr: NonNull) { + // SAFETY: The vtable ensures that the pointer is valid and points to a + // SequenceString of the correct type. + let header = unsafe { ptr.as_ref() }; + // SAFETY: Layout was validated on allocation. The `len` field is guaranteed + // to be valid for the string type `T`. let layout = unsafe { - Layout::for_value(this) - .extend(Layout::array::(this.vtable.len).unwrap_unchecked()) + T::base_layout() + .extend(Layout::array::(header.len).unwrap_unchecked()) .unwrap_unchecked() .0 .pad_to_align() }; - - // SAFETY: If refcount is 0, this is the last reference, so deallocating is safe. + // SAFETY: The `ptr` is a valid `NonNull` pointer to the allocated memory. + // The `layout` correctly describes the allocation. unsafe { - dealloc(vtable.as_ptr().cast(), layout); + dealloc(ptr.as_ptr().cast(), layout); } } #[inline] -fn seq_as_str(vtable: NonNull) -> JsStr<'static> { - // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SequenceString = unsafe { vtable.cast().as_ref() }; - let len = this.vtable.len; +fn seq_as_str(header: &JsStringHeader) -> JsStr<'_> { + // SAFETY: The header is part of a SequenceString and it's aligned. + let this: &SequenceString = unsafe { &*ptr::from_ref(header).cast::>() }; + let len = header.len; let data_ptr = (&raw const this.data).cast::(); // SAFETY: SequenceString data is always valid and properly aligned. + // `data_ptr` points to the start of the character data, and `len` is the + // number of characters, which is guaranteed to be within the allocated bounds. let slice = unsafe { std::slice::from_raw_parts(data_ptr, len) }; T::str_ctor(slice) } #[inline] -fn seq_code_points( - vtable: NonNull, -) -> CodePointsIter<'static> { - CodePointsIter::new(seq_as_str::(vtable)) +fn seq_code_points(header: &JsStringHeader) -> CodePointsIter<'_> { + CodePointsIter::new(seq_as_str::(header)) } -/// `VTable` function for refcount, need to return an `Option`. #[inline] -#[allow(clippy::unnecessary_wraps)] -fn seq_refcount(vtable: NonNull) -> Option { - // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SequenceString = unsafe { vtable.cast().as_ref() }; - Some(this.refcount.get()) +fn seq_code_unit_at(header: &JsStringHeader, index: usize) -> Option { + seq_as_str::(header).get(index) } diff --git a/core/string/src/vtable/slice.rs b/core/string/src/vtable/slice.rs index c433f0968e5..0622c1ffe52 100644 --- a/core/string/src/vtable/slice.rs +++ b/core/string/src/vtable/slice.rs @@ -1,22 +1,28 @@ use crate::iter::CodePointsIter; -use crate::vtable::JsStringVTable; +use crate::vtable::{JsStringHeader, JsStringVTable}; use crate::{JsStr, JsString, JsStringKind}; -use std::cell::Cell; -use std::process::abort; -use std::ptr::NonNull; +use std::ptr::{self, NonNull}; + +/// Static vtable for slice strings. +pub(crate) static SLICE_VTABLE: JsStringVTable = JsStringVTable { + as_str: slice_as_str, + code_points: slice_code_points, + code_unit_at: slice_code_unit_at, + dealloc: slice_dealloc, // Slice strings are now correctly deallocated. + kind: JsStringKind::Slice, +}; /// A slice of an existing string. #[repr(C)] -pub(crate) struct SliceString { - /// Embedded `VTable` - must be the first field for vtable dispatch. - vtable: JsStringVTable, +#[derive(Debug)] +pub struct SliceString { + /// Standardized header for all strings. + pub(crate) header: JsStringHeader, // Keep this for refcounting the original string. - owned: JsString, + pub(crate) owned: JsString, // Pointer to the data itself. This is guaranteed to be safe as long as `owned` is // owned. - inner: JsStr<'static>, - // Refcount for this string as we need to clone/drop it as well. - refcount: Cell, + pub(crate) inner: JsStr<'static>, } impl SliceString { @@ -28,22 +34,15 @@ impl SliceString { #[inline] #[must_use] pub(crate) unsafe fn new(owned: &JsString, start: usize, end: usize) -> Self { - // SAFETY: invariant stated for this whole function. + // SAFETY: The caller is responsible for ensuring start and end are safe (`start` <= `end`, + // `start` >= 0, `end` <= `owned.len()`). let inner = unsafe { owned.as_str().get_unchecked(start..end) }; SliceString { - vtable: JsStringVTable { - clone: slice_clone, - drop: slice_drop, - as_str: slice_as_str, - code_points: slice_code_points, - refcount: slice_refcount, - len: end - start, - kind: JsStringKind::Slice, - }, + header: JsStringHeader::new(&SLICE_VTABLE, end - start, 1), owned: owned.clone(), // SAFETY: this inner's lifetime is tied to the owned string above. + // We transmute the lifetime to 'static to satisfy the long-lived nature of the string vtable. inner: unsafe { inner.as_static() }, - refcount: Cell::new(1), } } @@ -56,53 +55,54 @@ impl SliceString { } #[inline] -pub(super) fn slice_clone(vtable: NonNull) -> JsString { - // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SliceString = unsafe { vtable.cast().as_ref() }; - let Some(strong) = this.refcount.get().checked_add(1) else { - abort(); - }; - this.refcount.set(strong); - // SAFETY: validated the string outside this function. - unsafe { JsString::from_ptr(vtable) } +fn slice_as_str(header: &JsStringHeader) -> JsStr<'_> { + // SAFETY: The header is part of a SliceString and it's aligned. + let this: &SliceString = unsafe { &*ptr::from_ref(header).cast::() }; + this.inner } #[inline] -fn slice_drop(vtable: NonNull) { - // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SliceString = unsafe { vtable.cast().as_ref() }; - let Some(new) = this.refcount.get().checked_sub(1) else { - abort(); - }; - this.refcount.set(new); - if new != 0 { - return; - } +fn slice_dealloc(ptr: NonNull) { + // Iteratively destroy slice chains to avoid recursive drop stack overflow. + let mut current = ptr; + + loop { + // SAFETY: the vtable ensures this pointer refers to `SliceString`. + unsafe { + // Take ownership of the slice node. + let slice_ptr = current.cast::(); + let mut slice_box = Box::from_raw(slice_ptr.as_ptr()); + + // Extract the parent string. + let parent = + std::mem::replace(&mut slice_box.owned, crate::StaticJsStrings::EMPTY_STRING); + + // Drop the slice node itself. + drop(slice_box); - // SAFETY: This is the last reference, so we can deallocate. - // The vtable pointer is actually pointing to a SliceString, so cast it correctly. - unsafe { - drop(Box::from_raw(vtable.cast::().as_ptr())); + // If the parent is another slice and we are the last reference, + // continue iteratively instead of recursing. + if parent.kind() == JsStringKind::Slice && parent.refcount() == Some(1) { + current = parent.ptr; + std::mem::forget(parent); + continue; + } + + // Otherwise drop normally and finish. + drop(parent); + break; + } } } #[inline] -fn slice_as_str(vtable: NonNull) -> JsStr<'static> { - // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SliceString = unsafe { vtable.cast().as_ref() }; - this.inner +fn slice_code_points(header: &JsStringHeader) -> CodePointsIter<'_> { + CodePointsIter::new(slice_as_str(header)) } #[inline] -fn slice_code_points(vtable: NonNull) -> CodePointsIter<'static> { - CodePointsIter::new(slice_as_str(vtable)) +fn slice_code_unit_at(header: &JsStringHeader, index: usize) -> Option { + slice_as_str(header).get(index) } -/// `VTable` function for refcount, need to return an `Option`. -#[inline] -#[allow(clippy::unnecessary_wraps)] -fn slice_refcount(vtable: NonNull) -> Option { - // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SliceString = unsafe { vtable.cast().as_ref() }; - Some(this.refcount.get()) -} +// Unused refcount method removed. diff --git a/core/string/src/vtable/static.rs b/core/string/src/vtable/static.rs index de57b051903..791f8a1de99 100644 --- a/core/string/src/vtable/static.rs +++ b/core/string/src/vtable/static.rs @@ -1,15 +1,24 @@ use crate::iter::CodePointsIter; -use crate::vtable::JsStringVTable; -use crate::{JsStr, JsString, JsStringKind}; +use crate::vtable::{JsStringHeader, JsStringVTable}; +use crate::{JsStr, JsStringKind}; use std::hash::{Hash, Hasher}; -use std::ptr::NonNull; +use std::ptr::{self}; + +/// Static vtable for static strings. +pub(crate) static STATIC_VTABLE: JsStringVTable = JsStringVTable { + as_str: static_as_str, + code_points: static_code_points, + code_unit_at: static_code_unit_at, + dealloc: |_| {}, // Static strings are never deallocated. + kind: JsStringKind::Static, +}; /// A static string with vtable for uniform dispatch. #[derive(Debug, Clone, Copy)] #[repr(C)] pub struct StaticString { - /// Embedded `VTable` - must be the first field for vtable dispatch. - vtable: JsStringVTable, + /// Standardized header for all strings. + pub(crate) header: JsStringHeader, /// The actual string data. pub(crate) str: JsStr<'static>, } @@ -19,15 +28,7 @@ impl StaticString { #[must_use] pub const fn new(str: JsStr<'static>) -> Self { Self { - vtable: JsStringVTable { - clone: static_clone, - drop: static_drop, - as_str: static_as_str, - code_points: static_code_points, - refcount: static_refcount, - len: str.len(), - kind: JsStringKind::Static, - }, + header: JsStringHeader::new(&STATIC_VTABLE, str.len(), 0), str, } } @@ -53,32 +54,23 @@ impl std::borrow::Borrow> for &'static StaticString { } } -#[inline] -pub(crate) fn static_clone(this: NonNull) -> JsString { - // Static strings don't need ref counting, just copy the pointer. - // SAFETY: validated the string outside this function. - unsafe { JsString::from_ptr(this) } -} +// Unused static_clone removed. #[inline] -fn static_drop(_ptr: NonNull) { - // Static strings don't need cleanup. -} - -#[inline] -fn static_as_str(this: NonNull) -> JsStr<'static> { - // SAFETY: validated the string outside this function. - let this: &StaticString = unsafe { this.cast().as_ref() }; +fn static_as_str(header: &JsStringHeader) -> JsStr<'_> { + // SAFETY: The header is part of a StaticString and it's aligned. + let this: &StaticString = unsafe { &*ptr::from_ref(header).cast::() }; this.str } #[inline] -fn static_code_points(this: NonNull) -> CodePointsIter<'static> { - CodePointsIter::new(static_as_str(this)) +fn static_code_points(header: &JsStringHeader) -> CodePointsIter<'_> { + CodePointsIter::new(static_as_str(header)) } #[inline] -fn static_refcount(_ptr: NonNull) -> Option { - // Static strings don't have refcount. - None +fn static_code_unit_at(header: &JsStringHeader, index: usize) -> Option { + static_as_str(header).get(index) } + +// Unused static_refcount removed.