Skip to content

[stdlib] fix utf8Span accessors for small strings #82077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 94 additions & 18 deletions stdlib/public/core/UTF8Span.swift
Original file line number Diff line number Diff line change
Expand Up @@ -201,36 +201,112 @@ extension String {
}

@available(SwiftStdlib 6.2, *)
public var utf8Span: UTF8Span {
private var _span: Span<UTF8.CodeUnit> {
@lifetime(borrow self)
borrowing get {
let isKnownASCII = _guts.isASCII
let utf8 = self.utf8
let span = utf8.span
let result = unsafe UTF8Span(
unchecked: span,
isKnownASCII: isKnownASCII)
return unsafe _overrideLifetime(result, borrowing: self)
#if _runtime(_ObjC)
// handle non-UTF8 Objective-C bridging cases here
if !_guts.isFastUTF8, _guts._object.hasObjCBridgeableObject {
let storage = _guts._getOrAllocateAssociatedStorage()
let (start, count) = unsafe (storage.start, storage.count)
let span = unsafe Span(_unsafeStart: start, count: count)
return unsafe _overrideLifetime(span, borrowing: self)
}
#endif
let count = _guts.count
if _guts.isSmall {
let a = Builtin.addressOfBorrow(self)
let address = unsafe UnsafePointer<UTF8.CodeUnit>(a)
let span = unsafe Span(_unsafeStart: address, count: count)
return unsafe _overrideLifetime(span, borrowing: self)
}
let isFastUTF8 = _guts.isFastUTF8
_precondition(isFastUTF8, "String must be contiguous UTF8")
let buffer = unsafe _guts._object.fastUTF8
let span = unsafe Span(_unsafeElements: buffer)
return unsafe _overrideLifetime(span, borrowing: self)
}
}
}

extension Substring {
/// A UTF8span over the code units that make up this string.
///
/// - Note: In the case of bridged UTF16 String instances (on Apple
/// platforms,) this property transcodes the code units the first time
/// it is called. The transcoded buffer is cached, and subsequent calls
/// to `span` can reuse the buffer.
///
/// Returns: a `UTF8Span` over the code units of this String.
///
/// Complexity: O(1) for native UTF8 Strings,
/// amortized O(1) for bridged UTF16 Strings.
@available(SwiftStdlib 6.2, *)
public var utf8Span: UTF8Span {
@lifetime(borrow self)
borrowing get {
let isKnownASCII = base._guts.isASCII
let utf8 = self.utf8
let span = utf8.span
let result = unsafe UTF8Span(
unchecked: span,
isKnownASCII: isKnownASCII)
return unsafe _overrideLifetime(result, borrowing: self)
unsafe UTF8Span(unchecked: _span, isKnownASCII: _guts.isASCII)
}
}
}

extension Substring {

@available(SwiftStdlib 6.2, *)
private var _span: Span<UTF8.CodeUnit> {
@lifetime(borrow self)
borrowing get {
#if _runtime(_ObjC)
// handle non-UTF8 Objective-C bridging cases here
if !_wholeGuts.isFastUTF8, _wholeGuts._object.hasObjCBridgeableObject {
let base: String.UTF8View = _slice._base.utf8
let first = base._foreignDistance(from: base.startIndex, to: startIndex)
let count = base._foreignDistance(from: startIndex, to: endIndex)
let span = base.span._extracting(first..<(first &+ count))
return unsafe _overrideLifetime(span, borrowing: self)
}
#endif
let first = _slice._startIndex._encodedOffset
let end = _slice._endIndex._encodedOffset
if _wholeGuts.isSmall {
let a = Builtin.addressOfBorrow(self)
let offset = first &+ (2 &* MemoryLayout<String.Index>.stride)
let start = unsafe UnsafePointer<UTF8.CodeUnit>(a).advanced(by: offset)
let span = unsafe Span(_unsafeStart: start, count: end &- first)
return unsafe _overrideLifetime(span, borrowing: self)
}
let isFastUTF8 = _wholeGuts.isFastUTF8
_precondition(isFastUTF8, "Substring must be contiguous UTF8")
var span = unsafe Span(_unsafeElements: _wholeGuts._object.fastUTF8)
span = span._extracting(first..<end)
return unsafe _overrideLifetime(span, borrowing: self)
}
}


/// A UTF8Span over the code units that make up this substring.
///
/// - Note: In the case of bridged UTF16 String instances (on Apple
/// platforms,) this property needs to transcode the code units every time
/// it is called.
/// For example, if `string` has the bridged UTF16 representation,
/// for word in string.split(separator: " ") {
/// useSpan(word.span)
/// }
/// is accidentally quadratic because of this issue. A workaround is to
/// explicitly convert the string into its native UTF8 representation:
/// var nativeString = consume string
/// nativeString.makeContiguousUTF8()
/// for word in nativeString.split(separator: " ") {
/// useSpan(word.span)
/// }
/// This second option has linear time complexity, as expected.
///
/// Returns: a `UTF8Span` over the code units of this Substring.
///
/// Complexity: O(1) for native UTF8 Strings, O(n) for bridged UTF16 Strings.
@available(SwiftStdlib 6.2, *)
public var utf8Span: UTF8Span {
@lifetime(borrow self)
borrowing get {
unsafe UTF8Span(unchecked: _span, isKnownASCII: base._guts.isASCII)
}
}
}
42 changes: 42 additions & 0 deletions test/stdlib/Span/StringUTF8SpanProperty.swift
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,45 @@ suite.test("Span from Large Native String's Substring")
expectEqual(span[i], u[i])
}
}

suite.test("Span from String.utf8Span")
.require(.stdlib_6_2).code {
guard #available(SwiftStdlib 6.2, *) else { return }

let s = String(200)
let utf8span = s.utf8Span
let span1 = utf8span.span
let utf8view = s.utf8
let span2 = utf8view.span
expectEqual(span1.count, span2.count)
for (i,j) in zip(span1.indices, span2.indices) {
expectEqual(span1[i], span2[j])
}
}

suite.test("UTF8Span from Span")
.require(.stdlib_6_2).code {
guard #available(SwiftStdlib 6.2, *) else { return }

let s = String(200).utf8
let span1 = s.span
guard let utf8 = expectNotNil(try? UTF8Span(validating: span1)) else { return }

let span2 = utf8.span
expectTrue(span1.isIdentical(to: span2))
}

suite.test("Span from Substring.utf8Span")
.require(.stdlib_6_2).code {
guard #available(SwiftStdlib 6.2, *) else { return }

let s = String(22000).dropFirst().dropLast()
let utf8span = s.utf8Span
let span1 = utf8span.span
let utf8view = s.utf8
let span2 = utf8view.span
expectEqual(span1.count, span2.count)
for (i,j) in zip(span1.indices, span2.indices) {
expectEqual(span1[i], span2[j])
}
}