core/
wtf8.rs

1//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
2//!
3//! This library uses Rust’s type system to maintain
4//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
5//! like the `String` and `&str` types do for UTF-8.
6//!
7//! Since [WTF-8 must not be used
8//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
9//! this library deliberately does not provide access to the underlying bytes
10//! of WTF-8 strings,
11//! nor can it decode WTF-8 from arbitrary bytes.
12//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
13#![unstable(
14    feature = "wtf8_internals",
15    issue = "none",
16    reason = "this is internal code for representing OsStr on some platforms and not a public API"
17)]
18// rustdoc bug: doc(hidden) on the module won't stop types in the module from showing up in trait
19// implementations, so, we'll have to add more doc(hidden)s anyway
20#![doc(hidden)]
21
22use crate::char::{MAX_LEN_UTF16, encode_utf16_raw};
23use crate::clone::CloneToUninit;
24use crate::fmt::{self, Write};
25use crate::hash::{Hash, Hasher};
26use crate::iter::FusedIterator;
27use crate::num::niche_types::CodePointInner;
28use crate::str::next_code_point;
29use crate::{ops, slice, str};
30
31/// A Unicode code point: from U+0000 to U+10FFFF.
32///
33/// Compares with the `char` type,
34/// which represents a Unicode scalar value:
35/// a code point that is not a surrogate (U+D800 to U+DFFF).
36#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
37#[doc(hidden)]
38pub struct CodePoint(CodePointInner);
39
40/// Format the code point as `U+` followed by four to six hexadecimal digits.
41/// Example: `U+1F4A9`
42impl fmt::Debug for CodePoint {
43    #[inline]
44    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
45        write!(formatter, "U+{:04X}", self.0.as_inner())
46    }
47}
48
49impl CodePoint {
50    /// Unsafely creates a new `CodePoint` without checking the value.
51    ///
52    /// Only use when `value` is known to be less than or equal to 0x10FFFF.
53    #[inline]
54    pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
55        // SAFETY: Guaranteed by caller.
56        CodePoint(unsafe { CodePointInner::new_unchecked(value) })
57    }
58
59    /// Creates a new `CodePoint` if the value is a valid code point.
60    ///
61    /// Returns `None` if `value` is above 0x10FFFF.
62    #[inline]
63    pub fn from_u32(value: u32) -> Option<CodePoint> {
64        Some(CodePoint(CodePointInner::new(value)?))
65    }
66
67    /// Creates a new `CodePoint` from a `char`.
68    ///
69    /// Since all Unicode scalar values are code points, this always succeeds.
70    #[inline]
71    pub fn from_char(value: char) -> CodePoint {
72        // SAFETY: All char are valid for this type.
73        unsafe { CodePoint::from_u32_unchecked(value as u32) }
74    }
75
76    /// Returns the numeric value of the code point.
77    #[inline]
78    pub fn to_u32(&self) -> u32 {
79        self.0.as_inner()
80    }
81
82    /// Returns the numeric value of the code point if it is a leading surrogate.
83    #[inline]
84    pub fn to_lead_surrogate(&self) -> Option<u16> {
85        match self.to_u32() {
86            lead @ 0xD800..=0xDBFF => Some(lead as u16),
87            _ => None,
88        }
89    }
90
91    /// Returns the numeric value of the code point if it is a trailing surrogate.
92    #[inline]
93    pub fn to_trail_surrogate(&self) -> Option<u16> {
94        match self.to_u32() {
95            trail @ 0xDC00..=0xDFFF => Some(trail as u16),
96            _ => None,
97        }
98    }
99
100    /// Optionally returns a Unicode scalar value for the code point.
101    ///
102    /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
103    #[inline]
104    pub fn to_char(&self) -> Option<char> {
105        match self.to_u32() {
106            0xD800..=0xDFFF => None,
107            // SAFETY: We explicitly check that the char is valid.
108            valid => Some(unsafe { char::from_u32_unchecked(valid) }),
109        }
110    }
111
112    /// Returns a Unicode scalar value for the code point.
113    ///
114    /// Returns `'\u{FFFD}'` (the replacement character “�”)
115    /// if the code point is a surrogate (from U+D800 to U+DFFF).
116    #[inline]
117    pub fn to_char_lossy(&self) -> char {
118        self.to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
119    }
120}
121
122/// A borrowed slice of well-formed WTF-8 data.
123///
124/// Similar to `&str`, but can additionally contain surrogate code points
125/// if they’re not in a surrogate pair.
126#[derive(Eq, Ord, PartialEq, PartialOrd)]
127#[repr(transparent)]
128#[rustc_has_incoherent_inherent_impls]
129#[doc(hidden)]
130pub struct Wtf8 {
131    bytes: [u8],
132}
133
134impl AsRef<[u8]> for Wtf8 {
135    #[inline]
136    fn as_ref(&self) -> &[u8] {
137        &self.bytes
138    }
139}
140
141/// Formats the string in double quotes, with characters escaped according to
142/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
143/// where each `x` is a hexadecimal digit.
144impl fmt::Debug for Wtf8 {
145    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
146        fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
147            use crate::fmt::Write;
148            for c in s.chars().flat_map(|c| c.escape_debug()) {
149                f.write_char(c)?
150            }
151            Ok(())
152        }
153
154        formatter.write_str("\"")?;
155        let mut pos = 0;
156        while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
157            // SAFETY: next_surrogate provides an index for a range of valid UTF-8 bytes.
158            write_str_escaped(formatter, unsafe {
159                str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
160            })?;
161            write!(formatter, "\\u{{{:x}}}", surrogate)?;
162            pos = surrogate_pos + 3;
163        }
164
165        // SAFETY: after next_surrogate returns None, the remainder is valid UTF-8.
166        write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
167        formatter.write_str("\"")
168    }
169}
170
171/// Formats the string with unpaired surrogates substituted with the replacement
172/// character, U+FFFD.
173impl fmt::Display for Wtf8 {
174    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
175        let wtf8_bytes = &self.bytes;
176        let mut pos = 0;
177        loop {
178            match self.next_surrogate(pos) {
179                Some((surrogate_pos, _)) => {
180                    // SAFETY: next_surrogate provides an index for a range of valid UTF-8 bytes.
181                    formatter.write_str(unsafe {
182                        str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
183                    })?;
184                    formatter.write_char(char::REPLACEMENT_CHARACTER)?;
185                    pos = surrogate_pos + 3;
186                }
187                None => {
188                    // SAFETY: after next_surrogate returns None, the remainder is valid UTF-8.
189                    let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
190                    if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) }
191                }
192            }
193        }
194    }
195}
196
197impl Wtf8 {
198    /// Creates a WTF-8 slice from a UTF-8 `&str` slice.
199    #[inline]
200    pub fn from_str(value: &str) -> &Wtf8 {
201        // SAFETY: Since WTF-8 is a superset of UTF-8, this always is valid.
202        unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
203    }
204
205    /// Creates a WTF-8 slice from a WTF-8 byte slice.
206    ///
207    /// Since the byte slice is not checked for valid WTF-8, this functions is
208    /// marked unsafe.
209    #[inline]
210    pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
211        // SAFETY: start with &[u8], end with fancy &[u8]
212        unsafe { &*(value as *const [u8] as *const Wtf8) }
213    }
214
215    /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice.
216    ///
217    /// Since the byte slice is not checked for valid WTF-8, this functions is
218    /// marked unsafe.
219    #[inline]
220    pub unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
221        // SAFETY: start with &mut [u8], end with fancy &mut [u8]
222        unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
223    }
224
225    /// Returns the length, in WTF-8 bytes.
226    #[inline]
227    pub fn len(&self) -> usize {
228        self.bytes.len()
229    }
230
231    #[inline]
232    pub fn is_empty(&self) -> bool {
233        self.bytes.is_empty()
234    }
235
236    /// Returns the code point at `position` if it is in the ASCII range,
237    /// or `b'\xFF'` otherwise.
238    ///
239    /// # Panics
240    ///
241    /// Panics if `position` is beyond the end of the string.
242    #[inline]
243    pub fn ascii_byte_at(&self, position: usize) -> u8 {
244        match self.bytes[position] {
245            ascii_byte @ 0x00..=0x7F => ascii_byte,
246            _ => 0xFF,
247        }
248    }
249
250    /// Returns an iterator for the string’s code points.
251    #[inline]
252    pub fn code_points(&self) -> Wtf8CodePoints<'_> {
253        Wtf8CodePoints { bytes: self.bytes.iter() }
254    }
255
256    /// Access raw bytes of WTF-8 data
257    #[inline]
258    pub fn as_bytes(&self) -> &[u8] {
259        &self.bytes
260    }
261
262    /// Tries to convert the string to UTF-8 and return a `&str` slice.
263    ///
264    /// Returns `None` if the string contains surrogates.
265    ///
266    /// This does not copy the data.
267    #[inline]
268    pub fn as_str(&self) -> Result<&str, str::Utf8Error> {
269        str::from_utf8(&self.bytes)
270    }
271
272    /// Converts the WTF-8 string to potentially ill-formed UTF-16
273    /// and return an iterator of 16-bit code units.
274    ///
275    /// This is lossless:
276    /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
277    /// would always return the original WTF-8 string.
278    #[inline]
279    pub fn encode_wide(&self) -> EncodeWide<'_> {
280        EncodeWide { code_points: self.code_points(), extra: 0 }
281    }
282
283    #[inline]
284    pub fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
285        let mut iter = self.bytes[pos..].iter();
286        loop {
287            let b = *iter.next()?;
288            if b < 0x80 {
289                pos += 1;
290            } else if b < 0xE0 {
291                iter.next();
292                pos += 2;
293            } else if b == 0xED {
294                match (iter.next(), iter.next()) {
295                    (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
296                        return Some((pos, decode_surrogate(b2, b3)));
297                    }
298                    _ => pos += 3,
299                }
300            } else if b < 0xF0 {
301                iter.next();
302                iter.next();
303                pos += 3;
304            } else {
305                iter.next();
306                iter.next();
307                iter.next();
308                pos += 4;
309            }
310        }
311    }
312
313    #[inline]
314    pub fn final_lead_surrogate(&self) -> Option<u16> {
315        match self.bytes {
316            [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
317            _ => None,
318        }
319    }
320
321    #[inline]
322    pub fn initial_trail_surrogate(&self) -> Option<u16> {
323        match self.bytes {
324            [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
325            _ => None,
326        }
327    }
328
329    #[inline]
330    pub fn make_ascii_lowercase(&mut self) {
331        self.bytes.make_ascii_lowercase()
332    }
333
334    #[inline]
335    pub fn make_ascii_uppercase(&mut self) {
336        self.bytes.make_ascii_uppercase()
337    }
338
339    #[inline]
340    pub fn is_ascii(&self) -> bool {
341        self.bytes.is_ascii()
342    }
343
344    #[inline]
345    pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
346        self.bytes.eq_ignore_ascii_case(&other.bytes)
347    }
348}
349
350/// Returns a slice of the given string for the byte range \[`begin`..`end`).
351///
352/// # Panics
353///
354/// Panics when `begin` and `end` do not point to code point boundaries,
355/// or point beyond the end of the string.
356impl ops::Index<ops::Range<usize>> for Wtf8 {
357    type Output = Wtf8;
358
359    #[inline]
360    fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
361        if range.start <= range.end
362            && self.is_code_point_boundary(range.start)
363            && self.is_code_point_boundary(range.end)
364        {
365            // SAFETY: is_code_point_boundary checks that the index is valid
366            unsafe { slice_unchecked(self, range.start, range.end) }
367        } else {
368            slice_error_fail(self, range.start, range.end)
369        }
370    }
371}
372
373/// Returns a slice of the given string from byte `begin` to its end.
374///
375/// # Panics
376///
377/// Panics when `begin` is not at a code point boundary,
378/// or is beyond the end of the string.
379impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
380    type Output = Wtf8;
381
382    #[inline]
383    fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
384        if self.is_code_point_boundary(range.start) {
385            // SAFETY: is_code_point_boundary checks that the index is valid
386            unsafe { slice_unchecked(self, range.start, self.len()) }
387        } else {
388            slice_error_fail(self, range.start, self.len())
389        }
390    }
391}
392
393/// Returns a slice of the given string from its beginning to byte `end`.
394///
395/// # Panics
396///
397/// Panics when `end` is not at a code point boundary,
398/// or is beyond the end of the string.
399impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
400    type Output = Wtf8;
401
402    #[inline]
403    fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
404        if self.is_code_point_boundary(range.end) {
405            // SAFETY: is_code_point_boundary checks that the index is valid
406            unsafe { slice_unchecked(self, 0, range.end) }
407        } else {
408            slice_error_fail(self, 0, range.end)
409        }
410    }
411}
412
413impl ops::Index<ops::RangeFull> for Wtf8 {
414    type Output = Wtf8;
415
416    #[inline]
417    fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
418        self
419    }
420}
421
422#[inline]
423fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
424    // The first byte is assumed to be 0xED
425    0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
426}
427
428impl Wtf8 {
429    /// Copied from str::is_char_boundary
430    #[inline]
431    pub fn is_code_point_boundary(&self, index: usize) -> bool {
432        if index == 0 {
433            return true;
434        }
435        match self.bytes.get(index) {
436            None => index == self.len(),
437            Some(&b) => (b as i8) >= -0x40,
438        }
439    }
440
441    /// Verify that `index` is at the edge of either a valid UTF-8 codepoint
442    /// (i.e. a codepoint that's not a surrogate) or of the whole string.
443    ///
444    /// These are the cases currently permitted by `OsStr::self_encoded_bytes`.
445    /// Splitting between surrogates is valid as far as WTF-8 is concerned, but
446    /// we do not permit it in the public API because WTF-8 is considered an
447    /// implementation detail.
448    #[track_caller]
449    #[inline]
450    pub fn check_utf8_boundary(&self, index: usize) {
451        if index == 0 {
452            return;
453        }
454        match self.bytes.get(index) {
455            Some(0xED) => (), // Might be a surrogate
456            Some(&b) if (b as i8) >= -0x40 => return,
457            Some(_) => panic!("byte index {index} is not a codepoint boundary"),
458            None if index == self.len() => return,
459            None => panic!("byte index {index} is out of bounds"),
460        }
461        if self.bytes[index + 1] >= 0xA0 {
462            // There's a surrogate after index. Now check before index.
463            if index >= 3 && self.bytes[index - 3] == 0xED && self.bytes[index - 2] >= 0xA0 {
464                panic!("byte index {index} lies between surrogate codepoints");
465            }
466        }
467    }
468}
469
470/// Copied from core::str::raw::slice_unchecked
471#[inline]
472unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
473    // SAFETY: memory layout of a &[u8] and &Wtf8 are the same
474    unsafe {
475        let len = end - begin;
476        let start = s.as_bytes().as_ptr().add(begin);
477        Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len))
478    }
479}
480
481/// Copied from core::str::raw::slice_error_fail
482#[inline(never)]
483fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
484    assert!(begin <= end);
485    panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary");
486}
487
488/// Iterator for the code points of a WTF-8 string.
489///
490/// Created with the method `.code_points()`.
491#[derive(Clone)]
492#[doc(hidden)]
493pub struct Wtf8CodePoints<'a> {
494    bytes: slice::Iter<'a, u8>,
495}
496
497impl Iterator for Wtf8CodePoints<'_> {
498    type Item = CodePoint;
499
500    #[inline]
501    fn next(&mut self) -> Option<CodePoint> {
502        // SAFETY: `self.bytes` has been created from a WTF-8 string
503        unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint::from_u32_unchecked(c)) }
504    }
505
506    #[inline]
507    fn size_hint(&self) -> (usize, Option<usize>) {
508        let len = self.bytes.len();
509        (len.saturating_add(3) / 4, Some(len))
510    }
511}
512
513impl fmt::Debug for Wtf8CodePoints<'_> {
514    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
515        f.debug_tuple("Wtf8CodePoints")
516            // SAFETY: We always leave the string in a valid state after each iteration.
517            .field(&unsafe { Wtf8::from_bytes_unchecked(self.bytes.as_slice()) })
518            .finish()
519    }
520}
521
522/// Generates a wide character sequence for potentially ill-formed UTF-16.
523#[stable(feature = "rust1", since = "1.0.0")]
524#[derive(Clone)]
525#[doc(hidden)]
526pub struct EncodeWide<'a> {
527    code_points: Wtf8CodePoints<'a>,
528    extra: u16,
529}
530
531// Copied from libunicode/u_str.rs
532#[stable(feature = "rust1", since = "1.0.0")]
533impl Iterator for EncodeWide<'_> {
534    type Item = u16;
535
536    #[inline]
537    fn next(&mut self) -> Option<u16> {
538        if self.extra != 0 {
539            let tmp = self.extra;
540            self.extra = 0;
541            return Some(tmp);
542        }
543
544        let mut buf = [0; MAX_LEN_UTF16];
545        self.code_points.next().map(|code_point| {
546            let n = encode_utf16_raw(code_point.to_u32(), &mut buf).len();
547            if n == 2 {
548                self.extra = buf[1];
549            }
550            buf[0]
551        })
552    }
553
554    #[inline]
555    fn size_hint(&self) -> (usize, Option<usize>) {
556        let (low, high) = self.code_points.size_hint();
557        let ext = (self.extra != 0) as usize;
558        // every code point gets either one u16 or two u16,
559        // so this iterator is between 1 or 2 times as
560        // long as the underlying iterator.
561        (low + ext, high.and_then(|n| n.checked_mul(2)).and_then(|n| n.checked_add(ext)))
562    }
563}
564
565impl fmt::Debug for EncodeWide<'_> {
566    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
567        f.debug_struct("EncodeWide").finish_non_exhaustive()
568    }
569}
570
571#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")]
572impl FusedIterator for EncodeWide<'_> {}
573
574impl Hash for CodePoint {
575    #[inline]
576    fn hash<H: Hasher>(&self, state: &mut H) {
577        self.0.hash(state)
578    }
579}
580
581impl Hash for Wtf8 {
582    #[inline]
583    fn hash<H: Hasher>(&self, state: &mut H) {
584        state.write(&self.bytes);
585        0xfeu8.hash(state)
586    }
587}
588
589#[unstable(feature = "clone_to_uninit", issue = "126799")]
590unsafe impl CloneToUninit for Wtf8 {
591    #[inline]
592    #[cfg_attr(debug_assertions, track_caller)]
593    unsafe fn clone_to_uninit(&self, dst: *mut u8) {
594        // SAFETY: we're just a transparent wrapper around [u8]
595        unsafe { self.bytes.clone_to_uninit(dst) }
596    }
597}