core/str/
lossy.rs

1use super::from_utf8_unchecked;
2use super::validations::utf8_char_width;
3use crate::fmt;
4use crate::fmt::{Formatter, Write};
5use crate::iter::FusedIterator;
6
7impl [u8] {
8    /// Creates an iterator over the contiguous valid UTF-8 ranges of this
9    /// slice, and the non-UTF-8 fragments in between.
10    ///
11    /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
12    ///
13    /// # Examples
14    ///
15    /// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
16    /// code in the form of a C-string literal (`c"..."`).
17    ///
18    /// ```
19    /// use std::fmt::Write as _;
20    ///
21    /// pub fn cstr_literal(bytes: &[u8]) -> String {
22    ///     let mut repr = String::new();
23    ///     repr.push_str("c\"");
24    ///     for chunk in bytes.utf8_chunks() {
25    ///         for ch in chunk.valid().chars() {
26    ///             // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
27    ///             write!(repr, "{}", ch.escape_debug()).unwrap();
28    ///         }
29    ///         for byte in chunk.invalid() {
30    ///             write!(repr, "\\x{:02X}", byte).unwrap();
31    ///         }
32    ///     }
33    ///     repr.push('"');
34    ///     repr
35    /// }
36    ///
37    /// fn main() {
38    ///     let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07");
39    ///     let expected = stringify!(c"\xFErris the 🦀\u{7}");
40    ///     assert_eq!(lit, expected);
41    /// }
42    /// ```
43    #[stable(feature = "utf8_chunks", since = "1.79.0")]
44    pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
45        Utf8Chunks { source: self }
46    }
47}
48
49/// An item returned by the [`Utf8Chunks`] iterator.
50///
51/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
52/// when decoding a UTF-8 string.
53///
54/// # Examples
55///
56/// ```
57/// // An invalid UTF-8 string
58/// let bytes = b"foo\xF1\x80bar";
59///
60/// // Decode the first `Utf8Chunk`
61/// let chunk = bytes.utf8_chunks().next().unwrap();
62///
63/// // The first three characters are valid UTF-8
64/// assert_eq!("foo", chunk.valid());
65///
66/// // The fourth character is broken
67/// assert_eq!(b"\xF1\x80", chunk.invalid());
68/// ```
69#[stable(feature = "utf8_chunks", since = "1.79.0")]
70#[derive(Clone, Debug, PartialEq, Eq)]
71pub struct Utf8Chunk<'a> {
72    valid: &'a str,
73    invalid: &'a [u8],
74}
75
76impl<'a> Utf8Chunk<'a> {
77    /// Returns the next validated UTF-8 substring.
78    ///
79    /// This substring can be empty at the start of the string or between
80    /// broken UTF-8 characters.
81    #[must_use]
82    #[stable(feature = "utf8_chunks", since = "1.79.0")]
83    pub fn valid(&self) -> &'a str {
84        self.valid
85    }
86
87    /// Returns the invalid sequence that caused a failure.
88    ///
89    /// The returned slice will have a maximum length of 3 and starts after the
90    /// substring given by [`valid`]. Decoding will resume after this sequence.
91    ///
92    /// If empty, this is the last chunk in the string. If non-empty, an
93    /// unexpected byte was encountered or the end of the input was reached
94    /// unexpectedly.
95    ///
96    /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
97    /// CHARACTER`].
98    ///
99    /// [`valid`]: Self::valid
100    /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
101    #[must_use]
102    #[stable(feature = "utf8_chunks", since = "1.79.0")]
103    pub fn invalid(&self) -> &'a [u8] {
104        self.invalid
105    }
106}
107
108#[must_use]
109#[unstable(feature = "str_internals", issue = "none")]
110pub struct Debug<'a>(&'a [u8]);
111
112#[unstable(feature = "str_internals", issue = "none")]
113impl fmt::Debug for Debug<'_> {
114    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
115        f.write_char('"')?;
116
117        for chunk in self.0.utf8_chunks() {
118            // Valid part.
119            // Here we partially parse UTF-8 again which is suboptimal.
120            {
121                let valid = chunk.valid();
122                let mut from = 0;
123                for (i, c) in valid.char_indices() {
124                    let esc = c.escape_debug();
125                    // If char needs escaping, flush backlog so far and write, else skip
126                    if esc.len() != 1 {
127                        f.write_str(&valid[from..i])?;
128                        for c in esc {
129                            f.write_char(c)?;
130                        }
131                        from = i + c.len_utf8();
132                    }
133                }
134                f.write_str(&valid[from..])?;
135            }
136
137            // Broken parts of string as hex escape.
138            for &b in chunk.invalid() {
139                write!(f, "\\x{:02X}", b)?;
140            }
141        }
142
143        f.write_char('"')
144    }
145}
146
147/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
148/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
149///
150/// This struct is created by the [`utf8_chunks`] method on bytes slices.
151/// If you want a simple conversion from UTF-8 byte slices to string slices,
152/// [`from_utf8`] is easier to use.
153///
154/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
155///
156/// [byteslice]: slice
157/// [`utf8_chunks`]: slice::utf8_chunks
158/// [`from_utf8`]: super::from_utf8
159///
160/// # Examples
161///
162/// This can be used to create functionality similar to
163/// [`String::from_utf8_lossy`] without allocating heap memory:
164///
165/// ```
166/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
167///     for chunk in input.utf8_chunks() {
168///         push(chunk.valid());
169///
170///         if !chunk.invalid().is_empty() {
171///             push("\u{FFFD}");
172///         }
173///     }
174/// }
175/// ```
176///
177/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
178#[must_use = "iterators are lazy and do nothing unless consumed"]
179#[stable(feature = "utf8_chunks", since = "1.79.0")]
180#[derive(Clone)]
181pub struct Utf8Chunks<'a> {
182    source: &'a [u8],
183}
184
185impl<'a> Utf8Chunks<'a> {
186    #[doc(hidden)]
187    #[unstable(feature = "str_internals", issue = "none")]
188    pub fn debug(&self) -> Debug<'_> {
189        Debug(self.source)
190    }
191}
192
193#[stable(feature = "utf8_chunks", since = "1.79.0")]
194impl<'a> Iterator for Utf8Chunks<'a> {
195    type Item = Utf8Chunk<'a>;
196
197    fn next(&mut self) -> Option<Utf8Chunk<'a>> {
198        if self.source.is_empty() {
199            return None;
200        }
201
202        const TAG_CONT_U8: u8 = 128;
203        fn safe_get(xs: &[u8], i: usize) -> u8 {
204            *xs.get(i).unwrap_or(&0)
205        }
206
207        let mut i = 0;
208        let mut valid_up_to = 0;
209        while i < self.source.len() {
210            // SAFETY: `i < self.source.len()` per previous line.
211            // For some reason the following are both significantly slower:
212            // while let Some(&byte) = self.source.get(i) {
213            // while let Some(byte) = self.source.get(i).copied() {
214            let byte = unsafe { *self.source.get_unchecked(i) };
215            i += 1;
216
217            if byte < 128 {
218                // This could be a `1 => ...` case in the match below, but for
219                // the common case of all-ASCII inputs, we bypass loading the
220                // sizeable UTF8_CHAR_WIDTH table into cache.
221            } else {
222                let w = utf8_char_width(byte);
223
224                match w {
225                    2 => {
226                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
227                            break;
228                        }
229                        i += 1;
230                    }
231                    3 => {
232                        match (byte, safe_get(self.source, i)) {
233                            (0xE0, 0xA0..=0xBF) => (),
234                            (0xE1..=0xEC, 0x80..=0xBF) => (),
235                            (0xED, 0x80..=0x9F) => (),
236                            (0xEE..=0xEF, 0x80..=0xBF) => (),
237                            _ => break,
238                        }
239                        i += 1;
240                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
241                            break;
242                        }
243                        i += 1;
244                    }
245                    4 => {
246                        match (byte, safe_get(self.source, i)) {
247                            (0xF0, 0x90..=0xBF) => (),
248                            (0xF1..=0xF3, 0x80..=0xBF) => (),
249                            (0xF4, 0x80..=0x8F) => (),
250                            _ => break,
251                        }
252                        i += 1;
253                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
254                            break;
255                        }
256                        i += 1;
257                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
258                            break;
259                        }
260                        i += 1;
261                    }
262                    _ => break,
263                }
264            }
265
266            valid_up_to = i;
267        }
268
269        // SAFETY: `i <= self.source.len()` because it is only ever incremented
270        // via `i += 1` and in between every single one of those increments, `i`
271        // is compared against `self.source.len()`. That happens either
272        // literally by `i < self.source.len()` in the while-loop's condition,
273        // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
274        // loop is terminated as soon as the latest `i += 1` has made `i` no
275        // longer less than `self.source.len()`, which means it'll be at most
276        // equal to `self.source.len()`.
277        let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
278        self.source = remaining;
279
280        // SAFETY: `valid_up_to <= i` because it is only ever assigned via
281        // `valid_up_to = i` and `i` only increases.
282        let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
283
284        Some(Utf8Chunk {
285            // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
286            valid: unsafe { from_utf8_unchecked(valid) },
287            invalid,
288        })
289    }
290}
291
292#[stable(feature = "utf8_chunks", since = "1.79.0")]
293impl FusedIterator for Utf8Chunks<'_> {}
294
295#[stable(feature = "utf8_chunks", since = "1.79.0")]
296impl fmt::Debug for Utf8Chunks<'_> {
297    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
298        f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
299    }
300}