core/str/lossy.rs
1use super::from_utf8_unchecked;
2use super::validations::utf8_char_width;
3use crate::fmt;
4use crate::fmt::{Formatter, Write};
5use crate::iter::FusedIterator;
6
7impl [u8] {
8 /// Creates an iterator over the contiguous valid UTF-8 ranges of this
9 /// slice, and the non-UTF-8 fragments in between.
10 ///
11 /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
12 ///
13 /// # Examples
14 ///
15 /// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
16 /// code in the form of a C-string literal (`c"..."`).
17 ///
18 /// ```
19 /// use std::fmt::Write as _;
20 ///
21 /// pub fn cstr_literal(bytes: &[u8]) -> String {
22 /// let mut repr = String::new();
23 /// repr.push_str("c\"");
24 /// for chunk in bytes.utf8_chunks() {
25 /// for ch in chunk.valid().chars() {
26 /// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
27 /// write!(repr, "{}", ch.escape_debug()).unwrap();
28 /// }
29 /// for byte in chunk.invalid() {
30 /// write!(repr, "\\x{:02X}", byte).unwrap();
31 /// }
32 /// }
33 /// repr.push('"');
34 /// repr
35 /// }
36 ///
37 /// fn main() {
38 /// let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07");
39 /// let expected = stringify!(c"\xFErris the 🦀\u{7}");
40 /// assert_eq!(lit, expected);
41 /// }
42 /// ```
43 #[stable(feature = "utf8_chunks", since = "1.79.0")]
44 pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
45 Utf8Chunks { source: self }
46 }
47}
48
49/// An item returned by the [`Utf8Chunks`] iterator.
50///
51/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
52/// when decoding a UTF-8 string.
53///
54/// # Examples
55///
56/// ```
57/// // An invalid UTF-8 string
58/// let bytes = b"foo\xF1\x80bar";
59///
60/// // Decode the first `Utf8Chunk`
61/// let chunk = bytes.utf8_chunks().next().unwrap();
62///
63/// // The first three characters are valid UTF-8
64/// assert_eq!("foo", chunk.valid());
65///
66/// // The fourth character is broken
67/// assert_eq!(b"\xF1\x80", chunk.invalid());
68/// ```
69#[stable(feature = "utf8_chunks", since = "1.79.0")]
70#[derive(Clone, Debug, PartialEq, Eq)]
71pub struct Utf8Chunk<'a> {
72 valid: &'a str,
73 invalid: &'a [u8],
74}
75
76impl<'a> Utf8Chunk<'a> {
77 /// Returns the next validated UTF-8 substring.
78 ///
79 /// This substring can be empty at the start of the string or between
80 /// broken UTF-8 characters.
81 #[must_use]
82 #[stable(feature = "utf8_chunks", since = "1.79.0")]
83 pub fn valid(&self) -> &'a str {
84 self.valid
85 }
86
87 /// Returns the invalid sequence that caused a failure.
88 ///
89 /// The returned slice will have a maximum length of 3 and starts after the
90 /// substring given by [`valid`]. Decoding will resume after this sequence.
91 ///
92 /// If empty, this is the last chunk in the string. If non-empty, an
93 /// unexpected byte was encountered or the end of the input was reached
94 /// unexpectedly.
95 ///
96 /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
97 /// CHARACTER`].
98 ///
99 /// [`valid`]: Self::valid
100 /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
101 #[must_use]
102 #[stable(feature = "utf8_chunks", since = "1.79.0")]
103 pub fn invalid(&self) -> &'a [u8] {
104 self.invalid
105 }
106}
107
108#[must_use]
109#[unstable(feature = "str_internals", issue = "none")]
110pub struct Debug<'a>(&'a [u8]);
111
112#[unstable(feature = "str_internals", issue = "none")]
113impl fmt::Debug for Debug<'_> {
114 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
115 f.write_char('"')?;
116
117 for chunk in self.0.utf8_chunks() {
118 // Valid part.
119 // Here we partially parse UTF-8 again which is suboptimal.
120 {
121 let valid = chunk.valid();
122 let mut from = 0;
123 for (i, c) in valid.char_indices() {
124 let esc = c.escape_debug();
125 // If char needs escaping, flush backlog so far and write, else skip
126 if esc.len() != 1 {
127 f.write_str(&valid[from..i])?;
128 for c in esc {
129 f.write_char(c)?;
130 }
131 from = i + c.len_utf8();
132 }
133 }
134 f.write_str(&valid[from..])?;
135 }
136
137 // Broken parts of string as hex escape.
138 for &b in chunk.invalid() {
139 write!(f, "\\x{:02X}", b)?;
140 }
141 }
142
143 f.write_char('"')
144 }
145}
146
147/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
148/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
149///
150/// If you want a simple conversion from UTF-8 byte slices to string slices,
151/// [`from_utf8`] is easier to use.
152///
153/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
154///
155/// [byteslice]: slice
156/// [`from_utf8`]: super::from_utf8
157///
158/// # Examples
159///
160/// This can be used to create functionality similar to
161/// [`String::from_utf8_lossy`] without allocating heap memory:
162///
163/// ```
164/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
165/// for chunk in input.utf8_chunks() {
166/// push(chunk.valid());
167///
168/// if !chunk.invalid().is_empty() {
169/// push("\u{FFFD}");
170/// }
171/// }
172/// }
173/// ```
174///
175/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
176#[must_use = "iterators are lazy and do nothing unless consumed"]
177#[stable(feature = "utf8_chunks", since = "1.79.0")]
178#[derive(Clone)]
179pub struct Utf8Chunks<'a> {
180 source: &'a [u8],
181}
182
183impl<'a> Utf8Chunks<'a> {
184 #[doc(hidden)]
185 #[unstable(feature = "str_internals", issue = "none")]
186 pub fn debug(&self) -> Debug<'_> {
187 Debug(self.source)
188 }
189}
190
191#[stable(feature = "utf8_chunks", since = "1.79.0")]
192impl<'a> Iterator for Utf8Chunks<'a> {
193 type Item = Utf8Chunk<'a>;
194
195 fn next(&mut self) -> Option<Utf8Chunk<'a>> {
196 if self.source.is_empty() {
197 return None;
198 }
199
200 const TAG_CONT_U8: u8 = 128;
201 fn safe_get(xs: &[u8], i: usize) -> u8 {
202 *xs.get(i).unwrap_or(&0)
203 }
204
205 let mut i = 0;
206 let mut valid_up_to = 0;
207 while i < self.source.len() {
208 // SAFETY: `i < self.source.len()` per previous line.
209 // For some reason the following are both significantly slower:
210 // while let Some(&byte) = self.source.get(i) {
211 // while let Some(byte) = self.source.get(i).copied() {
212 let byte = unsafe { *self.source.get_unchecked(i) };
213 i += 1;
214
215 if byte < 128 {
216 // This could be a `1 => ...` case in the match below, but for
217 // the common case of all-ASCII inputs, we bypass loading the
218 // sizeable UTF8_CHAR_WIDTH table into cache.
219 } else {
220 let w = utf8_char_width(byte);
221
222 match w {
223 2 => {
224 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
225 break;
226 }
227 i += 1;
228 }
229 3 => {
230 match (byte, safe_get(self.source, i)) {
231 (0xE0, 0xA0..=0xBF) => (),
232 (0xE1..=0xEC, 0x80..=0xBF) => (),
233 (0xED, 0x80..=0x9F) => (),
234 (0xEE..=0xEF, 0x80..=0xBF) => (),
235 _ => break,
236 }
237 i += 1;
238 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
239 break;
240 }
241 i += 1;
242 }
243 4 => {
244 match (byte, safe_get(self.source, i)) {
245 (0xF0, 0x90..=0xBF) => (),
246 (0xF1..=0xF3, 0x80..=0xBF) => (),
247 (0xF4, 0x80..=0x8F) => (),
248 _ => break,
249 }
250 i += 1;
251 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
252 break;
253 }
254 i += 1;
255 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
256 break;
257 }
258 i += 1;
259 }
260 _ => break,
261 }
262 }
263
264 valid_up_to = i;
265 }
266
267 // SAFETY: `i <= self.source.len()` because it is only ever incremented
268 // via `i += 1` and in between every single one of those increments, `i`
269 // is compared against `self.source.len()`. That happens either
270 // literally by `i < self.source.len()` in the while-loop's condition,
271 // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
272 // loop is terminated as soon as the latest `i += 1` has made `i` no
273 // longer less than `self.source.len()`, which means it'll be at most
274 // equal to `self.source.len()`.
275 let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
276 self.source = remaining;
277
278 // SAFETY: `valid_up_to <= i` because it is only ever assigned via
279 // `valid_up_to = i` and `i` only increases.
280 let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
281
282 Some(Utf8Chunk {
283 // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
284 valid: unsafe { from_utf8_unchecked(valid) },
285 invalid,
286 })
287 }
288}
289
290#[stable(feature = "utf8_chunks", since = "1.79.0")]
291impl FusedIterator for Utf8Chunks<'_> {}
292
293#[stable(feature = "utf8_chunks", since = "1.79.0")]
294impl fmt::Debug for Utf8Chunks<'_> {
295 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
296 f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
297 }
298}