core/str/lossy.rs
1use super::from_utf8_unchecked;
2use super::validations::utf8_char_width;
3use crate::fmt;
4use crate::fmt::{Formatter, Write};
5use crate::iter::FusedIterator;
6
7impl [u8] {
8 /// Creates an iterator over the contiguous valid UTF-8 ranges of this
9 /// slice, and the non-UTF-8 fragments in between.
10 ///
11 /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
12 ///
13 /// # Examples
14 ///
15 /// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
16 /// code in the form of a C-string literal (`c"..."`).
17 ///
18 /// ```
19 /// use std::fmt::Write as _;
20 ///
21 /// pub fn cstr_literal(bytes: &[u8]) -> String {
22 /// let mut repr = String::new();
23 /// repr.push_str("c\"");
24 /// for chunk in bytes.utf8_chunks() {
25 /// for ch in chunk.valid().chars() {
26 /// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
27 /// write!(repr, "{}", ch.escape_debug()).unwrap();
28 /// }
29 /// for byte in chunk.invalid() {
30 /// write!(repr, "\\x{:02X}", byte).unwrap();
31 /// }
32 /// }
33 /// repr.push('"');
34 /// repr
35 /// }
36 ///
37 /// fn main() {
38 /// let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07");
39 /// let expected = stringify!(c"\xFErris the 🦀\u{7}");
40 /// assert_eq!(lit, expected);
41 /// }
42 /// ```
43 #[stable(feature = "utf8_chunks", since = "1.79.0")]
44 pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
45 Utf8Chunks { source: self }
46 }
47}
48
49/// An item returned by the [`Utf8Chunks`] iterator.
50///
51/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
52/// when decoding a UTF-8 string.
53///
54/// # Examples
55///
56/// ```
57/// // An invalid UTF-8 string
58/// let bytes = b"foo\xF1\x80bar";
59///
60/// // Decode the first `Utf8Chunk`
61/// let chunk = bytes.utf8_chunks().next().unwrap();
62///
63/// // The first three characters are valid UTF-8
64/// assert_eq!("foo", chunk.valid());
65///
66/// // The fourth character is broken
67/// assert_eq!(b"\xF1\x80", chunk.invalid());
68/// ```
69#[stable(feature = "utf8_chunks", since = "1.79.0")]
70#[derive(Clone, Debug, PartialEq, Eq)]
71pub struct Utf8Chunk<'a> {
72 valid: &'a str,
73 invalid: &'a [u8],
74}
75
76impl<'a> Utf8Chunk<'a> {
77 /// Returns the next validated UTF-8 substring.
78 ///
79 /// This substring can be empty at the start of the string or between
80 /// broken UTF-8 characters.
81 #[must_use]
82 #[stable(feature = "utf8_chunks", since = "1.79.0")]
83 pub fn valid(&self) -> &'a str {
84 self.valid
85 }
86
87 /// Returns the invalid sequence that caused a failure.
88 ///
89 /// The returned slice will have a maximum length of 3 and starts after the
90 /// substring given by [`valid`]. Decoding will resume after this sequence.
91 ///
92 /// If empty, this is the last chunk in the string. If non-empty, an
93 /// unexpected byte was encountered or the end of the input was reached
94 /// unexpectedly.
95 ///
96 /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
97 /// CHARACTER`].
98 ///
99 /// [`valid`]: Self::valid
100 /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
101 #[must_use]
102 #[stable(feature = "utf8_chunks", since = "1.79.0")]
103 pub fn invalid(&self) -> &'a [u8] {
104 self.invalid
105 }
106}
107
108#[must_use]
109#[unstable(feature = "str_internals", issue = "none")]
110pub struct Debug<'a>(&'a [u8]);
111
112#[unstable(feature = "str_internals", issue = "none")]
113impl fmt::Debug for Debug<'_> {
114 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
115 f.write_char('"')?;
116
117 for chunk in self.0.utf8_chunks() {
118 // Valid part.
119 // Here we partially parse UTF-8 again which is suboptimal.
120 {
121 let valid = chunk.valid();
122 let mut from = 0;
123 for (i, c) in valid.char_indices() {
124 let esc = c.escape_debug();
125 // If char needs escaping, flush backlog so far and write, else skip
126 if esc.len() != 1 {
127 f.write_str(&valid[from..i])?;
128 for c in esc {
129 f.write_char(c)?;
130 }
131 from = i + c.len_utf8();
132 }
133 }
134 f.write_str(&valid[from..])?;
135 }
136
137 // Broken parts of string as hex escape.
138 for &b in chunk.invalid() {
139 write!(f, "\\x{:02X}", b)?;
140 }
141 }
142
143 f.write_char('"')
144 }
145}
146
147/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
148/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
149///
150/// This struct is created by the [`utf8_chunks`] method on bytes slices.
151/// If you want a simple conversion from UTF-8 byte slices to string slices,
152/// [`from_utf8`] is easier to use.
153///
154/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
155///
156/// [byteslice]: slice
157/// [`utf8_chunks`]: slice::utf8_chunks
158/// [`from_utf8`]: super::from_utf8
159///
160/// # Examples
161///
162/// This can be used to create functionality similar to
163/// [`String::from_utf8_lossy`] without allocating heap memory:
164///
165/// ```
166/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
167/// for chunk in input.utf8_chunks() {
168/// push(chunk.valid());
169///
170/// if !chunk.invalid().is_empty() {
171/// push("\u{FFFD}");
172/// }
173/// }
174/// }
175/// ```
176///
177/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
178#[must_use = "iterators are lazy and do nothing unless consumed"]
179#[stable(feature = "utf8_chunks", since = "1.79.0")]
180#[derive(Clone)]
181pub struct Utf8Chunks<'a> {
182 source: &'a [u8],
183}
184
185impl<'a> Utf8Chunks<'a> {
186 #[doc(hidden)]
187 #[unstable(feature = "str_internals", issue = "none")]
188 pub fn debug(&self) -> Debug<'_> {
189 Debug(self.source)
190 }
191}
192
193#[stable(feature = "utf8_chunks", since = "1.79.0")]
194impl<'a> Iterator for Utf8Chunks<'a> {
195 type Item = Utf8Chunk<'a>;
196
197 fn next(&mut self) -> Option<Utf8Chunk<'a>> {
198 if self.source.is_empty() {
199 return None;
200 }
201
202 const TAG_CONT_U8: u8 = 128;
203 fn safe_get(xs: &[u8], i: usize) -> u8 {
204 *xs.get(i).unwrap_or(&0)
205 }
206
207 let mut i = 0;
208 let mut valid_up_to = 0;
209 while i < self.source.len() {
210 // SAFETY: `i < self.source.len()` per previous line.
211 // For some reason the following are both significantly slower:
212 // while let Some(&byte) = self.source.get(i) {
213 // while let Some(byte) = self.source.get(i).copied() {
214 let byte = unsafe { *self.source.get_unchecked(i) };
215 i += 1;
216
217 if byte < 128 {
218 // This could be a `1 => ...` case in the match below, but for
219 // the common case of all-ASCII inputs, we bypass loading the
220 // sizeable UTF8_CHAR_WIDTH table into cache.
221 } else {
222 let w = utf8_char_width(byte);
223
224 match w {
225 2 => {
226 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
227 break;
228 }
229 i += 1;
230 }
231 3 => {
232 match (byte, safe_get(self.source, i)) {
233 (0xE0, 0xA0..=0xBF) => (),
234 (0xE1..=0xEC, 0x80..=0xBF) => (),
235 (0xED, 0x80..=0x9F) => (),
236 (0xEE..=0xEF, 0x80..=0xBF) => (),
237 _ => break,
238 }
239 i += 1;
240 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
241 break;
242 }
243 i += 1;
244 }
245 4 => {
246 match (byte, safe_get(self.source, i)) {
247 (0xF0, 0x90..=0xBF) => (),
248 (0xF1..=0xF3, 0x80..=0xBF) => (),
249 (0xF4, 0x80..=0x8F) => (),
250 _ => break,
251 }
252 i += 1;
253 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
254 break;
255 }
256 i += 1;
257 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
258 break;
259 }
260 i += 1;
261 }
262 _ => break,
263 }
264 }
265
266 valid_up_to = i;
267 }
268
269 // SAFETY: `i <= self.source.len()` because it is only ever incremented
270 // via `i += 1` and in between every single one of those increments, `i`
271 // is compared against `self.source.len()`. That happens either
272 // literally by `i < self.source.len()` in the while-loop's condition,
273 // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
274 // loop is terminated as soon as the latest `i += 1` has made `i` no
275 // longer less than `self.source.len()`, which means it'll be at most
276 // equal to `self.source.len()`.
277 let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
278 self.source = remaining;
279
280 // SAFETY: `valid_up_to <= i` because it is only ever assigned via
281 // `valid_up_to = i` and `i` only increases.
282 let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
283
284 Some(Utf8Chunk {
285 // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
286 valid: unsafe { from_utf8_unchecked(valid) },
287 invalid,
288 })
289 }
290}
291
292#[stable(feature = "utf8_chunks", since = "1.79.0")]
293impl FusedIterator for Utf8Chunks<'_> {}
294
295#[stable(feature = "utf8_chunks", since = "1.79.0")]
296impl fmt::Debug for Utf8Chunks<'_> {
297 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
298 f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
299 }
300}