core/str/
validations.rs

1//! Operations related to UTF-8 validation.
2
3use super::Utf8Error;
4use crate::intrinsics::const_eval_select;
5use crate::mem;
6
7/// Returns the initial codepoint accumulator for the first byte.
8/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
9/// for width 3, and 3 bits for width 4.
10#[inline]
11const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
12    (byte & (0x7F >> width)) as u32
13}
14
15/// Returns the value of `ch` updated with continuation byte `byte`.
16#[inline]
17const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
18    (ch << 6) | (byte & CONT_MASK) as u32
19}
20
21/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
22/// bits `10`).
23#[inline]
24pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
25    (byte as i8) < -64
26}
27
28/// Reads the next code point out of a byte iterator (assuming a
29/// UTF-8-like encoding).
30///
31/// # Safety
32///
33/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
34#[unstable(feature = "str_internals", issue = "none")]
35#[inline]
36pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
37    // Decode UTF-8
38    let x = *bytes.next()?;
39    if x < 128 {
40        return Some(x as u32);
41    }
42
43    // Multibyte case follows
44    // Decode from a byte combination out of: [[[x y] z] w]
45    // NOTE: Performance is sensitive to the exact formulation here
46    let init = utf8_first_byte(x, 2);
47    // SAFETY: `bytes` produces an UTF-8-like string,
48    // so the iterator must produce a value here.
49    let y = unsafe { *bytes.next().unwrap_unchecked() };
50    let mut ch = utf8_acc_cont_byte(init, y);
51    if x >= 0xE0 {
52        // [[x y z] w] case
53        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
54        // SAFETY: `bytes` produces an UTF-8-like string,
55        // so the iterator must produce a value here.
56        let z = unsafe { *bytes.next().unwrap_unchecked() };
57        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
58        ch = init << 12 | y_z;
59        if x >= 0xF0 {
60            // [x y z w] case
61            // use only the lower 3 bits of `init`
62            // SAFETY: `bytes` produces an UTF-8-like string,
63            // so the iterator must produce a value here.
64            let w = unsafe { *bytes.next().unwrap_unchecked() };
65            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
66        }
67    }
68
69    Some(ch)
70}
71
72/// Reads the last code point out of a byte iterator (assuming a
73/// UTF-8-like encoding).
74///
75/// # Safety
76///
77/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
78#[inline]
79pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
80where
81    I: DoubleEndedIterator<Item = &'a u8>,
82{
83    // Decode UTF-8
84    let w = match *bytes.next_back()? {
85        next_byte if next_byte < 128 => return Some(next_byte as u32),
86        back_byte => back_byte,
87    };
88
89    // Multibyte case follows
90    // Decode from a byte combination out of: [x [y [z w]]]
91    let mut ch;
92    // SAFETY: `bytes` produces an UTF-8-like string,
93    // so the iterator must produce a value here.
94    let z = unsafe { *bytes.next_back().unwrap_unchecked() };
95    ch = utf8_first_byte(z, 2);
96    if utf8_is_cont_byte(z) {
97        // SAFETY: `bytes` produces an UTF-8-like string,
98        // so the iterator must produce a value here.
99        let y = unsafe { *bytes.next_back().unwrap_unchecked() };
100        ch = utf8_first_byte(y, 3);
101        if utf8_is_cont_byte(y) {
102            // SAFETY: `bytes` produces an UTF-8-like string,
103            // so the iterator must produce a value here.
104            let x = unsafe { *bytes.next_back().unwrap_unchecked() };
105            ch = utf8_first_byte(x, 4);
106            ch = utf8_acc_cont_byte(ch, y);
107        }
108        ch = utf8_acc_cont_byte(ch, z);
109    }
110    ch = utf8_acc_cont_byte(ch, w);
111
112    Some(ch)
113}
114
115const NONASCII_MASK: usize = usize::repeat_u8(0x80);
116
117/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
118#[inline]
119const fn contains_nonascii(x: usize) -> bool {
120    (x & NONASCII_MASK) != 0
121}
122
123/// Walks through `v` checking that it's a valid UTF-8 sequence,
124/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
125#[inline(always)]
126#[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
127pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
128    let mut index = 0;
129    let len = v.len();
130
131    const USIZE_BYTES: usize = mem::size_of::<usize>();
132
133    let ascii_block_size = 2 * USIZE_BYTES;
134    let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
135    // Below, we safely fall back to a slower codepath if the offset is `usize::MAX`,
136    // so the end-to-end behavior is the same at compiletime and runtime.
137    let align = const_eval_select!(
138        @capture { v: &[u8] } -> usize:
139        if const {
140            usize::MAX
141        } else {
142            v.as_ptr().align_offset(USIZE_BYTES)
143        }
144    );
145
146    while index < len {
147        let old_offset = index;
148        macro_rules! err {
149            ($error_len: expr) => {
150                return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len })
151            };
152        }
153
154        macro_rules! next {
155            () => {{
156                index += 1;
157                // we needed data, but there was none: error!
158                if index >= len {
159                    err!(None)
160                }
161                v[index]
162            }};
163        }
164
165        let first = v[index];
166        if first >= 128 {
167            let w = utf8_char_width(first);
168            // 2-byte encoding is for codepoints  \u{0080} to  \u{07ff}
169            //        first  C2 80        last DF BF
170            // 3-byte encoding is for codepoints  \u{0800} to  \u{ffff}
171            //        first  E0 A0 80     last EF BF BF
172            //   excluding surrogates codepoints  \u{d800} to  \u{dfff}
173            //               ED A0 80 to       ED BF BF
174            // 4-byte encoding is for codepoints \u{10000} to \u{10ffff}
175            //        first  F0 90 80 80  last F4 8F BF BF
176            //
177            // Use the UTF-8 syntax from the RFC
178            //
179            // https://tools.ietf.org/html/rfc3629
180            // UTF8-1      = %x00-7F
181            // UTF8-2      = %xC2-DF UTF8-tail
182            // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
183            //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
184            // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
185            //               %xF4 %x80-8F 2( UTF8-tail )
186            match w {
187                2 => {
188                    if next!() as i8 >= -64 {
189                        err!(Some(1))
190                    }
191                }
192                3 => {
193                    match (first, next!()) {
194                        (0xE0, 0xA0..=0xBF)
195                        | (0xE1..=0xEC, 0x80..=0xBF)
196                        | (0xED, 0x80..=0x9F)
197                        | (0xEE..=0xEF, 0x80..=0xBF) => {}
198                        _ => err!(Some(1)),
199                    }
200                    if next!() as i8 >= -64 {
201                        err!(Some(2))
202                    }
203                }
204                4 => {
205                    match (first, next!()) {
206                        (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
207                        _ => err!(Some(1)),
208                    }
209                    if next!() as i8 >= -64 {
210                        err!(Some(2))
211                    }
212                    if next!() as i8 >= -64 {
213                        err!(Some(3))
214                    }
215                }
216                _ => err!(Some(1)),
217            }
218            index += 1;
219        } else {
220            // Ascii case, try to skip forward quickly.
221            // When the pointer is aligned, read 2 words of data per iteration
222            // until we find a word containing a non-ascii byte.
223            if align != usize::MAX && align.wrapping_sub(index) % USIZE_BYTES == 0 {
224                let ptr = v.as_ptr();
225                while index < blocks_end {
226                    // SAFETY: since `align - index` and `ascii_block_size` are
227                    // multiples of `USIZE_BYTES`, `block = ptr.add(index)` is
228                    // always aligned with a `usize` so it's safe to dereference
229                    // both `block` and `block.add(1)`.
230                    unsafe {
231                        let block = ptr.add(index) as *const usize;
232                        // break if there is a nonascii byte
233                        let zu = contains_nonascii(*block);
234                        let zv = contains_nonascii(*block.add(1));
235                        if zu || zv {
236                            break;
237                        }
238                    }
239                    index += ascii_block_size;
240                }
241                // step from the point where the wordwise loop stopped
242                while index < len && v[index] < 128 {
243                    index += 1;
244                }
245            } else {
246                index += 1;
247            }
248        }
249    }
250
251    Ok(())
252}
253
254// https://tools.ietf.org/html/rfc3629
255const UTF8_CHAR_WIDTH: &[u8; 256] = &[
256    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
257    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
258    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
259    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
260    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
261    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
262    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
263    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
264    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
265    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
266    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
267    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
268    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
269    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
270    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
271    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
272    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
273];
274
275/// Given a first byte, determines how many bytes are in this UTF-8 character.
276#[unstable(feature = "str_internals", issue = "none")]
277#[must_use]
278#[inline]
279pub const fn utf8_char_width(b: u8) -> usize {
280    UTF8_CHAR_WIDTH[b as usize] as usize
281}
282
283/// Mask of the value bits of a continuation byte.
284const CONT_MASK: u8 = 0b0011_1111;