rustc_ast/util/
unicode.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
pub const TEXT_FLOW_CONTROL_CHARS: &[char] = &[
    '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
    '\u{2069}',
];

#[inline]
pub fn contains_text_flow_control_chars(s: &str) -> bool {
    // Char   - UTF-8
    // U+202A - E2 80 AA
    // U+202B - E2 80 AB
    // U+202C - E2 80 AC
    // U+202D - E2 80 AD
    // U+202E - E2 80 AE
    // U+2066 - E2 81 A6
    // U+2067 - E2 81 A7
    // U+2068 - E2 81 A8
    // U+2069 - E2 81 A9
    let mut bytes = s.as_bytes();
    loop {
        match memchr::memchr(0xE2, bytes) {
            Some(idx) => {
                // bytes are valid UTF-8 -> E2 must be followed by two bytes
                let ch = &bytes[idx..idx + 3];
                match ch {
                    [_, 0x80, 0xAA..=0xAE] | [_, 0x81, 0xA6..=0xA9] => break true,
                    _ => {}
                }
                bytes = &bytes[idx + 3..];
            }
            None => {
                break false;
            }
        }
    }
}