rustc_lexer/
lib.rs

1//! Low-level Rust lexer.
2//!
3//! The idea with `rustc_lexer` is to make a reusable library,
4//! by separating out pure lexing and rustc-specific concerns, like spans,
5//! error reporting, and interning. So, rustc_lexer operates directly on `&str`,
6//! produces simple tokens which are a pair of type-tag and a bit of original text,
7//! and does not report errors, instead storing them as flags on the token.
8//!
9//! Tokens produced by this lexer are not yet ready for parsing the Rust syntax.
10//! For that see [`rustc_parse::lexer`], which converts this basic token stream
11//! into wide tokens used by actual parser.
12//!
13//! The purpose of this crate is to convert raw sources into a labeled sequence
14//! of well-known token types, so building an actual Rust token stream will
15//! be easier.
16//!
17//! The main entity of this crate is the [`TokenKind`] enum which represents common
18//! lexeme types.
19//!
20//! [`rustc_parse::lexer`]: ../rustc_parse/lexer/index.html
21
22// tidy-alphabetical-start
23// We want to be able to build this crate with a stable compiler,
24// so no `#![feature]` attributes should be added.
25#![deny(unstable_features)]
26#![warn(unreachable_pub)]
27// tidy-alphabetical-end
28
29mod cursor;
30pub mod unescape;
31
32#[cfg(test)]
33mod tests;
34
35use unicode_properties::UnicodeEmoji;
36pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION;
37
38use self::LiteralKind::*;
39use self::TokenKind::*;
40pub use crate::cursor::Cursor;
41use crate::cursor::EOF_CHAR;
42
43/// Parsed token.
44/// It doesn't contain information about data that has been parsed,
45/// only the type of the token and its size.
46#[derive(Debug)]
47pub struct Token {
48    pub kind: TokenKind,
49    pub len: u32,
50}
51
52impl Token {
53    fn new(kind: TokenKind, len: u32) -> Token {
54        Token { kind, len }
55    }
56}
57
58/// Enum representing common lexeme types.
59#[derive(Clone, Copy, Debug, PartialEq, Eq)]
60pub enum TokenKind {
61    /// A line comment, e.g. `// comment`.
62    LineComment { doc_style: Option<DocStyle> },
63
64    /// A block comment, e.g. `/* block comment */`.
65    ///
66    /// Block comments can be recursive, so a sequence like `/* /* */`
67    /// will not be considered terminated and will result in a parsing error.
68    BlockComment { doc_style: Option<DocStyle>, terminated: bool },
69
70    /// Any whitespace character sequence.
71    Whitespace,
72
73    /// An identifier or keyword, e.g. `ident` or `continue`.
74    Ident,
75
76    /// An identifier that is invalid because it contains emoji.
77    InvalidIdent,
78
79    /// A raw identifier, e.g. "r#ident".
80    RawIdent,
81
82    /// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes
83    /// literal prefixes that contain emoji, which are considered "invalid".
84    ///
85    /// Note that only the
86    /// prefix (`foo`) is included in the token, not the separator (which is
87    /// lexed as its own distinct token). In Rust 2021 and later, reserved
88    /// prefixes are reported as errors; in earlier editions, they result in a
89    /// (allowed by default) lint, and are treated as regular identifier
90    /// tokens.
91    UnknownPrefix,
92
93    /// An unknown prefix in a lifetime, like `'foo#`.
94    ///
95    /// Like `UnknownPrefix`, only the `'` and prefix are included in the token
96    /// and not the separator.
97    UnknownPrefixLifetime,
98
99    /// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into
100    /// several tokens: `'r` and `#` and `foo`.
101    RawLifetime,
102
103    /// Guarded string literal prefix: `#"` or `##`.
104    ///
105    /// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
106    /// Split into the component tokens on older editions.
107    GuardedStrPrefix,
108
109    /// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
110    /// suffix, but may be present here on string and float literals. Users of
111    /// this type will need to check for and reject that case.
112    ///
113    /// See [LiteralKind] for more details.
114    Literal { kind: LiteralKind, suffix_start: u32 },
115
116    /// A lifetime, e.g. `'a`.
117    Lifetime { starts_with_number: bool },
118
119    /// `;`
120    Semi,
121    /// `,`
122    Comma,
123    /// `.`
124    Dot,
125    /// `(`
126    OpenParen,
127    /// `)`
128    CloseParen,
129    /// `{`
130    OpenBrace,
131    /// `}`
132    CloseBrace,
133    /// `[`
134    OpenBracket,
135    /// `]`
136    CloseBracket,
137    /// `@`
138    At,
139    /// `#`
140    Pound,
141    /// `~`
142    Tilde,
143    /// `?`
144    Question,
145    /// `:`
146    Colon,
147    /// `$`
148    Dollar,
149    /// `=`
150    Eq,
151    /// `!`
152    Bang,
153    /// `<`
154    Lt,
155    /// `>`
156    Gt,
157    /// `-`
158    Minus,
159    /// `&`
160    And,
161    /// `|`
162    Or,
163    /// `+`
164    Plus,
165    /// `*`
166    Star,
167    /// `/`
168    Slash,
169    /// `^`
170    Caret,
171    /// `%`
172    Percent,
173
174    /// Unknown token, not expected by the lexer, e.g. "â„–"
175    Unknown,
176
177    /// End of input.
178    Eof,
179}
180
181#[derive(Clone, Copy, Debug, PartialEq, Eq)]
182pub enum DocStyle {
183    Outer,
184    Inner,
185}
186
187/// Enum representing the literal types supported by the lexer.
188///
189/// Note that the suffix is *not* considered when deciding the `LiteralKind` in
190/// this type. This means that float literals like `1f32` are classified by this
191/// type as `Int`. (Compare against `rustc_ast::token::LitKind` and
192/// `rustc_ast::ast::LitKind`).
193#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
194pub enum LiteralKind {
195    /// `12_u8`, `0o100`, `0b120i99`, `1f32`.
196    Int { base: Base, empty_int: bool },
197    /// `12.34f32`, `1e3`, but not `1f32`.
198    Float { base: Base, empty_exponent: bool },
199    /// `'a'`, `'\\'`, `'''`, `';`
200    Char { terminated: bool },
201    /// `b'a'`, `b'\\'`, `b'''`, `b';`
202    Byte { terminated: bool },
203    /// `"abc"`, `"abc`
204    Str { terminated: bool },
205    /// `b"abc"`, `b"abc`
206    ByteStr { terminated: bool },
207    /// `c"abc"`, `c"abc`
208    CStr { terminated: bool },
209    /// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates
210    /// an invalid literal.
211    RawStr { n_hashes: Option<u8> },
212    /// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None`
213    /// indicates an invalid literal.
214    RawByteStr { n_hashes: Option<u8> },
215    /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
216    RawCStr { n_hashes: Option<u8> },
217}
218
219/// `#"abc"#`, `##"a"` (fewer closing), or even `#"a` (unterminated).
220///
221/// Can capture fewer closing hashes than starting hashes,
222/// for more efficient lexing and better backwards diagnostics.
223#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
224pub struct GuardedStr {
225    pub n_hashes: u32,
226    pub terminated: bool,
227    pub token_len: u32,
228}
229
230#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
231pub enum RawStrError {
232    /// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
233    InvalidStarter { bad_char: char },
234    /// The string was not terminated, e.g. `r###"abcde"##`.
235    /// `possible_terminator_offset` is the number of characters after `r` or
236    /// `br` where they may have intended to terminate it.
237    NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> },
238    /// More than 255 `#`s exist.
239    TooManyDelimiters { found: u32 },
240}
241
242/// Base of numeric literal encoding according to its prefix.
243#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
244pub enum Base {
245    /// Literal starts with "0b".
246    Binary = 2,
247    /// Literal starts with "0o".
248    Octal = 8,
249    /// Literal doesn't contain a prefix.
250    Decimal = 10,
251    /// Literal starts with "0x".
252    Hexadecimal = 16,
253}
254
255/// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun",
256/// but shebang isn't a part of rust syntax.
257pub fn strip_shebang(input: &str) -> Option<usize> {
258    // Shebang must start with `#!` literally, without any preceding whitespace.
259    // For simplicity we consider any line starting with `#!` a shebang,
260    // regardless of restrictions put on shebangs by specific platforms.
261    if let Some(input_tail) = input.strip_prefix("#!") {
262        // Ok, this is a shebang but if the next non-whitespace token is `[`,
263        // then it may be valid Rust code, so consider it Rust code.
264        let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| {
265            !matches!(
266                tok,
267                TokenKind::Whitespace
268                    | TokenKind::LineComment { doc_style: None }
269                    | TokenKind::BlockComment { doc_style: None, .. }
270            )
271        });
272        if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
273            // No other choice than to consider this a shebang.
274            return Some(2 + input_tail.lines().next().unwrap_or_default().len());
275        }
276    }
277    None
278}
279
280/// Validates a raw string literal. Used for getting more information about a
281/// problem with a `RawStr`/`RawByteStr` with a `None` field.
282#[inline]
283pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
284    debug_assert!(!input.is_empty());
285    let mut cursor = Cursor::new(input);
286    // Move past the leading `r` or `br`.
287    for _ in 0..prefix_len {
288        cursor.bump().unwrap();
289    }
290    cursor.raw_double_quoted_string(prefix_len).map(|_| ())
291}
292
293/// Creates an iterator that produces tokens from the input string.
294pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
295    let mut cursor = Cursor::new(input);
296    std::iter::from_fn(move || {
297        let token = cursor.advance_token();
298        if token.kind != TokenKind::Eof { Some(token) } else { None }
299    })
300}
301
302/// True if `c` is considered a whitespace according to Rust language definition.
303/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
304/// for definitions of these classes.
305pub fn is_whitespace(c: char) -> bool {
306    // This is Pattern_White_Space.
307    //
308    // Note that this set is stable (ie, it doesn't change with different
309    // Unicode versions), so it's ok to just hard-code the values.
310
311    matches!(
312        c,
313        // Usual ASCII suspects
314        '\u{0009}'   // \t
315        | '\u{000A}' // \n
316        | '\u{000B}' // vertical tab
317        | '\u{000C}' // form feed
318        | '\u{000D}' // \r
319        | '\u{0020}' // space
320
321        // NEXT LINE from latin1
322        | '\u{0085}'
323
324        // Bidi markers
325        | '\u{200E}' // LEFT-TO-RIGHT MARK
326        | '\u{200F}' // RIGHT-TO-LEFT MARK
327
328        // Dedicated whitespace characters from Unicode
329        | '\u{2028}' // LINE SEPARATOR
330        | '\u{2029}' // PARAGRAPH SEPARATOR
331    )
332}
333
334/// True if `c` is valid as a first character of an identifier.
335/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
336/// a formal definition of valid identifier name.
337pub fn is_id_start(c: char) -> bool {
338    // This is XID_Start OR '_' (which formally is not a XID_Start).
339    c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
340}
341
342/// True if `c` is valid as a non-first character of an identifier.
343/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
344/// a formal definition of valid identifier name.
345pub fn is_id_continue(c: char) -> bool {
346    unicode_xid::UnicodeXID::is_xid_continue(c)
347}
348
349/// The passed string is lexically an identifier.
350pub fn is_ident(string: &str) -> bool {
351    let mut chars = string.chars();
352    if let Some(start) = chars.next() {
353        is_id_start(start) && chars.all(is_id_continue)
354    } else {
355        false
356    }
357}
358
359impl Cursor<'_> {
360    /// Parses a token from the input string.
361    pub fn advance_token(&mut self) -> Token {
362        let first_char = match self.bump() {
363            Some(c) => c,
364            None => return Token::new(TokenKind::Eof, 0),
365        };
366        let token_kind = match first_char {
367            // Slash, comment or block comment.
368            '/' => match self.first() {
369                '/' => self.line_comment(),
370                '*' => self.block_comment(),
371                _ => Slash,
372            },
373
374            // Whitespace sequence.
375            c if is_whitespace(c) => self.whitespace(),
376
377            // Raw identifier, raw string literal or identifier.
378            'r' => match (self.first(), self.second()) {
379                ('#', c1) if is_id_start(c1) => self.raw_ident(),
380                ('#', _) | ('"', _) => {
381                    let res = self.raw_double_quoted_string(1);
382                    let suffix_start = self.pos_within_token();
383                    if res.is_ok() {
384                        self.eat_literal_suffix();
385                    }
386                    let kind = RawStr { n_hashes: res.ok() };
387                    Literal { kind, suffix_start }
388                }
389                _ => self.ident_or_unknown_prefix(),
390            },
391
392            // Byte literal, byte string literal, raw byte string literal or identifier.
393            'b' => self.c_or_byte_string(
394                |terminated| ByteStr { terminated },
395                |n_hashes| RawByteStr { n_hashes },
396                Some(|terminated| Byte { terminated }),
397            ),
398
399            // c-string literal, raw c-string literal or identifier.
400            'c' => self.c_or_byte_string(
401                |terminated| CStr { terminated },
402                |n_hashes| RawCStr { n_hashes },
403                None,
404            ),
405
406            // Identifier (this should be checked after other variant that can
407            // start as identifier).
408            c if is_id_start(c) => self.ident_or_unknown_prefix(),
409
410            // Numeric literal.
411            c @ '0'..='9' => {
412                let literal_kind = self.number(c);
413                let suffix_start = self.pos_within_token();
414                self.eat_literal_suffix();
415                TokenKind::Literal { kind: literal_kind, suffix_start }
416            }
417
418            // Guarded string literal prefix: `#"` or `##`
419            '#' if matches!(self.first(), '"' | '#') => {
420                self.bump();
421                TokenKind::GuardedStrPrefix
422            }
423
424            // One-symbol tokens.
425            ';' => Semi,
426            ',' => Comma,
427            '.' => Dot,
428            '(' => OpenParen,
429            ')' => CloseParen,
430            '{' => OpenBrace,
431            '}' => CloseBrace,
432            '[' => OpenBracket,
433            ']' => CloseBracket,
434            '@' => At,
435            '#' => Pound,
436            '~' => Tilde,
437            '?' => Question,
438            ':' => Colon,
439            '$' => Dollar,
440            '=' => Eq,
441            '!' => Bang,
442            '<' => Lt,
443            '>' => Gt,
444            '-' => Minus,
445            '&' => And,
446            '|' => Or,
447            '+' => Plus,
448            '*' => Star,
449            '^' => Caret,
450            '%' => Percent,
451
452            // Lifetime or character literal.
453            '\'' => self.lifetime_or_char(),
454
455            // String literal.
456            '"' => {
457                let terminated = self.double_quoted_string();
458                let suffix_start = self.pos_within_token();
459                if terminated {
460                    self.eat_literal_suffix();
461                }
462                let kind = Str { terminated };
463                Literal { kind, suffix_start }
464            }
465            // Identifier starting with an emoji. Only lexed for graceful error recovery.
466            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
467            _ => Unknown,
468        };
469        let res = Token::new(token_kind, self.pos_within_token());
470        self.reset_pos_within_token();
471        res
472    }
473
474    fn line_comment(&mut self) -> TokenKind {
475        debug_assert!(self.prev() == '/' && self.first() == '/');
476        self.bump();
477
478        let doc_style = match self.first() {
479            // `//!` is an inner line doc comment.
480            '!' => Some(DocStyle::Inner),
481            // `////` (more than 3 slashes) is not considered a doc comment.
482            '/' if self.second() != '/' => Some(DocStyle::Outer),
483            _ => None,
484        };
485
486        self.eat_until(b'\n');
487        LineComment { doc_style }
488    }
489
490    fn block_comment(&mut self) -> TokenKind {
491        debug_assert!(self.prev() == '/' && self.first() == '*');
492        self.bump();
493
494        let doc_style = match self.first() {
495            // `/*!` is an inner block doc comment.
496            '!' => Some(DocStyle::Inner),
497            // `/***` (more than 2 stars) is not considered a doc comment.
498            // `/**/` is not considered a doc comment.
499            '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
500            _ => None,
501        };
502
503        let mut depth = 1usize;
504        while let Some(c) = self.bump() {
505            match c {
506                '/' if self.first() == '*' => {
507                    self.bump();
508                    depth += 1;
509                }
510                '*' if self.first() == '/' => {
511                    self.bump();
512                    depth -= 1;
513                    if depth == 0 {
514                        // This block comment is closed, so for a construction like "/* */ */"
515                        // there will be a successfully parsed block comment "/* */"
516                        // and " */" will be processed separately.
517                        break;
518                    }
519                }
520                _ => (),
521            }
522        }
523
524        BlockComment { doc_style, terminated: depth == 0 }
525    }
526
527    fn whitespace(&mut self) -> TokenKind {
528        debug_assert!(is_whitespace(self.prev()));
529        self.eat_while(is_whitespace);
530        Whitespace
531    }
532
533    fn raw_ident(&mut self) -> TokenKind {
534        debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second()));
535        // Eat "#" symbol.
536        self.bump();
537        // Eat the identifier part of RawIdent.
538        self.eat_identifier();
539        RawIdent
540    }
541
542    fn ident_or_unknown_prefix(&mut self) -> TokenKind {
543        debug_assert!(is_id_start(self.prev()));
544        // Start is already eaten, eat the rest of identifier.
545        self.eat_while(is_id_continue);
546        // Known prefixes must have been handled earlier. So if
547        // we see a prefix here, it is definitely an unknown prefix.
548        match self.first() {
549            '#' | '"' | '\'' => UnknownPrefix,
550            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
551            _ => Ident,
552        }
553    }
554
555    fn invalid_ident(&mut self) -> TokenKind {
556        // Start is already eaten, eat the rest of identifier.
557        self.eat_while(|c| {
558            const ZERO_WIDTH_JOINER: char = '\u{200d}';
559            is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
560        });
561        // An invalid identifier followed by '#' or '"' or '\'' could be
562        // interpreted as an invalid literal prefix. We don't bother doing that
563        // because the treatment of invalid identifiers and invalid prefixes
564        // would be the same.
565        InvalidIdent
566    }
567
568    fn c_or_byte_string(
569        &mut self,
570        mk_kind: fn(bool) -> LiteralKind,
571        mk_kind_raw: fn(Option<u8>) -> LiteralKind,
572        single_quoted: Option<fn(bool) -> LiteralKind>,
573    ) -> TokenKind {
574        match (self.first(), self.second(), single_quoted) {
575            ('\'', _, Some(single_quoted)) => {
576                self.bump();
577                let terminated = self.single_quoted_string();
578                let suffix_start = self.pos_within_token();
579                if terminated {
580                    self.eat_literal_suffix();
581                }
582                let kind = single_quoted(terminated);
583                Literal { kind, suffix_start }
584            }
585            ('"', _, _) => {
586                self.bump();
587                let terminated = self.double_quoted_string();
588                let suffix_start = self.pos_within_token();
589                if terminated {
590                    self.eat_literal_suffix();
591                }
592                let kind = mk_kind(terminated);
593                Literal { kind, suffix_start }
594            }
595            ('r', '"', _) | ('r', '#', _) => {
596                self.bump();
597                let res = self.raw_double_quoted_string(2);
598                let suffix_start = self.pos_within_token();
599                if res.is_ok() {
600                    self.eat_literal_suffix();
601                }
602                let kind = mk_kind_raw(res.ok());
603                Literal { kind, suffix_start }
604            }
605            _ => self.ident_or_unknown_prefix(),
606        }
607    }
608
609    fn number(&mut self, first_digit: char) -> LiteralKind {
610        debug_assert!('0' <= self.prev() && self.prev() <= '9');
611        let mut base = Base::Decimal;
612        if first_digit == '0' {
613            // Attempt to parse encoding base.
614            match self.first() {
615                'b' => {
616                    base = Base::Binary;
617                    self.bump();
618                    if !self.eat_decimal_digits() {
619                        return Int { base, empty_int: true };
620                    }
621                }
622                'o' => {
623                    base = Base::Octal;
624                    self.bump();
625                    if !self.eat_decimal_digits() {
626                        return Int { base, empty_int: true };
627                    }
628                }
629                'x' => {
630                    base = Base::Hexadecimal;
631                    self.bump();
632                    if !self.eat_hexadecimal_digits() {
633                        return Int { base, empty_int: true };
634                    }
635                }
636                // Not a base prefix; consume additional digits.
637                '0'..='9' | '_' => {
638                    self.eat_decimal_digits();
639                }
640
641                // Also not a base prefix; nothing more to do here.
642                '.' | 'e' | 'E' => {}
643
644                // Just a 0.
645                _ => return Int { base, empty_int: false },
646            }
647        } else {
648            // No base prefix, parse number in the usual way.
649            self.eat_decimal_digits();
650        };
651
652        match self.first() {
653            // Don't be greedy if this is actually an
654            // integer literal followed by field/method access or a range pattern
655            // (`0..2` and `12.foo()`)
656            '.' if self.second() != '.' && !is_id_start(self.second()) => {
657                // might have stuff after the ., and if it does, it needs to start
658                // with a number
659                self.bump();
660                let mut empty_exponent = false;
661                if self.first().is_ascii_digit() {
662                    self.eat_decimal_digits();
663                    match self.first() {
664                        'e' | 'E' => {
665                            self.bump();
666                            empty_exponent = !self.eat_float_exponent();
667                        }
668                        _ => (),
669                    }
670                }
671                Float { base, empty_exponent }
672            }
673            'e' | 'E' => {
674                self.bump();
675                let empty_exponent = !self.eat_float_exponent();
676                Float { base, empty_exponent }
677            }
678            _ => Int { base, empty_int: false },
679        }
680    }
681
682    fn lifetime_or_char(&mut self) -> TokenKind {
683        debug_assert!(self.prev() == '\'');
684
685        let can_be_a_lifetime = if self.second() == '\'' {
686            // It's surely not a lifetime.
687            false
688        } else {
689            // If the first symbol is valid for identifier, it can be a lifetime.
690            // Also check if it's a number for a better error reporting (so '0 will
691            // be reported as invalid lifetime and not as unterminated char literal).
692            is_id_start(self.first()) || self.first().is_ascii_digit()
693        };
694
695        if !can_be_a_lifetime {
696            let terminated = self.single_quoted_string();
697            let suffix_start = self.pos_within_token();
698            if terminated {
699                self.eat_literal_suffix();
700            }
701            let kind = Char { terminated };
702            return Literal { kind, suffix_start };
703        }
704
705        if self.first() == 'r' && self.second() == '#' && is_id_start(self.third()) {
706            // Eat "r" and `#`, and identifier start characters.
707            self.bump();
708            self.bump();
709            self.bump();
710            self.eat_while(is_id_continue);
711            return RawLifetime;
712        }
713
714        // Either a lifetime or a character literal with
715        // length greater than 1.
716        let starts_with_number = self.first().is_ascii_digit();
717
718        // Skip the literal contents.
719        // First symbol can be a number (which isn't a valid identifier start),
720        // so skip it without any checks.
721        self.bump();
722        self.eat_while(is_id_continue);
723
724        match self.first() {
725            // Check if after skipping literal contents we've met a closing
726            // single quote (which means that user attempted to create a
727            // string with single quotes).
728            '\'' => {
729                self.bump();
730                let kind = Char { terminated: true };
731                Literal { kind, suffix_start: self.pos_within_token() }
732            }
733            '#' if !starts_with_number => UnknownPrefixLifetime,
734            _ => Lifetime { starts_with_number },
735        }
736    }
737
738    fn single_quoted_string(&mut self) -> bool {
739        debug_assert!(self.prev() == '\'');
740        // Check if it's a one-symbol literal.
741        if self.second() == '\'' && self.first() != '\\' {
742            self.bump();
743            self.bump();
744            return true;
745        }
746
747        // Literal has more than one symbol.
748
749        // Parse until either quotes are terminated or error is detected.
750        loop {
751            match self.first() {
752                // Quotes are terminated, finish parsing.
753                '\'' => {
754                    self.bump();
755                    return true;
756                }
757                // Probably beginning of the comment, which we don't want to include
758                // to the error report.
759                '/' => break,
760                // Newline without following '\'' means unclosed quote, stop parsing.
761                '\n' if self.second() != '\'' => break,
762                // End of file, stop parsing.
763                EOF_CHAR if self.is_eof() => break,
764                // Escaped slash is considered one character, so bump twice.
765                '\\' => {
766                    self.bump();
767                    self.bump();
768                }
769                // Skip the character.
770                _ => {
771                    self.bump();
772                }
773            }
774        }
775        // String was not terminated.
776        false
777    }
778
779    /// Eats double-quoted string and returns true
780    /// if string is terminated.
781    fn double_quoted_string(&mut self) -> bool {
782        debug_assert!(self.prev() == '"');
783        while let Some(c) = self.bump() {
784            match c {
785                '"' => {
786                    return true;
787                }
788                '\\' if self.first() == '\\' || self.first() == '"' => {
789                    // Bump again to skip escaped character.
790                    self.bump();
791                }
792                _ => (),
793            }
794        }
795        // End of file reached.
796        false
797    }
798
799    /// Attempt to lex for a guarded string literal.
800    ///
801    /// Used by `rustc_parse::lexer` to lex for guarded strings
802    /// conditionally based on edition.
803    ///
804    /// Note: this will not reset the `Cursor` when a
805    /// guarded string is not found. It is the caller's
806    /// responsibility to do so.
807    pub fn guarded_double_quoted_string(&mut self) -> Option<GuardedStr> {
808        debug_assert!(self.prev() != '#');
809
810        let mut n_start_hashes: u32 = 0;
811        while self.first() == '#' {
812            n_start_hashes += 1;
813            self.bump();
814        }
815
816        if self.first() != '"' {
817            return None;
818        }
819        self.bump();
820        debug_assert!(self.prev() == '"');
821
822        // Lex the string itself as a normal string literal
823        // so we can recover that for older editions later.
824        let terminated = self.double_quoted_string();
825        if !terminated {
826            let token_len = self.pos_within_token();
827            self.reset_pos_within_token();
828
829            return Some(GuardedStr { n_hashes: n_start_hashes, terminated: false, token_len });
830        }
831
832        // Consume closing '#' symbols.
833        // Note that this will not consume extra trailing `#` characters:
834        // `###"abcde"####` is lexed as a `GuardedStr { n_end_hashes: 3, .. }`
835        // followed by a `#` token.
836        let mut n_end_hashes = 0;
837        while self.first() == '#' && n_end_hashes < n_start_hashes {
838            n_end_hashes += 1;
839            self.bump();
840        }
841
842        // Reserved syntax, always an error, so it doesn't matter if
843        // `n_start_hashes != n_end_hashes`.
844
845        self.eat_literal_suffix();
846
847        let token_len = self.pos_within_token();
848        self.reset_pos_within_token();
849
850        Some(GuardedStr { n_hashes: n_start_hashes, terminated: true, token_len })
851    }
852
853    /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
854    fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
855        // Wrap the actual function to handle the error with too many hashes.
856        // This way, it eats the whole raw string.
857        let n_hashes = self.raw_string_unvalidated(prefix_len)?;
858        // Only up to 255 `#`s are allowed in raw strings
859        match u8::try_from(n_hashes) {
860            Ok(num) => Ok(num),
861            Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }),
862        }
863    }
864
865    fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
866        debug_assert!(self.prev() == 'r');
867        let start_pos = self.pos_within_token();
868        let mut possible_terminator_offset = None;
869        let mut max_hashes = 0;
870
871        // Count opening '#' symbols.
872        let mut eaten = 0;
873        while self.first() == '#' {
874            eaten += 1;
875            self.bump();
876        }
877        let n_start_hashes = eaten;
878
879        // Check that string is started.
880        match self.bump() {
881            Some('"') => (),
882            c => {
883                let c = c.unwrap_or(EOF_CHAR);
884                return Err(RawStrError::InvalidStarter { bad_char: c });
885            }
886        }
887
888        // Skip the string contents and on each '#' character met, check if this is
889        // a raw string termination.
890        loop {
891            self.eat_until(b'"');
892
893            if self.is_eof() {
894                return Err(RawStrError::NoTerminator {
895                    expected: n_start_hashes,
896                    found: max_hashes,
897                    possible_terminator_offset,
898                });
899            }
900
901            // Eat closing double quote.
902            self.bump();
903
904            // Check that amount of closing '#' symbols
905            // is equal to the amount of opening ones.
906            // Note that this will not consume extra trailing `#` characters:
907            // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
908            // followed by a `#` token.
909            let mut n_end_hashes = 0;
910            while self.first() == '#' && n_end_hashes < n_start_hashes {
911                n_end_hashes += 1;
912                self.bump();
913            }
914
915            if n_end_hashes == n_start_hashes {
916                return Ok(n_start_hashes);
917            } else if n_end_hashes > max_hashes {
918                // Keep track of possible terminators to give a hint about
919                // where there might be a missing terminator
920                possible_terminator_offset =
921                    Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
922                max_hashes = n_end_hashes;
923            }
924        }
925    }
926
927    fn eat_decimal_digits(&mut self) -> bool {
928        let mut has_digits = false;
929        loop {
930            match self.first() {
931                '_' => {
932                    self.bump();
933                }
934                '0'..='9' => {
935                    has_digits = true;
936                    self.bump();
937                }
938                _ => break,
939            }
940        }
941        has_digits
942    }
943
944    fn eat_hexadecimal_digits(&mut self) -> bool {
945        let mut has_digits = false;
946        loop {
947            match self.first() {
948                '_' => {
949                    self.bump();
950                }
951                '0'..='9' | 'a'..='f' | 'A'..='F' => {
952                    has_digits = true;
953                    self.bump();
954                }
955                _ => break,
956            }
957        }
958        has_digits
959    }
960
961    /// Eats the float exponent. Returns true if at least one digit was met,
962    /// and returns false otherwise.
963    fn eat_float_exponent(&mut self) -> bool {
964        debug_assert!(self.prev() == 'e' || self.prev() == 'E');
965        if self.first() == '-' || self.first() == '+' {
966            self.bump();
967        }
968        self.eat_decimal_digits()
969    }
970
971    // Eats the suffix of the literal, e.g. "u8".
972    fn eat_literal_suffix(&mut self) {
973        self.eat_identifier();
974    }
975
976    // Eats the identifier. Note: succeeds on `_`, which isn't a valid
977    // identifier.
978    fn eat_identifier(&mut self) {
979        if !is_id_start(self.first()) {
980            return;
981        }
982        self.bump();
983
984        self.eat_while(is_id_continue);
985    }
986}