rustc_lexer/lib.rs
1//! Low-level Rust lexer.
2//!
3//! The idea with `rustc_lexer` is to make a reusable library,
4//! by separating out pure lexing and rustc-specific concerns, like spans,
5//! error reporting, and interning. So, rustc_lexer operates directly on `&str`,
6//! produces simple tokens which are a pair of type-tag and a bit of original text,
7//! and does not report errors, instead storing them as flags on the token.
8//!
9//! Tokens produced by this lexer are not yet ready for parsing the Rust syntax.
10//! For that see [`rustc_parse::lexer`], which converts this basic token stream
11//! into wide tokens used by actual parser.
12//!
13//! The purpose of this crate is to convert raw sources into a labeled sequence
14//! of well-known token types, so building an actual Rust token stream will
15//! be easier.
16//!
17//! The main entity of this crate is the [`TokenKind`] enum which represents common
18//! lexeme types.
19//!
20//! [`rustc_parse::lexer`]: ../rustc_parse/lexer/index.html
21
22// tidy-alphabetical-start
23// We want to be able to build this crate with a stable compiler,
24// so no `#![feature]` attributes should be added.
25#![deny(unstable_features)]
26// tidy-alphabetical-end
27
28mod cursor;
29
30#[cfg(test)]
31mod tests;
32
33use LiteralKind::*;
34use TokenKind::*;
35use cursor::EOF_CHAR;
36pub use cursor::{Cursor, FrontmatterAllowed};
37pub use unicode_ident::UNICODE_VERSION;
38use unicode_properties::UnicodeEmoji;
39
40// Make sure that the Unicode version of the dependencies is the same.
41const _: () = {
42 let properties = unicode_properties::UNICODE_VERSION;
43 let ident = unicode_ident::UNICODE_VERSION;
44
45 if properties.0 != ident.0 as u64
46 || properties.1 != ident.1 as u64
47 || properties.2 != ident.2 as u64
48 {
49 panic!(
50 "unicode-properties and unicode-ident must use the same Unicode version, \
51 `unicode_properties::UNICODE_VERSION` and `unicode_ident::UNICODE_VERSION` are \
52 different."
53 );
54 }
55};
56
57/// Parsed token.
58/// It doesn't contain information about data that has been parsed,
59/// only the type of the token and its size.
60#[derive(Debug)]
61pub struct Token {
62 pub kind: TokenKind,
63 pub len: u32,
64}
65
66impl Token {
67 fn new(kind: TokenKind, len: u32) -> Token {
68 Token { kind, len }
69 }
70}
71
72/// Enum representing common lexeme types.
73#[derive(Clone, Copy, Debug, PartialEq, Eq)]
74pub enum TokenKind {
75 /// A line comment, e.g. `// comment`.
76 LineComment {
77 doc_style: Option<DocStyle>,
78 },
79
80 /// A block comment, e.g. `/* block comment */`.
81 ///
82 /// Block comments can be recursive, so a sequence like `/* /* */`
83 /// will not be considered terminated and will result in a parsing error.
84 BlockComment {
85 doc_style: Option<DocStyle>,
86 terminated: bool,
87 },
88
89 /// Any whitespace character sequence.
90 Whitespace,
91
92 Frontmatter {
93 has_invalid_preceding_whitespace: bool,
94 invalid_infostring: bool,
95 },
96
97 /// An identifier or keyword, e.g. `ident` or `continue`.
98 Ident,
99
100 /// An identifier that is invalid because it contains emoji.
101 InvalidIdent,
102
103 /// A raw identifier, e.g. "r#ident".
104 RawIdent,
105
106 /// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes
107 /// literal prefixes that contain emoji, which are considered "invalid".
108 ///
109 /// Note that only the
110 /// prefix (`foo`) is included in the token, not the separator (which is
111 /// lexed as its own distinct token). In Rust 2021 and later, reserved
112 /// prefixes are reported as errors; in earlier editions, they result in a
113 /// (allowed by default) lint, and are treated as regular identifier
114 /// tokens.
115 UnknownPrefix,
116
117 /// An unknown prefix in a lifetime, like `'foo#`.
118 ///
119 /// Like `UnknownPrefix`, only the `'` and prefix are included in the token
120 /// and not the separator.
121 UnknownPrefixLifetime,
122
123 /// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into
124 /// several tokens: `'r` and `#` and `foo`.
125 RawLifetime,
126
127 /// Guarded string literal prefix: `#"` or `##`.
128 ///
129 /// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
130 /// Split into the component tokens on older editions.
131 GuardedStrPrefix,
132
133 /// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
134 /// suffix, but may be present here on string and float literals. Users of
135 /// this type will need to check for and reject that case.
136 ///
137 /// See [LiteralKind] for more details.
138 Literal {
139 kind: LiteralKind,
140 suffix_start: u32,
141 },
142
143 /// A lifetime, e.g. `'a`.
144 Lifetime {
145 starts_with_number: bool,
146 },
147
148 /// `;`
149 Semi,
150 /// `,`
151 Comma,
152 /// `.`
153 Dot,
154 /// `(`
155 OpenParen,
156 /// `)`
157 CloseParen,
158 /// `{`
159 OpenBrace,
160 /// `}`
161 CloseBrace,
162 /// `[`
163 OpenBracket,
164 /// `]`
165 CloseBracket,
166 /// `@`
167 At,
168 /// `#`
169 Pound,
170 /// `~`
171 Tilde,
172 /// `?`
173 Question,
174 /// `:`
175 Colon,
176 /// `$`
177 Dollar,
178 /// `=`
179 Eq,
180 /// `!`
181 Bang,
182 /// `<`
183 Lt,
184 /// `>`
185 Gt,
186 /// `-`
187 Minus,
188 /// `&`
189 And,
190 /// `|`
191 Or,
192 /// `+`
193 Plus,
194 /// `*`
195 Star,
196 /// `/`
197 Slash,
198 /// `^`
199 Caret,
200 /// `%`
201 Percent,
202
203 /// Unknown token, not expected by the lexer, e.g. "№"
204 Unknown,
205
206 /// End of input.
207 Eof,
208}
209
210#[derive(Clone, Copy, Debug, PartialEq, Eq)]
211pub enum DocStyle {
212 Outer,
213 Inner,
214}
215
216/// Enum representing the literal types supported by the lexer.
217///
218/// Note that the suffix is *not* considered when deciding the `LiteralKind` in
219/// this type. This means that float literals like `1f32` are classified by this
220/// type as `Int`. (Compare against `rustc_ast::token::LitKind` and
221/// `rustc_ast::ast::LitKind`).
222#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
223pub enum LiteralKind {
224 /// `12_u8`, `0o100`, `0b120i99`, `1f32`.
225 Int { base: Base, empty_int: bool },
226 /// `12.34f32`, `1e3`, but not `1f32`.
227 Float { base: Base, empty_exponent: bool },
228 /// `'a'`, `'\\'`, `'''`, `';`
229 Char { terminated: bool },
230 /// `b'a'`, `b'\\'`, `b'''`, `b';`
231 Byte { terminated: bool },
232 /// `"abc"`, `"abc`
233 Str { terminated: bool },
234 /// `b"abc"`, `b"abc`
235 ByteStr { terminated: bool },
236 /// `c"abc"`, `c"abc`
237 CStr { terminated: bool },
238 /// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates
239 /// an invalid literal.
240 RawStr { n_hashes: Option<u8> },
241 /// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None`
242 /// indicates an invalid literal.
243 RawByteStr { n_hashes: Option<u8> },
244 /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
245 RawCStr { n_hashes: Option<u8> },
246}
247
248/// `#"abc"#`, `##"a"` (fewer closing), or even `#"a` (unterminated).
249///
250/// Can capture fewer closing hashes than starting hashes,
251/// for more efficient lexing and better backwards diagnostics.
252#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
253pub struct GuardedStr {
254 pub n_hashes: u32,
255 pub terminated: bool,
256 pub token_len: u32,
257}
258
259#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
260pub enum RawStrError {
261 /// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
262 InvalidStarter { bad_char: char },
263 /// The string was not terminated, e.g. `r###"abcde"##`.
264 /// `possible_terminator_offset` is the number of characters after `r` or
265 /// `br` where they may have intended to terminate it.
266 NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> },
267 /// More than 255 `#`s exist.
268 TooManyDelimiters { found: u32 },
269}
270
271/// Base of numeric literal encoding according to its prefix.
272#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
273pub enum Base {
274 /// Literal starts with "0b".
275 Binary = 2,
276 /// Literal starts with "0o".
277 Octal = 8,
278 /// Literal doesn't contain a prefix.
279 Decimal = 10,
280 /// Literal starts with "0x".
281 Hexadecimal = 16,
282}
283
284/// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun",
285/// but shebang isn't a part of rust syntax.
286pub fn strip_shebang(input: &str) -> Option<usize> {
287 // Shebang must start with `#!` literally, without any preceding whitespace.
288 // For simplicity we consider any line starting with `#!` a shebang,
289 // regardless of restrictions put on shebangs by specific platforms.
290 if let Some(input_tail) = input.strip_prefix("#!") {
291 // Ok, this is a shebang but if the next non-whitespace token is `[`,
292 // then it may be valid Rust code, so consider it Rust code.
293 let next_non_whitespace_token =
294 tokenize(input_tail, FrontmatterAllowed::No).map(|tok| tok.kind).find(|tok| {
295 !matches!(
296 tok,
297 TokenKind::Whitespace
298 | TokenKind::LineComment { doc_style: None }
299 | TokenKind::BlockComment { doc_style: None, .. }
300 )
301 });
302 if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
303 // No other choice than to consider this a shebang.
304 return Some(2 + input_tail.lines().next().unwrap_or_default().len());
305 }
306 }
307 None
308}
309
310/// Validates a raw string literal. Used for getting more information about a
311/// problem with a `RawStr`/`RawByteStr` with a `None` field.
312#[inline]
313pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
314 debug_assert!(!input.is_empty());
315 let mut cursor = Cursor::new(input, FrontmatterAllowed::No);
316 // Move past the leading `r` or `br`.
317 for _ in 0..prefix_len {
318 cursor.bump().unwrap();
319 }
320 cursor.raw_double_quoted_string(prefix_len).map(|_| ())
321}
322
323/// Creates an iterator that produces tokens from the input string.
324///
325/// When parsing a full Rust document,
326/// first [`strip_shebang`] and then allow frontmatters with [`FrontmatterAllowed::Yes`].
327///
328/// When tokenizing a slice of a document, be sure to disallow frontmatters with [`FrontmatterAllowed::No`]
329pub fn tokenize(
330 input: &str,
331 frontmatter_allowed: FrontmatterAllowed,
332) -> impl Iterator<Item = Token> {
333 let mut cursor = Cursor::new(input, frontmatter_allowed);
334 std::iter::from_fn(move || {
335 let token = cursor.advance_token();
336 if token.kind != TokenKind::Eof { Some(token) } else { None }
337 })
338}
339
340/// True if `c` is considered a whitespace according to Rust language definition.
341/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
342/// for definitions of these classes.
343pub fn is_whitespace(c: char) -> bool {
344 // This is Pattern_White_Space.
345 //
346 // Note that this set is stable (ie, it doesn't change with different
347 // Unicode versions), so it's ok to just hard-code the values.
348
349 matches!(
350 c,
351 // End-of-line characters
352 | '\u{000A}' // line feed (\n)
353 | '\u{000B}' // vertical tab
354 | '\u{000C}' // form feed
355 | '\u{000D}' // carriage return (\r)
356 | '\u{0085}' // next line (from latin1)
357 | '\u{2028}' // LINE SEPARATOR
358 | '\u{2029}' // PARAGRAPH SEPARATOR
359
360 // `Default_Ignorable_Code_Point` characters
361 | '\u{200E}' // LEFT-TO-RIGHT MARK
362 | '\u{200F}' // RIGHT-TO-LEFT MARK
363
364 // Horizontal space characters
365 | '\u{0009}' // tab (\t)
366 | '\u{0020}' // space
367 )
368}
369
370/// True if `c` is considered horizontal whitespace according to Rust language definition.
371pub fn is_horizontal_whitespace(c: char) -> bool {
372 // This is Pattern_White_Space.
373 //
374 // Note that this set is stable (ie, it doesn't change with different
375 // Unicode versions), so it's ok to just hard-code the values.
376
377 matches!(
378 c,
379 // Horizontal space characters
380 '\u{0009}' // tab (\t)
381 | '\u{0020}' // space
382 )
383}
384
385/// True if `c` is valid as a first character of an identifier.
386/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
387/// a formal definition of valid identifier name.
388pub fn is_id_start(c: char) -> bool {
389 // This is XID_Start OR '_' (which formally is not a XID_Start).
390 c == '_' || unicode_ident::is_xid_start(c)
391}
392
393/// True if `c` is valid as a non-first character of an identifier.
394/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
395/// a formal definition of valid identifier name.
396pub fn is_id_continue(c: char) -> bool {
397 unicode_ident::is_xid_continue(c)
398}
399
400/// The passed string is lexically an identifier.
401pub fn is_ident(string: &str) -> bool {
402 let mut chars = string.chars();
403 if let Some(start) = chars.next() {
404 is_id_start(start) && chars.all(is_id_continue)
405 } else {
406 false
407 }
408}
409
410impl Cursor<'_> {
411 /// Parses a token from the input string.
412 pub fn advance_token(&mut self) -> Token {
413 let Some(first_char) = self.bump() else {
414 return Token::new(TokenKind::Eof, 0);
415 };
416
417 let token_kind = match first_char {
418 c if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
419 && is_whitespace(c) =>
420 {
421 let mut last = first_char;
422 while is_whitespace(self.first()) {
423 let Some(c) = self.bump() else {
424 break;
425 };
426 last = c;
427 }
428 // invalid frontmatter opening as whitespace preceding it isn't newline.
429 // combine the whitespace and the frontmatter to a single token as we shall
430 // error later.
431 if last != '\n' && self.as_str().starts_with("---") {
432 self.bump();
433 self.frontmatter(true)
434 } else {
435 Whitespace
436 }
437 }
438 '-' if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
439 && self.as_str().starts_with("--") =>
440 {
441 // happy path
442 self.frontmatter(false)
443 }
444 // Slash, comment or block comment.
445 '/' => match self.first() {
446 '/' => self.line_comment(),
447 '*' => self.block_comment(),
448 _ => Slash,
449 },
450
451 // Whitespace sequence.
452 c if is_whitespace(c) => self.whitespace(),
453
454 // Raw identifier, raw string literal or identifier.
455 'r' => match (self.first(), self.second()) {
456 ('#', c1) if is_id_start(c1) => self.raw_ident(),
457 ('#', _) | ('"', _) => {
458 let res = self.raw_double_quoted_string(1);
459 let suffix_start = self.pos_within_token();
460 if res.is_ok() {
461 self.eat_literal_suffix();
462 }
463 let kind = RawStr { n_hashes: res.ok() };
464 Literal { kind, suffix_start }
465 }
466 _ => self.ident_or_unknown_prefix(),
467 },
468
469 // Byte literal, byte string literal, raw byte string literal or identifier.
470 'b' => self.c_or_byte_string(
471 |terminated| ByteStr { terminated },
472 |n_hashes| RawByteStr { n_hashes },
473 Some(|terminated| Byte { terminated }),
474 ),
475
476 // c-string literal, raw c-string literal or identifier.
477 'c' => self.c_or_byte_string(
478 |terminated| CStr { terminated },
479 |n_hashes| RawCStr { n_hashes },
480 None,
481 ),
482
483 // Identifier (this should be checked after other variant that can
484 // start as identifier).
485 c if is_id_start(c) => self.ident_or_unknown_prefix(),
486
487 // Numeric literal.
488 c @ '0'..='9' => {
489 let literal_kind = self.number(c);
490 let suffix_start = self.pos_within_token();
491 self.eat_literal_suffix();
492 TokenKind::Literal { kind: literal_kind, suffix_start }
493 }
494
495 // Guarded string literal prefix: `#"` or `##`
496 '#' if matches!(self.first(), '"' | '#') => {
497 self.bump();
498 TokenKind::GuardedStrPrefix
499 }
500
501 // One-symbol tokens.
502 ';' => Semi,
503 ',' => Comma,
504 '.' => Dot,
505 '(' => OpenParen,
506 ')' => CloseParen,
507 '{' => OpenBrace,
508 '}' => CloseBrace,
509 '[' => OpenBracket,
510 ']' => CloseBracket,
511 '@' => At,
512 '#' => Pound,
513 '~' => Tilde,
514 '?' => Question,
515 ':' => Colon,
516 '$' => Dollar,
517 '=' => Eq,
518 '!' => Bang,
519 '<' => Lt,
520 '>' => Gt,
521 '-' => Minus,
522 '&' => And,
523 '|' => Or,
524 '+' => Plus,
525 '*' => Star,
526 '^' => Caret,
527 '%' => Percent,
528
529 // Lifetime or character literal.
530 '\'' => self.lifetime_or_char(),
531
532 // String literal.
533 '"' => {
534 let terminated = self.double_quoted_string();
535 let suffix_start = self.pos_within_token();
536 if terminated {
537 self.eat_literal_suffix();
538 }
539 let kind = Str { terminated };
540 Literal { kind, suffix_start }
541 }
542 // Identifier starting with an emoji. Only lexed for graceful error recovery.
543 c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
544 _ => Unknown,
545 };
546 if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
547 && !matches!(token_kind, Whitespace)
548 {
549 // stop allowing frontmatters after first non-whitespace token
550 self.frontmatter_allowed = FrontmatterAllowed::No;
551 }
552 let res = Token::new(token_kind, self.pos_within_token());
553 self.reset_pos_within_token();
554 res
555 }
556
557 /// Given that one `-` was eaten, eat the rest of the frontmatter.
558 fn frontmatter(&mut self, has_invalid_preceding_whitespace: bool) -> TokenKind {
559 debug_assert_eq!('-', self.prev());
560
561 let pos = self.pos_within_token();
562 self.eat_while(|c| c == '-');
563
564 // one `-` is eaten by the caller.
565 let length_opening = self.pos_within_token() - pos + 1;
566
567 // must be ensured by the caller
568 debug_assert!(length_opening >= 3);
569
570 // whitespace between the opening and the infostring.
571 self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch));
572
573 // copied from `eat_identifier`, but allows `-` and `.` in infostring to allow something like
574 // `---Cargo.toml` as a valid opener
575 if is_id_start(self.first()) {
576 self.bump();
577 self.eat_while(|c| is_id_continue(c) || c == '-' || c == '.');
578 }
579
580 self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch));
581 let invalid_infostring = self.first() != '\n';
582
583 let mut found = false;
584 let nl_fence_pattern = format!("\n{:-<1$}", "", length_opening as usize);
585 if let Some(closing) = self.as_str().find(&nl_fence_pattern) {
586 // candidate found
587 self.bump_bytes(closing + nl_fence_pattern.len());
588 // in case like
589 // ---cargo
590 // --- blahblah
591 // or
592 // ---cargo
593 // ----
594 // combine those stuff into this frontmatter token such that it gets detected later.
595 self.eat_until(b'\n');
596 found = true;
597 }
598
599 if !found {
600 // recovery strategy: a closing statement might have preceding whitespace/newline
601 // but not have enough dashes to properly close. In this case, we eat until there,
602 // and report a mismatch in the parser.
603 let mut rest = self.as_str();
604 // We can look for a shorter closing (starting with four dashes but closing with three)
605 // and other indications that Rust has started and the infostring has ended.
606 let mut potential_closing = rest
607 .find("\n---")
608 // n.b. only in the case where there are dashes, we move the index to the line where
609 // the dashes start as we eat to include that line. For other cases those are Rust code
610 // and not included in the frontmatter.
611 .map(|x| x + 1)
612 .or_else(|| rest.find("\nuse "))
613 .or_else(|| rest.find("\n//!"))
614 .or_else(|| rest.find("\n#!["));
615
616 if potential_closing.is_none() {
617 // a less fortunate recovery if all else fails which finds any dashes preceded by whitespace
618 // on a standalone line. Might be wrong.
619 let mut base_index = 0;
620 while let Some(closing) = rest.find("---") {
621 let preceding_chars_start = rest[..closing].rfind("\n").map_or(0, |i| i + 1);
622 if rest[preceding_chars_start..closing].chars().all(is_horizontal_whitespace) {
623 // candidate found
624 potential_closing = Some(closing + base_index);
625 break;
626 } else {
627 rest = &rest[closing + 3..];
628 base_index += closing + 3;
629 }
630 }
631 }
632
633 if let Some(potential_closing) = potential_closing {
634 // bump to the potential closing, and eat everything on that line.
635 self.bump_bytes(potential_closing);
636 self.eat_until(b'\n');
637 } else {
638 // eat everything. this will get reported as an unclosed frontmatter.
639 self.eat_while(|_| true);
640 }
641 }
642
643 Frontmatter { has_invalid_preceding_whitespace, invalid_infostring }
644 }
645
646 fn line_comment(&mut self) -> TokenKind {
647 debug_assert!(self.prev() == '/' && self.first() == '/');
648 self.bump();
649
650 let doc_style = match self.first() {
651 // `//!` is an inner line doc comment.
652 '!' => Some(DocStyle::Inner),
653 // `////` (more than 3 slashes) is not considered a doc comment.
654 '/' if self.second() != '/' => Some(DocStyle::Outer),
655 _ => None,
656 };
657
658 self.eat_until(b'\n');
659 LineComment { doc_style }
660 }
661
662 fn block_comment(&mut self) -> TokenKind {
663 debug_assert!(self.prev() == '/' && self.first() == '*');
664 self.bump();
665
666 let doc_style = match self.first() {
667 // `/*!` is an inner block doc comment.
668 '!' => Some(DocStyle::Inner),
669 // `/***` (more than 2 stars) is not considered a doc comment.
670 // `/**/` is not considered a doc comment.
671 '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
672 _ => None,
673 };
674
675 let mut depth = 1usize;
676 while let Some(c) = self.bump() {
677 match c {
678 '/' if self.first() == '*' => {
679 self.bump();
680 depth += 1;
681 }
682 '*' if self.first() == '/' => {
683 self.bump();
684 depth -= 1;
685 if depth == 0 {
686 // This block comment is closed, so for a construction like "/* */ */"
687 // there will be a successfully parsed block comment "/* */"
688 // and " */" will be processed separately.
689 break;
690 }
691 }
692 _ => (),
693 }
694 }
695
696 BlockComment { doc_style, terminated: depth == 0 }
697 }
698
699 fn whitespace(&mut self) -> TokenKind {
700 debug_assert!(is_whitespace(self.prev()));
701 self.eat_while(is_whitespace);
702 Whitespace
703 }
704
705 fn raw_ident(&mut self) -> TokenKind {
706 debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second()));
707 // Eat "#" symbol.
708 self.bump();
709 // Eat the identifier part of RawIdent.
710 self.eat_identifier();
711 RawIdent
712 }
713
714 fn ident_or_unknown_prefix(&mut self) -> TokenKind {
715 debug_assert!(is_id_start(self.prev()));
716 // Start is already eaten, eat the rest of identifier.
717 self.eat_while(is_id_continue);
718 // Known prefixes must have been handled earlier. So if
719 // we see a prefix here, it is definitely an unknown prefix.
720 match self.first() {
721 '#' | '"' | '\'' => UnknownPrefix,
722 c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
723 _ => Ident,
724 }
725 }
726
727 fn invalid_ident(&mut self) -> TokenKind {
728 // Start is already eaten, eat the rest of identifier.
729 self.eat_while(|c| {
730 const ZERO_WIDTH_JOINER: char = '\u{200d}';
731 is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
732 });
733 // An invalid identifier followed by '#' or '"' or '\'' could be
734 // interpreted as an invalid literal prefix. We don't bother doing that
735 // because the treatment of invalid identifiers and invalid prefixes
736 // would be the same.
737 InvalidIdent
738 }
739
740 fn c_or_byte_string(
741 &mut self,
742 mk_kind: fn(bool) -> LiteralKind,
743 mk_kind_raw: fn(Option<u8>) -> LiteralKind,
744 single_quoted: Option<fn(bool) -> LiteralKind>,
745 ) -> TokenKind {
746 match (self.first(), self.second(), single_quoted) {
747 ('\'', _, Some(single_quoted)) => {
748 self.bump();
749 let terminated = self.single_quoted_string();
750 let suffix_start = self.pos_within_token();
751 if terminated {
752 self.eat_literal_suffix();
753 }
754 let kind = single_quoted(terminated);
755 Literal { kind, suffix_start }
756 }
757 ('"', _, _) => {
758 self.bump();
759 let terminated = self.double_quoted_string();
760 let suffix_start = self.pos_within_token();
761 if terminated {
762 self.eat_literal_suffix();
763 }
764 let kind = mk_kind(terminated);
765 Literal { kind, suffix_start }
766 }
767 ('r', '"', _) | ('r', '#', _) => {
768 self.bump();
769 let res = self.raw_double_quoted_string(2);
770 let suffix_start = self.pos_within_token();
771 if res.is_ok() {
772 self.eat_literal_suffix();
773 }
774 let kind = mk_kind_raw(res.ok());
775 Literal { kind, suffix_start }
776 }
777 _ => self.ident_or_unknown_prefix(),
778 }
779 }
780
781 fn number(&mut self, first_digit: char) -> LiteralKind {
782 debug_assert!('0' <= self.prev() && self.prev() <= '9');
783 let mut base = Base::Decimal;
784 if first_digit == '0' {
785 // Attempt to parse encoding base.
786 match self.first() {
787 'b' => {
788 base = Base::Binary;
789 self.bump();
790 if !self.eat_decimal_digits() {
791 return Int { base, empty_int: true };
792 }
793 }
794 'o' => {
795 base = Base::Octal;
796 self.bump();
797 if !self.eat_decimal_digits() {
798 return Int { base, empty_int: true };
799 }
800 }
801 'x' => {
802 base = Base::Hexadecimal;
803 self.bump();
804 if !self.eat_hexadecimal_digits() {
805 return Int { base, empty_int: true };
806 }
807 }
808 // Not a base prefix; consume additional digits.
809 '0'..='9' | '_' => {
810 self.eat_decimal_digits();
811 }
812
813 // Also not a base prefix; nothing more to do here.
814 '.' | 'e' | 'E' => {}
815
816 // Just a 0.
817 _ => return Int { base, empty_int: false },
818 }
819 } else {
820 // No base prefix, parse number in the usual way.
821 self.eat_decimal_digits();
822 }
823
824 match self.first() {
825 // Don't be greedy if this is actually an
826 // integer literal followed by field/method access or a range pattern
827 // (`0..2` and `12.foo()`)
828 '.' if self.second() != '.' && !is_id_start(self.second()) => {
829 // might have stuff after the ., and if it does, it needs to start
830 // with a number
831 self.bump();
832 let mut empty_exponent = false;
833 if self.first().is_ascii_digit() {
834 self.eat_decimal_digits();
835 match self.first() {
836 'e' | 'E' => {
837 self.bump();
838 empty_exponent = !self.eat_float_exponent();
839 }
840 _ => (),
841 }
842 }
843 Float { base, empty_exponent }
844 }
845 'e' | 'E' => {
846 self.bump();
847 let empty_exponent = !self.eat_float_exponent();
848 Float { base, empty_exponent }
849 }
850 _ => Int { base, empty_int: false },
851 }
852 }
853
854 fn lifetime_or_char(&mut self) -> TokenKind {
855 debug_assert!(self.prev() == '\'');
856
857 let can_be_a_lifetime = if self.second() == '\'' {
858 // It's surely not a lifetime.
859 false
860 } else {
861 // If the first symbol is valid for identifier, it can be a lifetime.
862 // Also check if it's a number for a better error reporting (so '0 will
863 // be reported as invalid lifetime and not as unterminated char literal).
864 is_id_start(self.first()) || self.first().is_ascii_digit()
865 };
866
867 if !can_be_a_lifetime {
868 let terminated = self.single_quoted_string();
869 let suffix_start = self.pos_within_token();
870 if terminated {
871 self.eat_literal_suffix();
872 }
873 let kind = Char { terminated };
874 return Literal { kind, suffix_start };
875 }
876
877 if self.first() == 'r' && self.second() == '#' && is_id_start(self.third()) {
878 // Eat "r" and `#`, and identifier start characters.
879 self.bump();
880 self.bump();
881 self.bump();
882 self.eat_while(is_id_continue);
883 return RawLifetime;
884 }
885
886 // Either a lifetime or a character literal with
887 // length greater than 1.
888 let starts_with_number = self.first().is_ascii_digit();
889
890 // Skip the literal contents.
891 // First symbol can be a number (which isn't a valid identifier start),
892 // so skip it without any checks.
893 self.bump();
894 self.eat_while(is_id_continue);
895
896 match self.first() {
897 // Check if after skipping literal contents we've met a closing
898 // single quote (which means that user attempted to create a
899 // string with single quotes).
900 '\'' => {
901 self.bump();
902 let kind = Char { terminated: true };
903 Literal { kind, suffix_start: self.pos_within_token() }
904 }
905 '#' if !starts_with_number => UnknownPrefixLifetime,
906 _ => Lifetime { starts_with_number },
907 }
908 }
909
910 fn single_quoted_string(&mut self) -> bool {
911 debug_assert!(self.prev() == '\'');
912 // Check if it's a one-symbol literal.
913 if self.second() == '\'' && self.first() != '\\' {
914 self.bump();
915 self.bump();
916 return true;
917 }
918
919 // Literal has more than one symbol.
920
921 // Parse until either quotes are terminated or error is detected.
922 loop {
923 match self.first() {
924 // Quotes are terminated, finish parsing.
925 '\'' => {
926 self.bump();
927 return true;
928 }
929 // Probably beginning of the comment, which we don't want to include
930 // to the error report.
931 '/' => break,
932 // Newline without following '\'' means unclosed quote, stop parsing.
933 '\n' if self.second() != '\'' => break,
934 // End of file, stop parsing.
935 EOF_CHAR if self.is_eof() => break,
936 // Escaped slash is considered one character, so bump twice.
937 '\\' => {
938 self.bump();
939 self.bump();
940 }
941 // Skip the character.
942 _ => {
943 self.bump();
944 }
945 }
946 }
947 // String was not terminated.
948 false
949 }
950
951 /// Eats double-quoted string and returns true
952 /// if string is terminated.
953 fn double_quoted_string(&mut self) -> bool {
954 debug_assert!(self.prev() == '"');
955 while let Some(c) = self.bump() {
956 match c {
957 '"' => {
958 return true;
959 }
960 '\\' if self.first() == '\\' || self.first() == '"' => {
961 // Bump again to skip escaped character.
962 self.bump();
963 }
964 _ => (),
965 }
966 }
967 // End of file reached.
968 false
969 }
970
971 /// Attempt to lex for a guarded string literal.
972 ///
973 /// Used by `rustc_parse::lexer` to lex for guarded strings
974 /// conditionally based on edition.
975 ///
976 /// Note: this will not reset the `Cursor` when a
977 /// guarded string is not found. It is the caller's
978 /// responsibility to do so.
979 pub fn guarded_double_quoted_string(&mut self) -> Option<GuardedStr> {
980 debug_assert!(self.prev() != '#');
981
982 let mut n_start_hashes: u32 = 0;
983 while self.first() == '#' {
984 n_start_hashes += 1;
985 self.bump();
986 }
987
988 if self.first() != '"' {
989 return None;
990 }
991 self.bump();
992 debug_assert!(self.prev() == '"');
993
994 // Lex the string itself as a normal string literal
995 // so we can recover that for older editions later.
996 let terminated = self.double_quoted_string();
997 if !terminated {
998 let token_len = self.pos_within_token();
999 self.reset_pos_within_token();
1000
1001 return Some(GuardedStr { n_hashes: n_start_hashes, terminated: false, token_len });
1002 }
1003
1004 // Consume closing '#' symbols.
1005 // Note that this will not consume extra trailing `#` characters:
1006 // `###"abcde"####` is lexed as a `GuardedStr { n_end_hashes: 3, .. }`
1007 // followed by a `#` token.
1008 let mut n_end_hashes = 0;
1009 while self.first() == '#' && n_end_hashes < n_start_hashes {
1010 n_end_hashes += 1;
1011 self.bump();
1012 }
1013
1014 // Reserved syntax, always an error, so it doesn't matter if
1015 // `n_start_hashes != n_end_hashes`.
1016
1017 self.eat_literal_suffix();
1018
1019 let token_len = self.pos_within_token();
1020 self.reset_pos_within_token();
1021
1022 Some(GuardedStr { n_hashes: n_start_hashes, terminated: true, token_len })
1023 }
1024
1025 /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
1026 fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
1027 // Wrap the actual function to handle the error with too many hashes.
1028 // This way, it eats the whole raw string.
1029 let n_hashes = self.raw_string_unvalidated(prefix_len)?;
1030 // Only up to 255 `#`s are allowed in raw strings
1031 match u8::try_from(n_hashes) {
1032 Ok(num) => Ok(num),
1033 Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }),
1034 }
1035 }
1036
1037 fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
1038 debug_assert!(self.prev() == 'r');
1039 let start_pos = self.pos_within_token();
1040 let mut possible_terminator_offset = None;
1041 let mut max_hashes = 0;
1042
1043 // Count opening '#' symbols.
1044 let mut eaten = 0;
1045 while self.first() == '#' {
1046 eaten += 1;
1047 self.bump();
1048 }
1049 let n_start_hashes = eaten;
1050
1051 // Check that string is started.
1052 match self.bump() {
1053 Some('"') => (),
1054 c => {
1055 let c = c.unwrap_or(EOF_CHAR);
1056 return Err(RawStrError::InvalidStarter { bad_char: c });
1057 }
1058 }
1059
1060 // Skip the string contents and on each '#' character met, check if this is
1061 // a raw string termination.
1062 loop {
1063 self.eat_until(b'"');
1064
1065 if self.is_eof() {
1066 return Err(RawStrError::NoTerminator {
1067 expected: n_start_hashes,
1068 found: max_hashes,
1069 possible_terminator_offset,
1070 });
1071 }
1072
1073 // Eat closing double quote.
1074 self.bump();
1075
1076 // Check that amount of closing '#' symbols
1077 // is equal to the amount of opening ones.
1078 // Note that this will not consume extra trailing `#` characters:
1079 // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
1080 // followed by a `#` token.
1081 let mut n_end_hashes = 0;
1082 while self.first() == '#' && n_end_hashes < n_start_hashes {
1083 n_end_hashes += 1;
1084 self.bump();
1085 }
1086
1087 if n_end_hashes == n_start_hashes {
1088 return Ok(n_start_hashes);
1089 } else if n_end_hashes > max_hashes {
1090 // Keep track of possible terminators to give a hint about
1091 // where there might be a missing terminator
1092 possible_terminator_offset =
1093 Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
1094 max_hashes = n_end_hashes;
1095 }
1096 }
1097 }
1098
1099 fn eat_decimal_digits(&mut self) -> bool {
1100 let mut has_digits = false;
1101 loop {
1102 match self.first() {
1103 '_' => {
1104 self.bump();
1105 }
1106 '0'..='9' => {
1107 has_digits = true;
1108 self.bump();
1109 }
1110 _ => break,
1111 }
1112 }
1113 has_digits
1114 }
1115
1116 fn eat_hexadecimal_digits(&mut self) -> bool {
1117 let mut has_digits = false;
1118 loop {
1119 match self.first() {
1120 '_' => {
1121 self.bump();
1122 }
1123 '0'..='9' | 'a'..='f' | 'A'..='F' => {
1124 has_digits = true;
1125 self.bump();
1126 }
1127 _ => break,
1128 }
1129 }
1130 has_digits
1131 }
1132
1133 /// Eats the float exponent. Returns true if at least one digit was met,
1134 /// and returns false otherwise.
1135 fn eat_float_exponent(&mut self) -> bool {
1136 debug_assert!(self.prev() == 'e' || self.prev() == 'E');
1137 if self.first() == '-' || self.first() == '+' {
1138 self.bump();
1139 }
1140 self.eat_decimal_digits()
1141 }
1142
1143 // Eats the suffix of the literal, e.g. "u8".
1144 fn eat_literal_suffix(&mut self) {
1145 self.eat_identifier();
1146 }
1147
1148 // Eats the identifier. Note: succeeds on `_`, which isn't a valid
1149 // identifier.
1150 fn eat_identifier(&mut self) {
1151 if !is_id_start(self.first()) {
1152 return;
1153 }
1154 self.bump();
1155
1156 self.eat_while(is_id_continue);
1157 }
1158}