rustc_parse/lexer/
mod.rs

1use diagnostics::make_errors_for_mismatched_closing_delims;
2use rustc_ast::ast::{self, AttrStyle};
3use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
4use rustc_ast::tokenstream::TokenStream;
5use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_control_chars};
6use rustc_errors::codes::*;
7use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
8use rustc_lexer::{
9    Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace,
10};
11use rustc_literal_escaper::{EscapeError, Mode, check_for_errors};
12use rustc_session::lint::BuiltinLintDiag;
13use rustc_session::lint::builtin::{
14    RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
15    TEXT_DIRECTION_CODEPOINT_IN_COMMENT, TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
16};
17use rustc_session::parse::ParseSess;
18use rustc_span::{BytePos, Pos, Span, Symbol, sym};
19use tracing::debug;
20
21use crate::errors;
22use crate::lexer::diagnostics::TokenTreeDiagInfo;
23use crate::lexer::unicode_chars::UNICODE_ARRAY;
24
25mod diagnostics;
26mod tokentrees;
27mod unescape_error_reporting;
28mod unicode_chars;
29
30use unescape_error_reporting::{emit_unescape_error, escaped_char};
31
32// This type is used a lot. Make sure it doesn't unintentionally get bigger.
33//
34// This assertion is in this crate, rather than in `rustc_lexer`, because that
35// crate cannot depend on `rustc_data_structures`.
36#[cfg(target_pointer_width = "64")]
37rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12);
38
39#[derive(Clone, Debug)]
40pub(crate) struct UnmatchedDelim {
41    pub found_delim: Option<Delimiter>,
42    pub found_span: Span,
43    pub unclosed_span: Option<Span>,
44    pub candidate_span: Option<Span>,
45}
46
47pub(crate) fn lex_token_trees<'psess, 'src>(
48    psess: &'psess ParseSess,
49    mut src: &'src str,
50    mut start_pos: BytePos,
51    override_span: Option<Span>,
52) -> Result<TokenStream, Vec<Diag<'psess>>> {
53    // Skip `#!`, if present.
54    if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
55        src = &src[shebang_len..];
56        start_pos = start_pos + BytePos::from_usize(shebang_len);
57    }
58
59    let cursor = Cursor::new(src, FrontmatterAllowed::Yes);
60    let mut lexer = Lexer {
61        psess,
62        start_pos,
63        pos: start_pos,
64        src,
65        cursor,
66        override_span,
67        nbsp_is_whitespace: false,
68        last_lifetime: None,
69        token: Token::dummy(),
70        diag_info: TokenTreeDiagInfo::default(),
71    };
72    let res = lexer.lex_token_trees(/* is_delimited */ false);
73
74    let mut unmatched_closing_delims: Vec<_> =
75        make_errors_for_mismatched_closing_delims(&lexer.diag_info.unmatched_delims, psess);
76
77    match res {
78        Ok((_open_spacing, stream)) => {
79            if unmatched_closing_delims.is_empty() {
80                Ok(stream)
81            } else {
82                // Return error if there are unmatched delimiters or unclosed delimiters.
83                Err(unmatched_closing_delims)
84            }
85        }
86        Err(errs) => {
87            // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
88            // because the delimiter mismatch is more likely to be the root cause of error
89            unmatched_closing_delims.extend(errs);
90            Err(unmatched_closing_delims)
91        }
92    }
93}
94
95struct Lexer<'psess, 'src> {
96    psess: &'psess ParseSess,
97    /// Initial position, read-only.
98    start_pos: BytePos,
99    /// The absolute offset within the source_map of the current character.
100    pos: BytePos,
101    /// Source text to tokenize.
102    src: &'src str,
103    /// Cursor for getting lexer tokens.
104    cursor: Cursor<'src>,
105    override_span: Option<Span>,
106    /// When a "unknown start of token: \u{a0}" has already been emitted earlier
107    /// in this file, it's safe to treat further occurrences of the non-breaking
108    /// space character as whitespace.
109    nbsp_is_whitespace: bool,
110
111    /// Track the `Span` for the leading `'` of the last lifetime. Used for
112    /// diagnostics to detect possible typo where `"` was meant.
113    last_lifetime: Option<Span>,
114
115    /// The current token.
116    token: Token,
117
118    diag_info: TokenTreeDiagInfo,
119}
120
121impl<'psess, 'src> Lexer<'psess, 'src> {
122    fn dcx(&self) -> DiagCtxtHandle<'psess> {
123        self.psess.dcx()
124    }
125
126    fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
127        self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
128    }
129
130    /// Returns the next token, paired with a bool indicating if the token was
131    /// preceded by whitespace.
132    fn next_token_from_cursor(&mut self) -> (Token, bool) {
133        let mut preceded_by_whitespace = false;
134        let mut swallow_next_invalid = 0;
135        // Skip trivial (whitespace & comments) tokens
136        loop {
137            let str_before = self.cursor.as_str();
138            let token = self.cursor.advance_token();
139            let start = self.pos;
140            self.pos = self.pos + BytePos(token.len);
141
142            debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
143
144            if let rustc_lexer::TokenKind::Semi
145            | rustc_lexer::TokenKind::LineComment { .. }
146            | rustc_lexer::TokenKind::BlockComment { .. }
147            | rustc_lexer::TokenKind::CloseParen
148            | rustc_lexer::TokenKind::CloseBrace
149            | rustc_lexer::TokenKind::CloseBracket = token.kind
150            {
151                // Heuristic: we assume that it is unlikely we're dealing with an unterminated
152                // string surrounded by single quotes.
153                self.last_lifetime = None;
154            }
155
156            // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
157            // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
158            // additional validation.
159            let kind = match token.kind {
160                rustc_lexer::TokenKind::LineComment { doc_style } => {
161                    // Skip non-doc comments
162                    let Some(doc_style) = doc_style else {
163                        self.lint_unicode_text_flow(start);
164                        preceded_by_whitespace = true;
165                        continue;
166                    };
167
168                    // Opening delimiter of the length 3 is not included into the symbol.
169                    let content_start = start + BytePos(3);
170                    let content = self.str_from(content_start);
171                    self.lint_doc_comment_unicode_text_flow(start, content);
172                    self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
173                }
174                rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
175                    if !terminated {
176                        self.report_unterminated_block_comment(start, doc_style);
177                    }
178
179                    // Skip non-doc comments
180                    let Some(doc_style) = doc_style else {
181                        self.lint_unicode_text_flow(start);
182                        preceded_by_whitespace = true;
183                        continue;
184                    };
185
186                    // Opening delimiter of the length 3 and closing delimiter of the length 2
187                    // are not included into the symbol.
188                    let content_start = start + BytePos(3);
189                    let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
190                    let content = self.str_from_to(content_start, content_end);
191                    self.lint_doc_comment_unicode_text_flow(start, content);
192                    self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
193                }
194                rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } => {
195                    self.validate_frontmatter(start, has_invalid_preceding_whitespace, invalid_infostring);
196                    preceded_by_whitespace = true;
197                    continue;
198                }
199                rustc_lexer::TokenKind::Whitespace => {
200                    preceded_by_whitespace = true;
201                    continue;
202                }
203                rustc_lexer::TokenKind::Ident => self.ident(start),
204                rustc_lexer::TokenKind::RawIdent => {
205                    let sym = nfc_normalize(self.str_from(start + BytePos(2)));
206                    let span = self.mk_sp(start, self.pos);
207                    self.psess.symbol_gallery.insert(sym, span);
208                    if !sym.can_be_raw() {
209                        self.dcx().emit_err(errors::CannotBeRawIdent { span, ident: sym });
210                    }
211                    self.psess.raw_identifier_spans.push(span);
212                    token::Ident(sym, IdentIsRaw::Yes)
213                }
214                rustc_lexer::TokenKind::UnknownPrefix => {
215                    self.report_unknown_prefix(start);
216                    self.ident(start)
217                }
218                rustc_lexer::TokenKind::UnknownPrefixLifetime => {
219                    self.report_unknown_prefix(start);
220                    // Include the leading `'` in the real identifier, for macro
221                    // expansion purposes. See #12512 for the gory details of why
222                    // this is necessary.
223                    let lifetime_name = self.str_from(start);
224                    self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
225                    let ident = Symbol::intern(lifetime_name);
226                    token::Lifetime(ident, IdentIsRaw::No)
227                }
228                rustc_lexer::TokenKind::InvalidIdent
229                    // Do not recover an identifier with emoji if the codepoint is a confusable
230                    // with a recoverable substitution token, like `➖`.
231                    if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
232                        let sym = self.str_from(start);
233                        sym.chars().count() == 1 && c == sym.chars().next().unwrap()
234                    }) =>
235                {
236                    let sym = nfc_normalize(self.str_from(start));
237                    let span = self.mk_sp(start, self.pos);
238                    self.psess
239                        .bad_unicode_identifiers
240                        .borrow_mut()
241                        .entry(sym)
242                        .or_default()
243                        .push(span);
244                    token::Ident(sym, IdentIsRaw::No)
245                }
246                // split up (raw) c string literals to an ident and a string literal when edition <
247                // 2021.
248                rustc_lexer::TokenKind::Literal {
249                    kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
250                    suffix_start: _,
251                } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => {
252                    let prefix_len = match kind {
253                        LiteralKind::CStr { .. } => 1,
254                        LiteralKind::RawCStr { .. } => 2,
255                        _ => unreachable!(),
256                    };
257
258                    // reset the state so that only the prefix ("c" or "cr")
259                    // was consumed.
260                    let lit_start = start + BytePos(prefix_len);
261                    self.pos = lit_start;
262                    self.cursor = Cursor::new(&str_before[prefix_len as usize..], FrontmatterAllowed::No);
263                    self.report_unknown_prefix(start);
264                    let prefix_span = self.mk_sp(start, lit_start);
265                    return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
266                }
267                rustc_lexer::TokenKind::GuardedStrPrefix => {
268                    self.maybe_report_guarded_str(start, str_before)
269                }
270                rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
271                    let suffix_start = start + BytePos(suffix_start);
272                    let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
273                    let suffix = if suffix_start < self.pos {
274                        let string = self.str_from(suffix_start);
275                        if string == "_" {
276                            self.dcx().emit_err(errors::UnderscoreLiteralSuffix {
277                                span: self.mk_sp(suffix_start, self.pos),
278                            });
279                            None
280                        } else {
281                            Some(Symbol::intern(string))
282                        }
283                    } else {
284                        None
285                    };
286                    self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
287                    token::Literal(token::Lit { kind, symbol, suffix })
288                }
289                rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
290                    // Include the leading `'` in the real identifier, for macro
291                    // expansion purposes. See #12512 for the gory details of why
292                    // this is necessary.
293                    let lifetime_name = self.str_from(start);
294                    self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
295                    if starts_with_number {
296                        let span = self.mk_sp(start, self.pos);
297                        self.dcx()
298                            .struct_err("lifetimes cannot start with a number")
299                            .with_span(span)
300                            .stash(span, StashKey::LifetimeIsChar);
301                    }
302                    let ident = Symbol::intern(lifetime_name);
303                    token::Lifetime(ident, IdentIsRaw::No)
304                }
305                rustc_lexer::TokenKind::RawLifetime => {
306                    self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
307
308                    let ident_start = start + BytePos(3);
309                    let prefix_span = self.mk_sp(start, ident_start);
310
311                    if prefix_span.at_least_rust_2021() {
312                        // If the raw lifetime is followed by \' then treat it a normal
313                        // lifetime followed by a \', which is to interpret it as a character
314                        // literal. In this case, it's always an invalid character literal
315                        // since the literal must necessarily have >3 characters (r#...) inside
316                        // of it, which is invalid.
317                        if self.cursor.as_str().starts_with('\'') {
318                            let lit_span = self.mk_sp(start, self.pos + BytePos(1));
319                            let contents = self.str_from_to(start + BytePos(1), self.pos);
320                            emit_unescape_error(
321                                self.dcx(),
322                                contents,
323                                lit_span,
324                                lit_span,
325                                Mode::Char,
326                                0..contents.len(),
327                                EscapeError::MoreThanOneChar,
328                            )
329                            .expect("expected error");
330                        }
331
332                        let span = self.mk_sp(start, self.pos);
333
334                        let lifetime_name_without_tick =
335                            Symbol::intern(&self.str_from(ident_start));
336                        if !lifetime_name_without_tick.can_be_raw() {
337                            self.dcx().emit_err(
338                                errors::CannotBeRawLifetime {
339                                    span,
340                                    ident: lifetime_name_without_tick
341                                }
342                            );
343                        }
344
345                        // Put the `'` back onto the lifetime name.
346                        let mut lifetime_name =
347                            String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
348                        lifetime_name.push('\'');
349                        lifetime_name += lifetime_name_without_tick.as_str();
350                        let sym = Symbol::intern(&lifetime_name);
351
352                        // Make sure we mark this as a raw identifier.
353                        self.psess.raw_identifier_spans.push(span);
354
355                        token::Lifetime(sym, IdentIsRaw::Yes)
356                    } else {
357                        // Otherwise, this should be parsed like `'r`. Warn about it though.
358                        self.psess.buffer_lint(
359                            RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
360                            prefix_span,
361                            ast::CRATE_NODE_ID,
362                            BuiltinLintDiag::RawPrefix(prefix_span),
363                        );
364
365                        // Reset the state so we just lex the `'r`.
366                        let lt_start = start + BytePos(2);
367                        self.pos = lt_start;
368                        self.cursor = Cursor::new(&str_before[2 as usize..], FrontmatterAllowed::No);
369
370                        let lifetime_name = self.str_from(start);
371                        let ident = Symbol::intern(lifetime_name);
372                        token::Lifetime(ident, IdentIsRaw::No)
373                    }
374                }
375                rustc_lexer::TokenKind::Semi => token::Semi,
376                rustc_lexer::TokenKind::Comma => token::Comma,
377                rustc_lexer::TokenKind::Dot => token::Dot,
378                rustc_lexer::TokenKind::OpenParen => token::OpenParen,
379                rustc_lexer::TokenKind::CloseParen => token::CloseParen,
380                rustc_lexer::TokenKind::OpenBrace => token::OpenBrace,
381                rustc_lexer::TokenKind::CloseBrace => token::CloseBrace,
382                rustc_lexer::TokenKind::OpenBracket => token::OpenBracket,
383                rustc_lexer::TokenKind::CloseBracket => token::CloseBracket,
384                rustc_lexer::TokenKind::At => token::At,
385                rustc_lexer::TokenKind::Pound => token::Pound,
386                rustc_lexer::TokenKind::Tilde => token::Tilde,
387                rustc_lexer::TokenKind::Question => token::Question,
388                rustc_lexer::TokenKind::Colon => token::Colon,
389                rustc_lexer::TokenKind::Dollar => token::Dollar,
390                rustc_lexer::TokenKind::Eq => token::Eq,
391                rustc_lexer::TokenKind::Bang => token::Bang,
392                rustc_lexer::TokenKind::Lt => token::Lt,
393                rustc_lexer::TokenKind::Gt => token::Gt,
394                rustc_lexer::TokenKind::Minus => token::Minus,
395                rustc_lexer::TokenKind::And => token::And,
396                rustc_lexer::TokenKind::Or => token::Or,
397                rustc_lexer::TokenKind::Plus => token::Plus,
398                rustc_lexer::TokenKind::Star => token::Star,
399                rustc_lexer::TokenKind::Slash => token::Slash,
400                rustc_lexer::TokenKind::Caret => token::Caret,
401                rustc_lexer::TokenKind::Percent => token::Percent,
402
403                rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
404                    // Don't emit diagnostics for sequences of the same invalid token
405                    if swallow_next_invalid > 0 {
406                        swallow_next_invalid -= 1;
407                        continue;
408                    }
409                    let mut it = self.str_from_to_end(start).chars();
410                    let c = it.next().unwrap();
411                    if c == '\u{00a0}' {
412                        // If an error has already been reported on non-breaking
413                        // space characters earlier in the file, treat all
414                        // subsequent occurrences as whitespace.
415                        if self.nbsp_is_whitespace {
416                            preceded_by_whitespace = true;
417                            continue;
418                        }
419                        self.nbsp_is_whitespace = true;
420                    }
421                    let repeats = it.take_while(|c1| *c1 == c).count();
422                    // FIXME: the lexer could be used to turn the ASCII version of unicode
423                    // homoglyphs, instead of keeping a table in `check_for_substitution`into the
424                    // token. Ideally, this should be inside `rustc_lexer`. However, we should
425                    // first remove compound tokens like `<<` from `rustc_lexer`, and then add
426                    // fancier error recovery to it, as there will be less overall work to do this
427                    // way.
428                    let (token, sugg) =
429                        unicode_chars::check_for_substitution(self, start, c, repeats + 1);
430                    self.dcx().emit_err(errors::UnknownTokenStart {
431                        span: self.mk_sp(start, self.pos + Pos::from_usize(repeats * c.len_utf8())),
432                        escaped: escaped_char(c),
433                        sugg,
434                        null: if c == '\x00' { Some(errors::UnknownTokenNull) } else { None },
435                        repeat: if repeats > 0 {
436                            swallow_next_invalid = repeats;
437                            Some(errors::UnknownTokenRepeat { repeats })
438                        } else {
439                            None
440                        },
441                    });
442
443                    if let Some(token) = token {
444                        token
445                    } else {
446                        preceded_by_whitespace = true;
447                        continue;
448                    }
449                }
450                rustc_lexer::TokenKind::Eof => token::Eof,
451            };
452            let span = self.mk_sp(start, self.pos);
453            return (Token::new(kind, span), preceded_by_whitespace);
454        }
455    }
456
457    fn ident(&self, start: BytePos) -> TokenKind {
458        let sym = nfc_normalize(self.str_from(start));
459        let span = self.mk_sp(start, self.pos);
460        self.psess.symbol_gallery.insert(sym, span);
461        token::Ident(sym, IdentIsRaw::No)
462    }
463
464    /// Detect usages of Unicode codepoints changing the direction of the text on screen and loudly
465    /// complain about it.
466    fn lint_unicode_text_flow(&self, start: BytePos) {
467        // Opening delimiter of the length 2 is not included into the comment text.
468        let content_start = start + BytePos(2);
469        let content = self.str_from(content_start);
470        if contains_text_flow_control_chars(content) {
471            let span = self.mk_sp(start, self.pos);
472            self.psess.buffer_lint(
473                TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
474                span,
475                ast::CRATE_NODE_ID,
476                BuiltinLintDiag::UnicodeTextFlow(span, content.to_string()),
477            );
478        }
479    }
480
481    fn lint_doc_comment_unicode_text_flow(&mut self, start: BytePos, content: &str) {
482        if contains_text_flow_control_chars(content) {
483            self.report_text_direction_codepoint(
484                content,
485                self.mk_sp(start, self.pos),
486                0,
487                false,
488                "doc comment",
489            );
490        }
491    }
492
493    fn lint_literal_unicode_text_flow(
494        &mut self,
495        text: Symbol,
496        lit_kind: token::LitKind,
497        span: Span,
498        label: &'static str,
499    ) {
500        if !contains_text_flow_control_chars(text.as_str()) {
501            return;
502        }
503        let (padding, point_at_inner_spans) = match lit_kind {
504            // account for `"` or `'`
505            token::LitKind::Str | token::LitKind::Char => (1, true),
506            // account for `c"`
507            token::LitKind::CStr => (2, true),
508            // account for `r###"`
509            token::LitKind::StrRaw(n) => (n as u32 + 2, true),
510            // account for `cr###"`
511            token::LitKind::CStrRaw(n) => (n as u32 + 3, true),
512            // suppress bad literals.
513            token::LitKind::Err(_) => return,
514            // Be conservative just in case new literals do support these.
515            _ => (0, false),
516        };
517        self.report_text_direction_codepoint(
518            text.as_str(),
519            span,
520            padding,
521            point_at_inner_spans,
522            label,
523        );
524    }
525
526    fn report_text_direction_codepoint(
527        &self,
528        text: &str,
529        span: Span,
530        padding: u32,
531        point_at_inner_spans: bool,
532        label: &str,
533    ) {
534        // Obtain the `Span`s for each of the forbidden chars.
535        let spans: Vec<_> = text
536            .char_indices()
537            .filter_map(|(i, c)| {
538                TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
539                    let lo = span.lo() + BytePos(i as u32 + padding);
540                    (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
541                })
542            })
543            .collect();
544
545        let count = spans.len();
546        let labels = point_at_inner_spans.then_some(spans.clone());
547
548        self.psess.buffer_lint(
549            TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
550            span,
551            ast::CRATE_NODE_ID,
552            BuiltinLintDiag::HiddenUnicodeCodepoints {
553                label: label.to_string(),
554                count,
555                span_label: span,
556                labels,
557                escape: point_at_inner_spans && !spans.is_empty(),
558                spans,
559            },
560        );
561    }
562
563    fn validate_frontmatter(
564        &self,
565        start: BytePos,
566        has_invalid_preceding_whitespace: bool,
567        invalid_infostring: bool,
568    ) {
569        let s = self.str_from(start);
570        let real_start = s.find("---").unwrap();
571        let frontmatter_opening_pos = BytePos(real_start as u32) + start;
572        let s_new = &s[real_start..];
573        let within = s_new.trim_start_matches('-');
574        let len_opening = s_new.len() - within.len();
575
576        let frontmatter_opening_end_pos = frontmatter_opening_pos + BytePos(len_opening as u32);
577        if has_invalid_preceding_whitespace {
578            let line_start =
579                BytePos(s[..real_start].rfind("\n").map_or(0, |i| i as u32 + 1)) + start;
580            let span = self.mk_sp(line_start, frontmatter_opening_end_pos);
581            let label_span = self.mk_sp(line_start, frontmatter_opening_pos);
582            self.dcx().emit_err(errors::FrontmatterInvalidOpeningPrecedingWhitespace {
583                span,
584                note_span: label_span,
585            });
586        }
587
588        if invalid_infostring {
589            let line_end = s[real_start..].find('\n').unwrap_or(s[real_start..].len());
590            let span = self.mk_sp(
591                frontmatter_opening_end_pos,
592                frontmatter_opening_pos + BytePos(line_end as u32),
593            );
594            self.dcx().emit_err(errors::FrontmatterInvalidInfostring { span });
595        }
596
597        let last_line_start = within.rfind('\n').map_or(0, |i| i + 1);
598        let last_line = &within[last_line_start..];
599        let last_line_trimmed = last_line.trim_start_matches(is_whitespace);
600        let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32);
601
602        let frontmatter_span = self.mk_sp(frontmatter_opening_pos, self.pos);
603        self.psess.gated_spans.gate(sym::frontmatter, frontmatter_span);
604
605        if !last_line_trimmed.starts_with("---") {
606            let label_span = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
607            self.dcx().emit_err(errors::FrontmatterUnclosed {
608                span: frontmatter_span,
609                note_span: label_span,
610            });
611            return;
612        }
613
614        if last_line_trimmed.len() != last_line.len() {
615            let line_end = last_line_start_pos + BytePos(last_line.len() as u32);
616            let span = self.mk_sp(last_line_start_pos, line_end);
617            let whitespace_end =
618                last_line_start_pos + BytePos((last_line.len() - last_line_trimmed.len()) as u32);
619            let label_span = self.mk_sp(last_line_start_pos, whitespace_end);
620            self.dcx().emit_err(errors::FrontmatterInvalidClosingPrecedingWhitespace {
621                span,
622                note_span: label_span,
623            });
624        }
625
626        let rest = last_line_trimmed.trim_start_matches('-');
627        let len_close = last_line_trimmed.len() - rest.len();
628        if len_close != len_opening {
629            let span = self.mk_sp(frontmatter_opening_pos, self.pos);
630            let opening = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
631            let last_line_close_pos = last_line_start_pos + BytePos(len_close as u32);
632            let close = self.mk_sp(last_line_start_pos, last_line_close_pos);
633            self.dcx().emit_err(errors::FrontmatterLengthMismatch {
634                span,
635                opening,
636                close,
637                len_opening,
638                len_close,
639            });
640        }
641
642        if !rest.trim_matches(is_whitespace).is_empty() {
643            let span = self.mk_sp(last_line_start_pos, self.pos);
644            self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span });
645        }
646    }
647
648    fn cook_doc_comment(
649        &self,
650        content_start: BytePos,
651        content: &str,
652        comment_kind: CommentKind,
653        doc_style: DocStyle,
654    ) -> TokenKind {
655        if content.contains('\r') {
656            for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
657                let span = self.mk_sp(
658                    content_start + BytePos(idx as u32),
659                    content_start + BytePos(idx as u32 + 1),
660                );
661                let block = matches!(comment_kind, CommentKind::Block);
662                self.dcx().emit_err(errors::CrDocComment { span, block });
663            }
664        }
665
666        let attr_style = match doc_style {
667            DocStyle::Outer => AttrStyle::Outer,
668            DocStyle::Inner => AttrStyle::Inner,
669        };
670
671        token::DocComment(comment_kind, attr_style, Symbol::intern(content))
672    }
673
674    fn cook_lexer_literal(
675        &self,
676        start: BytePos,
677        end: BytePos,
678        kind: rustc_lexer::LiteralKind,
679    ) -> (token::LitKind, Symbol) {
680        match kind {
681            rustc_lexer::LiteralKind::Char { terminated } => {
682                if !terminated {
683                    let mut err = self
684                        .dcx()
685                        .struct_span_fatal(self.mk_sp(start, end), "unterminated character literal")
686                        .with_code(E0762);
687                    if let Some(lt_sp) = self.last_lifetime {
688                        err.multipart_suggestion(
689                            "if you meant to write a string literal, use double quotes",
690                            vec![
691                                (lt_sp, "\"".to_string()),
692                                (self.mk_sp(start, start + BytePos(1)), "\"".to_string()),
693                            ],
694                            Applicability::MaybeIncorrect,
695                        );
696                    }
697                    err.emit()
698                }
699                self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
700            }
701            rustc_lexer::LiteralKind::Byte { terminated } => {
702                if !terminated {
703                    self.dcx()
704                        .struct_span_fatal(
705                            self.mk_sp(start + BytePos(1), end),
706                            "unterminated byte constant",
707                        )
708                        .with_code(E0763)
709                        .emit()
710                }
711                self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
712            }
713            rustc_lexer::LiteralKind::Str { terminated } => {
714                if !terminated {
715                    self.dcx()
716                        .struct_span_fatal(
717                            self.mk_sp(start, end),
718                            "unterminated double quote string",
719                        )
720                        .with_code(E0765)
721                        .emit()
722                }
723                self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
724            }
725            rustc_lexer::LiteralKind::ByteStr { terminated } => {
726                if !terminated {
727                    self.dcx()
728                        .struct_span_fatal(
729                            self.mk_sp(start + BytePos(1), end),
730                            "unterminated double quote byte string",
731                        )
732                        .with_code(E0766)
733                        .emit()
734                }
735                self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1)
736                // b" "
737            }
738            rustc_lexer::LiteralKind::CStr { terminated } => {
739                if !terminated {
740                    self.dcx()
741                        .struct_span_fatal(
742                            self.mk_sp(start + BytePos(1), end),
743                            "unterminated C string",
744                        )
745                        .with_code(E0767)
746                        .emit()
747                }
748                self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
749            }
750            rustc_lexer::LiteralKind::RawStr { n_hashes } => {
751                if let Some(n_hashes) = n_hashes {
752                    let n = u32::from(n_hashes);
753                    let kind = token::StrRaw(n_hashes);
754                    self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n)
755                // r##" "##
756                } else {
757                    self.report_raw_str_error(start, 1);
758                }
759            }
760            rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
761                if let Some(n_hashes) = n_hashes {
762                    let n = u32::from(n_hashes);
763                    let kind = token::ByteStrRaw(n_hashes);
764                    self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n)
765                // br##" "##
766                } else {
767                    self.report_raw_str_error(start, 2);
768                }
769            }
770            rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
771                if let Some(n_hashes) = n_hashes {
772                    let n = u32::from(n_hashes);
773                    let kind = token::CStrRaw(n_hashes);
774                    self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n)
775                // cr##" "##
776                } else {
777                    self.report_raw_str_error(start, 2);
778                }
779            }
780            rustc_lexer::LiteralKind::Int { base, empty_int } => {
781                let mut kind = token::Integer;
782                if empty_int {
783                    let span = self.mk_sp(start, end);
784                    let guar = self.dcx().emit_err(errors::NoDigitsLiteral { span });
785                    kind = token::Err(guar);
786                } else if matches!(base, Base::Binary | Base::Octal) {
787                    let base = base as u32;
788                    let s = self.str_from_to(start + BytePos(2), end);
789                    for (idx, c) in s.char_indices() {
790                        let span = self.mk_sp(
791                            start + BytePos::from_usize(2 + idx),
792                            start + BytePos::from_usize(2 + idx + c.len_utf8()),
793                        );
794                        if c != '_' && c.to_digit(base).is_none() {
795                            let guar =
796                                self.dcx().emit_err(errors::InvalidDigitLiteral { span, base });
797                            kind = token::Err(guar);
798                        }
799                    }
800                }
801                (kind, self.symbol_from_to(start, end))
802            }
803            rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
804                let mut kind = token::Float;
805                if empty_exponent {
806                    let span = self.mk_sp(start, self.pos);
807                    let guar = self.dcx().emit_err(errors::EmptyExponentFloat { span });
808                    kind = token::Err(guar);
809                }
810                let base = match base {
811                    Base::Hexadecimal => Some("hexadecimal"),
812                    Base::Octal => Some("octal"),
813                    Base::Binary => Some("binary"),
814                    _ => None,
815                };
816                if let Some(base) = base {
817                    let span = self.mk_sp(start, end);
818                    let guar =
819                        self.dcx().emit_err(errors::FloatLiteralUnsupportedBase { span, base });
820                    kind = token::Err(guar)
821                }
822                (kind, self.symbol_from_to(start, end))
823            }
824        }
825    }
826
827    #[inline]
828    fn src_index(&self, pos: BytePos) -> usize {
829        (pos - self.start_pos).to_usize()
830    }
831
832    /// Slice of the source text from `start` up to but excluding `self.pos`,
833    /// meaning the slice does not include the character `self.ch`.
834    fn str_from(&self, start: BytePos) -> &'src str {
835        self.str_from_to(start, self.pos)
836    }
837
838    /// As symbol_from, with an explicit endpoint.
839    fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
840        debug!("taking an ident from {:?} to {:?}", start, end);
841        Symbol::intern(self.str_from_to(start, end))
842    }
843
844    /// Slice of the source text spanning from `start` up to but excluding `end`.
845    fn str_from_to(&self, start: BytePos, end: BytePos) -> &'src str {
846        &self.src[self.src_index(start)..self.src_index(end)]
847    }
848
849    /// Slice of the source text spanning from `start` until the end
850    fn str_from_to_end(&self, start: BytePos) -> &'src str {
851        &self.src[self.src_index(start)..]
852    }
853
854    fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
855        match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
856            Err(RawStrError::InvalidStarter { bad_char }) => {
857                self.report_non_started_raw_string(start, bad_char)
858            }
859            Err(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
860                .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
861            Err(RawStrError::TooManyDelimiters { found }) => {
862                self.report_too_many_hashes(start, found)
863            }
864            Ok(()) => panic!("no error found for supposedly invalid raw string literal"),
865        }
866    }
867
868    fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
869        self.dcx()
870            .struct_span_fatal(
871                self.mk_sp(start, self.pos),
872                format!(
873                    "found invalid character; only `#` is allowed in raw string delimitation: {}",
874                    escaped_char(bad_char)
875                ),
876            )
877            .emit()
878    }
879
880    fn report_unterminated_raw_string(
881        &self,
882        start: BytePos,
883        n_hashes: u32,
884        possible_offset: Option<u32>,
885        found_terminators: u32,
886    ) -> ! {
887        let mut err =
888            self.dcx().struct_span_fatal(self.mk_sp(start, start), "unterminated raw string");
889        err.code(E0748);
890        err.span_label(self.mk_sp(start, start), "unterminated raw string");
891
892        if n_hashes > 0 {
893            err.note(format!(
894                "this raw string should be terminated with `\"{}`",
895                "#".repeat(n_hashes as usize)
896            ));
897        }
898
899        if let Some(possible_offset) = possible_offset {
900            let lo = start + BytePos(possible_offset);
901            let hi = lo + BytePos(found_terminators);
902            let span = self.mk_sp(lo, hi);
903            err.span_suggestion(
904                span,
905                "consider terminating the string here",
906                "#".repeat(n_hashes as usize),
907                Applicability::MaybeIncorrect,
908            );
909        }
910
911        err.emit()
912    }
913
914    fn report_unterminated_block_comment(&self, start: BytePos, doc_style: Option<DocStyle>) {
915        let msg = match doc_style {
916            Some(_) => "unterminated block doc-comment",
917            None => "unterminated block comment",
918        };
919        let last_bpos = self.pos;
920        let mut err = self.dcx().struct_span_fatal(self.mk_sp(start, last_bpos), msg);
921        err.code(E0758);
922        let mut nested_block_comment_open_idxs = vec![];
923        let mut last_nested_block_comment_idxs = None;
924        let mut content_chars = self.str_from(start).char_indices().peekable();
925
926        while let Some((idx, current_char)) = content_chars.next() {
927            match content_chars.peek() {
928                Some((_, '*')) if current_char == '/' => {
929                    nested_block_comment_open_idxs.push(idx);
930                }
931                Some((_, '/')) if current_char == '*' => {
932                    last_nested_block_comment_idxs =
933                        nested_block_comment_open_idxs.pop().map(|open_idx| (open_idx, idx));
934                }
935                _ => {}
936            };
937        }
938
939        if let Some((nested_open_idx, nested_close_idx)) = last_nested_block_comment_idxs {
940            err.span_label(self.mk_sp(start, start + BytePos(2)), msg)
941                .span_label(
942                    self.mk_sp(
943                        start + BytePos(nested_open_idx as u32),
944                        start + BytePos(nested_open_idx as u32 + 2),
945                    ),
946                    "...as last nested comment starts here, maybe you want to close this instead?",
947                )
948                .span_label(
949                    self.mk_sp(
950                        start + BytePos(nested_close_idx as u32),
951                        start + BytePos(nested_close_idx as u32 + 2),
952                    ),
953                    "...and last nested comment terminates here.",
954                );
955        }
956
957        err.emit();
958    }
959
960    // RFC 3101 introduced the idea of (reserved) prefixes. As of Rust 2021,
961    // using a (unknown) prefix is an error. In earlier editions, however, they
962    // only result in a (allowed by default) lint, and are treated as regular
963    // identifier tokens.
964    fn report_unknown_prefix(&self, start: BytePos) {
965        let prefix_span = self.mk_sp(start, self.pos);
966        let prefix = self.str_from_to(start, self.pos);
967        let expn_data = prefix_span.ctxt().outer_expn_data();
968
969        if expn_data.edition.at_least_rust_2021() {
970            // In Rust 2021, this is a hard error.
971            let sugg = if prefix == "rb" {
972                Some(errors::UnknownPrefixSugg::UseBr(prefix_span))
973            } else if prefix == "rc" {
974                Some(errors::UnknownPrefixSugg::UseCr(prefix_span))
975            } else if expn_data.is_root() {
976                if self.cursor.first() == '\''
977                    && let Some(start) = self.last_lifetime
978                    && self.cursor.third() != '\''
979                    && let end = self.mk_sp(self.pos, self.pos + BytePos(1))
980                    && !self.psess.source_map().is_multiline(start.until(end))
981                {
982                    // FIXME: An "unclosed `char`" error will be emitted already in some cases,
983                    // but it's hard to silence this error while not also silencing important cases
984                    // too. We should use the error stashing machinery instead.
985                    Some(errors::UnknownPrefixSugg::MeantStr { start, end })
986                } else {
987                    Some(errors::UnknownPrefixSugg::Whitespace(prefix_span.shrink_to_hi()))
988                }
989            } else {
990                None
991            };
992            self.dcx().emit_err(errors::UnknownPrefix { span: prefix_span, prefix, sugg });
993        } else {
994            // Before Rust 2021, only emit a lint for migration.
995            self.psess.buffer_lint(
996                RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
997                prefix_span,
998                ast::CRATE_NODE_ID,
999                BuiltinLintDiag::ReservedPrefix(prefix_span, prefix.to_string()),
1000            );
1001        }
1002    }
1003
1004    /// Detect guarded string literal syntax
1005    ///
1006    /// RFC 3593 reserved this syntax for future use. As of Rust 2024,
1007    /// using this syntax produces an error. In earlier editions, however, it
1008    /// only results in an (allowed by default) lint, and is treated as
1009    /// separate tokens.
1010    fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
1011        let span = self.mk_sp(start, self.pos);
1012        let edition2024 = span.edition().at_least_rust_2024();
1013
1014        let space_pos = start + BytePos(1);
1015        let space_span = self.mk_sp(space_pos, space_pos);
1016
1017        let mut cursor = Cursor::new(str_before, FrontmatterAllowed::No);
1018
1019        let (is_string, span, unterminated) = match cursor.guarded_double_quoted_string() {
1020            Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
1021                let end = start + BytePos(token_len);
1022                let span = self.mk_sp(start, end);
1023                let str_start = start + BytePos(n_hashes);
1024
1025                if edition2024 {
1026                    self.cursor = cursor;
1027                    self.pos = end;
1028                }
1029
1030                let unterminated = if terminated { None } else { Some(str_start) };
1031
1032                (true, span, unterminated)
1033            }
1034            None => {
1035                // We should only get here in the `##+` case.
1036                debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");
1037
1038                (false, span, None)
1039            }
1040        };
1041        if edition2024 {
1042            if let Some(str_start) = unterminated {
1043                // Only a fatal error if string is unterminated.
1044                self.dcx()
1045                    .struct_span_fatal(
1046                        self.mk_sp(str_start, self.pos),
1047                        "unterminated double quote string",
1048                    )
1049                    .with_code(E0765)
1050                    .emit()
1051            }
1052
1053            let sugg = if span.from_expansion() {
1054                None
1055            } else {
1056                Some(errors::GuardedStringSugg(space_span))
1057            };
1058
1059            // In Edition 2024 and later, emit a hard error.
1060            let err = if is_string {
1061                self.dcx().emit_err(errors::ReservedString { span, sugg })
1062            } else {
1063                self.dcx().emit_err(errors::ReservedMultihash { span, sugg })
1064            };
1065
1066            token::Literal(token::Lit {
1067                kind: token::Err(err),
1068                symbol: self.symbol_from_to(start, self.pos),
1069                suffix: None,
1070            })
1071        } else {
1072            // Before Rust 2024, only emit a lint for migration.
1073            self.psess.buffer_lint(
1074                RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
1075                span,
1076                ast::CRATE_NODE_ID,
1077                BuiltinLintDiag::ReservedString { is_string, suggestion: space_span },
1078            );
1079
1080            // For backwards compatibility, roll back to after just the first `#`
1081            // and return the `Pound` token.
1082            self.pos = start + BytePos(1);
1083            self.cursor = Cursor::new(&str_before[1..], FrontmatterAllowed::No);
1084            token::Pound
1085        }
1086    }
1087
1088    fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
1089        self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
1090    }
1091
1092    fn cook_quoted(
1093        &self,
1094        mut kind: token::LitKind,
1095        mode: Mode,
1096        start: BytePos,
1097        end: BytePos,
1098        prefix_len: u32,
1099        postfix_len: u32,
1100    ) -> (token::LitKind, Symbol) {
1101        let content_start = start + BytePos(prefix_len);
1102        let content_end = end - BytePos(postfix_len);
1103        let lit_content = self.str_from_to(content_start, content_end);
1104        check_for_errors(lit_content, mode, |range, err| {
1105            let span_with_quotes = self.mk_sp(start, end);
1106            let (start, end) = (range.start as u32, range.end as u32);
1107            let lo = content_start + BytePos(start);
1108            let hi = lo + BytePos(end - start);
1109            let span = self.mk_sp(lo, hi);
1110            let is_fatal = err.is_fatal();
1111            if let Some(guar) = emit_unescape_error(
1112                self.dcx(),
1113                lit_content,
1114                span_with_quotes,
1115                span,
1116                mode,
1117                range,
1118                err,
1119            ) {
1120                assert!(is_fatal);
1121                kind = token::Err(guar);
1122            }
1123        });
1124
1125        // We normally exclude the quotes for the symbol, but for errors we
1126        // include it because it results in clearer error messages.
1127        let sym = if !matches!(kind, token::Err(_)) {
1128            Symbol::intern(lit_content)
1129        } else {
1130            self.symbol_from_to(start, end)
1131        };
1132        (kind, sym)
1133    }
1134}
1135
1136pub fn nfc_normalize(string: &str) -> Symbol {
1137    use unicode_normalization::{IsNormalized, UnicodeNormalization, is_nfc_quick};
1138    match is_nfc_quick(string.chars()) {
1139        IsNormalized::Yes => Symbol::intern(string),
1140        _ => {
1141            let normalized_str: String = string.chars().nfc().collect();
1142            Symbol::intern(&normalized_str)
1143        }
1144    }
1145}