Skip to main content

rustdoc/passes/lint/
html_tags.rs

1//! Detects invalid HTML (like an unclosed `<span>`) in doc comments.
2
3use std::borrow::Cow;
4use std::iter::Peekable;
5use std::ops::Range;
6use std::str::CharIndices;
7
8use itertools::Itertools as _;
9use rustc_hir::HirId;
10use rustc_resolve::rustdoc::pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag, TagEnd};
11use rustc_resolve::rustdoc::source_span_for_markdown_range;
12
13use crate::clean::*;
14use crate::core::DocContext;
15use crate::html::markdown::main_body_opts;
16
17pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item, hir_id: HirId, dox: &str) {
18    let tcx = cx.tcx;
19    let report_diag = |msg: String, range: &Range<usize>, is_open_tag: bool| {
20        let sp = match source_span_for_markdown_range(tcx, dox, range, &item.attrs.doc_strings) {
21            Some((sp, _)) => sp,
22            None => item.attr_span(tcx),
23        };
24        tcx.emit_node_span_lint(
25            crate::lint::INVALID_HTML_TAGS,
26            hir_id,
27            sp,
28            rustc_errors::DiagDecorator(|lint| {
29                use rustc_lint_defs::Applicability;
30
31                lint.primary_message(msg);
32
33                // If a tag looks like `<this>`, it might actually be a generic.
34                // We don't try to detect stuff `<like, this>` because that's not valid HTML,
35                // and we don't try to detect stuff `<like this>` because that's not valid Rust.
36                let mut generics_end = range.end;
37                if is_open_tag
38                    && dox[..generics_end].ends_with('>')
39                    && let Some(mut generics_start) = extract_path_backwards(dox, range.start)
40                {
41                    while generics_start != 0
42                        && generics_end < dox.len()
43                        && dox.as_bytes()[generics_start - 1] == b'<'
44                        && dox.as_bytes()[generics_end] == b'>'
45                    {
46                        generics_end += 1;
47                        generics_start -= 1;
48                        if let Some(new_start) = extract_path_backwards(dox, generics_start) {
49                            generics_start = new_start;
50                        }
51                        if let Some(new_end) = extract_path_forward(dox, generics_end) {
52                            generics_end = new_end;
53                        }
54                    }
55                    if let Some(new_end) = extract_path_forward(dox, generics_end) {
56                        generics_end = new_end;
57                    }
58                    let generics_sp = match source_span_for_markdown_range(
59                        tcx,
60                        dox,
61                        &(generics_start..generics_end),
62                        &item.attrs.doc_strings,
63                    ) {
64                        Some((sp, _)) => sp,
65                        None => item.attr_span(tcx),
66                    };
67                    // Sometimes, we only extract part of a path. For example, consider this:
68                    //
69                    //     <[u32] as IntoIter<u32>>::Item
70                    //                       ^^^^^ unclosed HTML tag `u32`
71                    //
72                    // We don't have any code for parsing fully-qualified trait paths.
73                    // In theory, we could add it, but doing it correctly would require
74                    // parsing the entire path grammar, which is problematic because of
75                    // overlap between the path grammar and Markdown.
76                    //
77                    // The example above shows that ambiguity. Is `[u32]` intended to be an
78                    // intra-doc link to the u32 primitive, or is it intended to be a slice?
79                    //
80                    // If the below conditional were removed, we would suggest this, which is
81                    // not what the user probably wants.
82                    //
83                    //     <[u32] as `IntoIter<u32>`>::Item
84                    //
85                    // We know that the user actually wants to wrap the whole thing in a code
86                    // block, but the only reason we know that is because `u32` does not, in
87                    // fact, implement IntoIter. If the example looks like this:
88                    //
89                    //     <[Vec<i32>] as IntoIter<i32>::Item
90                    //
91                    // The ideal fix would be significantly different.
92                    if (generics_start > 0 && dox.as_bytes()[generics_start - 1] == b'<')
93                        || (generics_end < dox.len() && dox.as_bytes()[generics_end] == b'>')
94                    {
95                        return;
96                    }
97                    // multipart form is chosen here because ``Vec<i32>`` would be confusing.
98                    lint.multipart_suggestion(
99                        "try marking as source code",
100                        vec![
101                            (generics_sp.shrink_to_lo(), String::from("`")),
102                            (generics_sp.shrink_to_hi(), String::from("`")),
103                        ],
104                        Applicability::MaybeIncorrect,
105                    );
106                }
107            }),
108        );
109    };
110
111    let mut tagp = TagParser::new();
112    let mut is_in_comment = None;
113    let mut in_code_block = false;
114
115    let link_names = item.link_names(&cx.cache);
116
117    let mut replacer = |broken_link: BrokenLink<'_>| {
118        if let Some(link) =
119            link_names.iter().find(|link| *link.original_text == *broken_link.reference)
120        {
121            Some((link.href.as_str().into(), link.new_text.to_string().into()))
122        } else if matches!(&broken_link.link_type, LinkType::Reference | LinkType::ReferenceUnknown)
123        {
124            // If the link is shaped [like][this], suppress any broken HTML in the [this] part.
125            // The `broken_intra_doc_links` will report typos in there anyway.
126            Some((
127                broken_link.reference.to_string().into(),
128                broken_link.reference.to_string().into(),
129            ))
130        } else {
131            None
132        }
133    };
134
135    let p = Parser::new_with_broken_link_callback(dox, main_body_opts(), Some(&mut replacer))
136        .into_offset_iter()
137        .coalesce(|a, b| {
138            // for some reason, pulldown-cmark splits html blocks into separate events for each line.
139            // we undo this, in order to handle multi-line tags.
140            match (a, b) {
141                ((Event::Html(_), ra), (Event::Html(_), rb)) if ra.end == rb.start => {
142                    let merged = ra.start..rb.end;
143                    Ok((Event::Html(Cow::Borrowed(&dox[merged.clone()]).into()), merged))
144                }
145                x => Err(x),
146            }
147        });
148
149    for (event, range) in p {
150        match event {
151            Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
152            Event::Html(text) | Event::InlineHtml(text) if !in_code_block => {
153                tagp.extract_tags(&text, range, &mut is_in_comment, &report_diag)
154            }
155            Event::End(TagEnd::CodeBlock) => in_code_block = false,
156            _ => {}
157        }
158    }
159
160    if let Some(range) = is_in_comment {
161        report_diag("Unclosed HTML comment".to_string(), &range, false);
162    } else if let &Some(quote_pos) = &tagp.quote_pos {
163        let qr = Range { start: quote_pos, end: quote_pos };
164        report_diag(
165            format!("unclosed quoted HTML attribute on tag `{}`", &tagp.tag_name),
166            &qr,
167            false,
168        );
169    } else {
170        if !tagp.tag_name.is_empty() {
171            report_diag(
172                format!("incomplete HTML tag `{}`", &tagp.tag_name),
173                &(tagp.tag_start_pos..dox.len()),
174                false,
175            );
176        }
177        for (tag, range) in tagp.tags.iter().filter(|(t, _)| {
178            let t = t.to_lowercase();
179            !is_implicitly_self_closing(&t)
180        }) {
181            report_diag(format!("unclosed HTML tag `{tag}`"), range, true);
182        }
183    }
184}
185
186/// These tags are interpreted as self-closing if they lack an explicit closing tag.
187const ALLOWED_UNCLOSED: &[&str] = &[
188    "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param",
189    "source", "track", "wbr",
190];
191
192/// Allows constructs like `<img>`, but not `<img`.
193fn is_implicitly_self_closing(tag_name: &str) -> bool {
194    ALLOWED_UNCLOSED.contains(&tag_name)
195}
196
197fn extract_path_backwards(text: &str, end_pos: usize) -> Option<usize> {
198    use rustc_lexer::{is_id_continue, is_id_start};
199    let mut current_pos = end_pos;
200    loop {
201        if current_pos >= 2 && text[..current_pos].ends_with("::") {
202            current_pos -= 2;
203        }
204        let new_pos = text[..current_pos]
205            .char_indices()
206            .rev()
207            .take_while(|(_, c)| is_id_start(*c) || is_id_continue(*c))
208            .reduce(|_accum, item| item)
209            .and_then(|(new_pos, c)| is_id_start(c).then_some(new_pos));
210        if let Some(new_pos) = new_pos
211            && current_pos != new_pos
212        {
213            current_pos = new_pos;
214            continue;
215        }
216        break;
217    }
218    if current_pos == end_pos { None } else { Some(current_pos) }
219}
220
221fn extract_path_forward(text: &str, start_pos: usize) -> Option<usize> {
222    use rustc_lexer::{is_id_continue, is_id_start};
223    let mut current_pos = start_pos;
224    loop {
225        if current_pos < text.len() && text[current_pos..].starts_with("::") {
226            current_pos += 2;
227        } else {
228            break;
229        }
230        let mut chars = text[current_pos..].chars();
231        if let Some(c) = chars.next() {
232            if is_id_start(c) {
233                current_pos += c.len_utf8();
234            } else {
235                break;
236            }
237        }
238        for c in chars {
239            if is_id_continue(c) {
240                current_pos += c.len_utf8();
241            } else {
242                break;
243            }
244        }
245    }
246    if current_pos == start_pos { None } else { Some(current_pos) }
247}
248
249fn is_valid_for_html_tag_name(c: char, is_empty: bool) -> bool {
250    // https://spec.commonmark.org/0.30/#raw-html
251    //
252    // > A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or
253    // > hyphens (-).
254    c.is_ascii_alphabetic() || !is_empty && (c == '-' || c.is_ascii_digit())
255}
256
257/// Parse html tags to ensure they are well-formed
258#[derive(Debug, Clone)]
259struct TagParser {
260    tags: Vec<(String, Range<usize>)>,
261    /// Name of the tag that is being parsed, if we are within a tag.
262    ///
263    /// Since the `<` and name of a tag must appear on the same line with no whitespace,
264    /// if this is the empty string, we are not in a tag.
265    tag_name: String,
266    tag_start_pos: usize,
267    is_closing: bool,
268    /// `true` if we are within a tag, but not within its name.
269    in_attrs: bool,
270    /// If we are in a quoted attribute, what quote char does it use?
271    ///
272    /// This needs to be stored in the struct since HTML5 allows newlines in quoted attrs.
273    quote: Option<char>,
274    quote_pos: Option<usize>,
275    after_eq: bool,
276}
277
278impl TagParser {
279    fn new() -> Self {
280        Self {
281            tags: Vec::new(),
282            tag_name: String::with_capacity(8),
283            tag_start_pos: 0,
284            is_closing: false,
285            in_attrs: false,
286            quote: None,
287            quote_pos: None,
288            after_eq: false,
289        }
290    }
291
292    fn drop_tag(&mut self, range: Range<usize>, f: &impl Fn(String, &Range<usize>, bool)) {
293        let tag_name_low = self.tag_name.to_lowercase();
294        if let Some(pos) = self.tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) {
295            // If the tag is nested inside a "<script>" or a "<style>" tag, no warning should
296            // be emitted.
297            let should_not_warn = self.tags.iter().take(pos + 1).any(|(at, _)| {
298                let at = at.to_lowercase();
299                at == "script" || at == "style"
300            });
301            for (last_tag_name, last_tag_span) in self.tags.drain(pos + 1..) {
302                if should_not_warn {
303                    continue;
304                }
305                let last_tag_name_low = last_tag_name.to_lowercase();
306                if is_implicitly_self_closing(&last_tag_name_low) {
307                    continue;
308                }
309                // `tags` is used as a queue, meaning that everything after `pos` is included inside it.
310                // So `<h2><h3></h2>` will look like `["h2", "h3"]`. So when closing `h2`, we will still
311                // have `h3`, meaning the tag wasn't closed as it should have.
312                f(format!("unclosed HTML tag `{last_tag_name}`"), &last_tag_span, true);
313            }
314            // Remove the `tag_name` that was originally closed
315            self.tags.pop();
316        } else {
317            // It can happen for example in this case: `<h2></script></h2>` (the `h2` tag isn't required
318            // but it helps for the visualization).
319            f(format!("unopened HTML tag `{}`", &self.tag_name), &range, false);
320        }
321    }
322
323    /// Handle a `<` that appeared while parsing a tag.
324    fn handle_lt_in_tag(
325        &mut self,
326        range: Range<usize>,
327        lt_pos: usize,
328        f: &impl Fn(String, &Range<usize>, bool),
329    ) {
330        let global_pos = range.start + lt_pos;
331        // is this check needed?
332        if global_pos == self.tag_start_pos {
333            // `<` is in the tag because it is the start.
334            return;
335        }
336        // tried to start a new tag while in a tag
337        f(
338            format!("incomplete HTML tag `{}`", &self.tag_name),
339            &(self.tag_start_pos..global_pos),
340            false,
341        );
342        self.tag_parsed();
343    }
344
345    fn extract_html_tag(
346        &mut self,
347        text: &str,
348        range: &Range<usize>,
349        start_pos: usize,
350        iter: &mut Peekable<CharIndices<'_>>,
351        f: &impl Fn(String, &Range<usize>, bool),
352    ) {
353        let mut prev_pos = start_pos;
354
355        'outer_loop: loop {
356            let (pos, c) = match iter.peek() {
357                Some((pos, c)) => (*pos, *c),
358                // In case we reached the of the doc comment, we want to check that it's an
359                // unclosed HTML tag. For example "/// <h3".
360                None if self.tag_name.is_empty() => (prev_pos, '\0'),
361                None => break,
362            };
363            prev_pos = pos;
364            if c == '/' && self.tag_name.is_empty() {
365                // Checking if this is a closing tag (like `</a>` for `<a>`).
366                self.is_closing = true;
367            } else if !self.in_attrs && is_valid_for_html_tag_name(c, self.tag_name.is_empty()) {
368                self.tag_name.push(c);
369            } else {
370                if !self.tag_name.is_empty() {
371                    self.in_attrs = true;
372                    // range of the entire tag within dox
373                    let mut r = Range { start: range.start + start_pos, end: range.start + pos };
374                    if c == '>' {
375                        // In case we have a tag without attribute, we can consider the span to
376                        // refer to it fully.
377                        r.end += 1;
378                    }
379                    if self.is_closing {
380                        // In case we have "</div >" or even "</div         >".
381                        if c != '>' {
382                            if !c.is_whitespace() {
383                                // It seems like it's not a valid HTML tag.
384                                break;
385                            }
386                            let mut found = false;
387                            for (new_pos, c) in text[pos..].char_indices() {
388                                if !c.is_whitespace() {
389                                    if c == '>' {
390                                        r.end = range.start + pos + new_pos + 1;
391                                        found = true;
392                                    } else if c == '<' {
393                                        self.handle_lt_in_tag(range.clone(), pos + new_pos, f);
394                                    }
395                                    break;
396                                }
397                            }
398                            if !found {
399                                break 'outer_loop;
400                            }
401                        }
402                        self.drop_tag(r, f);
403                        self.tag_parsed();
404                    } else {
405                        self.extract_opening_tag(text, range, r, pos, c, iter, f)
406                    }
407                }
408                break;
409            }
410            iter.next();
411        }
412    }
413
414    fn extract_opening_tag(
415        &mut self,
416        text: &str,
417        range: &Range<usize>,
418        r: Range<usize>,
419        pos: usize,
420        c: char,
421        iter: &mut Peekable<CharIndices<'_>>,
422        f: &impl Fn(String, &Range<usize>, bool),
423    ) {
424        // we can store this as a local, since html5 does require the `/` and `>`
425        // to not be separated by whitespace.
426        let mut is_self_closing = false;
427        if c != '>' {
428            'parse_til_gt: {
429                for (i, c) in text[pos..].char_indices() {
430                    if !c.is_whitespace() {
431                        debug_assert_eq!(self.quote_pos.is_some(), self.quote.is_some());
432                        if let Some(q) = self.quote {
433                            if c == q {
434                                self.quote = None;
435                                self.quote_pos = None;
436                                self.after_eq = false;
437                            }
438                        } else if c == '>' {
439                            break 'parse_til_gt;
440                        } else if c == '<' {
441                            self.handle_lt_in_tag(range.clone(), pos + i, f);
442                        } else if c == '/' && !self.after_eq {
443                            is_self_closing = true;
444                        } else {
445                            if is_self_closing {
446                                is_self_closing = false;
447                            }
448                            if (c == '"' || c == '\'') && self.after_eq {
449                                self.quote = Some(c);
450                                self.quote_pos = Some(pos + i);
451                            } else if c == '=' {
452                                self.after_eq = true;
453                            }
454                        }
455                    } else if self.quote.is_none() {
456                        self.after_eq = false;
457                    }
458                    if !is_self_closing && !self.tag_name.is_empty() {
459                        iter.next();
460                    }
461                }
462                // if we've run out of text but still haven't found a `>`,
463                // return early without calling `tag_parsed` or emitting lints.
464                // this allows us to either find the `>` in a later event
465                // or emit a lint about it being missing.
466                return;
467            }
468        }
469        if is_self_closing {
470            // https://html.spec.whatwg.org/#parse-error-non-void-html-element-start-tag-with-trailing-solidus
471            let valid = ALLOWED_UNCLOSED.contains(&&self.tag_name[..])
472                || self.tags.iter().take(pos + 1).any(|(at, _)| {
473                    let at = at.to_lowercase();
474                    at == "svg" || at == "math"
475                });
476            if !valid {
477                f(format!("invalid self-closing HTML tag `{}`", self.tag_name), &r, false);
478            }
479        } else if !self.tag_name.is_empty() {
480            self.tags.push((std::mem::take(&mut self.tag_name), r));
481        }
482        self.tag_parsed();
483    }
484    /// Finished parsing a tag, reset related data.
485    fn tag_parsed(&mut self) {
486        self.tag_name.clear();
487        self.is_closing = false;
488        self.in_attrs = false;
489    }
490
491    fn extract_tags(
492        &mut self,
493        text: &str,
494        range: Range<usize>,
495        is_in_comment: &mut Option<Range<usize>>,
496        f: &impl Fn(String, &Range<usize>, bool),
497    ) {
498        let mut iter = text.char_indices().peekable();
499        let mut prev_pos = 0;
500        loop {
501            if self.quote.is_some() {
502                debug_assert!(self.in_attrs && self.quote_pos.is_some());
503            }
504            if self.in_attrs
505                && let Some(&(start_pos, _)) = iter.peek()
506            {
507                self.extract_html_tag(text, &range, start_pos, &mut iter, f);
508                // if no progress is being made, move forward forcefully.
509                if prev_pos == start_pos {
510                    iter.next();
511                }
512                prev_pos = start_pos;
513                continue;
514            }
515            let Some((start_pos, c)) = iter.next() else { break };
516            if is_in_comment.is_some() {
517                if text[start_pos..].starts_with("-->") {
518                    *is_in_comment = None;
519                }
520            } else if c == '<' {
521                // "<!--" is a valid attribute name under html5, so don't treat it as a comment if we're in a tag.
522                if self.tag_name.is_empty() && text[start_pos..].starts_with("<!--") {
523                    // We skip the "!--" part. (Once `advance_by` is stable, might be nice to use it!)
524                    iter.next();
525                    iter.next();
526                    iter.next();
527                    *is_in_comment = Some(Range {
528                        start: range.start + start_pos,
529                        end: range.start + start_pos + 4,
530                    });
531                } else {
532                    if self.tag_name.is_empty() {
533                        self.tag_start_pos = range.start + start_pos;
534                    }
535                    self.extract_html_tag(text, &range, start_pos, &mut iter, f);
536                }
537            } else if !self.tag_name.is_empty() {
538                // partially inside html tag that spans across events
539                self.extract_html_tag(text, &range, start_pos, &mut iter, f);
540            }
541        }
542    }
543}
544
545#[cfg(test)]
546mod tests;