rustdoc/passes/lint/
html_tags.rs

1//! Detects invalid HTML (like an unclosed `<span>`) in doc comments.
2
3use std::iter::Peekable;
4use std::ops::Range;
5use std::str::CharIndices;
6
7use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag, TagEnd};
8use rustc_hir::HirId;
9use rustc_resolve::rustdoc::source_span_for_markdown_range;
10
11use crate::clean::*;
12use crate::core::DocContext;
13use crate::html::markdown::main_body_opts;
14
15pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item, hir_id: HirId, dox: &str) {
16    let tcx = cx.tcx;
17    let report_diag = |msg: String, range: &Range<usize>, is_open_tag: bool| {
18        let sp = match source_span_for_markdown_range(tcx, dox, range, &item.attrs.doc_strings) {
19            Some(sp) => sp,
20            None => item.attr_span(tcx),
21        };
22        tcx.node_span_lint(crate::lint::INVALID_HTML_TAGS, hir_id, sp, |lint| {
23            use rustc_lint_defs::Applicability;
24
25            lint.primary_message(msg);
26
27            // If a tag looks like `<this>`, it might actually be a generic.
28            // We don't try to detect stuff `<like, this>` because that's not valid HTML,
29            // and we don't try to detect stuff `<like this>` because that's not valid Rust.
30            let mut generics_end = range.end;
31            if let Some(Some(mut generics_start)) = (is_open_tag
32                && dox[..generics_end].ends_with('>'))
33            .then(|| extract_path_backwards(dox, range.start))
34            {
35                while generics_start != 0
36                    && generics_end < dox.len()
37                    && dox.as_bytes()[generics_start - 1] == b'<'
38                    && dox.as_bytes()[generics_end] == b'>'
39                {
40                    generics_end += 1;
41                    generics_start -= 1;
42                    if let Some(new_start) = extract_path_backwards(dox, generics_start) {
43                        generics_start = new_start;
44                    }
45                    if let Some(new_end) = extract_path_forward(dox, generics_end) {
46                        generics_end = new_end;
47                    }
48                }
49                if let Some(new_end) = extract_path_forward(dox, generics_end) {
50                    generics_end = new_end;
51                }
52                let generics_sp = match source_span_for_markdown_range(
53                    tcx,
54                    dox,
55                    &(generics_start..generics_end),
56                    &item.attrs.doc_strings,
57                ) {
58                    Some(sp) => sp,
59                    None => item.attr_span(tcx),
60                };
61                // Sometimes, we only extract part of a path. For example, consider this:
62                //
63                //     <[u32] as IntoIter<u32>>::Item
64                //                       ^^^^^ unclosed HTML tag `u32`
65                //
66                // We don't have any code for parsing fully-qualified trait paths.
67                // In theory, we could add it, but doing it correctly would require
68                // parsing the entire path grammar, which is problematic because of
69                // overlap between the path grammar and Markdown.
70                //
71                // The example above shows that ambiguity. Is `[u32]` intended to be an
72                // intra-doc link to the u32 primitive, or is it intended to be a slice?
73                //
74                // If the below conditional were removed, we would suggest this, which is
75                // not what the user probably wants.
76                //
77                //     <[u32] as `IntoIter<u32>`>::Item
78                //
79                // We know that the user actually wants to wrap the whole thing in a code
80                // block, but the only reason we know that is because `u32` does not, in
81                // fact, implement IntoIter. If the example looks like this:
82                //
83                //     <[Vec<i32>] as IntoIter<i32>::Item
84                //
85                // The ideal fix would be significantly different.
86                if (generics_start > 0 && dox.as_bytes()[generics_start - 1] == b'<')
87                    || (generics_end < dox.len() && dox.as_bytes()[generics_end] == b'>')
88                {
89                    return;
90                }
91                // multipart form is chosen here because ``Vec<i32>`` would be confusing.
92                lint.multipart_suggestion(
93                    "try marking as source code",
94                    vec![
95                        (generics_sp.shrink_to_lo(), String::from("`")),
96                        (generics_sp.shrink_to_hi(), String::from("`")),
97                    ],
98                    Applicability::MaybeIncorrect,
99                );
100            }
101        });
102    };
103
104    let mut tags = Vec::new();
105    let mut is_in_comment = None;
106    let mut in_code_block = false;
107
108    let link_names = item.link_names(&cx.cache);
109
110    let mut replacer = |broken_link: BrokenLink<'_>| {
111        if let Some(link) =
112            link_names.iter().find(|link| *link.original_text == *broken_link.reference)
113        {
114            Some((link.href.as_str().into(), link.new_text.to_string().into()))
115        } else if matches!(&broken_link.link_type, LinkType::Reference | LinkType::ReferenceUnknown)
116        {
117            // If the link is shaped [like][this], suppress any broken HTML in the [this] part.
118            // The `broken_intra_doc_links` will report typos in there anyway.
119            Some((
120                broken_link.reference.to_string().into(),
121                broken_link.reference.to_string().into(),
122            ))
123        } else {
124            None
125        }
126    };
127
128    let p = Parser::new_with_broken_link_callback(dox, main_body_opts(), Some(&mut replacer))
129        .into_offset_iter();
130
131    for (event, range) in p {
132        match event {
133            Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
134            Event::Html(text) | Event::InlineHtml(text) if !in_code_block => {
135                extract_tags(&mut tags, &text, range, &mut is_in_comment, &report_diag)
136            }
137            Event::End(TagEnd::CodeBlock) => in_code_block = false,
138            _ => {}
139        }
140    }
141
142    for (tag, range) in tags.iter().filter(|(t, _)| {
143        let t = t.to_lowercase();
144        !ALLOWED_UNCLOSED.contains(&t.as_str())
145    }) {
146        report_diag(format!("unclosed HTML tag `{tag}`"), range, true);
147    }
148
149    if let Some(range) = is_in_comment {
150        report_diag("Unclosed HTML comment".to_string(), &range, false);
151    }
152}
153
154const ALLOWED_UNCLOSED: &[&str] = &[
155    "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param",
156    "source", "track", "wbr",
157];
158
159fn drop_tag(
160    tags: &mut Vec<(String, Range<usize>)>,
161    tag_name: String,
162    range: Range<usize>,
163    f: &impl Fn(String, &Range<usize>, bool),
164) {
165    let tag_name_low = tag_name.to_lowercase();
166    if let Some(pos) = tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) {
167        // If the tag is nested inside a "<script>" or a "<style>" tag, no warning should
168        // be emitted.
169        let should_not_warn = tags.iter().take(pos + 1).any(|(at, _)| {
170            let at = at.to_lowercase();
171            at == "script" || at == "style"
172        });
173        for (last_tag_name, last_tag_span) in tags.drain(pos + 1..) {
174            if should_not_warn {
175                continue;
176            }
177            let last_tag_name_low = last_tag_name.to_lowercase();
178            if ALLOWED_UNCLOSED.contains(&last_tag_name_low.as_str()) {
179                continue;
180            }
181            // `tags` is used as a queue, meaning that everything after `pos` is included inside it.
182            // So `<h2><h3></h2>` will look like `["h2", "h3"]`. So when closing `h2`, we will still
183            // have `h3`, meaning the tag wasn't closed as it should have.
184            f(format!("unclosed HTML tag `{last_tag_name}`"), &last_tag_span, true);
185        }
186        // Remove the `tag_name` that was originally closed
187        tags.pop();
188    } else {
189        // It can happen for example in this case: `<h2></script></h2>` (the `h2` tag isn't required
190        // but it helps for the visualization).
191        f(format!("unopened HTML tag `{tag_name}`"), &range, false);
192    }
193}
194
195fn extract_path_backwards(text: &str, end_pos: usize) -> Option<usize> {
196    use rustc_lexer::{is_id_continue, is_id_start};
197    let mut current_pos = end_pos;
198    loop {
199        if current_pos >= 2 && text[..current_pos].ends_with("::") {
200            current_pos -= 2;
201        }
202        let new_pos = text[..current_pos]
203            .char_indices()
204            .rev()
205            .take_while(|(_, c)| is_id_start(*c) || is_id_continue(*c))
206            .reduce(|_accum, item| item)
207            .and_then(|(new_pos, c)| is_id_start(c).then_some(new_pos));
208        if let Some(new_pos) = new_pos
209            && current_pos != new_pos
210        {
211            current_pos = new_pos;
212            continue;
213        }
214        break;
215    }
216    if current_pos == end_pos { None } else { Some(current_pos) }
217}
218
219fn extract_path_forward(text: &str, start_pos: usize) -> Option<usize> {
220    use rustc_lexer::{is_id_continue, is_id_start};
221    let mut current_pos = start_pos;
222    loop {
223        if current_pos < text.len() && text[current_pos..].starts_with("::") {
224            current_pos += 2;
225        } else {
226            break;
227        }
228        let mut chars = text[current_pos..].chars();
229        if let Some(c) = chars.next() {
230            if is_id_start(c) {
231                current_pos += c.len_utf8();
232            } else {
233                break;
234            }
235        }
236        for c in chars {
237            if is_id_continue(c) {
238                current_pos += c.len_utf8();
239            } else {
240                break;
241            }
242        }
243    }
244    if current_pos == start_pos { None } else { Some(current_pos) }
245}
246
247fn is_valid_for_html_tag_name(c: char, is_empty: bool) -> bool {
248    // https://spec.commonmark.org/0.30/#raw-html
249    //
250    // > A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or
251    // > hyphens (-).
252    c.is_ascii_alphabetic() || !is_empty && (c == '-' || c.is_ascii_digit())
253}
254
255fn extract_html_tag(
256    tags: &mut Vec<(String, Range<usize>)>,
257    text: &str,
258    range: &Range<usize>,
259    start_pos: usize,
260    iter: &mut Peekable<CharIndices<'_>>,
261    f: &impl Fn(String, &Range<usize>, bool),
262) {
263    let mut tag_name = String::new();
264    let mut is_closing = false;
265    let mut prev_pos = start_pos;
266
267    loop {
268        let (pos, c) = match iter.peek() {
269            Some((pos, c)) => (*pos, *c),
270            // In case we reached the of the doc comment, we want to check that it's an
271            // unclosed HTML tag. For example "/// <h3".
272            None => (prev_pos, '\0'),
273        };
274        prev_pos = pos;
275        // Checking if this is a closing tag (like `</a>` for `<a>`).
276        if c == '/' && tag_name.is_empty() {
277            is_closing = true;
278        } else if is_valid_for_html_tag_name(c, tag_name.is_empty()) {
279            tag_name.push(c);
280        } else {
281            if !tag_name.is_empty() {
282                let mut r = Range { start: range.start + start_pos, end: range.start + pos };
283                if c == '>' {
284                    // In case we have a tag without attribute, we can consider the span to
285                    // refer to it fully.
286                    r.end += 1;
287                }
288                if is_closing {
289                    // In case we have "</div >" or even "</div         >".
290                    if c != '>' {
291                        if !c.is_whitespace() {
292                            // It seems like it's not a valid HTML tag.
293                            break;
294                        }
295                        let mut found = false;
296                        for (new_pos, c) in text[pos..].char_indices() {
297                            if !c.is_whitespace() {
298                                if c == '>' {
299                                    r.end = range.start + new_pos + 1;
300                                    found = true;
301                                }
302                                break;
303                            }
304                        }
305                        if !found {
306                            break;
307                        }
308                    }
309                    drop_tag(tags, tag_name, r, f);
310                } else {
311                    let mut is_self_closing = false;
312                    let mut quote_pos = None;
313                    if c != '>' {
314                        let mut quote = None;
315                        let mut after_eq = false;
316                        for (i, c) in text[pos..].char_indices() {
317                            if !c.is_whitespace() {
318                                if let Some(q) = quote {
319                                    if c == q {
320                                        quote = None;
321                                        quote_pos = None;
322                                        after_eq = false;
323                                    }
324                                } else if c == '>' {
325                                    break;
326                                } else if c == '/' && !after_eq {
327                                    is_self_closing = true;
328                                } else {
329                                    if is_self_closing {
330                                        is_self_closing = false;
331                                    }
332                                    if (c == '"' || c == '\'') && after_eq {
333                                        quote = Some(c);
334                                        quote_pos = Some(pos + i);
335                                    } else if c == '=' {
336                                        after_eq = true;
337                                    }
338                                }
339                            } else if quote.is_none() {
340                                after_eq = false;
341                            }
342                        }
343                    }
344                    if let Some(quote_pos) = quote_pos {
345                        let qr = Range { start: quote_pos, end: quote_pos };
346                        f(
347                            format!("unclosed quoted HTML attribute on tag `{tag_name}`"),
348                            &qr,
349                            false,
350                        );
351                    }
352                    if is_self_closing {
353                        // https://html.spec.whatwg.org/#parse-error-non-void-html-element-start-tag-with-trailing-solidus
354                        let valid = ALLOWED_UNCLOSED.contains(&&tag_name[..])
355                            || tags.iter().take(pos + 1).any(|(at, _)| {
356                                let at = at.to_lowercase();
357                                at == "svg" || at == "math"
358                            });
359                        if !valid {
360                            f(format!("invalid self-closing HTML tag `{tag_name}`"), &r, false);
361                        }
362                    } else {
363                        tags.push((tag_name, r));
364                    }
365                }
366            }
367            break;
368        }
369        iter.next();
370    }
371}
372
373fn extract_tags(
374    tags: &mut Vec<(String, Range<usize>)>,
375    text: &str,
376    range: Range<usize>,
377    is_in_comment: &mut Option<Range<usize>>,
378    f: &impl Fn(String, &Range<usize>, bool),
379) {
380    let mut iter = text.char_indices().peekable();
381
382    while let Some((start_pos, c)) = iter.next() {
383        if is_in_comment.is_some() {
384            if text[start_pos..].starts_with("-->") {
385                *is_in_comment = None;
386            }
387        } else if c == '<' {
388            if text[start_pos..].starts_with("<!--") {
389                // We skip the "!--" part. (Once `advance_by` is stable, might be nice to use it!)
390                iter.next();
391                iter.next();
392                iter.next();
393                *is_in_comment = Some(Range {
394                    start: range.start + start_pos,
395                    end: range.start + start_pos + 3,
396                });
397            } else {
398                extract_html_tag(tags, text, &range, start_pos, &mut iter, f);
399            }
400        }
401    }
402}