rustc_errors/markdown/
parse.rs

1use std::{iter, mem, str};
2
3use crate::markdown::{MdStream, MdTree};
4
5/// Short aliases that we can use in match patterns. If an end pattern is not
6/// included, this type may be variable
7const ANC_E: &[u8] = b">";
8const ANC_S: &[u8] = b"<";
9const BRK: &[u8] = b"---";
10const CBK: &[u8] = b"```";
11const CIL: &[u8] = b"`";
12const CMT_E: &[u8] = b"-->";
13const CMT_S: &[u8] = b"<!--";
14const EMP_U: &[u8] = b"_";
15const EMP_A: &[u8] = b"*";
16const HDG: &[u8] = b"#";
17const LNK_CHARS: &str = "$-_.+!*'()/&?=:%";
18const LNK_E: &[u8] = b"]";
19const LNK_S: &[u8] = b"[";
20const STG_U: &[u8] = b"__";
21const STG_A: &[u8] = b"**";
22const STK: &[u8] = b"~~";
23
24/// Pattern replacements
25const REPLACEMENTS: &[(&str, &str)] = &[
26    ("(c)", "©"),
27    ("(C)", "©"),
28    ("(r)", "®"),
29    ("(R)", "®"),
30    ("(tm)", "â„¢"),
31    ("(TM)", "â„¢"),
32    (":crab:", "🦀"),
33    ("\n", " "),
34];
35
36/// `(extracted, remaining)`
37type Parsed<'a> = (MdTree<'a>, &'a [u8]);
38/// Output of a parse function
39type ParseResult<'a> = Option<Parsed<'a>>;
40
41/// Parsing context
42#[derive(Clone, Copy, Debug, PartialEq)]
43// The default values are the most common setting for non top-level parsing: not top block, not at
44// line start (yes leading whitespace, not escaped).
45struct Context {
46    /// If true, we are at a the topmost level (not recursing a nested tt)
47    top_block: bool = false,
48    /// Previous character
49    prev: Prev = Prev::Whitespace,
50}
51
52/// Character class preceding this one
53#[derive(Clone, Copy, Debug, PartialEq)]
54enum Prev {
55    Newline,
56    /// Whitespace that is not a newline
57    Whitespace,
58    Escape,
59    Any,
60}
61
62/// Flags to simple parser function
63#[derive(Clone, Copy, Debug, PartialEq)]
64enum ParseOpt {
65    /// Ignore escapes before closing pattern, trim content
66    TrimNoEsc,
67    None,
68}
69
70/// Parse a buffer
71pub(crate) fn entrypoint(txt: &str) -> MdStream<'_> {
72    let ctx = Context { top_block: true, prev: Prev::Newline };
73    normalize(parse_recursive(txt.trim().as_bytes(), ctx), &mut Vec::new())
74}
75
76/// Parse a buffer with specified context
77fn parse_recursive<'a>(buf: &'a [u8], ctx: Context) -> MdStream<'a> {
78    use ParseOpt as Po;
79    use Prev::{Escape, Newline, Whitespace};
80
81    let mut stream: Vec<MdTree<'a>> = Vec::new();
82    let Context { top_block: top_blk, mut prev } = ctx;
83
84    // wip_buf is our entire unprocessed (unpushed) buffer, loop_buf is our to
85    // check buffer that shrinks with each loop
86    let mut wip_buf = buf;
87    let mut loop_buf = wip_buf;
88
89    while !loop_buf.is_empty() {
90        let next_prev = match loop_buf[0] {
91            b'\n' => Newline,
92            b'\\' => Escape,
93            x if x.is_ascii_whitespace() => Whitespace,
94            _ => Prev::Any,
95        };
96
97        let res: ParseResult<'_> = match (top_blk, prev) {
98            _ if loop_buf.starts_with(CMT_S) => {
99                parse_simple_pat(loop_buf, CMT_S, CMT_E, Po::TrimNoEsc, MdTree::Comment)
100            }
101            (true, Newline) if loop_buf.starts_with(CBK) => Some(parse_codeblock(loop_buf)),
102            _ if loop_buf.starts_with(CIL) => parse_codeinline(loop_buf),
103            (true, Newline | Whitespace) if loop_buf.starts_with(HDG) => parse_heading(loop_buf),
104            (true, Newline) if loop_buf.starts_with(BRK) => {
105                Some((MdTree::HorizontalRule, parse_to_newline(loop_buf).1))
106            }
107            (_, Newline) if unordered_list_start(loop_buf) => Some(parse_unordered_li(loop_buf)),
108            (_, Newline | Whitespace) if loop_buf.starts_with(STG_U) => {
109                parse_simple_pat(loop_buf, STG_U, STG_U, Po::None, MdTree::Strong)
110            }
111            _ if loop_buf.starts_with(STG_A) => {
112                parse_simple_pat(loop_buf, STG_A, STG_A, Po::None, MdTree::Strong)
113            }
114            (_, Newline | Whitespace) if loop_buf.starts_with(EMP_U) => {
115                parse_simple_pat(loop_buf, EMP_U, EMP_U, Po::None, MdTree::Emphasis)
116            }
117            _ if loop_buf.starts_with(EMP_A) => {
118                parse_simple_pat(loop_buf, EMP_A, EMP_A, Po::None, MdTree::Emphasis)
119            }
120            _ if loop_buf.starts_with(STK) => {
121                parse_simple_pat(loop_buf, STK, STK, Po::None, MdTree::Strikethrough)
122            }
123            (_, Newline | Whitespace) if loop_buf.starts_with(ANC_S) => {
124                let tt_fn = |link| MdTree::Link { disp: link, link };
125                let ret = parse_simple_pat(loop_buf, ANC_S, ANC_E, Po::None, tt_fn);
126                match ret {
127                    Some((MdTree::Link { disp, .. }, _))
128                        if disp.chars().all(|ch| LNK_CHARS.contains(ch)) =>
129                    {
130                        ret
131                    }
132                    _ => None,
133                }
134            }
135            (_, Newline) if ord_list_start(loop_buf).is_some() => Some(parse_ordered_li(loop_buf)),
136            _ if loop_buf.starts_with(LNK_S) => {
137                parse_any_link(loop_buf, top_blk && prev == Prev::Newline)
138            }
139            (_, Escape | _) => None,
140        };
141
142        if let Some((tree, rest)) = res {
143            // We found something: push our WIP and then push the found tree
144            let prev_buf = &wip_buf[..(wip_buf.len() - loop_buf.len())];
145            if !prev_buf.is_empty() {
146                let prev_str = str::from_utf8(prev_buf).unwrap();
147                stream.push(MdTree::PlainText(prev_str));
148            }
149            stream.push(tree);
150
151            wip_buf = rest;
152            loop_buf = rest;
153        } else {
154            // Just move on to the next character
155            loop_buf = &loop_buf[1..];
156            // If we are at the end and haven't found anything, just push plain text
157            if loop_buf.is_empty() && !wip_buf.is_empty() {
158                let final_str = str::from_utf8(wip_buf).unwrap();
159                stream.push(MdTree::PlainText(final_str));
160            }
161        };
162
163        prev = next_prev;
164    }
165
166    MdStream(stream)
167}
168
169/// The simplest kind of patterns: data within start and end patterns
170fn parse_simple_pat<'a, F>(
171    buf: &'a [u8],
172    start_pat: &[u8],
173    end_pat: &[u8],
174    opts: ParseOpt,
175    create_tt: F,
176) -> ParseResult<'a>
177where
178    F: FnOnce(&'a str) -> MdTree<'a>,
179{
180    let ignore_esc = matches!(opts, ParseOpt::TrimNoEsc);
181    let trim = matches!(opts, ParseOpt::TrimNoEsc);
182    let (txt, rest) = parse_with_end_pat(&buf[start_pat.len()..], end_pat, ignore_esc)?;
183    let mut txt = str::from_utf8(txt).unwrap();
184    if trim {
185        txt = txt.trim();
186    }
187    Some((create_tt(txt), rest))
188}
189
190/// Parse backtick-wrapped inline code. Accounts for >1 backtick sets
191fn parse_codeinline(buf: &[u8]) -> ParseResult<'_> {
192    let seps = buf.iter().take_while(|ch| **ch == b'`').count();
193    let (txt, rest) = parse_with_end_pat(&buf[seps..], &buf[..seps], true)?;
194    Some((MdTree::CodeInline(str::from_utf8(txt).unwrap()), rest))
195}
196
197/// Parse a codeblock. Accounts for >3 backticks and language specification
198fn parse_codeblock(buf: &[u8]) -> Parsed<'_> {
199    // account for ````code```` style
200    let seps = buf.iter().take_while(|ch| **ch == b'`').count();
201    let end_sep = &buf[..seps];
202    let mut working = &buf[seps..];
203
204    // Handle "````rust" style language specifications
205    let next_ws_idx = working.iter().take_while(|ch| !ch.is_ascii_whitespace()).count();
206
207    let lang = if next_ws_idx > 0 {
208        // Munch the lang
209        let tmp = str::from_utf8(&working[..next_ws_idx]).unwrap();
210        working = &working[next_ws_idx..];
211        Some(tmp)
212    } else {
213        None
214    };
215
216    let mut end_pat = vec![b'\n'];
217    end_pat.extend(end_sep);
218
219    // Find first end pattern with nothing else on its line
220    let mut found = None;
221    for idx in (0..working.len()).filter(|idx| working[*idx..].starts_with(&end_pat)) {
222        let (eol_txt, rest) = parse_to_newline(&working[(idx + end_pat.len())..]);
223        if !eol_txt.iter().any(u8::is_ascii_whitespace) {
224            found = Some((&working[..idx], rest));
225            break;
226        }
227    }
228
229    let (txt, rest) = found.unwrap_or((working, &[]));
230    let txt = str::from_utf8(txt).unwrap().trim_matches('\n');
231
232    (MdTree::CodeBlock { txt, lang }, rest)
233}
234
235fn parse_heading(buf: &[u8]) -> ParseResult<'_> {
236    let level = buf.iter().take_while(|ch| **ch == b'#').count();
237    let buf = &buf[level..];
238
239    if level > 6 || (buf.len() > 1 && !buf[0].is_ascii_whitespace()) {
240        // Enforce max 6 levels and whitespace following the `##` pattern
241        return None;
242    }
243
244    let (txt, rest) = parse_to_newline(&buf[1..]);
245    let ctx = Context { .. };
246    let stream = parse_recursive(txt, ctx);
247
248    Some((MdTree::Heading(level.try_into().unwrap(), stream), rest))
249}
250
251/// Bulleted list
252fn parse_unordered_li(buf: &[u8]) -> Parsed<'_> {
253    let (txt, rest) = get_indented_section(&buf[2..]);
254    let ctx = Context { .. };
255    let stream = parse_recursive(trim_ascii_start(txt), ctx);
256    (MdTree::UnorderedListItem(stream), rest)
257}
258
259/// Numbered list
260fn parse_ordered_li(buf: &[u8]) -> Parsed<'_> {
261    let (num, pos) = ord_list_start(buf).unwrap(); // success tested in caller
262    let (txt, rest) = get_indented_section(&buf[pos..]);
263    let ctx = Context { .. };
264    let stream = parse_recursive(trim_ascii_start(txt), ctx);
265    (MdTree::OrderedListItem(num, stream), rest)
266}
267
268fn get_indented_section(buf: &[u8]) -> (&[u8], &[u8]) {
269    let mut lines = buf.split(|&byte| byte == b'\n');
270    let mut end = lines.next().map_or(0, |line| line.len());
271    for line in lines {
272        if let Some(first) = line.first() {
273            if unordered_list_start(line) || !first.is_ascii_whitespace() {
274                break;
275            }
276        }
277        end += line.len() + 1;
278    }
279
280    (&buf[..end], &buf[end..])
281}
282
283fn unordered_list_start(mut buf: &[u8]) -> bool {
284    while let [b' ', rest @ ..] = buf {
285        buf = rest;
286    }
287    matches!(buf, [b'*' | b'-', b' ', ..])
288}
289
290/// Verify a valid ordered list start (e.g. `1.`) and parse it. Returns the
291/// parsed number and offset of character after the dot.
292fn ord_list_start(buf: &[u8]) -> Option<(u16, usize)> {
293    let pos = buf.iter().take(10).position(|ch| *ch == b'.')?;
294    let n = str::from_utf8(&buf[..pos]).ok()?;
295    if !buf.get(pos + 1)?.is_ascii_whitespace() {
296        return None;
297    }
298    n.parse::<u16>().ok().map(|v| (v, pos + 2))
299}
300
301/// Parse links. `can_be_def` indicates that a link definition is possible (top
302/// level, located at the start of a line)
303fn parse_any_link(buf: &[u8], can_be_def: bool) -> ParseResult<'_> {
304    let (bracketed, rest) = parse_with_end_pat(&buf[1..], LNK_E, true)?;
305    if rest.is_empty() {
306        return None;
307    }
308
309    let disp = str::from_utf8(bracketed).unwrap();
310    match (can_be_def, rest[0]) {
311        (true, b':') => {
312            let (link, tmp) = parse_to_newline(&rest[1..]);
313            let link = str::from_utf8(link).unwrap().trim();
314            Some((MdTree::LinkDef { id: disp, link }, tmp))
315        }
316        (_, b'(') => parse_simple_pat(rest, b"(", b")", ParseOpt::TrimNoEsc, |link| MdTree::Link {
317            disp,
318            link,
319        }),
320        (_, b'[') => parse_simple_pat(rest, b"[", b"]", ParseOpt::TrimNoEsc, |id| {
321            MdTree::RefLink { disp, id: Some(id) }
322        }),
323        _ => Some((MdTree::RefLink { disp, id: None }, rest)),
324    }
325}
326
327/// Find and consume an end pattern, return `(match, residual)`
328fn parse_with_end_pat<'a>(
329    buf: &'a [u8],
330    end_sep: &[u8],
331    ignore_esc: bool,
332) -> Option<(&'a [u8], &'a [u8])> {
333    // Find positions that start with the end separator
334    for idx in (0..buf.len()).filter(|idx| buf[*idx..].starts_with(end_sep)) {
335        if !ignore_esc && idx > 0 && buf[idx - 1] == b'\\' {
336            continue;
337        }
338        return Some((&buf[..idx], &buf[idx + end_sep.len()..]));
339    }
340    None
341}
342
343/// Return `(match, residual)` to end of line. The EOL is returned with the
344/// residual.
345fn parse_to_newline(buf: &[u8]) -> (&[u8], &[u8]) {
346    buf.iter().position(|ch| *ch == b'\n').map_or((buf, &[]), |pos| buf.split_at(pos))
347}
348
349/// Take a parsed stream and fix the little things
350fn normalize<'a>(MdStream(stream): MdStream<'a>, linkdefs: &mut Vec<MdTree<'a>>) -> MdStream<'a> {
351    let mut new_stream = Vec::with_capacity(stream.len());
352    let new_defs = stream.iter().filter(|tt| matches!(tt, MdTree::LinkDef { .. }));
353    linkdefs.extend(new_defs.cloned());
354
355    // Run plaintest expansions on types that need it, call this function on nested types
356    for item in stream {
357        match item {
358            MdTree::PlainText(txt) => expand_plaintext(txt, &mut new_stream, MdTree::PlainText),
359            MdTree::Strong(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Strong),
360            MdTree::Emphasis(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Emphasis),
361            MdTree::Strikethrough(txt) => {
362                expand_plaintext(txt, &mut new_stream, MdTree::Strikethrough);
363            }
364            MdTree::RefLink { disp, id } => new_stream.push(match_reflink(linkdefs, disp, id)),
365            MdTree::OrderedListItem(n, st) => {
366                new_stream.push(MdTree::OrderedListItem(n, normalize(st, linkdefs)));
367            }
368            MdTree::UnorderedListItem(st) => {
369                new_stream.push(MdTree::UnorderedListItem(normalize(st, linkdefs)));
370            }
371            MdTree::Heading(n, st) => new_stream.push(MdTree::Heading(n, normalize(st, linkdefs))),
372            _ => new_stream.push(item),
373        }
374    }
375
376    // Remove non printing types, duplicate paragraph breaks, and breaks at start/end
377    new_stream.retain(|x| !matches!(x, MdTree::Comment(_) | MdTree::LinkDef { .. }));
378    new_stream.dedup_by(|r, l| matches!((r, l), (MdTree::ParagraphBreak, MdTree::ParagraphBreak)));
379
380    if new_stream.first().is_some_and(is_break_ty) {
381        new_stream.remove(0);
382    }
383    if new_stream.last().is_some_and(is_break_ty) {
384        new_stream.pop();
385    }
386
387    // Remove paragraph breaks that shouldn't be there. w[1] is what will be
388    // removed in these cases. Note that these are the items to keep, not delete
389    // (for `retain`)
390    let to_keep: Vec<bool> = new_stream
391        .windows(3)
392        .map(|w| {
393            !((matches!(&w[1], MdTree::ParagraphBreak)
394                && matches!(should_break(&w[0], &w[2]), BreakRule::Always(1) | BreakRule::Never))
395                || (matches!(&w[1], MdTree::PlainText(txt) if txt.trim().is_empty())
396                    && matches!(
397                        should_break(&w[0], &w[2]),
398                        BreakRule::Always(_) | BreakRule::Never
399                    )))
400        })
401        .collect();
402    let mut iter = iter::once(true).chain(to_keep).chain(iter::once(true));
403    new_stream.retain(|_| iter.next().unwrap());
404
405    // Insert line or paragraph breaks where there should be some
406    let mut insertions = 0;
407    let to_insert: Vec<(usize, MdTree<'_>)> = new_stream
408        .windows(2)
409        .enumerate()
410        .filter_map(|(idx, w)| match should_break(&w[0], &w[1]) {
411            BreakRule::Always(1) => Some((idx, MdTree::LineBreak)),
412            BreakRule::Always(2) => Some((idx, MdTree::ParagraphBreak)),
413            _ => None,
414        })
415        .map(|(idx, tt)| {
416            insertions += 1;
417            (idx + insertions, tt)
418        })
419        .collect();
420    to_insert.into_iter().for_each(|(idx, tt)| new_stream.insert(idx, tt));
421
422    MdStream(new_stream)
423}
424
425/// Whether two types should or shouldn't have a paragraph break between them
426#[derive(Clone, Copy, Debug, PartialEq)]
427enum BreakRule {
428    Always(u8),
429    Never,
430    Optional,
431}
432
433/// Blocks that automatically handle their own text wrapping
434fn should_break(left: &MdTree<'_>, right: &MdTree<'_>) -> BreakRule {
435    use MdTree::*;
436
437    match (left, right) {
438        // Separate these types with a single line
439        (HorizontalRule, _)
440        | (_, HorizontalRule)
441        | (OrderedListItem(_, _), OrderedListItem(_, _))
442        | (UnorderedListItem(_), UnorderedListItem(_)) => BreakRule::Always(1),
443        // Condensed types shouldn't have an extra break on either side
444        (Comment(_) | ParagraphBreak | Heading(_, _), _) | (_, Comment(_) | ParagraphBreak) => {
445            BreakRule::Never
446        }
447        // Block types should always be separated by full breaks
448        (CodeBlock { .. } | OrderedListItem(_, _) | UnorderedListItem(_), _)
449        | (_, CodeBlock { .. } | Heading(_, _) | OrderedListItem(_, _) | UnorderedListItem(_)) => {
450            BreakRule::Always(2)
451        }
452        // Text types may or may not be separated by a break
453        (
454            CodeInline(_)
455            | Strong(_)
456            | Emphasis(_)
457            | Strikethrough(_)
458            | PlainText(_)
459            | Link { .. }
460            | RefLink { .. }
461            | LinkDef { .. },
462            CodeInline(_)
463            | Strong(_)
464            | Emphasis(_)
465            | Strikethrough(_)
466            | PlainText(_)
467            | Link { .. }
468            | RefLink { .. }
469            | LinkDef { .. },
470        ) => BreakRule::Optional,
471        (LineBreak, _) | (_, LineBreak) => {
472            unreachable!("should have been removed during deduplication")
473        }
474    }
475}
476
477/// Types that indicate some form of break
478fn is_break_ty(val: &MdTree<'_>) -> bool {
479    matches!(val, MdTree::ParagraphBreak | MdTree::LineBreak)
480        // >1 break between paragraphs acts as a break
481        || matches!(val, MdTree::PlainText(txt) if txt.trim().is_empty())
482}
483
484/// Perform transformations to text. This splits paragraphs, replaces patterns,
485/// and corrects newlines.
486///
487/// To avoid allocating strings (and using a different heavier tt type), our
488/// replace method means split into three and append each. For this reason, any
489/// viewer should treat consecutive `PlainText` types as belonging to the same
490/// paragraph.
491fn expand_plaintext<'a>(
492    txt: &'a str,
493    stream: &mut Vec<MdTree<'a>>,
494    mut f: fn(&'a str) -> MdTree<'a>,
495) {
496    if txt.is_empty() {
497        return;
498    } else if txt == "\n" {
499        if let Some(tt) = stream.last() {
500            let tmp = MdTree::PlainText(" ");
501            if should_break(tt, &tmp) == BreakRule::Optional {
502                stream.push(tmp);
503            }
504        }
505        return;
506    }
507    let mut queue1 = Vec::new();
508    let mut queue2 = Vec::new();
509    let stream_start_len = stream.len();
510    for paragraph in txt.split("\n\n") {
511        if paragraph.is_empty() {
512            stream.push(MdTree::ParagraphBreak);
513            continue;
514        }
515        let paragraph = trim_extra_ws(paragraph);
516
517        queue1.clear();
518        queue1.push(paragraph);
519
520        for (from, to) in REPLACEMENTS {
521            queue2.clear();
522            for item in &queue1 {
523                for s in item.split(from) {
524                    queue2.extend(&[s, to]);
525                }
526                if queue2.len() > 1 {
527                    let _ = queue2.pop(); // remove last unnecessary intersperse
528                }
529            }
530            mem::swap(&mut queue1, &mut queue2);
531        }
532
533        // Make sure we don't double whitespace
534        queue1.retain(|s| !s.is_empty());
535        for idx in 0..queue1.len() {
536            queue1[idx] = trim_extra_ws(queue1[idx]);
537            if idx < queue1.len() - 1
538                && queue1[idx].ends_with(char::is_whitespace)
539                && queue1[idx + 1].starts_with(char::is_whitespace)
540            {
541                queue1[idx] = queue1[idx].trim_end();
542            }
543        }
544        stream.extend(queue1.iter().copied().filter(|txt| !txt.is_empty()).map(&mut f));
545        stream.push(MdTree::ParagraphBreak);
546    }
547
548    if stream.len() - stream_start_len > 1 {
549        let _ = stream.pop(); // remove last unnecessary intersperse
550    }
551}
552
553/// Turn reflinks (links with reference IDs) into normal standalone links using
554/// listed link definitions
555fn match_reflink<'a>(linkdefs: &[MdTree<'a>], disp: &'a str, match_id: Option<&str>) -> MdTree<'a> {
556    let to_match = match_id.unwrap_or(disp); // Match with the display name if there isn't an id
557    for def in linkdefs {
558        if let MdTree::LinkDef { id, link } = def {
559            if *id == to_match {
560                return MdTree::Link { disp, link };
561            }
562        }
563    }
564    MdTree::Link { disp, link: "" } // link not found
565}
566
567/// If there is more than one whitespace char at start or end, trim the extras
568fn trim_extra_ws(mut txt: &str) -> &str {
569    let start_ws =
570        txt.bytes().position(|ch| !ch.is_ascii_whitespace()).unwrap_or(txt.len()).saturating_sub(1);
571    txt = &txt[start_ws..];
572    let end_ws = txt
573        .bytes()
574        .rev()
575        .position(|ch| !ch.is_ascii_whitespace())
576        .unwrap_or(txt.len())
577        .saturating_sub(1);
578    &txt[..txt.len() - end_ws]
579}
580
581/// If there is more than one whitespace char at start, trim the extras
582fn trim_ascii_start(buf: &[u8]) -> &[u8] {
583    let count = buf.iter().take_while(|ch| ch.is_ascii_whitespace()).count();
584    &buf[count..]
585}
586
587#[cfg(test)]
588#[path = "tests/parse.rs"]
589mod tests;