rustc_errors/markdown/
parse.rs

1use std::{iter, mem, str};
2
3use crate::markdown::{MdStream, MdTree};
4
5/// Short aliases that we can use in match patterns. If an end pattern is not
6/// included, this type may be variable
7const ANC_E: &[u8] = b">";
8const ANC_S: &[u8] = b"<";
9const BRK: &[u8] = b"---";
10const CBK: &[u8] = b"```";
11const CIL: &[u8] = b"`";
12const CMT_E: &[u8] = b"-->";
13const CMT_S: &[u8] = b"<!--";
14const EMP_U: &[u8] = b"_";
15const EMP_A: &[u8] = b"*";
16const HDG: &[u8] = b"#";
17const LNK_CHARS: &str = "$-_.+!*'()/&?=:%";
18const LNK_E: &[u8] = b"]";
19const LNK_S: &[u8] = b"[";
20const STG_U: &[u8] = b"__";
21const STG_A: &[u8] = b"**";
22const STK: &[u8] = b"~~";
23
24/// Pattern replacements
25const REPLACEMENTS: &[(&str, &str)] = &[
26    ("(c)", "©"),
27    ("(C)", "©"),
28    ("(r)", "®"),
29    ("(R)", "®"),
30    ("(tm)", "™"),
31    ("(TM)", "™"),
32    (":crab:", "🦀"),
33    ("\n", " "),
34];
35
36/// `(extracted, remaining)`
37type Parsed<'a> = (MdTree<'a>, &'a [u8]);
38/// Output of a parse function
39type ParseResult<'a> = Option<Parsed<'a>>;
40
41/// Parsing context
42#[derive(Clone, Copy, Debug, PartialEq)]
43struct Context {
44    /// If true, we are at a the topmost level (not recursing a nested tt)
45    top_block: bool,
46    /// Previous character
47    prev: Prev,
48}
49
50/// Character class preceding this one
51#[derive(Clone, Copy, Debug, PartialEq)]
52enum Prev {
53    Newline,
54    /// Whitespace that is not a newline
55    Whitespace,
56    Escape,
57    Any,
58}
59
60impl Default for Context {
61    /// Most common setting for non top-level parsing: not top block, not at
62    /// line start (yes leading whitespace, not escaped)
63    fn default() -> Self {
64        Self { top_block: false, prev: Prev::Whitespace }
65    }
66}
67
68/// Flags to simple parser function
69#[derive(Clone, Copy, Debug, PartialEq)]
70enum ParseOpt {
71    /// Ignore escapes before closing pattern, trim content
72    TrimNoEsc,
73    None,
74}
75
76/// Parse a buffer
77pub(crate) fn entrypoint(txt: &str) -> MdStream<'_> {
78    let ctx = Context { top_block: true, prev: Prev::Newline };
79    normalize(parse_recursive(txt.trim().as_bytes(), ctx), &mut Vec::new())
80}
81
82/// Parse a buffer with specified context
83fn parse_recursive<'a>(buf: &'a [u8], ctx: Context) -> MdStream<'a> {
84    use ParseOpt as Po;
85    use Prev::{Escape, Newline, Whitespace};
86
87    let mut stream: Vec<MdTree<'a>> = Vec::new();
88    let Context { top_block: top_blk, mut prev } = ctx;
89
90    // wip_buf is our entire unprocessed (unpushed) buffer, loop_buf is our to
91    // check buffer that shrinks with each loop
92    let mut wip_buf = buf;
93    let mut loop_buf = wip_buf;
94
95    while !loop_buf.is_empty() {
96        let next_prev = match loop_buf[0] {
97            b'\n' => Newline,
98            b'\\' => Escape,
99            x if x.is_ascii_whitespace() => Whitespace,
100            _ => Prev::Any,
101        };
102
103        let res: ParseResult<'_> = match (top_blk, prev) {
104            _ if loop_buf.starts_with(CMT_S) => {
105                parse_simple_pat(loop_buf, CMT_S, CMT_E, Po::TrimNoEsc, MdTree::Comment)
106            }
107            (true, Newline) if loop_buf.starts_with(CBK) => Some(parse_codeblock(loop_buf)),
108            _ if loop_buf.starts_with(CIL) => parse_codeinline(loop_buf),
109            (true, Newline | Whitespace) if loop_buf.starts_with(HDG) => parse_heading(loop_buf),
110            (true, Newline) if loop_buf.starts_with(BRK) => {
111                Some((MdTree::HorizontalRule, parse_to_newline(loop_buf).1))
112            }
113            (_, Newline) if unordered_list_start(loop_buf) => Some(parse_unordered_li(loop_buf)),
114            (_, Newline | Whitespace) if loop_buf.starts_with(STG_U) => {
115                parse_simple_pat(loop_buf, STG_U, STG_U, Po::None, MdTree::Strong)
116            }
117            _ if loop_buf.starts_with(STG_A) => {
118                parse_simple_pat(loop_buf, STG_A, STG_A, Po::None, MdTree::Strong)
119            }
120            (_, Newline | Whitespace) if loop_buf.starts_with(EMP_U) => {
121                parse_simple_pat(loop_buf, EMP_U, EMP_U, Po::None, MdTree::Emphasis)
122            }
123            _ if loop_buf.starts_with(EMP_A) => {
124                parse_simple_pat(loop_buf, EMP_A, EMP_A, Po::None, MdTree::Emphasis)
125            }
126            _ if loop_buf.starts_with(STK) => {
127                parse_simple_pat(loop_buf, STK, STK, Po::None, MdTree::Strikethrough)
128            }
129            (_, Newline | Whitespace) if loop_buf.starts_with(ANC_S) => {
130                let tt_fn = |link| MdTree::Link { disp: link, link };
131                let ret = parse_simple_pat(loop_buf, ANC_S, ANC_E, Po::None, tt_fn);
132                match ret {
133                    Some((MdTree::Link { disp, .. }, _))
134                        if disp.chars().all(|ch| LNK_CHARS.contains(ch)) =>
135                    {
136                        ret
137                    }
138                    _ => None,
139                }
140            }
141            (_, Newline) if ord_list_start(loop_buf).is_some() => Some(parse_ordered_li(loop_buf)),
142            _ if loop_buf.starts_with(LNK_S) => {
143                parse_any_link(loop_buf, top_blk && prev == Prev::Newline)
144            }
145            (_, Escape | _) => None,
146        };
147
148        if let Some((tree, rest)) = res {
149            // We found something: push our WIP and then push the found tree
150            let prev_buf = &wip_buf[..(wip_buf.len() - loop_buf.len())];
151            if !prev_buf.is_empty() {
152                let prev_str = str::from_utf8(prev_buf).unwrap();
153                stream.push(MdTree::PlainText(prev_str));
154            }
155            stream.push(tree);
156
157            wip_buf = rest;
158            loop_buf = rest;
159        } else {
160            // Just move on to the next character
161            loop_buf = &loop_buf[1..];
162            // If we are at the end and haven't found anything, just push plain text
163            if loop_buf.is_empty() && !wip_buf.is_empty() {
164                let final_str = str::from_utf8(wip_buf).unwrap();
165                stream.push(MdTree::PlainText(final_str));
166            }
167        };
168
169        prev = next_prev;
170    }
171
172    MdStream(stream)
173}
174
175/// The simplest kind of patterns: data within start and end patterns
176fn parse_simple_pat<'a, F>(
177    buf: &'a [u8],
178    start_pat: &[u8],
179    end_pat: &[u8],
180    opts: ParseOpt,
181    create_tt: F,
182) -> ParseResult<'a>
183where
184    F: FnOnce(&'a str) -> MdTree<'a>,
185{
186    let ignore_esc = matches!(opts, ParseOpt::TrimNoEsc);
187    let trim = matches!(opts, ParseOpt::TrimNoEsc);
188    let (txt, rest) = parse_with_end_pat(&buf[start_pat.len()..], end_pat, ignore_esc)?;
189    let mut txt = str::from_utf8(txt).unwrap();
190    if trim {
191        txt = txt.trim();
192    }
193    Some((create_tt(txt), rest))
194}
195
196/// Parse backtick-wrapped inline code. Accounts for >1 backtick sets
197fn parse_codeinline(buf: &[u8]) -> ParseResult<'_> {
198    let seps = buf.iter().take_while(|ch| **ch == b'`').count();
199    let (txt, rest) = parse_with_end_pat(&buf[seps..], &buf[..seps], true)?;
200    Some((MdTree::CodeInline(str::from_utf8(txt).unwrap()), rest))
201}
202
203/// Parse a codeblock. Accounts for >3 backticks and language specification
204fn parse_codeblock(buf: &[u8]) -> Parsed<'_> {
205    // account for ````code```` style
206    let seps = buf.iter().take_while(|ch| **ch == b'`').count();
207    let end_sep = &buf[..seps];
208    let mut working = &buf[seps..];
209
210    // Handle "````rust" style language specifications
211    let next_ws_idx = working.iter().take_while(|ch| !ch.is_ascii_whitespace()).count();
212
213    let lang = if next_ws_idx > 0 {
214        // Munch the lang
215        let tmp = str::from_utf8(&working[..next_ws_idx]).unwrap();
216        working = &working[next_ws_idx..];
217        Some(tmp)
218    } else {
219        None
220    };
221
222    let mut end_pat = vec![b'\n'];
223    end_pat.extend(end_sep);
224
225    // Find first end pattern with nothing else on its line
226    let mut found = None;
227    for idx in (0..working.len()).filter(|idx| working[*idx..].starts_with(&end_pat)) {
228        let (eol_txt, rest) = parse_to_newline(&working[(idx + end_pat.len())..]);
229        if !eol_txt.iter().any(u8::is_ascii_whitespace) {
230            found = Some((&working[..idx], rest));
231            break;
232        }
233    }
234
235    let (txt, rest) = found.unwrap_or((working, &[]));
236    let txt = str::from_utf8(txt).unwrap().trim_matches('\n');
237
238    (MdTree::CodeBlock { txt, lang }, rest)
239}
240
241fn parse_heading(buf: &[u8]) -> ParseResult<'_> {
242    let level = buf.iter().take_while(|ch| **ch == b'#').count();
243    let buf = &buf[level..];
244
245    if level > 6 || (buf.len() > 1 && !buf[0].is_ascii_whitespace()) {
246        // Enforce max 6 levels and whitespace following the `##` pattern
247        return None;
248    }
249
250    let (txt, rest) = parse_to_newline(&buf[1..]);
251    let ctx = Context { top_block: false, prev: Prev::Whitespace };
252    let stream = parse_recursive(txt, ctx);
253
254    Some((MdTree::Heading(level.try_into().unwrap(), stream), rest))
255}
256
257/// Bulleted list
258fn parse_unordered_li(buf: &[u8]) -> Parsed<'_> {
259    let (txt, rest) = get_indented_section(&buf[2..]);
260    let ctx = Context { top_block: false, prev: Prev::Whitespace };
261    let stream = parse_recursive(trim_ascii_start(txt), ctx);
262    (MdTree::UnorderedListItem(stream), rest)
263}
264
265/// Numbered list
266fn parse_ordered_li(buf: &[u8]) -> Parsed<'_> {
267    let (num, pos) = ord_list_start(buf).unwrap(); // success tested in caller
268    let (txt, rest) = get_indented_section(&buf[pos..]);
269    let ctx = Context { top_block: false, prev: Prev::Whitespace };
270    let stream = parse_recursive(trim_ascii_start(txt), ctx);
271    (MdTree::OrderedListItem(num, stream), rest)
272}
273
274fn get_indented_section(buf: &[u8]) -> (&[u8], &[u8]) {
275    let mut lines = buf.split(|&byte| byte == b'\n');
276    let mut end = lines.next().map_or(0, |line| line.len());
277    for line in lines {
278        if let Some(first) = line.first() {
279            if unordered_list_start(line) || !first.is_ascii_whitespace() {
280                break;
281            }
282        }
283        end += line.len() + 1;
284    }
285
286    (&buf[..end], &buf[end..])
287}
288
289fn unordered_list_start(mut buf: &[u8]) -> bool {
290    while let [b' ', rest @ ..] = buf {
291        buf = rest;
292    }
293    matches!(buf, [b'*' | b'-', b' ', ..])
294}
295
296/// Verify a valid ordered list start (e.g. `1.`) and parse it. Returns the
297/// parsed number and offset of character after the dot.
298fn ord_list_start(buf: &[u8]) -> Option<(u16, usize)> {
299    let pos = buf.iter().take(10).position(|ch| *ch == b'.')?;
300    let n = str::from_utf8(&buf[..pos]).ok()?;
301    if !buf.get(pos + 1)?.is_ascii_whitespace() {
302        return None;
303    }
304    n.parse::<u16>().ok().map(|v| (v, pos + 2))
305}
306
307/// Parse links. `can_be_def` indicates that a link definition is possible (top
308/// level, located at the start of a line)
309fn parse_any_link(buf: &[u8], can_be_def: bool) -> ParseResult<'_> {
310    let (bracketed, rest) = parse_with_end_pat(&buf[1..], LNK_E, true)?;
311    if rest.is_empty() {
312        return None;
313    }
314
315    let disp = str::from_utf8(bracketed).unwrap();
316    match (can_be_def, rest[0]) {
317        (true, b':') => {
318            let (link, tmp) = parse_to_newline(&rest[1..]);
319            let link = str::from_utf8(link).unwrap().trim();
320            Some((MdTree::LinkDef { id: disp, link }, tmp))
321        }
322        (_, b'(') => parse_simple_pat(rest, b"(", b")", ParseOpt::TrimNoEsc, |link| MdTree::Link {
323            disp,
324            link,
325        }),
326        (_, b'[') => parse_simple_pat(rest, b"[", b"]", ParseOpt::TrimNoEsc, |id| {
327            MdTree::RefLink { disp, id: Some(id) }
328        }),
329        _ => Some((MdTree::RefLink { disp, id: None }, rest)),
330    }
331}
332
333/// Find and consume an end pattern, return `(match, residual)`
334fn parse_with_end_pat<'a>(
335    buf: &'a [u8],
336    end_sep: &[u8],
337    ignore_esc: bool,
338) -> Option<(&'a [u8], &'a [u8])> {
339    // Find positions that start with the end separator
340    for idx in (0..buf.len()).filter(|idx| buf[*idx..].starts_with(end_sep)) {
341        if !ignore_esc && idx > 0 && buf[idx - 1] == b'\\' {
342            continue;
343        }
344        return Some((&buf[..idx], &buf[idx + end_sep.len()..]));
345    }
346    None
347}
348
349/// Return `(match, residual)` to end of line. The EOL is returned with the
350/// residual.
351fn parse_to_newline(buf: &[u8]) -> (&[u8], &[u8]) {
352    buf.iter().position(|ch| *ch == b'\n').map_or((buf, &[]), |pos| buf.split_at(pos))
353}
354
355/// Take a parsed stream and fix the little things
356fn normalize<'a>(MdStream(stream): MdStream<'a>, linkdefs: &mut Vec<MdTree<'a>>) -> MdStream<'a> {
357    let mut new_stream = Vec::with_capacity(stream.len());
358    let new_defs = stream.iter().filter(|tt| matches!(tt, MdTree::LinkDef { .. }));
359    linkdefs.extend(new_defs.cloned());
360
361    // Run plaintest expansions on types that need it, call this function on nested types
362    for item in stream {
363        match item {
364            MdTree::PlainText(txt) => expand_plaintext(txt, &mut new_stream, MdTree::PlainText),
365            MdTree::Strong(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Strong),
366            MdTree::Emphasis(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Emphasis),
367            MdTree::Strikethrough(txt) => {
368                expand_plaintext(txt, &mut new_stream, MdTree::Strikethrough);
369            }
370            MdTree::RefLink { disp, id } => new_stream.push(match_reflink(linkdefs, disp, id)),
371            MdTree::OrderedListItem(n, st) => {
372                new_stream.push(MdTree::OrderedListItem(n, normalize(st, linkdefs)));
373            }
374            MdTree::UnorderedListItem(st) => {
375                new_stream.push(MdTree::UnorderedListItem(normalize(st, linkdefs)));
376            }
377            MdTree::Heading(n, st) => new_stream.push(MdTree::Heading(n, normalize(st, linkdefs))),
378            _ => new_stream.push(item),
379        }
380    }
381
382    // Remove non printing types, duplicate paragraph breaks, and breaks at start/end
383    new_stream.retain(|x| !matches!(x, MdTree::Comment(_) | MdTree::LinkDef { .. }));
384    new_stream.dedup_by(|r, l| matches!((r, l), (MdTree::ParagraphBreak, MdTree::ParagraphBreak)));
385
386    if new_stream.first().is_some_and(is_break_ty) {
387        new_stream.remove(0);
388    }
389    if new_stream.last().is_some_and(is_break_ty) {
390        new_stream.pop();
391    }
392
393    // Remove paragraph breaks that shouldn't be there. w[1] is what will be
394    // removed in these cases. Note that these are the items to keep, not delete
395    // (for `retain`)
396    let to_keep: Vec<bool> = new_stream
397        .windows(3)
398        .map(|w| {
399            !((matches!(&w[1], MdTree::ParagraphBreak)
400                && matches!(should_break(&w[0], &w[2]), BreakRule::Always(1) | BreakRule::Never))
401                || (matches!(&w[1], MdTree::PlainText(txt) if txt.trim().is_empty())
402                    && matches!(
403                        should_break(&w[0], &w[2]),
404                        BreakRule::Always(_) | BreakRule::Never
405                    )))
406        })
407        .collect();
408    let mut iter = iter::once(true).chain(to_keep).chain(iter::once(true));
409    new_stream.retain(|_| iter.next().unwrap());
410
411    // Insert line or paragraph breaks where there should be some
412    let mut insertions = 0;
413    let to_insert: Vec<(usize, MdTree<'_>)> = new_stream
414        .windows(2)
415        .enumerate()
416        .filter_map(|(idx, w)| match should_break(&w[0], &w[1]) {
417            BreakRule::Always(1) => Some((idx, MdTree::LineBreak)),
418            BreakRule::Always(2) => Some((idx, MdTree::ParagraphBreak)),
419            _ => None,
420        })
421        .map(|(idx, tt)| {
422            insertions += 1;
423            (idx + insertions, tt)
424        })
425        .collect();
426    to_insert.into_iter().for_each(|(idx, tt)| new_stream.insert(idx, tt));
427
428    MdStream(new_stream)
429}
430
431/// Whether two types should or shouldn't have a paragraph break between them
432#[derive(Clone, Copy, Debug, PartialEq)]
433enum BreakRule {
434    Always(u8),
435    Never,
436    Optional,
437}
438
439/// Blocks that automatically handle their own text wrapping
440fn should_break(left: &MdTree<'_>, right: &MdTree<'_>) -> BreakRule {
441    use MdTree::*;
442
443    match (left, right) {
444        // Separate these types with a single line
445        (HorizontalRule, _)
446        | (_, HorizontalRule)
447        | (OrderedListItem(_, _), OrderedListItem(_, _))
448        | (UnorderedListItem(_), UnorderedListItem(_)) => BreakRule::Always(1),
449        // Condensed types shouldn't have an extra break on either side
450        (Comment(_) | ParagraphBreak | Heading(_, _), _) | (_, Comment(_) | ParagraphBreak) => {
451            BreakRule::Never
452        }
453        // Block types should always be separated by full breaks
454        (CodeBlock { .. } | OrderedListItem(_, _) | UnorderedListItem(_), _)
455        | (_, CodeBlock { .. } | Heading(_, _) | OrderedListItem(_, _) | UnorderedListItem(_)) => {
456            BreakRule::Always(2)
457        }
458        // Text types may or may not be separated by a break
459        (
460            CodeInline(_)
461            | Strong(_)
462            | Emphasis(_)
463            | Strikethrough(_)
464            | PlainText(_)
465            | Link { .. }
466            | RefLink { .. }
467            | LinkDef { .. },
468            CodeInline(_)
469            | Strong(_)
470            | Emphasis(_)
471            | Strikethrough(_)
472            | PlainText(_)
473            | Link { .. }
474            | RefLink { .. }
475            | LinkDef { .. },
476        ) => BreakRule::Optional,
477        (LineBreak, _) | (_, LineBreak) => {
478            unreachable!("should have been removed during deduplication")
479        }
480    }
481}
482
483/// Types that indicate some form of break
484fn is_break_ty(val: &MdTree<'_>) -> bool {
485    matches!(val, MdTree::ParagraphBreak | MdTree::LineBreak)
486        // >1 break between paragraphs acts as a break
487        || matches!(val, MdTree::PlainText(txt) if txt.trim().is_empty())
488}
489
490/// Perform transformations to text. This splits paragraphs, replaces patterns,
491/// and corrects newlines.
492///
493/// To avoid allocating strings (and using a different heavier tt type), our
494/// replace method means split into three and append each. For this reason, any
495/// viewer should treat consecutive `PlainText` types as belonging to the same
496/// paragraph.
497fn expand_plaintext<'a>(
498    txt: &'a str,
499    stream: &mut Vec<MdTree<'a>>,
500    mut f: fn(&'a str) -> MdTree<'a>,
501) {
502    if txt.is_empty() {
503        return;
504    } else if txt == "\n" {
505        if let Some(tt) = stream.last() {
506            let tmp = MdTree::PlainText(" ");
507            if should_break(tt, &tmp) == BreakRule::Optional {
508                stream.push(tmp);
509            }
510        }
511        return;
512    }
513    let mut queue1 = Vec::new();
514    let mut queue2 = Vec::new();
515    let stream_start_len = stream.len();
516    for paragraph in txt.split("\n\n") {
517        if paragraph.is_empty() {
518            stream.push(MdTree::ParagraphBreak);
519            continue;
520        }
521        let paragraph = trim_extra_ws(paragraph);
522
523        queue1.clear();
524        queue1.push(paragraph);
525
526        for (from, to) in REPLACEMENTS {
527            queue2.clear();
528            for item in &queue1 {
529                for s in item.split(from) {
530                    queue2.extend(&[s, to]);
531                }
532                if queue2.len() > 1 {
533                    let _ = queue2.pop(); // remove last unnecessary intersperse
534                }
535            }
536            mem::swap(&mut queue1, &mut queue2);
537        }
538
539        // Make sure we don't double whitespace
540        queue1.retain(|s| !s.is_empty());
541        for idx in 0..queue1.len() {
542            queue1[idx] = trim_extra_ws(queue1[idx]);
543            if idx < queue1.len() - 1
544                && queue1[idx].ends_with(char::is_whitespace)
545                && queue1[idx + 1].starts_with(char::is_whitespace)
546            {
547                queue1[idx] = queue1[idx].trim_end();
548            }
549        }
550        stream.extend(queue1.iter().copied().filter(|txt| !txt.is_empty()).map(&mut f));
551        stream.push(MdTree::ParagraphBreak);
552    }
553
554    if stream.len() - stream_start_len > 1 {
555        let _ = stream.pop(); // remove last unnecessary intersperse
556    }
557}
558
559/// Turn reflinks (links with reference IDs) into normal standalone links using
560/// listed link definitions
561fn match_reflink<'a>(linkdefs: &[MdTree<'a>], disp: &'a str, match_id: Option<&str>) -> MdTree<'a> {
562    let to_match = match_id.unwrap_or(disp); // Match with the display name if there isn't an id
563    for def in linkdefs {
564        if let MdTree::LinkDef { id, link } = def {
565            if *id == to_match {
566                return MdTree::Link { disp, link };
567            }
568        }
569    }
570    MdTree::Link { disp, link: "" } // link not found
571}
572
573/// If there is more than one whitespace char at start or end, trim the extras
574fn trim_extra_ws(mut txt: &str) -> &str {
575    let start_ws =
576        txt.bytes().position(|ch| !ch.is_ascii_whitespace()).unwrap_or(txt.len()).saturating_sub(1);
577    txt = &txt[start_ws..];
578    let end_ws = txt
579        .bytes()
580        .rev()
581        .position(|ch| !ch.is_ascii_whitespace())
582        .unwrap_or(txt.len())
583        .saturating_sub(1);
584    &txt[..txt.len() - end_ws]
585}
586
587/// If there is more than one whitespace char at start, trim the extras
588fn trim_ascii_start(buf: &[u8]) -> &[u8] {
589    let count = buf.iter().take_while(|ch| ch.is_ascii_whitespace()).count();
590    &buf[count..]
591}
592
593#[cfg(test)]
594#[path = "tests/parse.rs"]
595mod tests;