1use std::{iter, mem, str};
2
3use crate::markdown::{MdStream, MdTree};
4
5const ANC_E: &[u8] = b">";
8const ANC_S: &[u8] = b"<";
9const BRK: &[u8] = b"---";
10const CBK: &[u8] = b"```";
11const CIL: &[u8] = b"`";
12const CMT_E: &[u8] = b"-->";
13const CMT_S: &[u8] = b"<!--";
14const EMP_U: &[u8] = b"_";
15const EMP_A: &[u8] = b"*";
16const HDG: &[u8] = b"#";
17const LNK_CHARS: &str = "$-_.+!*'()/&?=:%";
18const LNK_E: &[u8] = b"]";
19const LNK_S: &[u8] = b"[";
20const STG_U: &[u8] = b"__";
21const STG_A: &[u8] = b"**";
22const STK: &[u8] = b"~~";
23
24const REPLACEMENTS: &[(&str, &str)] = &[
26 ("(c)", "©"),
27 ("(C)", "©"),
28 ("(r)", "®"),
29 ("(R)", "®"),
30 ("(tm)", "â„¢"),
31 ("(TM)", "â„¢"),
32 (":crab:", "🦀"),
33 ("\n", " "),
34];
35
36type Parsed<'a> = (MdTree<'a>, &'a [u8]);
38type ParseResult<'a> = Option<Parsed<'a>>;
40
41#[derive(Clone, Copy, Debug, PartialEq)]
43struct Context {
46 top_block: bool = false,
48 prev: Prev = Prev::Whitespace,
50}
51
52#[derive(Clone, Copy, Debug, PartialEq)]
54enum Prev {
55 Newline,
56 Whitespace,
58 Escape,
59 Any,
60}
61
62#[derive(Clone, Copy, Debug, PartialEq)]
64enum ParseOpt {
65 TrimNoEsc,
67 None,
68}
69
70pub(crate) fn entrypoint(txt: &str) -> MdStream<'_> {
72 let ctx = Context { top_block: true, prev: Prev::Newline };
73 normalize(parse_recursive(txt.trim().as_bytes(), ctx), &mut Vec::new())
74}
75
76fn parse_recursive<'a>(buf: &'a [u8], ctx: Context) -> MdStream<'a> {
78 use ParseOpt as Po;
79 use Prev::{Escape, Newline, Whitespace};
80
81 let mut stream: Vec<MdTree<'a>> = Vec::new();
82 let Context { top_block: top_blk, mut prev } = ctx;
83
84 let mut wip_buf = buf;
87 let mut loop_buf = wip_buf;
88
89 while !loop_buf.is_empty() {
90 let next_prev = match loop_buf[0] {
91 b'\n' => Newline,
92 b'\\' => Escape,
93 x if x.is_ascii_whitespace() => Whitespace,
94 _ => Prev::Any,
95 };
96
97 let res: ParseResult<'_> = match (top_blk, prev) {
98 _ if loop_buf.starts_with(CMT_S) => {
99 parse_simple_pat(loop_buf, CMT_S, CMT_E, Po::TrimNoEsc, MdTree::Comment)
100 }
101 (true, Newline) if loop_buf.starts_with(CBK) => Some(parse_codeblock(loop_buf)),
102 _ if loop_buf.starts_with(CIL) => parse_codeinline(loop_buf),
103 (true, Newline | Whitespace) if loop_buf.starts_with(HDG) => parse_heading(loop_buf),
104 (true, Newline) if loop_buf.starts_with(BRK) => {
105 Some((MdTree::HorizontalRule, parse_to_newline(loop_buf).1))
106 }
107 (_, Newline) if unordered_list_start(loop_buf) => Some(parse_unordered_li(loop_buf)),
108 (_, Newline | Whitespace) if loop_buf.starts_with(STG_U) => {
109 parse_simple_pat(loop_buf, STG_U, STG_U, Po::None, MdTree::Strong)
110 }
111 _ if loop_buf.starts_with(STG_A) => {
112 parse_simple_pat(loop_buf, STG_A, STG_A, Po::None, MdTree::Strong)
113 }
114 (_, Newline | Whitespace) if loop_buf.starts_with(EMP_U) => {
115 parse_simple_pat(loop_buf, EMP_U, EMP_U, Po::None, MdTree::Emphasis)
116 }
117 _ if loop_buf.starts_with(EMP_A) => {
118 parse_simple_pat(loop_buf, EMP_A, EMP_A, Po::None, MdTree::Emphasis)
119 }
120 _ if loop_buf.starts_with(STK) => {
121 parse_simple_pat(loop_buf, STK, STK, Po::None, MdTree::Strikethrough)
122 }
123 (_, Newline | Whitespace) if loop_buf.starts_with(ANC_S) => {
124 let tt_fn = |link| MdTree::Link { disp: link, link };
125 let ret = parse_simple_pat(loop_buf, ANC_S, ANC_E, Po::None, tt_fn);
126 match ret {
127 Some((MdTree::Link { disp, .. }, _))
128 if disp.chars().all(|ch| LNK_CHARS.contains(ch)) =>
129 {
130 ret
131 }
132 _ => None,
133 }
134 }
135 (_, Newline) if ord_list_start(loop_buf).is_some() => Some(parse_ordered_li(loop_buf)),
136 _ if loop_buf.starts_with(LNK_S) => {
137 parse_any_link(loop_buf, top_blk && prev == Prev::Newline)
138 }
139 (_, Escape | _) => None,
140 };
141
142 if let Some((tree, rest)) = res {
143 let prev_buf = &wip_buf[..(wip_buf.len() - loop_buf.len())];
145 if !prev_buf.is_empty() {
146 let prev_str = str::from_utf8(prev_buf).unwrap();
147 stream.push(MdTree::PlainText(prev_str));
148 }
149 stream.push(tree);
150
151 wip_buf = rest;
152 loop_buf = rest;
153 } else {
154 loop_buf = &loop_buf[1..];
156 if loop_buf.is_empty() && !wip_buf.is_empty() {
158 let final_str = str::from_utf8(wip_buf).unwrap();
159 stream.push(MdTree::PlainText(final_str));
160 }
161 };
162
163 prev = next_prev;
164 }
165
166 MdStream(stream)
167}
168
169fn parse_simple_pat<'a, F>(
171 buf: &'a [u8],
172 start_pat: &[u8],
173 end_pat: &[u8],
174 opts: ParseOpt,
175 create_tt: F,
176) -> ParseResult<'a>
177where
178 F: FnOnce(&'a str) -> MdTree<'a>,
179{
180 let ignore_esc = matches!(opts, ParseOpt::TrimNoEsc);
181 let trim = matches!(opts, ParseOpt::TrimNoEsc);
182 let (txt, rest) = parse_with_end_pat(&buf[start_pat.len()..], end_pat, ignore_esc)?;
183 let mut txt = str::from_utf8(txt).unwrap();
184 if trim {
185 txt = txt.trim();
186 }
187 Some((create_tt(txt), rest))
188}
189
190fn parse_codeinline(buf: &[u8]) -> ParseResult<'_> {
192 let seps = buf.iter().take_while(|ch| **ch == b'`').count();
193 let (txt, rest) = parse_with_end_pat(&buf[seps..], &buf[..seps], true)?;
194 Some((MdTree::CodeInline(str::from_utf8(txt).unwrap()), rest))
195}
196
197fn parse_codeblock(buf: &[u8]) -> Parsed<'_> {
199 let seps = buf.iter().take_while(|ch| **ch == b'`').count();
201 let end_sep = &buf[..seps];
202 let mut working = &buf[seps..];
203
204 let next_ws_idx = working.iter().take_while(|ch| !ch.is_ascii_whitespace()).count();
206
207 let lang = if next_ws_idx > 0 {
208 let tmp = str::from_utf8(&working[..next_ws_idx]).unwrap();
210 working = &working[next_ws_idx..];
211 Some(tmp)
212 } else {
213 None
214 };
215
216 let mut end_pat = vec![b'\n'];
217 end_pat.extend(end_sep);
218
219 let mut found = None;
221 for idx in (0..working.len()).filter(|idx| working[*idx..].starts_with(&end_pat)) {
222 let (eol_txt, rest) = parse_to_newline(&working[(idx + end_pat.len())..]);
223 if !eol_txt.iter().any(u8::is_ascii_whitespace) {
224 found = Some((&working[..idx], rest));
225 break;
226 }
227 }
228
229 let (txt, rest) = found.unwrap_or((working, &[]));
230 let txt = str::from_utf8(txt).unwrap().trim_matches('\n');
231
232 (MdTree::CodeBlock { txt, lang }, rest)
233}
234
235fn parse_heading(buf: &[u8]) -> ParseResult<'_> {
236 let level = buf.iter().take_while(|ch| **ch == b'#').count();
237 let buf = &buf[level..];
238
239 if level > 6 || (buf.len() > 1 && !buf[0].is_ascii_whitespace()) {
240 return None;
242 }
243
244 let (txt, rest) = parse_to_newline(&buf[1..]);
245 let ctx = Context { .. };
246 let stream = parse_recursive(txt, ctx);
247
248 Some((MdTree::Heading(level.try_into().unwrap(), stream), rest))
249}
250
251fn parse_unordered_li(buf: &[u8]) -> Parsed<'_> {
253 let (txt, rest) = get_indented_section(&buf[2..]);
254 let ctx = Context { .. };
255 let stream = parse_recursive(trim_ascii_start(txt), ctx);
256 (MdTree::UnorderedListItem(stream), rest)
257}
258
259fn parse_ordered_li(buf: &[u8]) -> Parsed<'_> {
261 let (num, pos) = ord_list_start(buf).unwrap(); let (txt, rest) = get_indented_section(&buf[pos..]);
263 let ctx = Context { .. };
264 let stream = parse_recursive(trim_ascii_start(txt), ctx);
265 (MdTree::OrderedListItem(num, stream), rest)
266}
267
268fn get_indented_section(buf: &[u8]) -> (&[u8], &[u8]) {
269 let mut lines = buf.split(|&byte| byte == b'\n');
270 let mut end = lines.next().map_or(0, |line| line.len());
271 for line in lines {
272 if let Some(first) = line.first() {
273 if unordered_list_start(line) || !first.is_ascii_whitespace() {
274 break;
275 }
276 }
277 end += line.len() + 1;
278 }
279
280 (&buf[..end], &buf[end..])
281}
282
283fn unordered_list_start(mut buf: &[u8]) -> bool {
284 while let [b' ', rest @ ..] = buf {
285 buf = rest;
286 }
287 matches!(buf, [b'*' | b'-', b' ', ..])
288}
289
290fn ord_list_start(buf: &[u8]) -> Option<(u16, usize)> {
293 let pos = buf.iter().take(10).position(|ch| *ch == b'.')?;
294 let n = str::from_utf8(&buf[..pos]).ok()?;
295 if !buf.get(pos + 1)?.is_ascii_whitespace() {
296 return None;
297 }
298 n.parse::<u16>().ok().map(|v| (v, pos + 2))
299}
300
301fn parse_any_link(buf: &[u8], can_be_def: bool) -> ParseResult<'_> {
304 let (bracketed, rest) = parse_with_end_pat(&buf[1..], LNK_E, true)?;
305 if rest.is_empty() {
306 return None;
307 }
308
309 let disp = str::from_utf8(bracketed).unwrap();
310 match (can_be_def, rest[0]) {
311 (true, b':') => {
312 let (link, tmp) = parse_to_newline(&rest[1..]);
313 let link = str::from_utf8(link).unwrap().trim();
314 Some((MdTree::LinkDef { id: disp, link }, tmp))
315 }
316 (_, b'(') => parse_simple_pat(rest, b"(", b")", ParseOpt::TrimNoEsc, |link| MdTree::Link {
317 disp,
318 link,
319 }),
320 (_, b'[') => parse_simple_pat(rest, b"[", b"]", ParseOpt::TrimNoEsc, |id| {
321 MdTree::RefLink { disp, id: Some(id) }
322 }),
323 _ => Some((MdTree::RefLink { disp, id: None }, rest)),
324 }
325}
326
327fn parse_with_end_pat<'a>(
329 buf: &'a [u8],
330 end_sep: &[u8],
331 ignore_esc: bool,
332) -> Option<(&'a [u8], &'a [u8])> {
333 for idx in (0..buf.len()).filter(|idx| buf[*idx..].starts_with(end_sep)) {
335 if !ignore_esc && idx > 0 && buf[idx - 1] == b'\\' {
336 continue;
337 }
338 return Some((&buf[..idx], &buf[idx + end_sep.len()..]));
339 }
340 None
341}
342
343fn parse_to_newline(buf: &[u8]) -> (&[u8], &[u8]) {
346 buf.iter().position(|ch| *ch == b'\n').map_or((buf, &[]), |pos| buf.split_at(pos))
347}
348
349fn normalize<'a>(MdStream(stream): MdStream<'a>, linkdefs: &mut Vec<MdTree<'a>>) -> MdStream<'a> {
351 let mut new_stream = Vec::with_capacity(stream.len());
352 let new_defs = stream.iter().filter(|tt| matches!(tt, MdTree::LinkDef { .. }));
353 linkdefs.extend(new_defs.cloned());
354
355 for item in stream {
357 match item {
358 MdTree::PlainText(txt) => expand_plaintext(txt, &mut new_stream, MdTree::PlainText),
359 MdTree::Strong(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Strong),
360 MdTree::Emphasis(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Emphasis),
361 MdTree::Strikethrough(txt) => {
362 expand_plaintext(txt, &mut new_stream, MdTree::Strikethrough);
363 }
364 MdTree::RefLink { disp, id } => new_stream.push(match_reflink(linkdefs, disp, id)),
365 MdTree::OrderedListItem(n, st) => {
366 new_stream.push(MdTree::OrderedListItem(n, normalize(st, linkdefs)));
367 }
368 MdTree::UnorderedListItem(st) => {
369 new_stream.push(MdTree::UnorderedListItem(normalize(st, linkdefs)));
370 }
371 MdTree::Heading(n, st) => new_stream.push(MdTree::Heading(n, normalize(st, linkdefs))),
372 _ => new_stream.push(item),
373 }
374 }
375
376 new_stream.retain(|x| !matches!(x, MdTree::Comment(_) | MdTree::LinkDef { .. }));
378 new_stream.dedup_by(|r, l| matches!((r, l), (MdTree::ParagraphBreak, MdTree::ParagraphBreak)));
379
380 if new_stream.first().is_some_and(is_break_ty) {
381 new_stream.remove(0);
382 }
383 if new_stream.last().is_some_and(is_break_ty) {
384 new_stream.pop();
385 }
386
387 let to_keep: Vec<bool> = new_stream
391 .windows(3)
392 .map(|w| {
393 !((matches!(&w[1], MdTree::ParagraphBreak)
394 && matches!(should_break(&w[0], &w[2]), BreakRule::Always(1) | BreakRule::Never))
395 || (matches!(&w[1], MdTree::PlainText(txt) if txt.trim().is_empty())
396 && matches!(
397 should_break(&w[0], &w[2]),
398 BreakRule::Always(_) | BreakRule::Never
399 )))
400 })
401 .collect();
402 let mut iter = iter::once(true).chain(to_keep).chain(iter::once(true));
403 new_stream.retain(|_| iter.next().unwrap());
404
405 let mut insertions = 0;
407 let to_insert: Vec<(usize, MdTree<'_>)> = new_stream
408 .windows(2)
409 .enumerate()
410 .filter_map(|(idx, w)| match should_break(&w[0], &w[1]) {
411 BreakRule::Always(1) => Some((idx, MdTree::LineBreak)),
412 BreakRule::Always(2) => Some((idx, MdTree::ParagraphBreak)),
413 _ => None,
414 })
415 .map(|(idx, tt)| {
416 insertions += 1;
417 (idx + insertions, tt)
418 })
419 .collect();
420 to_insert.into_iter().for_each(|(idx, tt)| new_stream.insert(idx, tt));
421
422 MdStream(new_stream)
423}
424
425#[derive(Clone, Copy, Debug, PartialEq)]
427enum BreakRule {
428 Always(u8),
429 Never,
430 Optional,
431}
432
433fn should_break(left: &MdTree<'_>, right: &MdTree<'_>) -> BreakRule {
435 use MdTree::*;
436
437 match (left, right) {
438 (HorizontalRule, _)
440 | (_, HorizontalRule)
441 | (OrderedListItem(_, _), OrderedListItem(_, _))
442 | (UnorderedListItem(_), UnorderedListItem(_)) => BreakRule::Always(1),
443 (Comment(_) | ParagraphBreak | Heading(_, _), _) | (_, Comment(_) | ParagraphBreak) => {
445 BreakRule::Never
446 }
447 (CodeBlock { .. } | OrderedListItem(_, _) | UnorderedListItem(_), _)
449 | (_, CodeBlock { .. } | Heading(_, _) | OrderedListItem(_, _) | UnorderedListItem(_)) => {
450 BreakRule::Always(2)
451 }
452 (
454 CodeInline(_)
455 | Strong(_)
456 | Emphasis(_)
457 | Strikethrough(_)
458 | PlainText(_)
459 | Link { .. }
460 | RefLink { .. }
461 | LinkDef { .. },
462 CodeInline(_)
463 | Strong(_)
464 | Emphasis(_)
465 | Strikethrough(_)
466 | PlainText(_)
467 | Link { .. }
468 | RefLink { .. }
469 | LinkDef { .. },
470 ) => BreakRule::Optional,
471 (LineBreak, _) | (_, LineBreak) => {
472 unreachable!("should have been removed during deduplication")
473 }
474 }
475}
476
477fn is_break_ty(val: &MdTree<'_>) -> bool {
479 matches!(val, MdTree::ParagraphBreak | MdTree::LineBreak)
480 || matches!(val, MdTree::PlainText(txt) if txt.trim().is_empty())
482}
483
484fn expand_plaintext<'a>(
492 txt: &'a str,
493 stream: &mut Vec<MdTree<'a>>,
494 mut f: fn(&'a str) -> MdTree<'a>,
495) {
496 if txt.is_empty() {
497 return;
498 } else if txt == "\n" {
499 if let Some(tt) = stream.last() {
500 let tmp = MdTree::PlainText(" ");
501 if should_break(tt, &tmp) == BreakRule::Optional {
502 stream.push(tmp);
503 }
504 }
505 return;
506 }
507 let mut queue1 = Vec::new();
508 let mut queue2 = Vec::new();
509 let stream_start_len = stream.len();
510 for paragraph in txt.split("\n\n") {
511 if paragraph.is_empty() {
512 stream.push(MdTree::ParagraphBreak);
513 continue;
514 }
515 let paragraph = trim_extra_ws(paragraph);
516
517 queue1.clear();
518 queue1.push(paragraph);
519
520 for (from, to) in REPLACEMENTS {
521 queue2.clear();
522 for item in &queue1 {
523 for s in item.split(from) {
524 queue2.extend(&[s, to]);
525 }
526 if queue2.len() > 1 {
527 let _ = queue2.pop(); }
529 }
530 mem::swap(&mut queue1, &mut queue2);
531 }
532
533 queue1.retain(|s| !s.is_empty());
535 for idx in 0..queue1.len() {
536 queue1[idx] = trim_extra_ws(queue1[idx]);
537 if idx < queue1.len() - 1
538 && queue1[idx].ends_with(char::is_whitespace)
539 && queue1[idx + 1].starts_with(char::is_whitespace)
540 {
541 queue1[idx] = queue1[idx].trim_end();
542 }
543 }
544 stream.extend(queue1.iter().copied().filter(|txt| !txt.is_empty()).map(&mut f));
545 stream.push(MdTree::ParagraphBreak);
546 }
547
548 if stream.len() - stream_start_len > 1 {
549 let _ = stream.pop(); }
551}
552
553fn match_reflink<'a>(linkdefs: &[MdTree<'a>], disp: &'a str, match_id: Option<&str>) -> MdTree<'a> {
556 let to_match = match_id.unwrap_or(disp); for def in linkdefs {
558 if let MdTree::LinkDef { id, link } = def {
559 if *id == to_match {
560 return MdTree::Link { disp, link };
561 }
562 }
563 }
564 MdTree::Link { disp, link: "" } }
566
567fn trim_extra_ws(mut txt: &str) -> &str {
569 let start_ws =
570 txt.bytes().position(|ch| !ch.is_ascii_whitespace()).unwrap_or(txt.len()).saturating_sub(1);
571 txt = &txt[start_ws..];
572 let end_ws = txt
573 .bytes()
574 .rev()
575 .position(|ch| !ch.is_ascii_whitespace())
576 .unwrap_or(txt.len())
577 .saturating_sub(1);
578 &txt[..txt.len() - end_ws]
579}
580
581fn trim_ascii_start(buf: &[u8]) -> &[u8] {
583 let count = buf.iter().take_while(|ch| ch.is_ascii_whitespace()).count();
584 &buf[count..]
585}
586
587#[cfg(test)]
588#[path = "tests/parse.rs"]
589mod tests;