1use std::{iter, mem, str};
2
3use crate::markdown::{MdStream, MdTree};
4
5const ANC_E: &[u8] = b">";
8const ANC_S: &[u8] = b"<";
9const BRK: &[u8] = b"---";
10const CBK: &[u8] = b"```";
11const CIL: &[u8] = b"`";
12const CMT_E: &[u8] = b"-->";
13const CMT_S: &[u8] = b"<!--";
14const EMP_U: &[u8] = b"_";
15const EMP_A: &[u8] = b"*";
16const HDG: &[u8] = b"#";
17const LNK_CHARS: &str = "$-_.+!*'()/&?=:%";
18const LNK_E: &[u8] = b"]";
19const LNK_S: &[u8] = b"[";
20const STG_U: &[u8] = b"__";
21const STG_A: &[u8] = b"**";
22const STK: &[u8] = b"~~";
23
24const REPLACEMENTS: &[(&str, &str)] = &[
26 ("(c)", "©"),
27 ("(C)", "©"),
28 ("(r)", "®"),
29 ("(R)", "®"),
30 ("(tm)", "™"),
31 ("(TM)", "™"),
32 (":crab:", "🦀"),
33 ("\n", " "),
34];
35
36type Parsed<'a> = (MdTree<'a>, &'a [u8]);
38type ParseResult<'a> = Option<Parsed<'a>>;
40
41#[derive(Clone, Copy, Debug, PartialEq)]
43struct Context {
44 top_block: bool,
46 prev: Prev,
48}
49
50#[derive(Clone, Copy, Debug, PartialEq)]
52enum Prev {
53 Newline,
54 Whitespace,
56 Escape,
57 Any,
58}
59
60impl Default for Context {
61 fn default() -> Self {
64 Self { top_block: false, prev: Prev::Whitespace }
65 }
66}
67
68#[derive(Clone, Copy, Debug, PartialEq)]
70enum ParseOpt {
71 TrimNoEsc,
73 None,
74}
75
76pub(crate) fn entrypoint(txt: &str) -> MdStream<'_> {
78 let ctx = Context { top_block: true, prev: Prev::Newline };
79 normalize(parse_recursive(txt.trim().as_bytes(), ctx), &mut Vec::new())
80}
81
82fn parse_recursive<'a>(buf: &'a [u8], ctx: Context) -> MdStream<'a> {
84 use ParseOpt as Po;
85 use Prev::{Escape, Newline, Whitespace};
86
87 let mut stream: Vec<MdTree<'a>> = Vec::new();
88 let Context { top_block: top_blk, mut prev } = ctx;
89
90 let mut wip_buf = buf;
93 let mut loop_buf = wip_buf;
94
95 while !loop_buf.is_empty() {
96 let next_prev = match loop_buf[0] {
97 b'\n' => Newline,
98 b'\\' => Escape,
99 x if x.is_ascii_whitespace() => Whitespace,
100 _ => Prev::Any,
101 };
102
103 let res: ParseResult<'_> = match (top_blk, prev) {
104 _ if loop_buf.starts_with(CMT_S) => {
105 parse_simple_pat(loop_buf, CMT_S, CMT_E, Po::TrimNoEsc, MdTree::Comment)
106 }
107 (true, Newline) if loop_buf.starts_with(CBK) => Some(parse_codeblock(loop_buf)),
108 _ if loop_buf.starts_with(CIL) => parse_codeinline(loop_buf),
109 (true, Newline | Whitespace) if loop_buf.starts_with(HDG) => parse_heading(loop_buf),
110 (true, Newline) if loop_buf.starts_with(BRK) => {
111 Some((MdTree::HorizontalRule, parse_to_newline(loop_buf).1))
112 }
113 (_, Newline) if unordered_list_start(loop_buf) => Some(parse_unordered_li(loop_buf)),
114 (_, Newline | Whitespace) if loop_buf.starts_with(STG_U) => {
115 parse_simple_pat(loop_buf, STG_U, STG_U, Po::None, MdTree::Strong)
116 }
117 _ if loop_buf.starts_with(STG_A) => {
118 parse_simple_pat(loop_buf, STG_A, STG_A, Po::None, MdTree::Strong)
119 }
120 (_, Newline | Whitespace) if loop_buf.starts_with(EMP_U) => {
121 parse_simple_pat(loop_buf, EMP_U, EMP_U, Po::None, MdTree::Emphasis)
122 }
123 _ if loop_buf.starts_with(EMP_A) => {
124 parse_simple_pat(loop_buf, EMP_A, EMP_A, Po::None, MdTree::Emphasis)
125 }
126 _ if loop_buf.starts_with(STK) => {
127 parse_simple_pat(loop_buf, STK, STK, Po::None, MdTree::Strikethrough)
128 }
129 (_, Newline | Whitespace) if loop_buf.starts_with(ANC_S) => {
130 let tt_fn = |link| MdTree::Link { disp: link, link };
131 let ret = parse_simple_pat(loop_buf, ANC_S, ANC_E, Po::None, tt_fn);
132 match ret {
133 Some((MdTree::Link { disp, .. }, _))
134 if disp.chars().all(|ch| LNK_CHARS.contains(ch)) =>
135 {
136 ret
137 }
138 _ => None,
139 }
140 }
141 (_, Newline) if ord_list_start(loop_buf).is_some() => Some(parse_ordered_li(loop_buf)),
142 _ if loop_buf.starts_with(LNK_S) => {
143 parse_any_link(loop_buf, top_blk && prev == Prev::Newline)
144 }
145 (_, Escape | _) => None,
146 };
147
148 if let Some((tree, rest)) = res {
149 let prev_buf = &wip_buf[..(wip_buf.len() - loop_buf.len())];
151 if !prev_buf.is_empty() {
152 let prev_str = str::from_utf8(prev_buf).unwrap();
153 stream.push(MdTree::PlainText(prev_str));
154 }
155 stream.push(tree);
156
157 wip_buf = rest;
158 loop_buf = rest;
159 } else {
160 loop_buf = &loop_buf[1..];
162 if loop_buf.is_empty() && !wip_buf.is_empty() {
164 let final_str = str::from_utf8(wip_buf).unwrap();
165 stream.push(MdTree::PlainText(final_str));
166 }
167 };
168
169 prev = next_prev;
170 }
171
172 MdStream(stream)
173}
174
175fn parse_simple_pat<'a, F>(
177 buf: &'a [u8],
178 start_pat: &[u8],
179 end_pat: &[u8],
180 opts: ParseOpt,
181 create_tt: F,
182) -> ParseResult<'a>
183where
184 F: FnOnce(&'a str) -> MdTree<'a>,
185{
186 let ignore_esc = matches!(opts, ParseOpt::TrimNoEsc);
187 let trim = matches!(opts, ParseOpt::TrimNoEsc);
188 let (txt, rest) = parse_with_end_pat(&buf[start_pat.len()..], end_pat, ignore_esc)?;
189 let mut txt = str::from_utf8(txt).unwrap();
190 if trim {
191 txt = txt.trim();
192 }
193 Some((create_tt(txt), rest))
194}
195
196fn parse_codeinline(buf: &[u8]) -> ParseResult<'_> {
198 let seps = buf.iter().take_while(|ch| **ch == b'`').count();
199 let (txt, rest) = parse_with_end_pat(&buf[seps..], &buf[..seps], true)?;
200 Some((MdTree::CodeInline(str::from_utf8(txt).unwrap()), rest))
201}
202
203fn parse_codeblock(buf: &[u8]) -> Parsed<'_> {
205 let seps = buf.iter().take_while(|ch| **ch == b'`').count();
207 let end_sep = &buf[..seps];
208 let mut working = &buf[seps..];
209
210 let next_ws_idx = working.iter().take_while(|ch| !ch.is_ascii_whitespace()).count();
212
213 let lang = if next_ws_idx > 0 {
214 let tmp = str::from_utf8(&working[..next_ws_idx]).unwrap();
216 working = &working[next_ws_idx..];
217 Some(tmp)
218 } else {
219 None
220 };
221
222 let mut end_pat = vec![b'\n'];
223 end_pat.extend(end_sep);
224
225 let mut found = None;
227 for idx in (0..working.len()).filter(|idx| working[*idx..].starts_with(&end_pat)) {
228 let (eol_txt, rest) = parse_to_newline(&working[(idx + end_pat.len())..]);
229 if !eol_txt.iter().any(u8::is_ascii_whitespace) {
230 found = Some((&working[..idx], rest));
231 break;
232 }
233 }
234
235 let (txt, rest) = found.unwrap_or((working, &[]));
236 let txt = str::from_utf8(txt).unwrap().trim_matches('\n');
237
238 (MdTree::CodeBlock { txt, lang }, rest)
239}
240
241fn parse_heading(buf: &[u8]) -> ParseResult<'_> {
242 let level = buf.iter().take_while(|ch| **ch == b'#').count();
243 let buf = &buf[level..];
244
245 if level > 6 || (buf.len() > 1 && !buf[0].is_ascii_whitespace()) {
246 return None;
248 }
249
250 let (txt, rest) = parse_to_newline(&buf[1..]);
251 let ctx = Context { top_block: false, prev: Prev::Whitespace };
252 let stream = parse_recursive(txt, ctx);
253
254 Some((MdTree::Heading(level.try_into().unwrap(), stream), rest))
255}
256
257fn parse_unordered_li(buf: &[u8]) -> Parsed<'_> {
259 let (txt, rest) = get_indented_section(&buf[2..]);
260 let ctx = Context { top_block: false, prev: Prev::Whitespace };
261 let stream = parse_recursive(trim_ascii_start(txt), ctx);
262 (MdTree::UnorderedListItem(stream), rest)
263}
264
265fn parse_ordered_li(buf: &[u8]) -> Parsed<'_> {
267 let (num, pos) = ord_list_start(buf).unwrap(); let (txt, rest) = get_indented_section(&buf[pos..]);
269 let ctx = Context { top_block: false, prev: Prev::Whitespace };
270 let stream = parse_recursive(trim_ascii_start(txt), ctx);
271 (MdTree::OrderedListItem(num, stream), rest)
272}
273
274fn get_indented_section(buf: &[u8]) -> (&[u8], &[u8]) {
275 let mut lines = buf.split(|&byte| byte == b'\n');
276 let mut end = lines.next().map_or(0, |line| line.len());
277 for line in lines {
278 if let Some(first) = line.first() {
279 if unordered_list_start(line) || !first.is_ascii_whitespace() {
280 break;
281 }
282 }
283 end += line.len() + 1;
284 }
285
286 (&buf[..end], &buf[end..])
287}
288
289fn unordered_list_start(mut buf: &[u8]) -> bool {
290 while let [b' ', rest @ ..] = buf {
291 buf = rest;
292 }
293 matches!(buf, [b'*' | b'-', b' ', ..])
294}
295
296fn ord_list_start(buf: &[u8]) -> Option<(u16, usize)> {
299 let pos = buf.iter().take(10).position(|ch| *ch == b'.')?;
300 let n = str::from_utf8(&buf[..pos]).ok()?;
301 if !buf.get(pos + 1)?.is_ascii_whitespace() {
302 return None;
303 }
304 n.parse::<u16>().ok().map(|v| (v, pos + 2))
305}
306
307fn parse_any_link(buf: &[u8], can_be_def: bool) -> ParseResult<'_> {
310 let (bracketed, rest) = parse_with_end_pat(&buf[1..], LNK_E, true)?;
311 if rest.is_empty() {
312 return None;
313 }
314
315 let disp = str::from_utf8(bracketed).unwrap();
316 match (can_be_def, rest[0]) {
317 (true, b':') => {
318 let (link, tmp) = parse_to_newline(&rest[1..]);
319 let link = str::from_utf8(link).unwrap().trim();
320 Some((MdTree::LinkDef { id: disp, link }, tmp))
321 }
322 (_, b'(') => parse_simple_pat(rest, b"(", b")", ParseOpt::TrimNoEsc, |link| MdTree::Link {
323 disp,
324 link,
325 }),
326 (_, b'[') => parse_simple_pat(rest, b"[", b"]", ParseOpt::TrimNoEsc, |id| {
327 MdTree::RefLink { disp, id: Some(id) }
328 }),
329 _ => Some((MdTree::RefLink { disp, id: None }, rest)),
330 }
331}
332
333fn parse_with_end_pat<'a>(
335 buf: &'a [u8],
336 end_sep: &[u8],
337 ignore_esc: bool,
338) -> Option<(&'a [u8], &'a [u8])> {
339 for idx in (0..buf.len()).filter(|idx| buf[*idx..].starts_with(end_sep)) {
341 if !ignore_esc && idx > 0 && buf[idx - 1] == b'\\' {
342 continue;
343 }
344 return Some((&buf[..idx], &buf[idx + end_sep.len()..]));
345 }
346 None
347}
348
349fn parse_to_newline(buf: &[u8]) -> (&[u8], &[u8]) {
352 buf.iter().position(|ch| *ch == b'\n').map_or((buf, &[]), |pos| buf.split_at(pos))
353}
354
355fn normalize<'a>(MdStream(stream): MdStream<'a>, linkdefs: &mut Vec<MdTree<'a>>) -> MdStream<'a> {
357 let mut new_stream = Vec::with_capacity(stream.len());
358 let new_defs = stream.iter().filter(|tt| matches!(tt, MdTree::LinkDef { .. }));
359 linkdefs.extend(new_defs.cloned());
360
361 for item in stream {
363 match item {
364 MdTree::PlainText(txt) => expand_plaintext(txt, &mut new_stream, MdTree::PlainText),
365 MdTree::Strong(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Strong),
366 MdTree::Emphasis(txt) => expand_plaintext(txt, &mut new_stream, MdTree::Emphasis),
367 MdTree::Strikethrough(txt) => {
368 expand_plaintext(txt, &mut new_stream, MdTree::Strikethrough);
369 }
370 MdTree::RefLink { disp, id } => new_stream.push(match_reflink(linkdefs, disp, id)),
371 MdTree::OrderedListItem(n, st) => {
372 new_stream.push(MdTree::OrderedListItem(n, normalize(st, linkdefs)));
373 }
374 MdTree::UnorderedListItem(st) => {
375 new_stream.push(MdTree::UnorderedListItem(normalize(st, linkdefs)));
376 }
377 MdTree::Heading(n, st) => new_stream.push(MdTree::Heading(n, normalize(st, linkdefs))),
378 _ => new_stream.push(item),
379 }
380 }
381
382 new_stream.retain(|x| !matches!(x, MdTree::Comment(_) | MdTree::LinkDef { .. }));
384 new_stream.dedup_by(|r, l| matches!((r, l), (MdTree::ParagraphBreak, MdTree::ParagraphBreak)));
385
386 if new_stream.first().is_some_and(is_break_ty) {
387 new_stream.remove(0);
388 }
389 if new_stream.last().is_some_and(is_break_ty) {
390 new_stream.pop();
391 }
392
393 let to_keep: Vec<bool> = new_stream
397 .windows(3)
398 .map(|w| {
399 !((matches!(&w[1], MdTree::ParagraphBreak)
400 && matches!(should_break(&w[0], &w[2]), BreakRule::Always(1) | BreakRule::Never))
401 || (matches!(&w[1], MdTree::PlainText(txt) if txt.trim().is_empty())
402 && matches!(
403 should_break(&w[0], &w[2]),
404 BreakRule::Always(_) | BreakRule::Never
405 )))
406 })
407 .collect();
408 let mut iter = iter::once(true).chain(to_keep).chain(iter::once(true));
409 new_stream.retain(|_| iter.next().unwrap());
410
411 let mut insertions = 0;
413 let to_insert: Vec<(usize, MdTree<'_>)> = new_stream
414 .windows(2)
415 .enumerate()
416 .filter_map(|(idx, w)| match should_break(&w[0], &w[1]) {
417 BreakRule::Always(1) => Some((idx, MdTree::LineBreak)),
418 BreakRule::Always(2) => Some((idx, MdTree::ParagraphBreak)),
419 _ => None,
420 })
421 .map(|(idx, tt)| {
422 insertions += 1;
423 (idx + insertions, tt)
424 })
425 .collect();
426 to_insert.into_iter().for_each(|(idx, tt)| new_stream.insert(idx, tt));
427
428 MdStream(new_stream)
429}
430
431#[derive(Clone, Copy, Debug, PartialEq)]
433enum BreakRule {
434 Always(u8),
435 Never,
436 Optional,
437}
438
439fn should_break(left: &MdTree<'_>, right: &MdTree<'_>) -> BreakRule {
441 use MdTree::*;
442
443 match (left, right) {
444 (HorizontalRule, _)
446 | (_, HorizontalRule)
447 | (OrderedListItem(_, _), OrderedListItem(_, _))
448 | (UnorderedListItem(_), UnorderedListItem(_)) => BreakRule::Always(1),
449 (Comment(_) | ParagraphBreak | Heading(_, _), _) | (_, Comment(_) | ParagraphBreak) => {
451 BreakRule::Never
452 }
453 (CodeBlock { .. } | OrderedListItem(_, _) | UnorderedListItem(_), _)
455 | (_, CodeBlock { .. } | Heading(_, _) | OrderedListItem(_, _) | UnorderedListItem(_)) => {
456 BreakRule::Always(2)
457 }
458 (
460 CodeInline(_)
461 | Strong(_)
462 | Emphasis(_)
463 | Strikethrough(_)
464 | PlainText(_)
465 | Link { .. }
466 | RefLink { .. }
467 | LinkDef { .. },
468 CodeInline(_)
469 | Strong(_)
470 | Emphasis(_)
471 | Strikethrough(_)
472 | PlainText(_)
473 | Link { .. }
474 | RefLink { .. }
475 | LinkDef { .. },
476 ) => BreakRule::Optional,
477 (LineBreak, _) | (_, LineBreak) => {
478 unreachable!("should have been removed during deduplication")
479 }
480 }
481}
482
483fn is_break_ty(val: &MdTree<'_>) -> bool {
485 matches!(val, MdTree::ParagraphBreak | MdTree::LineBreak)
486 || matches!(val, MdTree::PlainText(txt) if txt.trim().is_empty())
488}
489
490fn expand_plaintext<'a>(
498 txt: &'a str,
499 stream: &mut Vec<MdTree<'a>>,
500 mut f: fn(&'a str) -> MdTree<'a>,
501) {
502 if txt.is_empty() {
503 return;
504 } else if txt == "\n" {
505 if let Some(tt) = stream.last() {
506 let tmp = MdTree::PlainText(" ");
507 if should_break(tt, &tmp) == BreakRule::Optional {
508 stream.push(tmp);
509 }
510 }
511 return;
512 }
513 let mut queue1 = Vec::new();
514 let mut queue2 = Vec::new();
515 let stream_start_len = stream.len();
516 for paragraph in txt.split("\n\n") {
517 if paragraph.is_empty() {
518 stream.push(MdTree::ParagraphBreak);
519 continue;
520 }
521 let paragraph = trim_extra_ws(paragraph);
522
523 queue1.clear();
524 queue1.push(paragraph);
525
526 for (from, to) in REPLACEMENTS {
527 queue2.clear();
528 for item in &queue1 {
529 for s in item.split(from) {
530 queue2.extend(&[s, to]);
531 }
532 if queue2.len() > 1 {
533 let _ = queue2.pop(); }
535 }
536 mem::swap(&mut queue1, &mut queue2);
537 }
538
539 queue1.retain(|s| !s.is_empty());
541 for idx in 0..queue1.len() {
542 queue1[idx] = trim_extra_ws(queue1[idx]);
543 if idx < queue1.len() - 1
544 && queue1[idx].ends_with(char::is_whitespace)
545 && queue1[idx + 1].starts_with(char::is_whitespace)
546 {
547 queue1[idx] = queue1[idx].trim_end();
548 }
549 }
550 stream.extend(queue1.iter().copied().filter(|txt| !txt.is_empty()).map(&mut f));
551 stream.push(MdTree::ParagraphBreak);
552 }
553
554 if stream.len() - stream_start_len > 1 {
555 let _ = stream.pop(); }
557}
558
559fn match_reflink<'a>(linkdefs: &[MdTree<'a>], disp: &'a str, match_id: Option<&str>) -> MdTree<'a> {
562 let to_match = match_id.unwrap_or(disp); for def in linkdefs {
564 if let MdTree::LinkDef { id, link } = def {
565 if *id == to_match {
566 return MdTree::Link { disp, link };
567 }
568 }
569 }
570 MdTree::Link { disp, link: "" } }
572
573fn trim_extra_ws(mut txt: &str) -> &str {
575 let start_ws =
576 txt.bytes().position(|ch| !ch.is_ascii_whitespace()).unwrap_or(txt.len()).saturating_sub(1);
577 txt = &txt[start_ws..];
578 let end_ws = txt
579 .bytes()
580 .rev()
581 .position(|ch| !ch.is_ascii_whitespace())
582 .unwrap_or(txt.len())
583 .saturating_sub(1);
584 &txt[..txt.len() - end_ws]
585}
586
587fn trim_ascii_start(buf: &[u8]) -> &[u8] {
589 let count = buf.iter().take_while(|ch| ch.is_ascii_whitespace()).count();
590 &buf[count..]
591}
592
593#[cfg(test)]
594#[path = "tests/parse.rs"]
595mod tests;