From 1af8f4e213207d010fdceb43a4732bb76bb2ef66 Mon Sep 17 00:00:00 2001 From: Natty Date: Sun, 1 Oct 2023 23:04:32 +0200 Subject: [PATCH] Basic inline tag parsing --- Cargo.lock | 19 +- Cargo.toml | 1 + magnetar_mmm_parser/Cargo.toml | 4 +- magnetar_mmm_parser/src/lib.rs | 405 +++++++++++++++++++++++++++++++++ 4 files changed, 426 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 83cf326..5b8dcd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -411,6 +411,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + [[package]] name = "byteorder" version = "1.4.3" @@ -1608,7 +1614,7 @@ name = "mmm_parser" version = "0.2.1-alpha" dependencies = [ "nom", - "thiserror", + "nom_locate", ] [[package]] @@ -1621,6 +1627,17 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom_locate" +version = "4.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e3c83c053b0713da60c5b8de47fe8e494fe3ece5267b2f23090a07a053ba8f3" +dependencies = [ + "bytecount", + "memchr", + "nom", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" diff --git a/Cargo.toml b/Cargo.toml index 76333e2..a7a960f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ js-sys = "0.3" log = "0.4" miette = "5.9" nom = "7" +nom_locate = "4" percent-encoding = "2.2" redis = "0.23" reqwest = "0.11" diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 3620931..8a07618 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -5,5 +5,5 @@ edition.workspace = true license = "MIT OR Apache-2.0" [dependencies] -thiserror = { workspace = true } -nom = { workspace = true } \ No newline at end of file +nom = { workspace = true } +nom_locate = { workspace = true } \ No newline at end of file diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index e69de29..9ed647e 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -0,0 +1,405 @@ +use nom::branch::alt; +use nom::bytes::complete::tag; +use nom::character::complete; +use nom::character::complete::{anychar, line_ending, not_line_ending, tab}; +use nom::combinator::{fail, not, opt}; +use nom::error::ErrorKind; +use nom::multi::{many1, separated_list1}; +use nom::sequence::tuple; +use nom::{IResult, Offset, Slice}; +use nom_locate::LocatedSpan; +use std::borrow::Cow; + +enum Token<'a> { + PlainText(Cow<'a, str>), + Sequence(Vec>), + Quote(Box>), + Small(Box>), + Big(Box>), + BoldItalic(Box>), + Bold(Box>), + Italic(Box>), + Center(Box>), + Strikethrough(Box>), + PlainTag(Cow<'a, str>), + InlineCode(Cow<'a, str>), + InlineMath(Cow<'a, str>), +} + +impl Token<'_> { + fn owned(&self) -> Token<'static> { + match self { + Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())), + Token::Sequence(tokens) => Token::Sequence(tokens.iter().map(Token::owned).collect()), + Token::Quote(inner) => Token::Quote(Box::new(inner.owned())), + Token::Small(inner) => Token::Small(Box::new(inner.owned())), + Token::Big(inner) => Token::Big(Box::new(inner.owned())), + Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.owned())), + Token::Bold(inner) => Token::Bold(Box::new(inner.owned())), + Token::Italic(inner) => Token::Italic(Box::new(inner.owned())), + Token::Center(inner) => Token::Center(Box::new(inner.owned())), + Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.owned())), + Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())), + Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())), + Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())), + } + } +} + +type Span<'a> = LocatedSpan<&'a str>; + +trait SliceOffset { + fn up_to(&self, other: &Self) -> Self; + + fn fragment_between<'a>(&self, other: &Self) -> &'a str + where + Self: 'a; +} + +impl SliceOffset for Span<'_> { + fn up_to(&self, other: &Self) -> Self { + self.slice(..self.offset(other)) + } + + fn fragment_between<'a>(&self, other: &Self) -> &'a str + where + Self: 'a, + { + self.up_to(other).into_fragment() + } +} + +const fn boxing_sequence<'a>( + func: impl Fn(Box>) -> Token<'a>, +) -> impl Fn(Vec>) -> Token<'a> { + move |tokens| func(Box::new(Token::Sequence(tokens))) +} + +const fn collect_char_sequence<'a>( + func: impl Fn(Cow<'a, str>) -> Token<'a>, +) -> impl Fn(Vec) -> Token<'a> { + move |chars| func(Cow::Owned(chars.into_iter().collect())) +} + +fn spliced<'a>( + segments: &[Span<'a>], + func: impl Fn(Span) -> IResult, + parent: Span<'a>, +) -> IResult, Token<'static>, nom::error::Error>> { + let combined = segments + .iter() + .copied() + .map(Span::into_fragment) + .collect::(); + let cum_offset_combined = segments + .iter() + .scan(0, |acc, &x| { + *acc += x.len(); + Some(*acc) + }) + .collect::>(); + let current_seg = |input: Span| { + cum_offset_combined + .iter() + .enumerate() + .filter(|(_, &o)| o >= input.location_offset()) + .map(|(i, o)| (segments[i], o)) + .last() + }; + + type NE = nom::Err; + type NomError<'x> = nom::error::Error>; + + let quote_span = Span::new(&combined); + let (input, inner) = match func(quote_span) { + Ok((input, token)) => (input, token.owned()), + Err(e) => { + return match e { + NE::Error(e) => { + let offset_new = e.input.location_offset(); + if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) { + let offset = offset_new - offset_seg_new; + let offset_orig = offset + seg_parent.location_offset(); + Err(NE::Error(NomError::new( + Span::new(&parent.into_fragment()[offset_orig..]), + e.code, + ))) + } else { + // ??? + Err(NE::Failure(NomError::new(parent, ErrorKind::Fail))) + } + } + NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))), + NE::Incomplete(i) => Err(NE::Incomplete(i)), + }; + } + }; + + let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) { + let offset = input.location_offset() - offset_seg_new; + let offset_orig = offset + seg_parent.location_offset(); + parent.slice(offset_orig..) + } else { + parent + }; + + Ok((out, Token::Quote(Box::new(inner.owned())))) +} + +fn space(input: Span) -> IResult { + let start = input; + let (input, _) = alt((complete::char('\u{0020}'), complete::char('\u{3000}'), tab))(input)?; + Ok(( + input, + Token::PlainText(start.fragment_between(&input).into()), + )) +} + +struct Context; + +impl Context { + const fn partial<'a>( + &self, + func: impl Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, + ) -> impl Fn(Span<'a>) -> IResult, Token<'a>> + '_ { + move |input| func(self, input) + } + + fn root<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt((self.partial(Self::tag_quote),))(input)?; + Ok((input, token)) + } + + fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?; + Ok((input, token)) + } + + fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; + + if let (None, None) = leading_spaces { + if input.get_column() != 0 { + return fail(input); + } + } + + let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input); + + let orig_input = input; + let (input, lines) = separated_list1(line_ending, quote_line)(input)?; + + let quote_lines = lines + .into_iter() + .map(|(_, _, text)| text) + .collect::>(); + + if quote_lines.len() == 1 + && quote_lines + .iter() + .map(Span::fragment) + .copied() + .any(&str::is_empty) + { + return fail(input); + } + + let (_, inner) = spliced("e_lines, space, orig_input)?; + + let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; + + Ok((input, Token::Quote(Box::new(inner)))) + } + + const fn tag_delimited<'a, 'b: 'a, T>( + &'a self, + start: &'b str, + end: &'b str, + escape: bool, + matcher_inner: impl Fn(Span<'b>) -> IResult, T> + 'a, + mapper: impl Fn(Vec) -> Token<'b> + 'a, + ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ { + move |input| { + let opening_tag = &tag(start); + let closing_tag = &tag(end); + + if escape { + if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), opening_tag))(input) { + return Ok((input_escaped, Token::PlainText(Cow::Borrowed(&mark)))); + } + } + + let begin = input; + let (post_open, _) = opening_tag(input)?; + + let res = tuple(( + many1(tuple((not(closing_tag), &matcher_inner))), + closing_tag, + ))(post_open); + + if let Err(nom::Err::Error(nom::error::Error { .. })) = res { + return Ok(( + post_open, + Token::PlainText(begin.fragment_between(&post_open).into()), + )); + } + + let (input, (inner, _)) = res?; + + let inner = inner.into_iter().map(|(_, t)| t).collect::>(); + + Ok((input, mapper(inner))) + } + } + + fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "", + "", + false, + self.partial(Self::inline), + boxing_sequence(Token::Small), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "***", + "***", + true, + self.partial(Self::inline), + boxing_sequence(Token::BoldItalic), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "___", + "___", + true, + self.partial(Self::inline), + boxing_sequence(Token::BoldItalic), + )(input) + } + + fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "", + "", + false, + self.partial(Self::inline), + boxing_sequence(Token::Bold), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "**", + "**", + true, + self.partial(Self::inline), + boxing_sequence(Token::Bold), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "__", + "__", + true, + self.partial(Self::inline), + boxing_sequence(Token::Bold), + )(input) + } + + fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "", + "", + false, + self.partial(Self::inline), + boxing_sequence(Token::Italic), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "*", + "*", + true, + self.partial(Self::inline), + boxing_sequence(Token::Italic), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "_", + "_", + true, + self.partial(Self::inline), + boxing_sequence(Token::Italic), + )(input) + } + + fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "", + "", + false, + self.partial(Self::inline), + boxing_sequence(Token::Strikethrough), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "~~", + "~~", + true, + move |input| { + tuple((not_line_ending, self.partial(Self::inline)))(input).map(|(i, t)| (i, t.1)) + }, + boxing_sequence(Token::Strikethrough), + )(input) + } + + fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "`", + "", + true, + move |input| { + tuple((not(alt((tag("`"), tag("ยด"), line_ending))), anychar))(input) + .map(|(i, (_skip, c))| (i, c)) + }, + collect_char_sequence(Token::InlineCode), + )(input) + } + + fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "\\(", + "\\)", + false, + move |input| tuple((not_line_ending, anychar))(input).map(|(i, (_skip, c))| (i, c)), + collect_char_sequence(Token::InlineMath), + )(input) + } + + fn text<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let before = input; + let (input, _) = anychar(input)?; + Ok(( + input, + Token::PlainText(before.fragment_between(&input).into()), + )) + } +}