From 703e1191c2820ca39e7a35a8f320bf4924671421 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 7 Oct 2023 01:46:20 +0200 Subject: [PATCH] Janky sequence unnesting and attempting to salvage nested parsing in incorrect formatting tags --- Cargo.lock | 1 + Cargo.toml | 1 + magnetar_mmm_parser/Cargo.toml | 1 + magnetar_mmm_parser/src/lib.rs | 245 ++++++++++++++++++++++++++------- 4 files changed, 200 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aa58d5f..e2d79c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1622,6 +1622,7 @@ dependencies = [ name = "mmm_parser" version = "0.2.1-alpha" dependencies = [ + "either", "emojis", "nom", "nom_locate", diff --git a/Cargo.toml b/Cargo.toml index f504d67..c326183 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ cached = "0.46" cfg-if = "1" chrono = "0.4" dotenvy = "0.15" +either = "1.9" emojis = "0.6" futures-core = "0.3" futures-util = "0.3" diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 30c2bad..25faa6b 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -5,6 +5,7 @@ edition.workspace = true license = "MIT OR Apache-2.0" [dependencies] +either = { workspace = true } emojis = { workspace = true } nom = { workspace = true } nom_locate = { workspace = true } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index ed90585..74d98ea 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,3 +1,4 @@ +use either::Either; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{ @@ -12,6 +13,7 @@ use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; +use std::convert::identity; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -145,6 +147,27 @@ impl Token<'_> { } } + if let Token::Sequence(seq) = tok { + let items = seq.iter().map(Token::merged).flat_map(|t| match t { + Token::Sequence(seq) => Either::Left(seq.into_iter()), + other => Either::Right(std::iter::once(other)), + }); + + for item in items { + if let Some(Token::PlainText(last)) = acc.last_mut() { + if let Token::PlainText(tok_text) = item { + *last = Cow::from(last.to_string() + tok_text.as_ref()); + + continue; + } + } + + acc.push(item); + } + + return acc; + } + acc.push(tok.merged()); acc }); @@ -200,13 +223,13 @@ impl SliceOffset for Span<'_> { } } -const fn boxing_sequence<'a>( - func: impl Fn(Box>) -> Token<'a>, -) -> impl Fn(Vec>) -> Token<'a> { - move |tokens| func(Box::new(Token::Sequence(tokens))) +#[inline] +fn boxing_token<'a>(func: impl Fn(Box>) -> Token<'a>) -> impl Fn(Token<'a>) -> Token<'a> { + move |tokens| func(Box::new(tokens)) } -const fn collect_char_sequence<'a>( +#[inline] +fn collect_char_sequence<'a>( func: impl Fn(Cow<'a, str>) -> Token<'a>, ) -> impl Fn(Vec) -> Token<'a> { move |chars| func(Cow::Owned(chars.into_iter().collect())) @@ -215,14 +238,14 @@ const fn collect_char_sequence<'a>( fn spliced<'a>( segments: &[Span<'a>], func: impl Fn(Span) -> IResult, - output_mapper: impl Fn(Box>) -> Token<'static>, parent: Span<'a>, ) -> IResult, Token<'static>, nom::error::Error>> { let combined = segments .iter() .copied() .map(Span::into_fragment) - .collect::(); + .collect::>() + .join("\n"); let cum_offset_combined = segments .iter() .scan(0, |acc, &x| { @@ -234,7 +257,7 @@ fn spliced<'a>( cum_offset_combined .iter() .enumerate() - .filter(|(_, &o)| o >= input.location_offset()) + .take_while(|(_, &o)| o > input.location_offset()) .map(|(i, o)| (segments[i], o)) .last() }; @@ -275,7 +298,7 @@ fn spliced<'a>( parent }; - Ok((out, output_mapper(Box::new(inner.owned())))) + Ok((out, inner.owned())) } fn space(input: Span) -> IResult { @@ -370,6 +393,22 @@ impl Context { Ok((input, token)) } + fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::url_no_embed), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::raw_url), + self.partial(Self::text), + ))(input)?; + Ok((input, token)) + } + fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, token) = alt(( self.partial(Self::unicode_emoji), @@ -416,12 +455,7 @@ impl Context { return fail(input); } - let (_, inner) = spliced( - "e_lines, - self.partial(Self::full), - Token::Quote, - orig_input, - )?; + let (_, inner) = spliced("e_lines, self.partial(Self::full), orig_input)?; let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; @@ -450,7 +484,10 @@ impl Context { let (input, _) = not(not(line_ending))(input)?; let (input, _) = opt(line_ending)(input)?; - Ok((input, boxing_sequence(Token::Center)(center_seq))) + Ok(( + input, + boxing_token(Token::Center)(Token::Sequence(center_seq)), + )) } fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { @@ -522,13 +559,16 @@ impl Context { )) } - const fn tag_delimited<'a, 'b: 'a, T>( + #[inline] + fn tag_delimited<'a, 'b: 'a, T>( &'a self, start: &'b str, end: &'b str, escape: bool, matcher_inner: impl Fn(Span<'b>) -> IResult, T> + 'a, - mapper: impl Fn(Vec) -> Token<'b> + 'a, + matcher_inner_fallback: impl Fn(Span<'b>) -> IResult, T> + 'a, + collector: impl Fn(Vec) -> Token<'b> + 'a, + mapper: impl Fn(Token<'b>) -> Token<'b> + 'a, ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ { move |input| { let opening_tag = &tag(start); @@ -548,18 +588,40 @@ impl Context { closing_tag, ))(post_open); - if let Err(nom::Err::Error(nom::error::Error { .. })) = res { + if let Err(nom::Err::Error(nom::error::Error { + input: input_past_err, + .. + })) = res + { + let res_fallback = tuple(( + many1(tuple((not(closing_tag), &matcher_inner_fallback))), + closing_tag, + ))(post_open); + + if res_fallback.is_err() { + return Ok(( + input_past_err, + Token::PlainText(begin.fragment_between(&input_past_err).into()), + )); + } + + let (input, (inner, closing)) = res_fallback.unwrap(); + let inner = inner.into_iter().map(|(_, t)| t).collect::>(); + return Ok(( - post_open, - Token::PlainText(begin.fragment_between(&post_open).into()), + input, + Token::Sequence(vec![ + Token::PlainText(begin.fragment_between(&post_open).into()), + collector(inner), + Token::PlainText(closing.into_fragment().into()), + ]), )); } let (input, (inner, _)) = res?; - let inner = inner.into_iter().map(|(_, t)| t).collect::>(); - Ok((input, mapper(inner))) + Ok((input, mapper(collector(inner)))) } } @@ -632,8 +694,10 @@ impl Context { "", "", false, - self.partial(Self::inline), - boxing_sequence(Token::Small), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Small), )(input) } @@ -643,8 +707,10 @@ impl Context { "***", "***", true, - self.partial(Self::inline), - boxing_sequence(Token::BoldItalic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::BoldItalic), )(input) } @@ -654,8 +720,10 @@ impl Context { "___", "___", true, - self.partial(Self::inline), - boxing_sequence(Token::BoldItalic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::BoldItalic), )(input) } @@ -664,8 +732,10 @@ impl Context { "", "", false, - self.partial(Self::inline), - boxing_sequence(Token::Bold), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Bold), )(input) } @@ -675,8 +745,10 @@ impl Context { "**", "**", true, - self.partial(Self::inline), - boxing_sequence(Token::Bold), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Bold), )(input) } @@ -686,8 +758,10 @@ impl Context { "__", "__", true, - self.partial(Self::inline), - boxing_sequence(Token::Bold), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Bold), )(input) } @@ -696,8 +770,10 @@ impl Context { "", "", false, - self.partial(Self::inline), - boxing_sequence(Token::Italic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Italic), )(input) } @@ -707,8 +783,10 @@ impl Context { "*", "*", true, - self.partial(Self::inline), - boxing_sequence(Token::Italic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Italic), )(input) } @@ -718,8 +796,10 @@ impl Context { "_", "_", true, - self.partial(Self::inline), - boxing_sequence(Token::Italic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Italic), )(input) } @@ -728,8 +808,10 @@ impl Context { "", "", false, - self.partial(Self::inline), - boxing_sequence(Token::Strikethrough), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Strikethrough), )(input) } @@ -740,9 +822,18 @@ impl Context { "~~", true, move |input| { - tuple((not_line_ending, self.partial(Self::inline)))(input).map(|(i, t)| (i, t.1)) + tuple((not_line_ending, self.partial(Self::inline_single)))(input) + .map(|(i, t)| (i, t.1)) }, - boxing_sequence(Token::Strikethrough), + move |input| { + tuple(( + not_line_ending, + self.partial(Self::inline_non_formatting_single), + ))(input) + .map(|(i, t)| (i, t.1)) + }, + Token::Sequence, + boxing_token(Token::Strikethrough), )(input) } @@ -755,7 +846,9 @@ impl Context { tuple((not(alt((tag("`"), tag("ยด"), line_ending))), anychar))(input) .map(|(i, (_skip, c))| (i, c)) }, + fail, collect_char_sequence(Token::InlineCode), + identity, )(input) } @@ -764,8 +857,10 @@ impl Context { "\\(", "\\)", false, - move |input| tuple((not_line_ending, anychar))(input).map(|(i, (_skip, c))| (i, c)), - collect_char_sequence(Token::InlineMath), + move |input| tuple((not(line_ending), anychar))(input).map(|(i, (_skip, c))| (i, c)), + fail, + collect_char_sequence(Token::InlineCode), + identity, )(input) } @@ -1044,7 +1139,61 @@ mod test { ])) }, Context.full(Span::new(emoji)).unwrap().1.merged() - ) + ); + + let bold_italic = r#"***bold italic***"#; + assert_eq!( + Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))), + Context.full(Span::new(bold_italic)).unwrap().1.merged() + ); + + let bold_italic_tag = r#"bold italic"#; + assert_eq!( + Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( + "bold italic".into() + ))))), + Context.full(Span::new(bold_italic_tag)).unwrap().1.merged() + ); + + assert_eq!( + Token::Sequence(vec![ + Token::PlainText("bold ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag1".into(), + host: None + }, + Token::PlainText(" ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag2".into(), + host: None + }, + Token::PlainText(" italic".into()) + ]), + Context + .full(Span::new(r#"bold @tag1 @tag2 italic"#)) + .unwrap() + .1 + .merged() + ); + + let quote = r#" +> test +> +> italic +> +>> Nested quote +"#; + + assert_eq!( + Token::Quote(Box::new(Token::Sequence(vec![ + Token::PlainText("test\n".into()), + Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), + Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) + ]))), + Context.full(Span::new(quote)).unwrap().1.merged() + ); } #[test]