From 154cc27c07e6c72f359ca5c045eb173062588602 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 7 Oct 2023 19:44:39 +0200 Subject: [PATCH] More precise emoji extraction and fixed center tag parsing --- Cargo.toml | 1 + magnetar_mmm_parser/src/lib.rs | 505 +++++++++++++++++++++++---------- 2 files changed, 350 insertions(+), 156 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c326183..c5d0c4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ nom = "7" nom_locate = "4" percent-encoding = "2.2" redis = "0.23" +regex = "1.9" reqwest = "0.11" sea-orm = "0.12" sea-orm-migration = "0.12" diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index fbcfeb6..8dccf96 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -13,7 +13,7 @@ use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; -use std::convert::identity; +use std::convert::{identity, Infallible}; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -37,7 +37,6 @@ pub enum Token<'a> { Sequence(Vec>), Quote(Box>), Small(Box>), - Big(Box>), BoldItalic(Box>), Bold(Box>), Italic(Box>), @@ -80,7 +79,6 @@ impl Token<'_> { Token::Sequence(tokens) => Token::Sequence(tokens.iter().map(Token::owned).collect()), Token::Quote(inner) => Token::Quote(Box::new(inner.owned())), Token::Small(inner) => Token::Small(Box::new(inner.owned())), - Token::Big(inner) => Token::Big(Box::new(inner.owned())), Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.owned())), Token::Bold(inner) => Token::Bold(Box::new(inner.owned())), Token::Italic(inner) => Token::Italic(Box::new(inner.owned())), @@ -180,7 +178,6 @@ impl Token<'_> { } Token::Quote(inner) => Token::Quote(Box::new(inner.merged())), Token::Small(inner) => Token::Small(Box::new(inner.merged())), - Token::Big(inner) => Token::Big(Box::new(inner.merged())), Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.merged())), Token::Bold(inner) => Token::Bold(Box::new(inner.merged())), Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), @@ -228,11 +225,19 @@ fn boxing_token<'a>(func: impl Fn(Box>) -> Token<'a>) -> impl Fn(Token move |tokens| func(Box::new(tokens)) } +#[inline] +fn collect_sequence<'a, T>( + func: impl Fn(Vec) -> Token<'a>, + transform: impl Fn(Token<'a>) -> Token<'a>, +) -> impl Fn(&mut dyn Iterator) -> Token<'a> { + move |tokens| transform(func(tokens.collect())) +} + #[inline] fn collect_char_sequence<'a>( func: impl Fn(Cow<'a, str>) -> Token<'a>, -) -> impl Fn(Vec) -> Token<'a> { - move |chars| func(Cow::Owned(chars.into_iter().collect())) +) -> impl Fn(&mut dyn Iterator) -> Token<'a> { + move |chars| func(Cow::Owned(chars.collect())) } fn spliced<'a>( @@ -306,6 +311,42 @@ fn space(input: Span) -> IResult { Ok((input, Token::PlainText(frag.into_fragment().into()))) } +struct Matcher<'a, 'b, T> { + matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), + _phantom_closure: std::marker::PhantomData<&'a ()>, + _phantom_data: std::marker::PhantomData<&'b ()>, + _phantom_output: std::marker::PhantomData T>, +} + +impl<'a, 'b, T> Matcher<'a, 'b, T> { + fn new( + matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), + ) -> Self { + Self { + matcher_inner, + collector, + _phantom_closure: std::marker::PhantomData, + _phantom_data: std::marker::PhantomData, + _phantom_output: std::marker::PhantomData, + } + } +} + +impl<'a, 'b> Matcher<'a, 'b, Infallible> { + // Don't break this invariant, else a monster will come at night and eat all your socks + fn reject() -> Self { + Self { + matcher_inner: &fail::<_, Infallible, _>, + collector: &|_| unreachable!(), + _phantom_closure: std::marker::PhantomData, + _phantom_data: std::marker::PhantomData, + _phantom_output: std::marker::PhantomData, + } + } +} + struct Context; impl Context { @@ -477,13 +518,9 @@ impl Context { let (input, (center_seq, _)) = many_till( self.partial(Self::inline_single), - tuple((opt(line_ending), tag_end)), + tuple((opt(space1), opt(line_ending), tag_end)), )(input)?; - let (input, _) = many0(space)(input)?; - let (input, _) = not(not(line_ending))(input)?; - let (input, _) = opt(line_ending)(input)?; - Ok(( input, boxing_token(Token::Center)(Token::Sequence(center_seq)), @@ -560,23 +597,21 @@ impl Context { } #[inline] - fn tag_delimited<'a, 'b: 'a, T>( + fn tag_delimited<'a, 'b: 'a, T, S>( &'a self, - start: &'b str, - end: &'b str, + opening_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + closing_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, escape: bool, - matcher_inner: impl Fn(Span<'b>) -> IResult, T> + 'a, - matcher_inner_fallback: impl Fn(Span<'b>) -> IResult, T> + 'a, - collector: impl Fn(Vec) -> Token<'b> + 'a, - mapper: impl Fn(Token<'b>) -> Token<'b> + 'a, + matcher: Matcher<'a, 'b, T>, + fallback: Matcher<'a, 'b, S>, ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ { move |input| { - let opening_tag = &tag(start); - let closing_tag = &tag(end); - if escape { - if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), opening_tag))(input) { - return Ok((input_escaped, Token::PlainText(Cow::Borrowed(&mark)))); + if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) { + return Ok(( + input_escaped, + Token::PlainText(Cow::Borrowed(mark.fragment())), + )); } } @@ -584,8 +619,8 @@ impl Context { let (post_open, _) = opening_tag(input)?; let res = tuple(( - many1(tuple((not(closing_tag), &matcher_inner))), - closing_tag, + many1(tuple((not(&closing_tag), &matcher.matcher_inner))), + &closing_tag, ))(post_open); if let Err(nom::Err::Error(nom::error::Error { @@ -594,8 +629,8 @@ impl Context { })) = res { let res_fallback = tuple(( - many1(tuple((not(closing_tag), &matcher_inner_fallback))), - closing_tag, + many1(tuple((not(&closing_tag), &fallback.matcher_inner))), + &closing_tag, ))(post_open); if res_fallback.is_err() { @@ -606,22 +641,22 @@ impl Context { } let (input, (inner, closing)) = res_fallback.unwrap(); - let inner = inner.into_iter().map(|(_, t)| t).collect::>(); + let mut inner = inner.into_iter().map(|(_, t)| t); return Ok(( input, Token::Sequence(vec![ Token::PlainText(begin.fragment_between(&post_open).into()), - collector(inner), + ((fallback.collector)(&mut inner)), Token::PlainText(closing.into_fragment().into()), ]), )); } let (input, (inner, _)) = res?; - let inner = inner.into_iter().map(|(_, t)| t).collect::>(); + let mut inner = inner.into_iter().map(|(_, t)| t); - Ok((input, mapper(collector(inner)))) + Ok((input, (matcher.collector)(&mut inner))) } } @@ -691,176 +726,230 @@ impl Context { fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "", - "", + tag(""), + tag(""), false, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Small), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Small)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "***", - "***", + tag("***"), + tag("***"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::BoldItalic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "___", - "___", + tag("___"), + tag("___"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::BoldItalic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "", - "", + tag(""), + tag(""), false, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Bold), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "**", - "**", + tag("**"), + tag("**"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Bold), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "__", - "__", + tag("__"), + tag("__"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Bold), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "", - "", + tag(""), + tag(""), false, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Italic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "*", - "*", + tag("*"), + tag("*"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Italic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "_", - "_", + tag("_"), + tag("_"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Italic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "", - "", + tag(""), + tag(""), false, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Strikethrough), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "~~", - "~~", + tag("~~"), + tag("~~"), true, - move |input| { - tuple((not_line_ending, self.partial(Self::inline_single)))(input) - .map(|(i, t)| (i, t.1)) - }, - move |input| { - tuple(( - not_line_ending, - self.partial(Self::inline_non_formatting_single), - ))(input) - .map(|(i, t)| (i, t.1)) - }, - Token::Sequence, - boxing_token(Token::Strikethrough), + Matcher::new( + &move |input| { + map( + tuple(((not(line_ending)), self.partial(Self::inline_single))), + |(_, captured)| captured, + )(input) + }, + &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), + ), + Matcher::new( + &move |input| { + map( + tuple(( + (not(line_ending)), + self.partial(Self::inline_non_formatting_single), + )), + |(_, captured)| captured, + )(input) + }, + &collect_sequence(Token::Sequence, identity), + ), )(input) } fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "`", - "", + tag("`"), + |input| alt((tag("`"), tag("Β΄")))(input), true, - move |input| { - tuple((not(alt((tag("`"), tag("Β΄"), line_ending))), anychar))(input) - .map(|(i, (_skip, c))| (i, c)) - }, - fail, - collect_char_sequence(Token::InlineCode), - identity, + Matcher::new( + &move |input| { + map( + tuple((not(alt((tag("`"), tag("Β΄"), line_ending))), anychar)), + |(_, captured)| captured, + )(input) + }, + &collect_char_sequence(Token::InlineCode), + ), + Matcher::reject(), )(input) } fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "\\(", - "\\)", + tag("\\("), + tag("\\)"), false, - move |input| tuple((not(line_ending), anychar))(input).map(|(i, (_skip, c))| (i, c)), - fail, - collect_char_sequence(Token::InlineMath), - identity, + Matcher::new( + &move |input| { + map(tuple((not(line_ending), anychar)), |(_, captured)| captured)(input) + }, + &collect_char_sequence(Token::InlineMath), + ), + Matcher::reject(), )(input) } @@ -925,6 +1014,8 @@ impl Context { return fail(input); }; + let grapheme = grapheme.trim_end_matches(|c| c == '\u{200c}' || c == '\u{200d}'); + let emoji = emojis::get(grapheme); if emoji.is_none() { @@ -1059,10 +1150,13 @@ fn url_chars<'a, T: 'a>( mod test { use crate::{url_chars, Context, Span, Token}; use nom::bytes::complete::tag; - use nom::multi::many1; use std::borrow::Cow; use std::collections::HashMap; + fn parse_full(string: &str) -> Token { + Context.full(Span::new(string)).unwrap().1.merged().owned() + } + #[test] fn parse_url_chars() { let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))"; @@ -1111,9 +1205,92 @@ mod test { ); } + #[test] + fn parse_formatting() { + assert_eq!( + Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), + parse_full(r#"~~stikethrough~~"#) + ); + + assert_eq!( + Token::Bold(Box::new(Token::PlainText("bold".into()))), + parse_full(r#"**bold**"#) + ); + + assert_eq!( + Token::Italic(Box::new(Token::PlainText("italic".into()))), + parse_full(r#"*italic*"#) + ); + + assert_eq!( + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("code".into()), + Token::PlainText(" also not code".into()) + ]), + parse_full(r#"not code `code` also not code"#) + ); + + assert_eq!( + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("code".into()), + Token::PlainText(" also `not code".into()) + ]), + parse_full(r#"not code `code` also `not code"#) + ); + + assert_eq!( + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("*not bold*".into()), + Token::PlainText(" also not code".into()) + ]), + parse_full(r#"not code `*not bold*` also not code"#) + ); + + assert_eq!( + Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))), + parse_full(r#"***bold italic***"#) + ); + + assert_eq!( + Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( + "bold italic".into() + ))))), + parse_full(r#"bold italic"#) + ); + } + #[test] fn parse_complex() { - let emoji = r#"$[x2 $[sparkle πŸ₯Ί]πŸ’œ$[spin.y,speed=5s ❀️]🦊]"#; + assert_eq!( + Token::Center(Box::new(Token::Sequence(vec![ + Token::PlainText("centered\n".into()), + Token::UnicodeEmoji("πŸ¦‹".into()), + Token::UnicodeEmoji("πŸ³οΈβ€βš§οΈ".into()), + Token::PlainText("\ntext".into()) + ]))), + parse_full( + r#"
centered +πŸ¦‹πŸ³οΈβ€βš§οΈ +text
"# + ) + ); + + assert_eq!( + Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ + Token::PlainText("centered\n".into()), + Token::UnicodeEmoji("πŸ‘©πŸ½β€πŸ€β€πŸ‘©πŸΌ".into()), + Token::PlainText("\ntext".into()) + ]))))), + parse_full( + r#">
centered +> πŸ‘©πŸ½β€πŸ€β€πŸ‘©πŸΌ +> text
"# + ) + ); + assert_eq!( Token::Function { name: "x2".into(), @@ -1138,21 +1315,7 @@ mod test { Token::UnicodeEmoji("🦊".into()), ])) }, - Context.full(Span::new(emoji)).unwrap().1.merged() - ); - - let bold_italic = r#"***bold italic***"#; - assert_eq!( - Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))), - Context.full(Span::new(bold_italic)).unwrap().1.merged() - ); - - let bold_italic_tag = r#"bold italic"#; - assert_eq!( - Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( - "bold italic".into() - ))))), - Context.full(Span::new(bold_italic_tag)).unwrap().1.merged() + parse_full(r#"$[x2 $[sparkle πŸ₯Ί]πŸ’œ$[spin.y,speed=5s ❀️]🦊]"#) ); assert_eq!( @@ -1178,37 +1341,67 @@ mod test { .merged() ); - let quote = r#" -> test -> -> italic -> ->> Nested quote -"#; - assert_eq!( Token::Quote(Box::new(Token::Sequence(vec![ Token::PlainText("test\n".into()), Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) ]))), - Context.full(Span::new(quote)).unwrap().1.merged() + parse_full( + r#" +> test +> +> italic +> +>> Nested quote +"# + ) ); } #[test] fn parse_emoji() { - let test = "πŸ₯ΊπŸ’œβ€οΈπŸ¦Š"; - let ctx = Context; - let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap(); + assert_eq!( + Token::Sequence( + vec!["πŸ₯Ί", "πŸ’œ", "❀️", "🦊"] + .into_iter() + .map(<&str as Into>>::into) + .map(Token::UnicodeEmoji) + .collect::>() + ), + parse_full("πŸ₯ΊπŸ’œβ€οΈπŸ¦Š") + ); + + // Trans flag, ZWJ + assert_eq!( + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into()), + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}") + ); assert_eq!( - vec!["πŸ₯Ί", "πŸ’œ", "❀️", "🦊"] - .into_iter() - .map(<&str as Into>>::into) - .map(Token::UnicodeEmoji) - .collect::>(), - tokens.1 + Token::Sequence(vec![ + Token::PlainText("\u{0200d}".into()), // ZWJ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + ]), + parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}") + ); + + // Trans flag, ZWNJ + assert_eq!( + Token::Sequence(vec![ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + Token::PlainText("\u{0200c}".into()), // ZWNJ + Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()) // Trans symbol + ]), + parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}") + ); + + assert_eq!( + Token::Sequence(vec![ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + Token::PlainText("\u{0200d}\u{0200d}\u{0200d}".into()), // ZWJ + ]), + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}") ); } }