From 453891ddf418744c1b649097227dad9b5af9924a Mon Sep 17 00:00:00 2001 From: Natty Date: Fri, 6 Oct 2023 00:17:52 +0200 Subject: [PATCH] Connected it all --- magnetar_mmm_parser/src/lib.rs | 249 +++++++++++++++++++++++++++------ 1 file changed, 204 insertions(+), 45 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 4fbb9ef..ed90585 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -6,7 +6,7 @@ use nom::character::complete::{ }; use nom::combinator::{eof, fail, map, not, opt, recognize}; use nom::error::ErrorKind; -use nom::multi::{many0, many0_count, many1, many1_count, separated_list1}; +use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; @@ -14,7 +14,7 @@ use std::borrow::Cow; use std::collections::HashMap; use unicode_segmentation::UnicodeSegmentation; -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum MentionType { Community, User, @@ -29,7 +29,7 @@ impl MentionType { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] pub enum Token<'a> { PlainText(Cow<'a, str>), Sequence(Vec>), @@ -132,6 +132,49 @@ impl Token<'_> { Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())), } } + + fn merged(&self) -> Token { + match self { + Token::Sequence(tokens) => { + let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| { + if let Some(Token::PlainText(last)) = acc.last_mut() { + if let Token::PlainText(tok_text) = tok { + *last = Cow::from(last.to_string() + tok_text.as_ref()); + + return acc; + } + } + + acc.push(tok.merged()); + acc + }); + + if tokens_multi.len() == 1 { + return tokens_multi.into_iter().next().unwrap(); + } + + Token::Sequence(tokens_multi) + } + Token::Quote(inner) => Token::Quote(Box::new(inner.merged())), + Token::Small(inner) => Token::Small(Box::new(inner.merged())), + Token::Big(inner) => Token::Big(Box::new(inner.merged())), + Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.merged())), + Token::Bold(inner) => Token::Bold(Box::new(inner.merged())), + Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), + Token::Center(inner) => Token::Center(Box::new(inner.merged())), + Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())), + Token::Function { + name, + params, + inner, + } => Token::Function { + name: name.clone(), + params: params.clone(), + inner: Box::new(inner.merged()), + }, + other => other.clone(), + } + } } type Span<'a> = LocatedSpan<&'a str>; @@ -244,25 +287,103 @@ struct Context; impl Context { #[inline] - const fn partial<'a>( + const fn partial( &self, - func: impl Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, - ) -> impl Fn(Span<'a>) -> IResult, Token<'a>> + '_ { + func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, + ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token<'a>> + '_ { move |input| func(self, input) } - fn root<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let (input, token) = alt((self.partial(Self::tag_quote),))(input)?; - Ok((input, token)) + fn full<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + map(many1(self.partial(Self::full_single)), Token::Sequence)(input) } fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?; + map(many1(self.partial(Self::inline_single)), Token::Sequence)(input) + } + + fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + map( + many1(self.partial(Self::inline_label_safe_single)), + Token::Sequence, + )(input) + } + + fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + alt(( + self.partial(Self::tag_bold_italic_asterisk), + self.partial(Self::tag_bold_italic_underscore), + self.partial(Self::tag_bold_asterisk), + self.partial(Self::tag_italic_asterisk), + self.partial(Self::tag_bold_underscore), + self.partial(Self::tag_italic_underscore), + ))(input) + } + + fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_block_center), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_block_code), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_quote), + self.partial(Self::tag_block_math), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::raw_url), + self.partial(Self::text), + ))(input)?; Ok((input, token)) } - fn inline_no_link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?; + fn inline_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::raw_url), + self.partial(Self::text), + ))(input)?; + Ok((input, token)) + } + + fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::shortcode_emoji), + self.partial(Self::text), + ))(input)?; Ok((input, token)) } @@ -270,7 +391,7 @@ impl Context { let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; if let (None, None) = leading_spaces { - if input.get_column() != 0 { + if input.get_column() != 1 { return fail(input); } } @@ -295,7 +416,12 @@ impl Context { return fail(input); } - let (_, inner) = spliced("e_lines, space, Token::Quote, orig_input)?; + let (_, inner) = spliced( + "e_lines, + self.partial(Self::full), + Token::Quote, + orig_input, + )?; let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; @@ -308,27 +434,23 @@ impl Context { let (input, _) = opt(line_ending)(input)?; - if input.get_column() != 0 { + if input.get_column() != 1 { return fail(input); } let (input, _) = tag_start(input)?; let (input, _) = opt(line_ending)(input)?; - let (input, center_seq) = many0(tuple(( - not(tuple((opt(line_ending), tag_end))), - self.partial(Self::inline), - )))(input)?; + let (input, (center_seq, _)) = many_till( + self.partial(Self::inline_single), + tuple((opt(line_ending), tag_end)), + )(input)?; - let (input, _) = opt(line_ending)(input)?; - let (input, _) = tag_end(input)?; let (input, _) = many0(space)(input)?; - let (input, _) = not(not_line_ending)(input)?; + let (input, _) = not(not(line_ending))(input)?; let (input, _) = opt(line_ending)(input)?; - let tokens = center_seq.into_iter().map(|(_, v)| v).collect::>(); - - Ok((input, boxing_sequence(Token::Center)(tokens))) + Ok((input, boxing_sequence(Token::Center)(center_seq))) } fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { @@ -336,7 +458,7 @@ impl Context { let (input, _) = opt(line_ending)(input)?; - if input.get_column() != 0 { + if input.get_column() != 1 { return fail(input); } @@ -358,7 +480,7 @@ impl Context { let (input, _) = line_ending(input)?; let (input, _) = delim(input)?; let (input, _) = many0(space)(input)?; - let (input, _) = not(not_line_ending)(input)?; + let (input, _) = not(not(line_ending))(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( @@ -376,7 +498,7 @@ impl Context { let (input, _) = opt(line_ending)(input)?; - if input.get_column() != 0 { + if input.get_column() != 1 { return fail(input); } @@ -458,8 +580,7 @@ impl Context { tag("_"), )))); - let (input, func_name_span) = func_ident(input)?; - let func_name = func_name_span.into_fragment(); + let (input, func_name) = map(func_ident, Span::into_fragment)(input)?; let arg = tuple((func_ident, opt(tuple((tag("="), param_value))))); @@ -478,16 +599,16 @@ impl Context { .collect::>() }); - let (input, inner) = self.partial(Self::inline)(input)?; + let (input, _) = opt(space)(input)?; - let (input, _) = tag("]")(input)?; + let (input, (inner, _)) = many_till(self.partial(Self::inline_single), tag("]"))(input)?; Ok(( input, Token::Function { name: Cow::from(func_name), params: args_out, - inner: Box::new(inner), + inner: Box::new(Token::Sequence(inner)), }, )) } @@ -649,15 +770,11 @@ impl Context { } fn text<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let before = input; - let (input, _) = anychar(input)?; - Ok(( - input, - Token::PlainText(before.fragment_between(&input).into()), - )) + let (input, text) = map(recognize(anychar), Span::into_fragment)(input)?; + Ok((input, Token::PlainText(text.into()))) } - fn url<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, url_span) = recognize(tuple(( protocol, url_chars(|input| not(url_chars_base)(input), false), @@ -688,8 +805,10 @@ impl Context { let (input, no_embed) = opt(tag("?"))(input)?; let (input, _) = tag("[")(input)?; let (input, _) = not(tag("["))(input)?; - let (input, label_span) = - recognize(many1(tuple((not(tag("](")), not_line_ending))))(input)?; + let (input, label_span) = recognize(many1(tuple(( + not(tag("](")), + self.partial(Self::inline_label_safe_single), + ))))(input)?; let (input, _) = tag("]")(input)?; let (input, _) = tag("(")(input)?; let (input, url_span) = recognize(tuple((protocol, url_chars(tag("]"), true))))(input)?; @@ -772,7 +891,7 @@ impl Context { )) } - fn hashtag<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { // TODO: Skip when preceded by alphanumerics let (input, _) = tag("#")(input)?; @@ -843,9 +962,11 @@ fn url_chars<'a, T: 'a>( #[cfg(test)] mod test { - use crate::{url_chars, Context, Span}; + use crate::{url_chars, Context, Span, Token}; use nom::bytes::complete::tag; use nom::multi::many1; + use std::borrow::Cow; + use std::collections::HashMap; #[test] fn parse_url_chars() { @@ -895,12 +1016,50 @@ mod test { ); } + #[test] + fn parse_complex() { + let emoji = r#"$[x2 $[sparkle πŸ₯Ί]πŸ’œ$[spin.y,speed=5s ❀️]🦊]"#; + assert_eq!( + Token::Function { + name: "x2".into(), + params: HashMap::new(), + inner: Box::new(Token::Sequence(vec![ + Token::Function { + name: "sparkle".into(), + params: HashMap::new(), + inner: Box::new(Token::UnicodeEmoji("πŸ₯Ί".into())), + }, + Token::UnicodeEmoji("πŸ’œ".into()), + Token::Function { + name: "spin".into(), + params: { + let mut params = HashMap::new(); + params.insert("y".into(), None); + params.insert("speed".into(), Some("5s".into())); + params + }, + inner: Box::new(Token::UnicodeEmoji("❀️".into())), + }, + Token::UnicodeEmoji("🦊".into()), + ])) + }, + Context.full(Span::new(emoji)).unwrap().1.merged() + ) + } + #[test] fn parse_emoji() { let test = "πŸ₯ΊπŸ’œβ€οΈπŸ¦Š"; let ctx = Context; let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap(); - println!("{:#?}", tokens.1) + assert_eq!( + vec!["πŸ₯Ί", "πŸ’œ", "❀️", "🦊"] + .into_iter() + .map(<&str as Into>>::into) + .map(Token::UnicodeEmoji) + .collect::>(), + tokens.1 + ); } }