From d0d977e6ebbf7676b26c1a76ab5890baa2389910 Mon Sep 17 00:00:00 2001 From: Natty Date: Sun, 8 Oct 2023 22:15:55 +0200 Subject: [PATCH] Fixed URL parsing and initial flanking rules implementation --- magnetar_mmm_parser/src/lib.rs | 388 +++++++++++++++++++++++++++------ 1 file changed, 322 insertions(+), 66 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 7385d65..d270760 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,19 +1,20 @@ use either::Either; use nom::branch::alt; -use nom::bytes::complete::tag; +use nom::bytes::complete::{tag, tag_no_case}; use nom::character::complete::{ - alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1, - tab, + alpha1, alphanumeric1, anychar, char as one_char, char, line_ending, not_line_ending, one_of, + satisfy, space1, tab, }; use nom::combinator::{eof, fail, map, not, opt, recognize}; use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; -use nom::{IResult, Offset, Slice}; +use nom::{Compare, IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; use std::convert::{identity, Infallible}; +use std::marker::PhantomData; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -73,6 +74,80 @@ pub enum Token<'a> { } impl Token<'_> { + fn str_content_left(&self) -> Option<&str> { + match self { + Token::PlainText(text) => Some(text.as_ref()), + Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left), + Token::Quote(inner) => inner.str_content_left(), + Token::Small(inner) => inner.str_content_left(), + Token::BoldItalic(inner) => inner.str_content_left(), + Token::Bold(inner) => inner.str_content_left(), + Token::Italic(inner) => inner.str_content_left(), + Token::Center(inner) => inner.str_content_left(), + Token::Strikethrough(inner) => inner.str_content_left(), + Token::PlainTag(tag) => Some(tag.as_ref()), + Token::UrlRaw(url) => Some(url.as_ref()), + Token::UrlNoEmbed(url) => Some(url.as_ref()), + Token::Link { label, .. } => label.str_content_left(), + Token::Function { inner, .. } => inner.str_content_left(), + Token::Mention { name, .. } => Some(name.as_ref()), + Token::UnicodeEmoji(code) => Some(code.as_ref()), + Token::ShortcodeEmoji(_) => None, + Token::Hashtag(tag) => Some(tag.as_ref()), + _ => None, + } + } + + fn str_content_right(&self) -> Option<&str> { + match self { + Token::PlainText(text) => Some(text.as_ref()), + Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right), + Token::Quote(inner) => inner.str_content_right(), + Token::Small(inner) => inner.str_content_right(), + Token::BoldItalic(inner) => inner.str_content_right(), + Token::Bold(inner) => inner.str_content_right(), + Token::Italic(inner) => inner.str_content_right(), + Token::Center(inner) => inner.str_content_right(), + Token::Strikethrough(inner) => inner.str_content_right(), + Token::PlainTag(tag) => Some(tag.as_ref()), + Token::UrlRaw(url) => Some(url.as_ref()), + Token::UrlNoEmbed(url) => Some(url.as_ref()), + Token::Link { label, .. } => label.str_content_right(), + Token::Function { inner, .. } => inner.str_content_right(), + Token::Mention { name, .. } => Some(name.as_ref()), + Token::UnicodeEmoji(code) => Some(code.as_ref()), + Token::Hashtag(tag) => Some(tag.as_ref()), + _ => None, + } + } + + fn inner(&self) -> Token { + match self { + plain @ Token::PlainText(_) => plain.clone(), + sequence @ Token::Sequence(_) => sequence.clone(), + Token::Quote(inner) => inner.inner(), + Token::Small(inner) => inner.inner(), + Token::BoldItalic(inner) => inner.inner(), + Token::Bold(inner) => inner.inner(), + Token::Italic(inner) => inner.inner(), + Token::Center(inner) => inner.inner(), + Token::Strikethrough(inner) => inner.inner(), + Token::PlainTag(text) => Token::PlainText(text.clone()), + Token::InlineCode(code) => Token::PlainText(code.clone()), + Token::InlineMath(math) => Token::PlainText(math.clone()), + Token::UrlRaw(url) => Token::PlainText(url.clone()), + Token::UrlNoEmbed(url) => Token::PlainText(url.clone()), + Token::Link { label, .. } => label.inner(), + Token::BlockCode { inner, .. } => Token::PlainText(inner.clone()), + Token::BlockMath(math) => Token::PlainText(math.clone()), + Token::Function { inner, .. } => inner.inner(), + Token::Mention { name, .. } => Token::PlainText(name.clone()), + Token::UnicodeEmoji(code) => Token::PlainText(code.clone()), + Token::ShortcodeEmoji(shortcode) => Token::PlainText(shortcode.clone()), + Token::Hashtag(tag) => Token::PlainText(tag.clone()), + } + } + fn owned(&self) -> Token<'static> { match self { Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())), @@ -129,7 +204,7 @@ impl Token<'_> { Token::ShortcodeEmoji(shortcode) => { Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned())) } - Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())), + Token::Hashtag(tag) => Token::Hashtag(Cow::Owned(tag.clone().into_owned())), } } @@ -245,6 +320,16 @@ fn collect_char_sequence<'a>( move |chars| func(Cow::Owned(chars.collect())) } +#[inline] +fn alpha1_unicode(input: Span) -> IResult { + recognize(many1_count(satisfy(char::is_alphanumeric)))(input) +} + +#[inline] +fn alphanumeric1_unicode(input: Span) -> IResult { + recognize(many1_count(satisfy(char::is_alphanumeric)))(input) +} + fn spliced<'a>( segments: &[Span<'a>], func: impl Fn(Span) -> IResult, @@ -316,15 +401,16 @@ fn space(input: Span) -> IResult { Ok((input, Token::PlainText(frag.into_fragment().into()))) } -struct Matcher<'a, 'b, T> { +#[derive(Copy, Clone)] +struct Matcher<'a, 'b, T: Clone> { matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), - _phantom_closure: std::marker::PhantomData<&'a ()>, - _phantom_data: std::marker::PhantomData<&'b ()>, - _phantom_output: std::marker::PhantomData T>, + _phantom_closure: PhantomData<&'a ()>, + _phantom_data: PhantomData<&'b ()>, + _phantom_output: PhantomData T>, } -impl<'a, 'b, T> Matcher<'a, 'b, T> { +impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> { fn new( matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), @@ -332,9 +418,9 @@ impl<'a, 'b, T> Matcher<'a, 'b, T> { Self { matcher_inner, collector, - _phantom_closure: std::marker::PhantomData, - _phantom_data: std::marker::PhantomData, - _phantom_output: std::marker::PhantomData, + _phantom_closure: PhantomData, + _phantom_data: PhantomData, + _phantom_output: PhantomData, } } } @@ -345,33 +431,60 @@ impl<'a, 'b> Matcher<'a, 'b, Infallible> { Self { matcher_inner: &fail::<_, Infallible, _>, collector: &|_| unreachable!(), - _phantom_closure: std::marker::PhantomData, - _phantom_data: std::marker::PhantomData, - _phantom_output: std::marker::PhantomData, + _phantom_closure: PhantomData, + _phantom_data: PhantomData, + _phantom_output: PhantomData, } } } -struct Context; +#[derive(Copy, Clone, Debug)] +enum FlankingRule { + Lenient, + Strict, + DontCare, +} + +struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>>( + T, + FlankingRule, + PhantomData<&'a ()>, +); + +impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From<(T, FlankingRule)> + for FlankingDelim<'a, T> +{ + fn from((func, rule): (T, FlankingRule)) -> Self { + FlankingDelim(func, rule, PhantomData) + } +} + +impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From for FlankingDelim<'a, T> { + fn from(func: T) -> Self { + FlankingDelim(func, FlankingRule::DontCare, PhantomData) + } +} + +pub struct Context; impl Context { #[inline] - const fn partial( + fn partial( &self, func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token<'a>> + '_ { move |input| func(self, input) } - fn full<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + pub fn full<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { map(many1(self.partial(Self::full_single)), Token::Sequence)(input) } - fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + pub fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { map(many1(self.partial(Self::inline_single)), Token::Sequence)(input) } - fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + pub fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { map( many1(self.partial(Self::inline_label_safe_single)), Token::Sequence, @@ -606,14 +719,21 @@ impl Context { } #[inline] - fn tag_delimited<'a, 'b: 'a, T, S>( + fn tag_delimited<'a, 'b: 'a, T: Clone, S: Clone, FOpen, FClose>( &'a self, - opening_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, - closing_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + opening_tag: impl Into> + 'a, + closing_tag: impl Into> + 'a, escape: bool, matcher: Matcher<'a, 'b, T>, fallback: Matcher<'a, 'b, S>, - ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ { + ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ + where + FOpen: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + FClose: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + { + let FlankingDelim(opening_tag, opening_rule, ..) = opening_tag.into(); + let FlankingDelim(closing_tag, closing_rule, ..) = closing_tag.into(); + move |input| { if escape { if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) { @@ -662,10 +782,44 @@ impl Context { )); } - let (input, (inner, _)) = res?; + let (input, (inner, closing)) = res?; let mut inner = inner.into_iter().map(|(_, t)| t); - Ok((input, (matcher.collector)(&mut inner))) + let inner_tok = (matcher.collector)(&mut inner); + + let correct_left_flanking = + if let FlankingRule::Lenient | FlankingRule::Strict = opening_rule { + let text_left = inner_tok.str_content_left(); + + !(text_left.is_some_and(|s| s.starts_with(char::is_whitespace)) + || text_left.is_none()) + } else { + true + }; + + let correct_right_flanking = + if let FlankingRule::Lenient | FlankingRule::Strict = closing_rule { + let text_right = inner_tok.str_content_right(); + !(text_right.is_some_and(|s| s.ends_with(char::is_whitespace)) + || text_right.is_none()) + } else { + true + }; + + // TODO: Unfinished flanking rules + let correct_flanking = correct_left_flanking && correct_right_flanking; + + if !correct_flanking { + return Ok(( + input, + Token::Sequence(vec![ + Token::PlainText(begin.fragment_between(&post_open).into()), + inner_tok.inner().owned(), + Token::PlainText(closing.into_fragment().into()), + ]), + )); + } + Ok((input, Token::Sequence(vec![inner_tok]))) } } @@ -720,12 +874,12 @@ impl Context { } fn tag_plain<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let opening_tag = &tag(""); - let closing_tag = &tag(""); + let opening_tag = &tag(""); + let closing_tag = &tag(""); let (input, _) = opening_tag(input)?; let (input, text) = map( - recognize(many1(tuple((not_line_ending, not(closing_tag))))), + recognize(many1(tuple((not(line_ending), not(closing_tag), anychar)))), Span::into_fragment, )(input)?; let (input, _) = closing_tag(input)?; @@ -735,8 +889,8 @@ impl Context { fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag(""), - tag(""), + tag_no_case(""), + tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), @@ -749,11 +903,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("***"), - tag("***"), + (tag("***"), FlankingRule::Lenient), + (tag("***"), FlankingRule::Lenient), true, Matcher::new( &self.partial(Self::inline_single), @@ -766,11 +919,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("___"), - tag("___"), + (tag("___"), FlankingRule::Strict), + (tag("___"), FlankingRule::Strict), true, Matcher::new( &self.partial(Self::inline_single), @@ -785,8 +937,8 @@ impl Context { fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag(""), - tag(""), + tag_no_case(""), + tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), @@ -799,11 +951,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("**"), - tag("**"), + (tag("**"), FlankingRule::Lenient), + (tag("**"), FlankingRule::Lenient), true, Matcher::new( &self.partial(Self::inline_single), @@ -816,11 +967,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("__"), - tag("__"), + (tag("__"), FlankingRule::Strict), + (tag("__"), FlankingRule::Strict), true, Matcher::new( &self.partial(Self::inline_single), @@ -835,8 +985,8 @@ impl Context { fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag(""), - tag(""), + tag_no_case(""), + tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), @@ -849,11 +999,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("*"), - tag("*"), + (tag("*"), FlankingRule::Lenient), + (tag("*"), FlankingRule::Lenient), true, Matcher::new( &self.partial(Self::inline_single), @@ -866,11 +1015,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("_"), - tag("_"), + (tag("_"), FlankingRule::Strict), + (tag("_"), FlankingRule::Strict), true, Matcher::new( &self.partial(Self::inline_single), @@ -885,8 +1033,8 @@ impl Context { fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag(""), - tag(""), + tag_no_case(""), + tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), @@ -899,11 +1047,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("~~"), - tag("~~"), + (tag("~~"), FlankingRule::Lenient), + (tag("~~"), FlankingRule::Lenient), true, Matcher::new( &move |input| { @@ -1037,20 +1184,42 @@ impl Context { } fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - // TODO: Fail when preceded by alphanumerics + if let (plain_out, Some(plain)) = map( + opt(recognize(tuple(( + alphanumeric1_unicode, + self.partial(Self::shortcode_emoji), + )))), + |o| o.map(Span::into_fragment), + )(input)? + { + return Ok((plain_out, Token::PlainText(plain.into()))); + } + let (input, _) = tag(":")(input)?; let (input, shortcode) = map( - recognize(many1(alt((alphanumeric1, recognize(one_of("_+-")))))), + recognize(many1(alt(( + alphanumeric1_unicode, + recognize(one_of("_+-")), + )))), Span::into_fragment, )(input)?; let (input, _) = tag(":")(input)?; - let (input, _) = not(alphanumeric1)(input)?; + let (input, _) = not(alphanumeric1_unicode)(input)?; Ok((input, Token::ShortcodeEmoji(shortcode.into()))) } fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - // TODO: Escaping and skip when preceded by alphanumerics + if let (plain_out, Some(plain)) = map( + opt(recognize(tuple(( + alt((tag("\\"), alphanumeric1_unicode)), + self.partial(Self::tag_mention), + )))), + |o| o.map(Span::into_fragment), + )(input)? + { + return Ok((plain_out, Token::PlainText(plain.into()))); + } let tags = one_of("@!"); let (input, mention_type) = map(tags, |c| match c { @@ -1123,12 +1292,12 @@ fn protocol(input: Span) -> IResult { #[inline] fn url_chars_base(input: Span) -> IResult { - recognize(alt(( - alpha1, + alt(( + alphanumeric1_unicode, recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))), recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))), recognize(one_of(".,_/:%#$&?!~=+-@")), - )))(input) + ))(input) } #[inline] @@ -1221,6 +1390,21 @@ mod test { Token::Italic(Box::new(Token::PlainText("italic".into()))), ); + assert_eq!( + parse_full(r#"* italic *"#), + Token::PlainText("* italic *".into()) + ); + + assert_eq!( + parse_full(r#"_ italic *"#), + Token::PlainText("_ italic *".into()) + ); + + assert_eq!( + parse_full(r#"*"italic"*"#), + Token::Italic(Box::new(Token::PlainText("\"italic\"".into()))) + ); + assert_eq!( parse_full(r#"not code `code` also not code"#), Token::Sequence(vec![ @@ -1356,6 +1540,47 @@ text"# #[test] fn parse_link() { + assert_eq!( + parse_full("IPv4 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv4 test: ".into()), + Token::UrlNoEmbed("https://0".into()) + ]) + ); + + assert_eq!( + parse_full("IPv4 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv4 test: ".into()), + Token::UrlNoEmbed("https://127.0.0.1".into()) + ]) + ); + + assert_eq!( + parse_full("IPv6 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv6 test: ".into()), + Token::UrlNoEmbed("https://[::2f:1]/nya".into()) + ]) + ); + + assert_eq!( + parse_full("IPv6 test: https://[::2f:1]/nya"), + Token::Sequence(vec![ + Token::PlainText("IPv6 test: ".into()), + Token::UrlRaw("https://[::2f:1]/nya".into()) + ]) + ); + + // IDNs + assert_eq!( + parse_full("IDN test: https://www.háčkyčárky.cz/"), + Token::Sequence(vec![ + Token::PlainText("IDN test: ".into()), + Token::UrlRaw("https://www.háčkyčárky.cz/".into()) + ]) + ); + assert_eq!( parse_full("Link test: [label](https://example.com)"), Token::Sequence(vec![ @@ -1440,6 +1665,11 @@ text"# } ); + assert_eq!( + parse_full("email@notactuallyamenmtion.org"), + Token::PlainText("email@notactuallyamenmtion.org".into()) + ); + assert_eq!( parse_full("hgsjlkdsa @tag fgahjsdkd"), Token::Sequence(vec![ @@ -1532,6 +1762,32 @@ text"# ); } + #[test] + fn parse_shortcodes() { + assert_eq!( + parse_full(":bottom:"), + Token::ShortcodeEmoji("bottom".into()) + ); + + assert_eq!( + parse_full(":bottom::blobfox:"), + Token::Sequence(vec![ + Token::ShortcodeEmoji("bottom".into()), + Token::ShortcodeEmoji("blobfox".into()) + ]) + ); + + assert_eq!( + parse_full(":bottom:blobfox"), + Token::PlainText(":bottom:blobfox".into()) + ); + + assert_eq!( + parse_full("bottom:blobfox:"), + Token::PlainText("bottom:blobfox:".into()) + ); + } + #[test] fn parse_emoji() { assert_eq!(