From 95141388faacc3e165d71f6235b44a9a292b7499 Mon Sep 17 00:00:00 2001 From: Natty Date: Fri, 29 Sep 2023 16:56:59 +0200 Subject: [PATCH 01/23] Created a project for the MMM parser --- Cargo.lock | 8 ++++++++ Cargo.toml | 2 ++ magnetar_mmm_parser/Cargo.toml | 9 +++++++++ magnetar_mmm_parser/README.md | 5 +++++ magnetar_mmm_parser/src/lib.rs | 0 5 files changed, 24 insertions(+) create mode 100644 magnetar_mmm_parser/Cargo.toml create mode 100644 magnetar_mmm_parser/README.md create mode 100644 magnetar_mmm_parser/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 95e3419..83cf326 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1603,6 +1603,14 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "mmm_parser" +version = "0.2.1-alpha" +dependencies = [ + "nom", + "thiserror", +] + [[package]] name = "nom" version = "7.1.3" diff --git a/Cargo.toml b/Cargo.toml index 95a4fe6..76333e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ members = [ "fe_calckey", "magnetar_common", "magnetar_sdk", + "magnetar_mmm_parser", "core" ] @@ -36,6 +37,7 @@ hyper = "0.14" js-sys = "0.3" log = "0.4" miette = "5.9" +nom = "7" percent-encoding = "2.2" redis = "0.23" reqwest = "0.11" diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml new file mode 100644 index 0000000..3620931 --- /dev/null +++ b/magnetar_mmm_parser/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "mmm_parser" +version.workspace = true +edition.workspace = true +license = "MIT OR Apache-2.0" + +[dependencies] +thiserror = { workspace = true } +nom = { workspace = true } \ No newline at end of file diff --git a/magnetar_mmm_parser/README.md b/magnetar_mmm_parser/README.md new file mode 100644 index 0000000..92dff9b --- /dev/null +++ b/magnetar_mmm_parser/README.md @@ -0,0 +1,5 @@ +# MMM + +Magnetar {marinated, modified} Markdown? + +#TODO: Finish docs \ No newline at end of file diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs new file mode 100644 index 0000000..e69de29 From 1af8f4e213207d010fdceb43a4732bb76bb2ef66 Mon Sep 17 00:00:00 2001 From: Natty Date: Sun, 1 Oct 2023 23:04:32 +0200 Subject: [PATCH 02/23] Basic inline tag parsing --- Cargo.lock | 19 +- Cargo.toml | 1 + magnetar_mmm_parser/Cargo.toml | 4 +- magnetar_mmm_parser/src/lib.rs | 405 +++++++++++++++++++++++++++++++++ 4 files changed, 426 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 83cf326..5b8dcd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -411,6 +411,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + [[package]] name = "byteorder" version = "1.4.3" @@ -1608,7 +1614,7 @@ name = "mmm_parser" version = "0.2.1-alpha" dependencies = [ "nom", - "thiserror", + "nom_locate", ] [[package]] @@ -1621,6 +1627,17 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom_locate" +version = "4.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e3c83c053b0713da60c5b8de47fe8e494fe3ece5267b2f23090a07a053ba8f3" +dependencies = [ + "bytecount", + "memchr", + "nom", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" diff --git a/Cargo.toml b/Cargo.toml index 76333e2..a7a960f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ js-sys = "0.3" log = "0.4" miette = "5.9" nom = "7" +nom_locate = "4" percent-encoding = "2.2" redis = "0.23" reqwest = "0.11" diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 3620931..8a07618 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -5,5 +5,5 @@ edition.workspace = true license = "MIT OR Apache-2.0" [dependencies] -thiserror = { workspace = true } -nom = { workspace = true } \ No newline at end of file +nom = { workspace = true } +nom_locate = { workspace = true } \ No newline at end of file diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index e69de29..9ed647e 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -0,0 +1,405 @@ +use nom::branch::alt; +use nom::bytes::complete::tag; +use nom::character::complete; +use nom::character::complete::{anychar, line_ending, not_line_ending, tab}; +use nom::combinator::{fail, not, opt}; +use nom::error::ErrorKind; +use nom::multi::{many1, separated_list1}; +use nom::sequence::tuple; +use nom::{IResult, Offset, Slice}; +use nom_locate::LocatedSpan; +use std::borrow::Cow; + +enum Token<'a> { + PlainText(Cow<'a, str>), + Sequence(Vec>), + Quote(Box>), + Small(Box>), + Big(Box>), + BoldItalic(Box>), + Bold(Box>), + Italic(Box>), + Center(Box>), + Strikethrough(Box>), + PlainTag(Cow<'a, str>), + InlineCode(Cow<'a, str>), + InlineMath(Cow<'a, str>), +} + +impl Token<'_> { + fn owned(&self) -> Token<'static> { + match self { + Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())), + Token::Sequence(tokens) => Token::Sequence(tokens.iter().map(Token::owned).collect()), + Token::Quote(inner) => Token::Quote(Box::new(inner.owned())), + Token::Small(inner) => Token::Small(Box::new(inner.owned())), + Token::Big(inner) => Token::Big(Box::new(inner.owned())), + Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.owned())), + Token::Bold(inner) => Token::Bold(Box::new(inner.owned())), + Token::Italic(inner) => Token::Italic(Box::new(inner.owned())), + Token::Center(inner) => Token::Center(Box::new(inner.owned())), + Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.owned())), + Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())), + Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())), + Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())), + } + } +} + +type Span<'a> = LocatedSpan<&'a str>; + +trait SliceOffset { + fn up_to(&self, other: &Self) -> Self; + + fn fragment_between<'a>(&self, other: &Self) -> &'a str + where + Self: 'a; +} + +impl SliceOffset for Span<'_> { + fn up_to(&self, other: &Self) -> Self { + self.slice(..self.offset(other)) + } + + fn fragment_between<'a>(&self, other: &Self) -> &'a str + where + Self: 'a, + { + self.up_to(other).into_fragment() + } +} + +const fn boxing_sequence<'a>( + func: impl Fn(Box>) -> Token<'a>, +) -> impl Fn(Vec>) -> Token<'a> { + move |tokens| func(Box::new(Token::Sequence(tokens))) +} + +const fn collect_char_sequence<'a>( + func: impl Fn(Cow<'a, str>) -> Token<'a>, +) -> impl Fn(Vec) -> Token<'a> { + move |chars| func(Cow::Owned(chars.into_iter().collect())) +} + +fn spliced<'a>( + segments: &[Span<'a>], + func: impl Fn(Span) -> IResult, + parent: Span<'a>, +) -> IResult, Token<'static>, nom::error::Error>> { + let combined = segments + .iter() + .copied() + .map(Span::into_fragment) + .collect::(); + let cum_offset_combined = segments + .iter() + .scan(0, |acc, &x| { + *acc += x.len(); + Some(*acc) + }) + .collect::>(); + let current_seg = |input: Span| { + cum_offset_combined + .iter() + .enumerate() + .filter(|(_, &o)| o >= input.location_offset()) + .map(|(i, o)| (segments[i], o)) + .last() + }; + + type NE = nom::Err; + type NomError<'x> = nom::error::Error>; + + let quote_span = Span::new(&combined); + let (input, inner) = match func(quote_span) { + Ok((input, token)) => (input, token.owned()), + Err(e) => { + return match e { + NE::Error(e) => { + let offset_new = e.input.location_offset(); + if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) { + let offset = offset_new - offset_seg_new; + let offset_orig = offset + seg_parent.location_offset(); + Err(NE::Error(NomError::new( + Span::new(&parent.into_fragment()[offset_orig..]), + e.code, + ))) + } else { + // ??? + Err(NE::Failure(NomError::new(parent, ErrorKind::Fail))) + } + } + NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))), + NE::Incomplete(i) => Err(NE::Incomplete(i)), + }; + } + }; + + let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) { + let offset = input.location_offset() - offset_seg_new; + let offset_orig = offset + seg_parent.location_offset(); + parent.slice(offset_orig..) + } else { + parent + }; + + Ok((out, Token::Quote(Box::new(inner.owned())))) +} + +fn space(input: Span) -> IResult { + let start = input; + let (input, _) = alt((complete::char('\u{0020}'), complete::char('\u{3000}'), tab))(input)?; + Ok(( + input, + Token::PlainText(start.fragment_between(&input).into()), + )) +} + +struct Context; + +impl Context { + const fn partial<'a>( + &self, + func: impl Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, + ) -> impl Fn(Span<'a>) -> IResult, Token<'a>> + '_ { + move |input| func(self, input) + } + + fn root<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt((self.partial(Self::tag_quote),))(input)?; + Ok((input, token)) + } + + fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?; + Ok((input, token)) + } + + fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; + + if let (None, None) = leading_spaces { + if input.get_column() != 0 { + return fail(input); + } + } + + let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input); + + let orig_input = input; + let (input, lines) = separated_list1(line_ending, quote_line)(input)?; + + let quote_lines = lines + .into_iter() + .map(|(_, _, text)| text) + .collect::>(); + + if quote_lines.len() == 1 + && quote_lines + .iter() + .map(Span::fragment) + .copied() + .any(&str::is_empty) + { + return fail(input); + } + + let (_, inner) = spliced("e_lines, space, orig_input)?; + + let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; + + Ok((input, Token::Quote(Box::new(inner)))) + } + + const fn tag_delimited<'a, 'b: 'a, T>( + &'a self, + start: &'b str, + end: &'b str, + escape: bool, + matcher_inner: impl Fn(Span<'b>) -> IResult, T> + 'a, + mapper: impl Fn(Vec) -> Token<'b> + 'a, + ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ { + move |input| { + let opening_tag = &tag(start); + let closing_tag = &tag(end); + + if escape { + if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), opening_tag))(input) { + return Ok((input_escaped, Token::PlainText(Cow::Borrowed(&mark)))); + } + } + + let begin = input; + let (post_open, _) = opening_tag(input)?; + + let res = tuple(( + many1(tuple((not(closing_tag), &matcher_inner))), + closing_tag, + ))(post_open); + + if let Err(nom::Err::Error(nom::error::Error { .. })) = res { + return Ok(( + post_open, + Token::PlainText(begin.fragment_between(&post_open).into()), + )); + } + + let (input, (inner, _)) = res?; + + let inner = inner.into_iter().map(|(_, t)| t).collect::>(); + + Ok((input, mapper(inner))) + } + } + + fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "", + "", + false, + self.partial(Self::inline), + boxing_sequence(Token::Small), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "***", + "***", + true, + self.partial(Self::inline), + boxing_sequence(Token::BoldItalic), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "___", + "___", + true, + self.partial(Self::inline), + boxing_sequence(Token::BoldItalic), + )(input) + } + + fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "", + "", + false, + self.partial(Self::inline), + boxing_sequence(Token::Bold), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "**", + "**", + true, + self.partial(Self::inline), + boxing_sequence(Token::Bold), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "__", + "__", + true, + self.partial(Self::inline), + boxing_sequence(Token::Bold), + )(input) + } + + fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "", + "", + false, + self.partial(Self::inline), + boxing_sequence(Token::Italic), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "*", + "*", + true, + self.partial(Self::inline), + boxing_sequence(Token::Italic), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "_", + "_", + true, + self.partial(Self::inline), + boxing_sequence(Token::Italic), + )(input) + } + + fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "", + "", + false, + self.partial(Self::inline), + boxing_sequence(Token::Strikethrough), + )(input) + } + + // TODO: CommonMark flanking rules + fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "~~", + "~~", + true, + move |input| { + tuple((not_line_ending, self.partial(Self::inline)))(input).map(|(i, t)| (i, t.1)) + }, + boxing_sequence(Token::Strikethrough), + )(input) + } + + fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "`", + "", + true, + move |input| { + tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input) + .map(|(i, (_skip, c))| (i, c)) + }, + collect_char_sequence(Token::InlineCode), + )(input) + } + + fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + self.tag_delimited( + "\\(", + "\\)", + false, + move |input| tuple((not_line_ending, anychar))(input).map(|(i, (_skip, c))| (i, c)), + collect_char_sequence(Token::InlineMath), + )(input) + } + + fn text<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let before = input; + let (input, _) = anychar(input)?; + Ok(( + input, + Token::PlainText(before.fragment_between(&input).into()), + )) + } +} From 46e0766a36d0c84ff6f3a56785014e3695e32879 Mon Sep 17 00:00:00 2001 From: Natty Date: Wed, 4 Oct 2023 19:31:03 +0200 Subject: [PATCH 03/23] Implemented MFM functions and math and center blocks --- Cargo.lock | 8 +- magnetar_mmm_parser/src/lib.rs | 156 +++++++++++++++++++++++++++++++-- 2 files changed, 154 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b8dcd4..9abfe30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2499,18 +2499,18 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" [[package]] name = "serde" -version = "1.0.180" +version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea67f183f058fe88a4e3ec6e2788e003840893b91bac4559cabedd00863b3ed" +checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.180" +version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24e744d7782b686ab3b73267ef05697159cc0e5abbed3f47f9933165e5219036" +checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 9ed647e..ee70239 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,16 +1,20 @@ use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete; -use nom::character::complete::{anychar, line_ending, not_line_ending, tab}; -use nom::combinator::{fail, not, opt}; +use nom::character::complete::{ + alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, tab, +}; +use nom::combinator::{fail, not, opt, recognize}; use nom::error::ErrorKind; -use nom::multi::{many1, separated_list1}; +use nom::multi::{many0, many0_count, many1, many1_count, separated_list1}; use nom::sequence::tuple; use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; +use std::collections::HashMap; -enum Token<'a> { +#[derive(Clone, Debug)] +pub enum Token<'a> { PlainText(Cow<'a, str>), Sequence(Vec>), Quote(Box>), @@ -24,6 +28,16 @@ enum Token<'a> { PlainTag(Cow<'a, str>), InlineCode(Cow<'a, str>), InlineMath(Cow<'a, str>), + BlockCode { + lang: Option>, + inner: Cow<'a, str>, + }, + BlockMath(Cow<'a, str>), + Function { + name: Cow<'a, str>, + params: HashMap, Option>>, + inner: Box>, + }, } impl Token<'_> { @@ -42,6 +56,28 @@ impl Token<'_> { Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())), Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())), Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())), + Token::BlockCode { inner, lang } => Token::BlockCode { + lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())), + inner: Cow::Owned(inner.clone().into_owned()), + }, + Token::BlockMath(math) => Token::BlockMath(Cow::Owned(math.clone().into_owned())), + Token::Function { + name, + params, + inner, + } => Token::Function { + name: Cow::Owned(name.clone().into_owned()), + params: params + .iter() + .map(|(k, v)| { + ( + Cow::Owned(k.clone().into_owned()), + v.as_ref().map(|val| Cow::Owned(val.clone().into_owned())), + ) + }) + .collect(), + inner: Box::new(inner.owned()), + }, } } } @@ -84,6 +120,7 @@ const fn collect_char_sequence<'a>( fn spliced<'a>( segments: &[Span<'a>], func: impl Fn(Span) -> IResult, + output_mapper: impl Fn(Box>) -> Token<'static>, parent: Span<'a>, ) -> IResult, Token<'static>, nom::error::Error>> { let combined = segments @@ -143,7 +180,7 @@ fn spliced<'a>( parent }; - Ok((out, Token::Quote(Box::new(inner.owned())))) + Ok((out, output_mapper(Box::new(inner.owned())))) } fn space(input: Span) -> IResult { @@ -204,13 +241,69 @@ impl Context { return fail(input); } - let (_, inner) = spliced("e_lines, space, orig_input)?; + let (_, inner) = spliced("e_lines, space, Token::Quote, orig_input)?; let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; Ok((input, Token::Quote(Box::new(inner)))) } + fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let tag_start = &tag("
"); + let tag_end = &tag("
"); + + let (input, _) = opt(line_ending)(input)?; + + if input.get_column() != 0 { + return fail(input); + } + + let (input, _) = tag_start(input)?; + let (input, _) = opt(line_ending)(input)?; + + let (input, center_seq) = many0(tuple(( + not(tuple((opt(line_ending), tag_end))), + self.partial(Self::inline), + )))(input)?; + + let (input, _) = opt(line_ending)(input)?; + let (input, _) = tag_end(input)?; + let (input, _) = many0(space)(input)?; + let (input, _) = not(not_line_ending)(input)?; + let (input, _) = opt(line_ending)(input)?; + + let tokens = center_seq.into_iter().map(|(_, v)| v).collect::>(); + + Ok((input, boxing_sequence(Token::Center)(tokens))) + } + + fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, _) = opt(line_ending)(input)?; + + if input.get_column() != 0 { + return fail(input); + } + + let (input, _) = tag("\\[")(input)?; + let (input, _) = opt(line_ending)(input)?; + + let (input, math_span) = recognize(many1_count(tuple(( + not(tuple((opt(line_ending), tag("\\]")))), + not_line_ending, + ))))(input)?; + + let (input, _) = opt(line_ending)(input)?; + let (input, _) = tag("\\]")(input)?; + let (input, _) = many0(space)(input)?; + let (input, _) = not(not_line_ending)(input)?; + let (input, _) = opt(line_ending)(input)?; + + Ok(( + input, + Token::BlockMath(Cow::Borrowed(math_span.into_fragment())), + )) + } + const fn tag_delimited<'a, 'b: 'a, T>( &'a self, start: &'b str, @@ -252,6 +345,57 @@ impl Context { } } + fn tag_func<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, _) = tag("$[")(input)?; + + let func_ident = |input| { + recognize(tuple(( + many1_count(alt((alpha1, tag("_")))), + many0_count(alt((alphanumeric1, tag("_")))), + )))(input) + }; + + let param_value = recognize(many1_count(alt(( + alphanumeric1, + tag("."), + tag("-"), + tag("_"), + )))); + + let (input, func_name_span) = func_ident(input)?; + let func_name = func_name_span.into_fragment(); + + let arg = tuple((func_ident, opt(tuple((tag("="), param_value))))); + + let (input, args) = + opt(tuple((one_char('.'), separated_list1(one_char(','), arg))))(input)?; + + let args_out = args.map_or_else(HashMap::new, |(_, items)| { + items + .into_iter() + .map(|(k, v)| { + ( + Cow::from(k.into_fragment()), + v.map(|(_, val)| Cow::from(val.into_fragment())), + ) + }) + .collect::>() + }); + + let (input, inner) = self.partial(Self::inline)(input)?; + + let (input, _) = tag("]")(input)?; + + Ok(( + input, + Token::Function { + name: Cow::from(func_name), + params: args_out, + inner: Box::new(inner), + }, + )) + } + fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( "", From 24d44632e04a613e2595276a9046d46f507a6094 Mon Sep 17 00:00:00 2001 From: Natty Date: Wed, 4 Oct 2023 19:44:27 +0200 Subject: [PATCH 04/23] Minor cleanup --- magnetar_mmm_parser/src/lib.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index ee70239..48a9e17 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -195,6 +195,7 @@ fn space(input: Span) -> IResult { struct Context; impl Context { + #[inline] const fn partial<'a>( &self, func: impl Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, @@ -278,22 +279,25 @@ impl Context { } fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let start = &tag("\\["); + let end = &tag("\\]"); + let (input, _) = opt(line_ending)(input)?; if input.get_column() != 0 { return fail(input); } - let (input, _) = tag("\\[")(input)?; + let (input, _) = start(input)?; let (input, _) = opt(line_ending)(input)?; let (input, math_span) = recognize(many1_count(tuple(( - not(tuple((opt(line_ending), tag("\\]")))), + not(tuple((opt(line_ending), end))), not_line_ending, ))))(input)?; let (input, _) = opt(line_ending)(input)?; - let (input, _) = tag("\\]")(input)?; + let (input, _) = end(input)?; let (input, _) = many0(space)(input)?; let (input, _) = not(not_line_ending)(input)?; let (input, _) = opt(line_ending)(input)?; From 9b26691ff41e3418041691b2025bcca847390159 Mon Sep 17 00:00:00 2001 From: Natty Date: Thu, 5 Oct 2023 19:09:26 +0200 Subject: [PATCH 05/23] Implemented URL parsing --- magnetar_mmm_parser/src/lib.rs | 176 +++++++++++++++++++++++++++++++-- 1 file changed, 167 insertions(+), 9 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 48a9e17..62d7116 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,10 +1,10 @@ use nom::branch::alt; use nom::bytes::complete::tag; -use nom::character::complete; use nom::character::complete::{ - alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, tab, + alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1, + tab, }; -use nom::combinator::{fail, not, opt, recognize}; +use nom::combinator::{eof, fail, not, opt, recognize}; use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, separated_list1}; use nom::sequence::tuple; @@ -28,6 +28,13 @@ pub enum Token<'a> { PlainTag(Cow<'a, str>), InlineCode(Cow<'a, str>), InlineMath(Cow<'a, str>), + UrlRaw(Cow<'a, str>), + UrlNoEmbed(Cow<'a, str>), + Link { + label: Cow<'a, str>, + href: Cow<'a, str>, + embed: bool, + }, BlockCode { lang: Option>, inner: Cow<'a, str>, @@ -56,6 +63,13 @@ impl Token<'_> { Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())), Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())), Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())), + Token::UrlRaw(url) => Token::UrlRaw(Cow::Owned(url.clone().into_owned())), + Token::UrlNoEmbed(url) => Token::UrlNoEmbed(Cow::Owned(url.clone().into_owned())), + Token::Link { embed, label, href } => Token::Link { + embed: *embed, + label: Cow::Owned(label.clone().into_owned()), + href: Cow::Owned(href.clone().into_owned()), + }, Token::BlockCode { inner, lang } => Token::BlockCode { lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())), inner: Cow::Owned(inner.clone().into_owned()), @@ -184,12 +198,8 @@ fn spliced<'a>( } fn space(input: Span) -> IResult { - let start = input; - let (input, _) = alt((complete::char('\u{0020}'), complete::char('\u{3000}'), tab))(input)?; - Ok(( - input, - Token::PlainText(start.fragment_between(&input).into()), - )) + let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?; + Ok((input, Token::PlainText(frag.into_fragment().into()))) } struct Context; @@ -213,6 +223,11 @@ impl Context { Ok((input, token)) } + fn inline_no_link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?; + Ok((input, token)) + } + fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; @@ -550,4 +565,147 @@ impl Context { Token::PlainText(before.fragment_between(&input).into()), )) } + + fn url<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, url_span) = recognize(tuple(( + protocol, + url_chars(|input| not(url_chars_base)(input), false), + )))(input)?; + + let url = url_span.into_fragment(); + let url_bytes = url.as_bytes(); + + // Strip punctuation at the end of sentences that might have been consumed as a part of the URL + let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) { + url.slice(..url.len() - 1) + } else { + url + }; + + Ok((input, Token::UrlRaw(Cow::from(final_url)))) + } + + fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, _) = tag("<")(input)?; + let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?; + let (input, _) = tag(">")(input)?; + + Ok((input, Token::UrlRaw(Cow::from(url_span.into_fragment())))) + } + + fn link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, no_embed) = opt(tag("?"))(input)?; + let (input, _) = tag("[")(input)?; + let (input, _) = not(tag("["))(input)?; + let (input, label_span) = + recognize(many1(tuple((not(tag("](")), not_line_ending))))(input)?; + let (input, _) = tag("]")(input)?; + let (input, _) = tag("(")(input)?; + let (input, url_span) = recognize(tuple((protocol, url_chars(tag("]"), true))))(input)?; + let (input, _) = tag(")")(input)?; + + Ok(( + input, + Token::Link { + label: label_span.into_fragment().into(), + href: url_span.into_fragment().into(), + embed: no_embed.is_none(), + }, + )) + } +} + +#[inline] +fn protocol(input: Span) -> IResult { + alt((tag("https://"), tag("http://")))(input) +} + +#[inline] +fn url_chars_base(input: Span) -> IResult { + recognize(alt((alpha1, recognize(one_of(".,_/:%#$&?!~=+-()[]@")))))(input) +} + +#[inline] +fn url_chars<'a, T: 'a>( + terminator: impl Fn(Span<'a>) -> IResult, T> + 'a, + spaces: bool, +) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'a { + let terminating = move |input| { + tuple(( + &terminator, + alt(( + space1, + line_ending, + eof, + recognize(one_of("([<'\"")), + recognize(tuple(( + alt((alpha1, recognize(one_of("*")))), + alt((space1, line_ending, eof)), + ))), + )), + ))(input) + }; + + let chars = tuple(( + not(tuple((space1, eof))), + not(tuple((space1, tag("\"")))), + not(tuple((opt(space1), terminating))), + alt((url_chars_base, if spaces { space1 } else { fail })), + )); + + recognize(many1_count(chars)) +} + +#[cfg(test)] +mod test { + use crate::{url_chars, Span}; + use nom::bytes::complete::tag; + + #[test] + fn parse_url_chars() { + let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))"; + assert_eq!( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)", + url_chars(tag(")"), true)(Span::new(test1)) + .unwrap() + .1 + .into_fragment() + ); + + let test2 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))"; + assert_eq!( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", + url_chars(tag(")"), true)(Span::new(test2)) + .unwrap() + .1 + .into_fragment() + ); + + let test3 = "https://en.wikipedia.org/wiki/("; + assert_eq!( + test3, + url_chars(tag(")"), true)(Span::new(test3)) + .unwrap() + .1 + .into_fragment() + ); + + let test4 = "https://cs.wikipedia.org/wiki/Among_Us "; + assert_eq!( + "https://cs.wikipedia.org/wiki/Among_Us", + url_chars(tag(")"), true)(Span::new(test4)) + .unwrap() + .1 + .into_fragment() + ); + + let test5 = "https://cs.wikipedia.org/wiki/Among Us )"; + assert_eq!( + "https://cs.wikipedia.org/wiki/Among Us", + url_chars(tag(")"), true)(Span::new(test5)) + .unwrap() + .1 + .into_fragment() + ); + } } From 52dc491a47cd8c5a62a24b4ad03dda2e928df513 Mon Sep 17 00:00:00 2001 From: Natty Date: Thu, 5 Oct 2023 20:05:03 +0200 Subject: [PATCH 06/23] Mention parsing --- magnetar_mmm_parser/src/lib.rs | 67 +++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 62d7116..d4e9d6e 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -4,7 +4,7 @@ use nom::character::complete::{ alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1, tab, }; -use nom::combinator::{eof, fail, not, opt, recognize}; +use nom::combinator::{eof, fail, map, not, opt, recognize}; use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, separated_list1}; use nom::sequence::tuple; @@ -13,6 +13,21 @@ use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; +#[derive(Copy, Clone, Debug)] +pub enum MentionType { + Community, + User, +} + +impl MentionType { + pub fn to_char(&self) -> char { + match self { + MentionType::Community => '!', + MentionType::User => '@', + } + } +} + #[derive(Clone, Debug)] pub enum Token<'a> { PlainText(Cow<'a, str>), @@ -45,6 +60,11 @@ pub enum Token<'a> { params: HashMap, Option>>, inner: Box>, }, + Mention { + name: Cow<'a, str>, + host: Option>, + mention_type: MentionType, + }, } impl Token<'_> { @@ -92,6 +112,15 @@ impl Token<'_> { .collect(), inner: Box::new(inner.owned()), }, + Token::Mention { + name, + host, + mention_type, + } => Token::Mention { + name: Cow::Owned(name.clone().into_owned()), + host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())), + mention_type: *mention_type, + }, } } } @@ -613,6 +642,42 @@ impl Context { }, )) } + + fn mention<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + // TODO: Escaping and skip when preceded by alphanumerics + + let tags = one_of("@!"); + let (input, mention_type) = map(tags, |c| match c { + '@' => MentionType::User, + '!' => MentionType::Community, + _ => unreachable!(), + })(input)?; + + let (input, name) = map( + recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))), + Span::into_fragment, + )(input)?; + + let (input, host) = map( + opt(tuple(( + tag("@"), + map( + recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))), + Span::into_fragment, + ), + ))), + |maybe_tag_host| maybe_tag_host.map(|(_, host)| host), + )(input)?; + + Ok(( + input, + Token::Mention { + mention_type, + name: name.into(), + host: host.map(|h| h.into()), + }, + )) + } } #[inline] From 8009546bfe31e7442a4374a28ff013113ee5a10b Mon Sep 17 00:00:00 2001 From: Natty Date: Thu, 5 Oct 2023 21:21:23 +0200 Subject: [PATCH 07/23] Emoji parsing --- Cargo.lock | 11 ++++++++++ Cargo.toml | 3 ++- magnetar_mmm_parser/Cargo.toml | 4 +++- magnetar_mmm_parser/src/lib.rs | 37 +++++++++++++++++++++++++++++++++- 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9abfe30..aa58d5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -748,6 +748,15 @@ dependencies = [ "serde", ] +[[package]] +name = "emojis" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979" +dependencies = [ + "phf", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -1613,8 +1622,10 @@ dependencies = [ name = "mmm_parser" version = "0.2.1-alpha" dependencies = [ + "emojis", "nom", "nom_locate", + "unicode-segmentation", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a7a960f..f504d67 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ cached = "0.46" cfg-if = "1" chrono = "0.4" dotenvy = "0.15" +emojis = "0.6" futures-core = "0.3" futures-util = "0.3" headers = "0.3" @@ -101,4 +102,4 @@ toml = { workspace = true } unicode-segmentation = { workspace = true } [profile.release] -lto = true \ No newline at end of file +lto = true diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 8a07618..30c2bad 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -5,5 +5,7 @@ edition.workspace = true license = "MIT OR Apache-2.0" [dependencies] +emojis = { workspace = true } nom = { workspace = true } -nom_locate = { workspace = true } \ No newline at end of file +nom_locate = { workspace = true } +unicode-segmentation = { workspace = true } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index d4e9d6e..a3ddcd1 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -12,6 +12,7 @@ use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; +use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug)] pub enum MentionType { @@ -65,6 +66,8 @@ pub enum Token<'a> { host: Option>, mention_type: MentionType, }, + UnicodeEmoji(Cow<'a, str>), + ShortcodeEmoji(Cow<'a, str>), } impl Token<'_> { @@ -121,6 +124,10 @@ impl Token<'_> { host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())), mention_type: *mention_type, }, + Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())), + Token::ShortcodeEmoji(shortcode) => { + Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned())) + } } } } @@ -643,6 +650,24 @@ impl Context { )) } + fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let frag = input.fragment(); + let Some(grapheme) = frag.graphemes(true).next() else { + return fail(input); + }; + + let emoji = emojis::get(grapheme); + + if emoji.is_none() { + return fail(input); + } + + Ok(( + input.slice(grapheme.len()..), + Token::UnicodeEmoji(grapheme.into()), + )) + } + fn mention<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { // TODO: Escaping and skip when preceded by alphanumerics @@ -723,8 +748,9 @@ fn url_chars<'a, T: 'a>( #[cfg(test)] mod test { - use crate::{url_chars, Span}; + use crate::{url_chars, Context, Span}; use nom::bytes::complete::tag; + use nom::multi::many1; #[test] fn parse_url_chars() { @@ -773,4 +799,13 @@ mod test { .into_fragment() ); } + + #[test] + fn parse_emoji() { + let test = "🥺💜❤️🦊"; + let ctx = Context; + let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap(); + + println!("{:#?}", tokens.1) + } } From 7c8e65f5562d22455c7c45b0029af13d61db335a Mon Sep 17 00:00:00 2001 From: Natty Date: Thu, 5 Oct 2023 22:02:46 +0200 Subject: [PATCH 08/23] Hashtag parsing --- magnetar_mmm_parser/src/lib.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index a3ddcd1..93f4da1 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -68,6 +68,7 @@ pub enum Token<'a> { }, UnicodeEmoji(Cow<'a, str>), ShortcodeEmoji(Cow<'a, str>), + Hashtag(Cow<'a, str>), } impl Token<'_> { @@ -128,6 +129,7 @@ impl Token<'_> { Token::ShortcodeEmoji(shortcode) => { Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned())) } + Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())), } } } @@ -703,6 +705,33 @@ impl Context { }, )) } + + fn hashtag<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + // TODO: Skip when preceded by alphanumerics + + let (input, _) = tag("#")(input)?; + + let (input, hashtag_text) = + map(recognize(many1(hashtag_chars)), Span::into_fragment)(input)?; + + Ok((input, Token::Hashtag(hashtag_text.into()))) + } +} + +#[inline] +fn hashtag_chars(input: Span) -> IResult { + recognize(alt(( + recognize(tuple((tag("("), hashtag_chars, tag(")")))), + recognize(tuple((tag("["), hashtag_chars, tag("]")))), + recognize(tuple((tag("「"), hashtag_chars, tag("」")))), + recognize(tuple((tag("("), hashtag_chars, tag(")")))), + recognize(tuple(( + not(space1), + not_line_ending, + not(one_of(".,:;!?#?/[]【】()「」()<>")), + anychar, + ))), + )))(input) } #[inline] From a6ee6bfbde032f2b46c8a603e68ed119f5b8fdd1 Mon Sep 17 00:00:00 2001 From: Natty Date: Thu, 5 Oct 2023 22:12:51 +0200 Subject: [PATCH 09/23] Plain tag parsing --- magnetar_mmm_parser/src/lib.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 93f4da1..b08ae5b 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -453,6 +453,20 @@ impl Context { )) } + fn tag_plain<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let opening_tag = &tag(""); + let closing_tag = &tag(""); + + let (input, _) = opening_tag(input)?; + let (input, text) = map( + recognize(many1(tuple((not_line_ending, not(closing_tag))))), + Span::into_fragment, + )(input)?; + let (input, _) = closing_tag(input)?; + + Ok((input, Token::PlainTag(text.into()))) + } + fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( "", From 4431a3ad6274e8f608a2bafbb8f9962599ea87d7 Mon Sep 17 00:00:00 2001 From: Natty Date: Thu, 5 Oct 2023 22:25:29 +0200 Subject: [PATCH 10/23] Code block parsing --- magnetar_mmm_parser/src/lib.rs | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index b08ae5b..6cc9cb7 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -331,6 +331,45 @@ impl Context { Ok((input, boxing_sequence(Token::Center)(tokens))) } + fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let delim = &tag("```"); + + let (input, _) = opt(line_ending)(input)?; + + if input.get_column() != 0 { + return fail(input); + } + + let (input, _) = delim(input)?; + let (input, lang) = opt(map( + recognize(many1(tuple((not(delim), not_line_ending)))), + Span::into_fragment, + ))(input)?; + let (input, _) = line_ending(input)?; + + let (input, code) = map( + recognize(many1_count(tuple(( + not(tuple((line_ending, delim))), + anychar, + )))), + Span::into_fragment, + )(input)?; + + let (input, _) = line_ending(input)?; + let (input, _) = delim(input)?; + let (input, _) = many0(space)(input)?; + let (input, _) = not(not_line_ending)(input)?; + let (input, _) = opt(line_ending)(input)?; + + Ok(( + input, + Token::BlockCode { + lang: lang.map(<&str>::into), + inner: code.into(), + }, + )) + } + fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let start = &tag("\\["); let end = &tag("\\]"); From c45ec852dd57d7b9299b3130f995ef3cd056abdd Mon Sep 17 00:00:00 2001 From: Natty Date: Thu, 5 Oct 2023 22:32:53 +0200 Subject: [PATCH 11/23] Shortcode emoji parsing --- magnetar_mmm_parser/src/lib.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 6cc9cb7..4fbb9ef 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -723,7 +723,20 @@ impl Context { )) } - fn mention<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + // TODO: Fail when preceded by alphanumerics + let (input, _) = tag(":")(input)?; + let (input, shortcode) = map( + recognize(many1(alt((alphanumeric1, recognize(one_of("_+-")))))), + Span::into_fragment, + )(input)?; + let (input, _) = tag(":")(input)?; + let (input, _) = not(alphanumeric1)(input)?; + + Ok((input, Token::ShortcodeEmoji(shortcode.into()))) + } + + fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { // TODO: Escaping and skip when preceded by alphanumerics let tags = one_of("@!"); From 453891ddf418744c1b649097227dad9b5af9924a Mon Sep 17 00:00:00 2001 From: Natty Date: Fri, 6 Oct 2023 00:17:52 +0200 Subject: [PATCH 12/23] Connected it all --- magnetar_mmm_parser/src/lib.rs | 249 +++++++++++++++++++++++++++------ 1 file changed, 204 insertions(+), 45 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 4fbb9ef..ed90585 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -6,7 +6,7 @@ use nom::character::complete::{ }; use nom::combinator::{eof, fail, map, not, opt, recognize}; use nom::error::ErrorKind; -use nom::multi::{many0, many0_count, many1, many1_count, separated_list1}; +use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; @@ -14,7 +14,7 @@ use std::borrow::Cow; use std::collections::HashMap; use unicode_segmentation::UnicodeSegmentation; -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum MentionType { Community, User, @@ -29,7 +29,7 @@ impl MentionType { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] pub enum Token<'a> { PlainText(Cow<'a, str>), Sequence(Vec>), @@ -132,6 +132,49 @@ impl Token<'_> { Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())), } } + + fn merged(&self) -> Token { + match self { + Token::Sequence(tokens) => { + let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| { + if let Some(Token::PlainText(last)) = acc.last_mut() { + if let Token::PlainText(tok_text) = tok { + *last = Cow::from(last.to_string() + tok_text.as_ref()); + + return acc; + } + } + + acc.push(tok.merged()); + acc + }); + + if tokens_multi.len() == 1 { + return tokens_multi.into_iter().next().unwrap(); + } + + Token::Sequence(tokens_multi) + } + Token::Quote(inner) => Token::Quote(Box::new(inner.merged())), + Token::Small(inner) => Token::Small(Box::new(inner.merged())), + Token::Big(inner) => Token::Big(Box::new(inner.merged())), + Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.merged())), + Token::Bold(inner) => Token::Bold(Box::new(inner.merged())), + Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), + Token::Center(inner) => Token::Center(Box::new(inner.merged())), + Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())), + Token::Function { + name, + params, + inner, + } => Token::Function { + name: name.clone(), + params: params.clone(), + inner: Box::new(inner.merged()), + }, + other => other.clone(), + } + } } type Span<'a> = LocatedSpan<&'a str>; @@ -244,25 +287,103 @@ struct Context; impl Context { #[inline] - const fn partial<'a>( + const fn partial( &self, - func: impl Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, - ) -> impl Fn(Span<'a>) -> IResult, Token<'a>> + '_ { + func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, + ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token<'a>> + '_ { move |input| func(self, input) } - fn root<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let (input, token) = alt((self.partial(Self::tag_quote),))(input)?; - Ok((input, token)) + fn full<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + map(many1(self.partial(Self::full_single)), Token::Sequence)(input) } fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?; + map(many1(self.partial(Self::inline_single)), Token::Sequence)(input) + } + + fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + map( + many1(self.partial(Self::inline_label_safe_single)), + Token::Sequence, + )(input) + } + + fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + alt(( + self.partial(Self::tag_bold_italic_asterisk), + self.partial(Self::tag_bold_italic_underscore), + self.partial(Self::tag_bold_asterisk), + self.partial(Self::tag_italic_asterisk), + self.partial(Self::tag_bold_underscore), + self.partial(Self::tag_italic_underscore), + ))(input) + } + + fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_block_center), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_block_code), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_quote), + self.partial(Self::tag_block_math), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::raw_url), + self.partial(Self::text), + ))(input)?; Ok((input, token)) } - fn inline_no_link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?; + fn inline_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::raw_url), + self.partial(Self::text), + ))(input)?; + Ok((input, token)) + } + + fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::shortcode_emoji), + self.partial(Self::text), + ))(input)?; Ok((input, token)) } @@ -270,7 +391,7 @@ impl Context { let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; if let (None, None) = leading_spaces { - if input.get_column() != 0 { + if input.get_column() != 1 { return fail(input); } } @@ -295,7 +416,12 @@ impl Context { return fail(input); } - let (_, inner) = spliced("e_lines, space, Token::Quote, orig_input)?; + let (_, inner) = spliced( + "e_lines, + self.partial(Self::full), + Token::Quote, + orig_input, + )?; let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; @@ -308,27 +434,23 @@ impl Context { let (input, _) = opt(line_ending)(input)?; - if input.get_column() != 0 { + if input.get_column() != 1 { return fail(input); } let (input, _) = tag_start(input)?; let (input, _) = opt(line_ending)(input)?; - let (input, center_seq) = many0(tuple(( - not(tuple((opt(line_ending), tag_end))), - self.partial(Self::inline), - )))(input)?; + let (input, (center_seq, _)) = many_till( + self.partial(Self::inline_single), + tuple((opt(line_ending), tag_end)), + )(input)?; - let (input, _) = opt(line_ending)(input)?; - let (input, _) = tag_end(input)?; let (input, _) = many0(space)(input)?; - let (input, _) = not(not_line_ending)(input)?; + let (input, _) = not(not(line_ending))(input)?; let (input, _) = opt(line_ending)(input)?; - let tokens = center_seq.into_iter().map(|(_, v)| v).collect::>(); - - Ok((input, boxing_sequence(Token::Center)(tokens))) + Ok((input, boxing_sequence(Token::Center)(center_seq))) } fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { @@ -336,7 +458,7 @@ impl Context { let (input, _) = opt(line_ending)(input)?; - if input.get_column() != 0 { + if input.get_column() != 1 { return fail(input); } @@ -358,7 +480,7 @@ impl Context { let (input, _) = line_ending(input)?; let (input, _) = delim(input)?; let (input, _) = many0(space)(input)?; - let (input, _) = not(not_line_ending)(input)?; + let (input, _) = not(not(line_ending))(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( @@ -376,7 +498,7 @@ impl Context { let (input, _) = opt(line_ending)(input)?; - if input.get_column() != 0 { + if input.get_column() != 1 { return fail(input); } @@ -458,8 +580,7 @@ impl Context { tag("_"), )))); - let (input, func_name_span) = func_ident(input)?; - let func_name = func_name_span.into_fragment(); + let (input, func_name) = map(func_ident, Span::into_fragment)(input)?; let arg = tuple((func_ident, opt(tuple((tag("="), param_value))))); @@ -478,16 +599,16 @@ impl Context { .collect::>() }); - let (input, inner) = self.partial(Self::inline)(input)?; + let (input, _) = opt(space)(input)?; - let (input, _) = tag("]")(input)?; + let (input, (inner, _)) = many_till(self.partial(Self::inline_single), tag("]"))(input)?; Ok(( input, Token::Function { name: Cow::from(func_name), params: args_out, - inner: Box::new(inner), + inner: Box::new(Token::Sequence(inner)), }, )) } @@ -649,15 +770,11 @@ impl Context { } fn text<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let before = input; - let (input, _) = anychar(input)?; - Ok(( - input, - Token::PlainText(before.fragment_between(&input).into()), - )) + let (input, text) = map(recognize(anychar), Span::into_fragment)(input)?; + Ok((input, Token::PlainText(text.into()))) } - fn url<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, url_span) = recognize(tuple(( protocol, url_chars(|input| not(url_chars_base)(input), false), @@ -688,8 +805,10 @@ impl Context { let (input, no_embed) = opt(tag("?"))(input)?; let (input, _) = tag("[")(input)?; let (input, _) = not(tag("["))(input)?; - let (input, label_span) = - recognize(many1(tuple((not(tag("](")), not_line_ending))))(input)?; + let (input, label_span) = recognize(many1(tuple(( + not(tag("](")), + self.partial(Self::inline_label_safe_single), + ))))(input)?; let (input, _) = tag("]")(input)?; let (input, _) = tag("(")(input)?; let (input, url_span) = recognize(tuple((protocol, url_chars(tag("]"), true))))(input)?; @@ -772,7 +891,7 @@ impl Context { )) } - fn hashtag<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { // TODO: Skip when preceded by alphanumerics let (input, _) = tag("#")(input)?; @@ -843,9 +962,11 @@ fn url_chars<'a, T: 'a>( #[cfg(test)] mod test { - use crate::{url_chars, Context, Span}; + use crate::{url_chars, Context, Span, Token}; use nom::bytes::complete::tag; use nom::multi::many1; + use std::borrow::Cow; + use std::collections::HashMap; #[test] fn parse_url_chars() { @@ -895,12 +1016,50 @@ mod test { ); } + #[test] + fn parse_complex() { + let emoji = r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#; + assert_eq!( + Token::Function { + name: "x2".into(), + params: HashMap::new(), + inner: Box::new(Token::Sequence(vec![ + Token::Function { + name: "sparkle".into(), + params: HashMap::new(), + inner: Box::new(Token::UnicodeEmoji("🥺".into())), + }, + Token::UnicodeEmoji("💜".into()), + Token::Function { + name: "spin".into(), + params: { + let mut params = HashMap::new(); + params.insert("y".into(), None); + params.insert("speed".into(), Some("5s".into())); + params + }, + inner: Box::new(Token::UnicodeEmoji("❤️".into())), + }, + Token::UnicodeEmoji("🦊".into()), + ])) + }, + Context.full(Span::new(emoji)).unwrap().1.merged() + ) + } + #[test] fn parse_emoji() { let test = "🥺💜❤️🦊"; let ctx = Context; let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap(); - println!("{:#?}", tokens.1) + assert_eq!( + vec!["🥺", "💜", "❤️", "🦊"] + .into_iter() + .map(<&str as Into>>::into) + .map(Token::UnicodeEmoji) + .collect::>(), + tokens.1 + ); } } From 703e1191c2820ca39e7a35a8f320bf4924671421 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 7 Oct 2023 01:46:20 +0200 Subject: [PATCH 13/23] Janky sequence unnesting and attempting to salvage nested parsing in incorrect formatting tags --- Cargo.lock | 1 + Cargo.toml | 1 + magnetar_mmm_parser/Cargo.toml | 1 + magnetar_mmm_parser/src/lib.rs | 245 ++++++++++++++++++++++++++------- 4 files changed, 200 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aa58d5f..e2d79c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1622,6 +1622,7 @@ dependencies = [ name = "mmm_parser" version = "0.2.1-alpha" dependencies = [ + "either", "emojis", "nom", "nom_locate", diff --git a/Cargo.toml b/Cargo.toml index f504d67..c326183 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ cached = "0.46" cfg-if = "1" chrono = "0.4" dotenvy = "0.15" +either = "1.9" emojis = "0.6" futures-core = "0.3" futures-util = "0.3" diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 30c2bad..25faa6b 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -5,6 +5,7 @@ edition.workspace = true license = "MIT OR Apache-2.0" [dependencies] +either = { workspace = true } emojis = { workspace = true } nom = { workspace = true } nom_locate = { workspace = true } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index ed90585..74d98ea 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,3 +1,4 @@ +use either::Either; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{ @@ -12,6 +13,7 @@ use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; +use std::convert::identity; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -145,6 +147,27 @@ impl Token<'_> { } } + if let Token::Sequence(seq) = tok { + let items = seq.iter().map(Token::merged).flat_map(|t| match t { + Token::Sequence(seq) => Either::Left(seq.into_iter()), + other => Either::Right(std::iter::once(other)), + }); + + for item in items { + if let Some(Token::PlainText(last)) = acc.last_mut() { + if let Token::PlainText(tok_text) = item { + *last = Cow::from(last.to_string() + tok_text.as_ref()); + + continue; + } + } + + acc.push(item); + } + + return acc; + } + acc.push(tok.merged()); acc }); @@ -200,13 +223,13 @@ impl SliceOffset for Span<'_> { } } -const fn boxing_sequence<'a>( - func: impl Fn(Box>) -> Token<'a>, -) -> impl Fn(Vec>) -> Token<'a> { - move |tokens| func(Box::new(Token::Sequence(tokens))) +#[inline] +fn boxing_token<'a>(func: impl Fn(Box>) -> Token<'a>) -> impl Fn(Token<'a>) -> Token<'a> { + move |tokens| func(Box::new(tokens)) } -const fn collect_char_sequence<'a>( +#[inline] +fn collect_char_sequence<'a>( func: impl Fn(Cow<'a, str>) -> Token<'a>, ) -> impl Fn(Vec) -> Token<'a> { move |chars| func(Cow::Owned(chars.into_iter().collect())) @@ -215,14 +238,14 @@ const fn collect_char_sequence<'a>( fn spliced<'a>( segments: &[Span<'a>], func: impl Fn(Span) -> IResult, - output_mapper: impl Fn(Box>) -> Token<'static>, parent: Span<'a>, ) -> IResult, Token<'static>, nom::error::Error>> { let combined = segments .iter() .copied() .map(Span::into_fragment) - .collect::(); + .collect::>() + .join("\n"); let cum_offset_combined = segments .iter() .scan(0, |acc, &x| { @@ -234,7 +257,7 @@ fn spliced<'a>( cum_offset_combined .iter() .enumerate() - .filter(|(_, &o)| o >= input.location_offset()) + .take_while(|(_, &o)| o > input.location_offset()) .map(|(i, o)| (segments[i], o)) .last() }; @@ -275,7 +298,7 @@ fn spliced<'a>( parent }; - Ok((out, output_mapper(Box::new(inner.owned())))) + Ok((out, inner.owned())) } fn space(input: Span) -> IResult { @@ -370,6 +393,22 @@ impl Context { Ok((input, token)) } + fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::url_no_embed), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::raw_url), + self.partial(Self::text), + ))(input)?; + Ok((input, token)) + } + fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, token) = alt(( self.partial(Self::unicode_emoji), @@ -416,12 +455,7 @@ impl Context { return fail(input); } - let (_, inner) = spliced( - "e_lines, - self.partial(Self::full), - Token::Quote, - orig_input, - )?; + let (_, inner) = spliced("e_lines, self.partial(Self::full), orig_input)?; let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; @@ -450,7 +484,10 @@ impl Context { let (input, _) = not(not(line_ending))(input)?; let (input, _) = opt(line_ending)(input)?; - Ok((input, boxing_sequence(Token::Center)(center_seq))) + Ok(( + input, + boxing_token(Token::Center)(Token::Sequence(center_seq)), + )) } fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { @@ -522,13 +559,16 @@ impl Context { )) } - const fn tag_delimited<'a, 'b: 'a, T>( + #[inline] + fn tag_delimited<'a, 'b: 'a, T>( &'a self, start: &'b str, end: &'b str, escape: bool, matcher_inner: impl Fn(Span<'b>) -> IResult, T> + 'a, - mapper: impl Fn(Vec) -> Token<'b> + 'a, + matcher_inner_fallback: impl Fn(Span<'b>) -> IResult, T> + 'a, + collector: impl Fn(Vec) -> Token<'b> + 'a, + mapper: impl Fn(Token<'b>) -> Token<'b> + 'a, ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ { move |input| { let opening_tag = &tag(start); @@ -548,18 +588,40 @@ impl Context { closing_tag, ))(post_open); - if let Err(nom::Err::Error(nom::error::Error { .. })) = res { + if let Err(nom::Err::Error(nom::error::Error { + input: input_past_err, + .. + })) = res + { + let res_fallback = tuple(( + many1(tuple((not(closing_tag), &matcher_inner_fallback))), + closing_tag, + ))(post_open); + + if res_fallback.is_err() { + return Ok(( + input_past_err, + Token::PlainText(begin.fragment_between(&input_past_err).into()), + )); + } + + let (input, (inner, closing)) = res_fallback.unwrap(); + let inner = inner.into_iter().map(|(_, t)| t).collect::>(); + return Ok(( - post_open, - Token::PlainText(begin.fragment_between(&post_open).into()), + input, + Token::Sequence(vec![ + Token::PlainText(begin.fragment_between(&post_open).into()), + collector(inner), + Token::PlainText(closing.into_fragment().into()), + ]), )); } let (input, (inner, _)) = res?; - let inner = inner.into_iter().map(|(_, t)| t).collect::>(); - Ok((input, mapper(inner))) + Ok((input, mapper(collector(inner)))) } } @@ -632,8 +694,10 @@ impl Context { "", "", false, - self.partial(Self::inline), - boxing_sequence(Token::Small), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Small), )(input) } @@ -643,8 +707,10 @@ impl Context { "***", "***", true, - self.partial(Self::inline), - boxing_sequence(Token::BoldItalic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::BoldItalic), )(input) } @@ -654,8 +720,10 @@ impl Context { "___", "___", true, - self.partial(Self::inline), - boxing_sequence(Token::BoldItalic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::BoldItalic), )(input) } @@ -664,8 +732,10 @@ impl Context { "", "", false, - self.partial(Self::inline), - boxing_sequence(Token::Bold), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Bold), )(input) } @@ -675,8 +745,10 @@ impl Context { "**", "**", true, - self.partial(Self::inline), - boxing_sequence(Token::Bold), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Bold), )(input) } @@ -686,8 +758,10 @@ impl Context { "__", "__", true, - self.partial(Self::inline), - boxing_sequence(Token::Bold), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Bold), )(input) } @@ -696,8 +770,10 @@ impl Context { "", "", false, - self.partial(Self::inline), - boxing_sequence(Token::Italic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Italic), )(input) } @@ -707,8 +783,10 @@ impl Context { "*", "*", true, - self.partial(Self::inline), - boxing_sequence(Token::Italic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Italic), )(input) } @@ -718,8 +796,10 @@ impl Context { "_", "_", true, - self.partial(Self::inline), - boxing_sequence(Token::Italic), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Italic), )(input) } @@ -728,8 +808,10 @@ impl Context { "", "", false, - self.partial(Self::inline), - boxing_sequence(Token::Strikethrough), + self.partial(Self::inline_single), + self.partial(Self::inline_non_formatting_single), + Token::Sequence, + boxing_token(Token::Strikethrough), )(input) } @@ -740,9 +822,18 @@ impl Context { "~~", true, move |input| { - tuple((not_line_ending, self.partial(Self::inline)))(input).map(|(i, t)| (i, t.1)) + tuple((not_line_ending, self.partial(Self::inline_single)))(input) + .map(|(i, t)| (i, t.1)) }, - boxing_sequence(Token::Strikethrough), + move |input| { + tuple(( + not_line_ending, + self.partial(Self::inline_non_formatting_single), + ))(input) + .map(|(i, t)| (i, t.1)) + }, + Token::Sequence, + boxing_token(Token::Strikethrough), )(input) } @@ -755,7 +846,9 @@ impl Context { tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input) .map(|(i, (_skip, c))| (i, c)) }, + fail, collect_char_sequence(Token::InlineCode), + identity, )(input) } @@ -764,8 +857,10 @@ impl Context { "\\(", "\\)", false, - move |input| tuple((not_line_ending, anychar))(input).map(|(i, (_skip, c))| (i, c)), - collect_char_sequence(Token::InlineMath), + move |input| tuple((not(line_ending), anychar))(input).map(|(i, (_skip, c))| (i, c)), + fail, + collect_char_sequence(Token::InlineCode), + identity, )(input) } @@ -1044,7 +1139,61 @@ mod test { ])) }, Context.full(Span::new(emoji)).unwrap().1.merged() - ) + ); + + let bold_italic = r#"***bold italic***"#; + assert_eq!( + Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))), + Context.full(Span::new(bold_italic)).unwrap().1.merged() + ); + + let bold_italic_tag = r#"bold italic"#; + assert_eq!( + Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( + "bold italic".into() + ))))), + Context.full(Span::new(bold_italic_tag)).unwrap().1.merged() + ); + + assert_eq!( + Token::Sequence(vec![ + Token::PlainText("bold ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag1".into(), + host: None + }, + Token::PlainText(" ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag2".into(), + host: None + }, + Token::PlainText(" italic".into()) + ]), + Context + .full(Span::new(r#"bold @tag1 @tag2 italic"#)) + .unwrap() + .1 + .merged() + ); + + let quote = r#" +> test +> +> italic +> +>> Nested quote +"#; + + assert_eq!( + Token::Quote(Box::new(Token::Sequence(vec![ + Token::PlainText("test\n".into()), + Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), + Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) + ]))), + Context.full(Span::new(quote)).unwrap().1.merged() + ); } #[test] From 95bce443be543c298f5676a84c31fb08839fd992 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 7 Oct 2023 01:53:03 +0200 Subject: [PATCH 14/23] Fixed a code typo --- magnetar_mmm_parser/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 74d98ea..fbcfeb6 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -859,7 +859,7 @@ impl Context { false, move |input| tuple((not(line_ending), anychar))(input).map(|(i, (_skip, c))| (i, c)), fail, - collect_char_sequence(Token::InlineCode), + collect_char_sequence(Token::InlineMath), identity, )(input) } From 154cc27c07e6c72f359ca5c045eb173062588602 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 7 Oct 2023 19:44:39 +0200 Subject: [PATCH 15/23] More precise emoji extraction and fixed center tag parsing --- Cargo.toml | 1 + magnetar_mmm_parser/src/lib.rs | 505 +++++++++++++++++++++++---------- 2 files changed, 350 insertions(+), 156 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c326183..c5d0c4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ nom = "7" nom_locate = "4" percent-encoding = "2.2" redis = "0.23" +regex = "1.9" reqwest = "0.11" sea-orm = "0.12" sea-orm-migration = "0.12" diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index fbcfeb6..8dccf96 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -13,7 +13,7 @@ use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; -use std::convert::identity; +use std::convert::{identity, Infallible}; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -37,7 +37,6 @@ pub enum Token<'a> { Sequence(Vec>), Quote(Box>), Small(Box>), - Big(Box>), BoldItalic(Box>), Bold(Box>), Italic(Box>), @@ -80,7 +79,6 @@ impl Token<'_> { Token::Sequence(tokens) => Token::Sequence(tokens.iter().map(Token::owned).collect()), Token::Quote(inner) => Token::Quote(Box::new(inner.owned())), Token::Small(inner) => Token::Small(Box::new(inner.owned())), - Token::Big(inner) => Token::Big(Box::new(inner.owned())), Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.owned())), Token::Bold(inner) => Token::Bold(Box::new(inner.owned())), Token::Italic(inner) => Token::Italic(Box::new(inner.owned())), @@ -180,7 +178,6 @@ impl Token<'_> { } Token::Quote(inner) => Token::Quote(Box::new(inner.merged())), Token::Small(inner) => Token::Small(Box::new(inner.merged())), - Token::Big(inner) => Token::Big(Box::new(inner.merged())), Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.merged())), Token::Bold(inner) => Token::Bold(Box::new(inner.merged())), Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), @@ -228,11 +225,19 @@ fn boxing_token<'a>(func: impl Fn(Box>) -> Token<'a>) -> impl Fn(Token move |tokens| func(Box::new(tokens)) } +#[inline] +fn collect_sequence<'a, T>( + func: impl Fn(Vec) -> Token<'a>, + transform: impl Fn(Token<'a>) -> Token<'a>, +) -> impl Fn(&mut dyn Iterator) -> Token<'a> { + move |tokens| transform(func(tokens.collect())) +} + #[inline] fn collect_char_sequence<'a>( func: impl Fn(Cow<'a, str>) -> Token<'a>, -) -> impl Fn(Vec) -> Token<'a> { - move |chars| func(Cow::Owned(chars.into_iter().collect())) +) -> impl Fn(&mut dyn Iterator) -> Token<'a> { + move |chars| func(Cow::Owned(chars.collect())) } fn spliced<'a>( @@ -306,6 +311,42 @@ fn space(input: Span) -> IResult { Ok((input, Token::PlainText(frag.into_fragment().into()))) } +struct Matcher<'a, 'b, T> { + matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), + _phantom_closure: std::marker::PhantomData<&'a ()>, + _phantom_data: std::marker::PhantomData<&'b ()>, + _phantom_output: std::marker::PhantomData T>, +} + +impl<'a, 'b, T> Matcher<'a, 'b, T> { + fn new( + matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), + ) -> Self { + Self { + matcher_inner, + collector, + _phantom_closure: std::marker::PhantomData, + _phantom_data: std::marker::PhantomData, + _phantom_output: std::marker::PhantomData, + } + } +} + +impl<'a, 'b> Matcher<'a, 'b, Infallible> { + // Don't break this invariant, else a monster will come at night and eat all your socks + fn reject() -> Self { + Self { + matcher_inner: &fail::<_, Infallible, _>, + collector: &|_| unreachable!(), + _phantom_closure: std::marker::PhantomData, + _phantom_data: std::marker::PhantomData, + _phantom_output: std::marker::PhantomData, + } + } +} + struct Context; impl Context { @@ -477,13 +518,9 @@ impl Context { let (input, (center_seq, _)) = many_till( self.partial(Self::inline_single), - tuple((opt(line_ending), tag_end)), + tuple((opt(space1), opt(line_ending), tag_end)), )(input)?; - let (input, _) = many0(space)(input)?; - let (input, _) = not(not(line_ending))(input)?; - let (input, _) = opt(line_ending)(input)?; - Ok(( input, boxing_token(Token::Center)(Token::Sequence(center_seq)), @@ -560,23 +597,21 @@ impl Context { } #[inline] - fn tag_delimited<'a, 'b: 'a, T>( + fn tag_delimited<'a, 'b: 'a, T, S>( &'a self, - start: &'b str, - end: &'b str, + opening_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + closing_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, escape: bool, - matcher_inner: impl Fn(Span<'b>) -> IResult, T> + 'a, - matcher_inner_fallback: impl Fn(Span<'b>) -> IResult, T> + 'a, - collector: impl Fn(Vec) -> Token<'b> + 'a, - mapper: impl Fn(Token<'b>) -> Token<'b> + 'a, + matcher: Matcher<'a, 'b, T>, + fallback: Matcher<'a, 'b, S>, ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ { move |input| { - let opening_tag = &tag(start); - let closing_tag = &tag(end); - if escape { - if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), opening_tag))(input) { - return Ok((input_escaped, Token::PlainText(Cow::Borrowed(&mark)))); + if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) { + return Ok(( + input_escaped, + Token::PlainText(Cow::Borrowed(mark.fragment())), + )); } } @@ -584,8 +619,8 @@ impl Context { let (post_open, _) = opening_tag(input)?; let res = tuple(( - many1(tuple((not(closing_tag), &matcher_inner))), - closing_tag, + many1(tuple((not(&closing_tag), &matcher.matcher_inner))), + &closing_tag, ))(post_open); if let Err(nom::Err::Error(nom::error::Error { @@ -594,8 +629,8 @@ impl Context { })) = res { let res_fallback = tuple(( - many1(tuple((not(closing_tag), &matcher_inner_fallback))), - closing_tag, + many1(tuple((not(&closing_tag), &fallback.matcher_inner))), + &closing_tag, ))(post_open); if res_fallback.is_err() { @@ -606,22 +641,22 @@ impl Context { } let (input, (inner, closing)) = res_fallback.unwrap(); - let inner = inner.into_iter().map(|(_, t)| t).collect::>(); + let mut inner = inner.into_iter().map(|(_, t)| t); return Ok(( input, Token::Sequence(vec![ Token::PlainText(begin.fragment_between(&post_open).into()), - collector(inner), + ((fallback.collector)(&mut inner)), Token::PlainText(closing.into_fragment().into()), ]), )); } let (input, (inner, _)) = res?; - let inner = inner.into_iter().map(|(_, t)| t).collect::>(); + let mut inner = inner.into_iter().map(|(_, t)| t); - Ok((input, mapper(collector(inner)))) + Ok((input, (matcher.collector)(&mut inner))) } } @@ -691,176 +726,230 @@ impl Context { fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "", - "", + tag(""), + tag(""), false, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Small), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Small)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "***", - "***", + tag("***"), + tag("***"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::BoldItalic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "___", - "___", + tag("___"), + tag("___"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::BoldItalic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "", - "", + tag(""), + tag(""), false, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Bold), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "**", - "**", + tag("**"), + tag("**"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Bold), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "__", - "__", + tag("__"), + tag("__"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Bold), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "", - "", + tag(""), + tag(""), false, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Italic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "*", - "*", + tag("*"), + tag("*"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Italic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "_", - "_", + tag("_"), + tag("_"), true, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Italic), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "", - "", + tag(""), + tag(""), false, - self.partial(Self::inline_single), - self.partial(Self::inline_non_formatting_single), - Token::Sequence, - boxing_token(Token::Strikethrough), + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), )(input) } // TODO: CommonMark flanking rules fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "~~", - "~~", + tag("~~"), + tag("~~"), true, - move |input| { - tuple((not_line_ending, self.partial(Self::inline_single)))(input) - .map(|(i, t)| (i, t.1)) - }, - move |input| { - tuple(( - not_line_ending, - self.partial(Self::inline_non_formatting_single), - ))(input) - .map(|(i, t)| (i, t.1)) - }, - Token::Sequence, - boxing_token(Token::Strikethrough), + Matcher::new( + &move |input| { + map( + tuple(((not(line_ending)), self.partial(Self::inline_single))), + |(_, captured)| captured, + )(input) + }, + &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), + ), + Matcher::new( + &move |input| { + map( + tuple(( + (not(line_ending)), + self.partial(Self::inline_non_formatting_single), + )), + |(_, captured)| captured, + )(input) + }, + &collect_sequence(Token::Sequence, identity), + ), )(input) } fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "`", - "", + tag("`"), + |input| alt((tag("`"), tag("´")))(input), true, - move |input| { - tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input) - .map(|(i, (_skip, c))| (i, c)) - }, - fail, - collect_char_sequence(Token::InlineCode), - identity, + Matcher::new( + &move |input| { + map( + tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar)), + |(_, captured)| captured, + )(input) + }, + &collect_char_sequence(Token::InlineCode), + ), + Matcher::reject(), )(input) } fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - "\\(", - "\\)", + tag("\\("), + tag("\\)"), false, - move |input| tuple((not(line_ending), anychar))(input).map(|(i, (_skip, c))| (i, c)), - fail, - collect_char_sequence(Token::InlineMath), - identity, + Matcher::new( + &move |input| { + map(tuple((not(line_ending), anychar)), |(_, captured)| captured)(input) + }, + &collect_char_sequence(Token::InlineMath), + ), + Matcher::reject(), )(input) } @@ -925,6 +1014,8 @@ impl Context { return fail(input); }; + let grapheme = grapheme.trim_end_matches(|c| c == '\u{200c}' || c == '\u{200d}'); + let emoji = emojis::get(grapheme); if emoji.is_none() { @@ -1059,10 +1150,13 @@ fn url_chars<'a, T: 'a>( mod test { use crate::{url_chars, Context, Span, Token}; use nom::bytes::complete::tag; - use nom::multi::many1; use std::borrow::Cow; use std::collections::HashMap; + fn parse_full(string: &str) -> Token { + Context.full(Span::new(string)).unwrap().1.merged().owned() + } + #[test] fn parse_url_chars() { let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))"; @@ -1111,9 +1205,92 @@ mod test { ); } + #[test] + fn parse_formatting() { + assert_eq!( + Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), + parse_full(r#"~~stikethrough~~"#) + ); + + assert_eq!( + Token::Bold(Box::new(Token::PlainText("bold".into()))), + parse_full(r#"**bold**"#) + ); + + assert_eq!( + Token::Italic(Box::new(Token::PlainText("italic".into()))), + parse_full(r#"*italic*"#) + ); + + assert_eq!( + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("code".into()), + Token::PlainText(" also not code".into()) + ]), + parse_full(r#"not code `code` also not code"#) + ); + + assert_eq!( + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("code".into()), + Token::PlainText(" also `not code".into()) + ]), + parse_full(r#"not code `code` also `not code"#) + ); + + assert_eq!( + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("*not bold*".into()), + Token::PlainText(" also not code".into()) + ]), + parse_full(r#"not code `*not bold*` also not code"#) + ); + + assert_eq!( + Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))), + parse_full(r#"***bold italic***"#) + ); + + assert_eq!( + Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( + "bold italic".into() + ))))), + parse_full(r#"bold italic"#) + ); + } + #[test] fn parse_complex() { - let emoji = r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#; + assert_eq!( + Token::Center(Box::new(Token::Sequence(vec![ + Token::PlainText("centered\n".into()), + Token::UnicodeEmoji("🦋".into()), + Token::UnicodeEmoji("🏳️‍⚧️".into()), + Token::PlainText("\ntext".into()) + ]))), + parse_full( + r#"
centered +🦋🏳️‍⚧️ +text
"# + ) + ); + + assert_eq!( + Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ + Token::PlainText("centered\n".into()), + Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()), + Token::PlainText("\ntext".into()) + ]))))), + parse_full( + r#">
centered +> 👩🏽‍🤝‍👩🏼 +> text
"# + ) + ); + assert_eq!( Token::Function { name: "x2".into(), @@ -1138,21 +1315,7 @@ mod test { Token::UnicodeEmoji("🦊".into()), ])) }, - Context.full(Span::new(emoji)).unwrap().1.merged() - ); - - let bold_italic = r#"***bold italic***"#; - assert_eq!( - Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))), - Context.full(Span::new(bold_italic)).unwrap().1.merged() - ); - - let bold_italic_tag = r#"bold italic"#; - assert_eq!( - Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( - "bold italic".into() - ))))), - Context.full(Span::new(bold_italic_tag)).unwrap().1.merged() + parse_full(r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#) ); assert_eq!( @@ -1178,37 +1341,67 @@ mod test { .merged() ); - let quote = r#" -> test -> -> italic -> ->> Nested quote -"#; - assert_eq!( Token::Quote(Box::new(Token::Sequence(vec![ Token::PlainText("test\n".into()), Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) ]))), - Context.full(Span::new(quote)).unwrap().1.merged() + parse_full( + r#" +> test +> +> italic +> +>> Nested quote +"# + ) ); } #[test] fn parse_emoji() { - let test = "🥺💜❤️🦊"; - let ctx = Context; - let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap(); + assert_eq!( + Token::Sequence( + vec!["🥺", "💜", "❤️", "🦊"] + .into_iter() + .map(<&str as Into>>::into) + .map(Token::UnicodeEmoji) + .collect::>() + ), + parse_full("🥺💜❤️🦊") + ); + + // Trans flag, ZWJ + assert_eq!( + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into()), + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}") + ); assert_eq!( - vec!["🥺", "💜", "❤️", "🦊"] - .into_iter() - .map(<&str as Into>>::into) - .map(Token::UnicodeEmoji) - .collect::>(), - tokens.1 + Token::Sequence(vec![ + Token::PlainText("\u{0200d}".into()), // ZWJ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + ]), + parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}") + ); + + // Trans flag, ZWNJ + assert_eq!( + Token::Sequence(vec![ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + Token::PlainText("\u{0200c}".into()), // ZWNJ + Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()) // Trans symbol + ]), + parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}") + ); + + assert_eq!( + Token::Sequence(vec![ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + Token::PlainText("\u{0200d}\u{0200d}\u{0200d}".into()), // ZWJ + ]), + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}") ); } } From d2bc67974055b9f10d0dae7a1dad2d292a9ca783 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 7 Oct 2023 20:40:01 +0200 Subject: [PATCH 16/23] Fixed link parsing --- magnetar_mmm_parser/src/lib.rs | 184 +++++++++++++++++++++++++++++---- 1 file changed, 165 insertions(+), 19 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 8dccf96..63e55c5 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -48,7 +48,7 @@ pub enum Token<'a> { UrlRaw(Cow<'a, str>), UrlNoEmbed(Cow<'a, str>), Link { - label: Cow<'a, str>, + label: Box>, href: Cow<'a, str>, embed: bool, }, @@ -91,7 +91,7 @@ impl Token<'_> { Token::UrlNoEmbed(url) => Token::UrlNoEmbed(Cow::Owned(url.clone().into_owned())), Token::Link { embed, label, href } => Token::Link { embed: *embed, - label: Cow::Owned(label.clone().into_owned()), + label: Box::new(label.owned()), href: Cow::Owned(href.clone().into_owned()), }, Token::BlockCode { inner, lang } => Token::BlockCode { @@ -183,6 +183,11 @@ impl Token<'_> { Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), Token::Center(inner) => Token::Center(Box::new(inner.merged())), Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())), + Token::Link { embed, label, href } => Token::Link { + label: Box::new(label.merged()), + href: href.clone(), + embed: *embed, + }, Token::Function { name, params, @@ -387,12 +392,14 @@ impl Context { fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, token) = alt(( self.partial(Self::unicode_emoji), - self.partial(Self::tag_block_center), - self.partial(Self::tag_small), - self.partial(Self::tag_plain), - self.partial(Self::tag_bold), - self.partial(Self::tag_italic), - self.partial(Self::tag_strikethrough), + alt(( + self.partial(Self::tag_block_center), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + )), self.partial(Self::url_no_embed), self.partial(Self::base_bold_italic), self.partial(Self::tag_block_code), @@ -405,6 +412,7 @@ impl Context { self.partial(Self::tag_mention), self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji), + self.partial(Self::link), self.partial(Self::raw_url), self.partial(Self::text), ))(input)?; @@ -428,6 +436,7 @@ impl Context { self.partial(Self::tag_mention), self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji), + self.partial(Self::link), self.partial(Self::raw_url), self.partial(Self::text), ))(input)?; @@ -989,19 +998,15 @@ impl Context { let (input, no_embed) = opt(tag("?"))(input)?; let (input, _) = tag("[")(input)?; let (input, _) = not(tag("["))(input)?; - let (input, label_span) = recognize(many1(tuple(( - not(tag("](")), - self.partial(Self::inline_label_safe_single), - ))))(input)?; - let (input, _) = tag("]")(input)?; - let (input, _) = tag("(")(input)?; - let (input, url_span) = recognize(tuple((protocol, url_chars(tag("]"), true))))(input)?; + let (input, (label_tok, _)) = + many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?; + let (input, url_span) = recognize(tuple((protocol, url_chars(tag(")"), true))))(input)?; let (input, _) = tag(")")(input)?; Ok(( input, Token::Link { - label: label_span.into_fragment().into(), + label: Box::new(Token::Sequence(label_tok)), href: url_span.into_fragment().into(), embed: no_embed.is_none(), }, @@ -1056,19 +1061,22 @@ impl Context { Span::into_fragment, )(input)?; - let (input, host) = map( + let before = input; + let (_, host) = map( opt(tuple(( tag("@"), map( - recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))), + recognize(many1(alt((alphanumeric1, recognize(one_of("-_.")))))), Span::into_fragment, ), ))), |maybe_tag_host| maybe_tag_host.map(|(_, host)| host), )(input)?; + let host = host.map(|h| h.trim_end_matches(|c| matches!(c, '.' | '-' | '_'))); + Ok(( - input, + host.map(|c| before.slice(c.len() + 1..)).unwrap_or(before), Token::Mention { mention_type, name: name.into(), @@ -1359,6 +1367,144 @@ text"# ); } + #[test] + fn parse_link() { + assert_eq!( + parse_full("Link test: [label](https://example.com)"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://example.com".into(), + embed: true + } + ]) + ); + + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay)"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false + } + ]) + ); + + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket + Token::Sequence(vec![ + Token::PlainText("Link test: ?[label](".into()), + Token::UrlRaw("https://awawa.gay".into()), + ]) + ); + } + + #[test] + fn parse_mention() { + assert_eq!( + parse_full("@tag"), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: None + } + ); + + assert_eq!( + parse_full("hgsjlkdsa @tag fgahjsdkd"), + Token::Sequence(vec![ + Token::PlainText("hgsjlkdsa ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: None + }, + Token::PlainText(" fgahjsdkd".into()) + ]) + ); + + assert_eq!( + parse_full("hgsjlkdsa @tag@ fgahjsdkd"), + Token::Sequence(vec![ + Token::PlainText("hgsjlkdsa ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: None + }, + Token::PlainText("@ fgahjsdkd".into()) + ]) + ); + + assert_eq!( + parse_full("aaaa @tag@domain bbbbb"), + Token::Sequence(vec![ + Token::PlainText("aaaa ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain".into()) + }, + Token::PlainText(" bbbbb".into()) + ]) + ); + + assert_eq!( + parse_full("test @tag@domain, test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain".into()) + }, + Token::PlainText(", test".into()) + ]) + ); + + assert_eq!( + parse_full("test @tag@domain.gay. test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain.gay".into()) + }, + Token::PlainText(". test".into()) + ]) + ); + + assert_eq!( + parse_full("test @tag@domain? test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain".into()) + }, + Token::PlainText("? test".into()) + ]) + ); + + assert_eq!( + parse_full("test !tag@domain.com test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::Community, + name: "tag".into(), + host: Some("domain.com".into()) + }, + Token::PlainText(" test".into()) + ]) + ); + } + #[test] fn parse_emoji() { assert_eq!( From c4fd99fa45990cd68dfa4465c4635e4744a7efd1 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 7 Oct 2023 21:22:21 +0200 Subject: [PATCH 17/23] Stricter URL parsing --- magnetar_mmm_parser/src/lib.rs | 129 ++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 49 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 63e55c5..6f1bf94 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -991,7 +991,10 @@ impl Context { let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?; let (input, _) = tag(">")(input)?; - Ok((input, Token::UrlRaw(Cow::from(url_span.into_fragment())))) + Ok(( + input, + Token::UrlNoEmbed(Cow::from(url_span.into_fragment())), + )) } fn link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { @@ -1120,7 +1123,12 @@ fn protocol(input: Span) -> IResult { #[inline] fn url_chars_base(input: Span) -> IResult { - recognize(alt((alpha1, recognize(one_of(".,_/:%#$&?!~=+-()[]@")))))(input) + recognize(alt(( + alpha1, + recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))), + recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))), + recognize(one_of(".,_/:%#$&?!~=+-@")), + )))(input) } #[inline] @@ -1128,26 +1136,10 @@ fn url_chars<'a, T: 'a>( terminator: impl Fn(Span<'a>) -> IResult, T> + 'a, spaces: bool, ) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'a { - let terminating = move |input| { - tuple(( - &terminator, - alt(( - space1, - line_ending, - eof, - recognize(one_of("([<'\"")), - recognize(tuple(( - alt((alpha1, recognize(one_of("*")))), - alt((space1, line_ending, eof)), - ))), - )), - ))(input) - }; - let chars = tuple(( not(tuple((space1, eof))), not(tuple((space1, tag("\"")))), - not(tuple((opt(space1), terminating))), + not(tuple((opt(space1), terminator))), alt((url_chars_base, if spaces { space1 } else { fail })), )); @@ -1167,49 +1159,48 @@ mod test { #[test] fn parse_url_chars() { - let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))"; assert_eq!( + url_chars(tag(")"), true)(Span::new( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security))" + )) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)" + ); + + assert_eq!( + url_chars(tag(")"), true)(Span::new( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))" + )) + .unwrap() + .1 + .into_fragment(), "https://en.wikipedia.org/wiki/Sandbox_(computer_security)", - url_chars(tag(")"), true)(Span::new(test1)) - .unwrap() - .1 - .into_fragment() ); - let test2 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))"; assert_eq!( - "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", - url_chars(tag(")"), true)(Span::new(test2)) + url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among_Us ")) .unwrap() .1 - .into_fragment() - ); - - let test3 = "https://en.wikipedia.org/wiki/("; - assert_eq!( - test3, - url_chars(tag(")"), true)(Span::new(test3)) - .unwrap() - .1 - .into_fragment() - ); - - let test4 = "https://cs.wikipedia.org/wiki/Among_Us "; - assert_eq!( + .into_fragment(), "https://cs.wikipedia.org/wiki/Among_Us", - url_chars(tag(")"), true)(Span::new(test4)) - .unwrap() - .1 - .into_fragment() ); - let test5 = "https://cs.wikipedia.org/wiki/Among Us )"; assert_eq!( - "https://cs.wikipedia.org/wiki/Among Us", - url_chars(tag(")"), true)(Span::new(test5)) + url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among Us )")) .unwrap() .1 - .into_fragment() + .into_fragment(), + "https://cs.wikipedia.org/wiki/Among Us" + ); + + assert_eq!( + url_chars(tag(")"), false)(Span::new("https://en.wikipedia.org/wiki/Among Us )")) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Among" ); } @@ -1381,6 +1372,20 @@ text"# ]) ); + assert_eq!( + parse_full(""), + Token::UrlNoEmbed("https://example.com".into()) + ); + + // Adjacent links okay + assert_eq!( + parse_full(""), + Token::Sequence(vec![ + Token::UrlNoEmbed("https://example.com/".into()), + Token::UrlNoEmbed("https://awawa.gay/".into()) + ]) + ); + assert_eq!( parse_full("Link test: ?[label](https://awawa.gay)"), Token::Sequence(vec![ @@ -1393,6 +1398,32 @@ text"# ]) ); + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay)test"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false + }, + Token::PlainText("test".into()) + ]) + ); + + assert_eq!( + parse_full("Link test: (?[label](https://awawa.gay))"), + Token::Sequence(vec![ + Token::PlainText("Link test: (".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false + }, + Token::PlainText(")".into()) + ]) + ); + assert_eq!( parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket Token::Sequence(vec![ From 26bd6fe4b2ee893d5325c19d7208b40dbf320cb8 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 7 Oct 2023 21:26:25 +0200 Subject: [PATCH 18/23] Normalized tests --- magnetar_mmm_parser/src/lib.rs | 82 ++++++++++++++++------------------ 1 file changed, 39 insertions(+), 43 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 6f1bf94..7385d65 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1207,90 +1207,91 @@ mod test { #[test] fn parse_formatting() { assert_eq!( + parse_full(r#"~~stikethrough~~"#), Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), - parse_full(r#"~~stikethrough~~"#) ); assert_eq!( + parse_full(r#"**bold**"#), Token::Bold(Box::new(Token::PlainText("bold".into()))), - parse_full(r#"**bold**"#) ); assert_eq!( + parse_full(r#"*italic*"#), Token::Italic(Box::new(Token::PlainText("italic".into()))), - parse_full(r#"*italic*"#) ); assert_eq!( + parse_full(r#"not code `code` also not code"#), Token::Sequence(vec![ Token::PlainText("not code ".into()), Token::InlineCode("code".into()), Token::PlainText(" also not code".into()) ]), - parse_full(r#"not code `code` also not code"#) ); assert_eq!( + parse_full(r#"not code `code` also `not code"#), Token::Sequence(vec![ Token::PlainText("not code ".into()), Token::InlineCode("code".into()), Token::PlainText(" also `not code".into()) ]), - parse_full(r#"not code `code` also `not code"#) ); assert_eq!( + parse_full(r#"not code `*not bold*` also not code"#), Token::Sequence(vec![ Token::PlainText("not code ".into()), Token::InlineCode("*not bold*".into()), Token::PlainText(" also not code".into()) ]), - parse_full(r#"not code `*not bold*` also not code"#) ); assert_eq!( - Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))), - parse_full(r#"***bold italic***"#) + parse_full(r#"***bold italic***"#), + Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))) ); assert_eq!( + parse_full(r#"bold italic"#), Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( "bold italic".into() - ))))), - parse_full(r#"bold italic"#) + ))))) ); } #[test] fn parse_complex() { assert_eq!( + parse_full( + r#"
centered +🦋🏳️‍⚧️ +text
"# + ), Token::Center(Box::new(Token::Sequence(vec![ Token::PlainText("centered\n".into()), Token::UnicodeEmoji("🦋".into()), Token::UnicodeEmoji("🏳️‍⚧️".into()), Token::PlainText("\ntext".into()) - ]))), - parse_full( - r#"
centered -🦋🏳️‍⚧️ -text
"# - ) + ]))) ); assert_eq!( + parse_full( + r#">
centered +> 👩🏽‍🤝‍👩🏼 +> text
"# + ), Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ Token::PlainText("centered\n".into()), Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()), Token::PlainText("\ntext".into()) ]))))), - parse_full( - r#">
centered -> 👩🏽‍🤝‍👩🏼 -> text
"# - ) ); assert_eq!( + parse_full(r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#), Token::Function { name: "x2".into(), params: HashMap::new(), @@ -1314,10 +1315,10 @@ text"# Token::UnicodeEmoji("🦊".into()), ])) }, - parse_full(r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#) ); assert_eq!( + parse_full(r#"bold @tag1 @tag2 italic"#), Token::Sequence(vec![ Token::PlainText("bold ".into()), Token::Mention { @@ -1333,19 +1334,9 @@ text"# }, Token::PlainText(" italic".into()) ]), - Context - .full(Span::new(r#"bold @tag1 @tag2 italic"#)) - .unwrap() - .1 - .merged() ); assert_eq!( - Token::Quote(Box::new(Token::Sequence(vec![ - Token::PlainText("test\n".into()), - Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), - Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) - ]))), parse_full( r#" > test @@ -1354,7 +1345,12 @@ text"# > >> Nested quote "# - ) + ), + Token::Quote(Box::new(Token::Sequence(vec![ + Token::PlainText("test\n".into()), + Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), + Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) + ]))), ); } @@ -1539,46 +1535,46 @@ text"# #[test] fn parse_emoji() { assert_eq!( + parse_full("🥺💜❤️🦊"), Token::Sequence( vec!["🥺", "💜", "❤️", "🦊"] .into_iter() .map(<&str as Into>>::into) .map(Token::UnicodeEmoji) .collect::>() - ), - parse_full("🥺💜❤️🦊") + ) ); // Trans flag, ZWJ assert_eq!( - Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into()), - parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}") + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}"), + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into()) ); assert_eq!( + parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}"), Token::Sequence(vec![ Token::PlainText("\u{0200d}".into()), // ZWJ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag - ]), - parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}") + ]) ); // Trans flag, ZWNJ assert_eq!( + parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}"), Token::Sequence(vec![ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag Token::PlainText("\u{0200c}".into()), // ZWNJ Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()) // Trans symbol - ]), - parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}") + ]) ); assert_eq!( + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}"), Token::Sequence(vec![ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag Token::PlainText("\u{0200d}\u{0200d}\u{0200d}".into()), // ZWJ - ]), - parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}") + ]) ); } } From d0d977e6ebbf7676b26c1a76ab5890baa2389910 Mon Sep 17 00:00:00 2001 From: Natty Date: Sun, 8 Oct 2023 22:15:55 +0200 Subject: [PATCH 19/23] Fixed URL parsing and initial flanking rules implementation --- magnetar_mmm_parser/src/lib.rs | 388 +++++++++++++++++++++++++++------ 1 file changed, 322 insertions(+), 66 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 7385d65..d270760 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,19 +1,20 @@ use either::Either; use nom::branch::alt; -use nom::bytes::complete::tag; +use nom::bytes::complete::{tag, tag_no_case}; use nom::character::complete::{ - alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1, - tab, + alpha1, alphanumeric1, anychar, char as one_char, char, line_ending, not_line_ending, one_of, + satisfy, space1, tab, }; use nom::combinator::{eof, fail, map, not, opt, recognize}; use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; -use nom::{IResult, Offset, Slice}; +use nom::{Compare, IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; use std::convert::{identity, Infallible}; +use std::marker::PhantomData; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -73,6 +74,80 @@ pub enum Token<'a> { } impl Token<'_> { + fn str_content_left(&self) -> Option<&str> { + match self { + Token::PlainText(text) => Some(text.as_ref()), + Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left), + Token::Quote(inner) => inner.str_content_left(), + Token::Small(inner) => inner.str_content_left(), + Token::BoldItalic(inner) => inner.str_content_left(), + Token::Bold(inner) => inner.str_content_left(), + Token::Italic(inner) => inner.str_content_left(), + Token::Center(inner) => inner.str_content_left(), + Token::Strikethrough(inner) => inner.str_content_left(), + Token::PlainTag(tag) => Some(tag.as_ref()), + Token::UrlRaw(url) => Some(url.as_ref()), + Token::UrlNoEmbed(url) => Some(url.as_ref()), + Token::Link { label, .. } => label.str_content_left(), + Token::Function { inner, .. } => inner.str_content_left(), + Token::Mention { name, .. } => Some(name.as_ref()), + Token::UnicodeEmoji(code) => Some(code.as_ref()), + Token::ShortcodeEmoji(_) => None, + Token::Hashtag(tag) => Some(tag.as_ref()), + _ => None, + } + } + + fn str_content_right(&self) -> Option<&str> { + match self { + Token::PlainText(text) => Some(text.as_ref()), + Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right), + Token::Quote(inner) => inner.str_content_right(), + Token::Small(inner) => inner.str_content_right(), + Token::BoldItalic(inner) => inner.str_content_right(), + Token::Bold(inner) => inner.str_content_right(), + Token::Italic(inner) => inner.str_content_right(), + Token::Center(inner) => inner.str_content_right(), + Token::Strikethrough(inner) => inner.str_content_right(), + Token::PlainTag(tag) => Some(tag.as_ref()), + Token::UrlRaw(url) => Some(url.as_ref()), + Token::UrlNoEmbed(url) => Some(url.as_ref()), + Token::Link { label, .. } => label.str_content_right(), + Token::Function { inner, .. } => inner.str_content_right(), + Token::Mention { name, .. } => Some(name.as_ref()), + Token::UnicodeEmoji(code) => Some(code.as_ref()), + Token::Hashtag(tag) => Some(tag.as_ref()), + _ => None, + } + } + + fn inner(&self) -> Token { + match self { + plain @ Token::PlainText(_) => plain.clone(), + sequence @ Token::Sequence(_) => sequence.clone(), + Token::Quote(inner) => inner.inner(), + Token::Small(inner) => inner.inner(), + Token::BoldItalic(inner) => inner.inner(), + Token::Bold(inner) => inner.inner(), + Token::Italic(inner) => inner.inner(), + Token::Center(inner) => inner.inner(), + Token::Strikethrough(inner) => inner.inner(), + Token::PlainTag(text) => Token::PlainText(text.clone()), + Token::InlineCode(code) => Token::PlainText(code.clone()), + Token::InlineMath(math) => Token::PlainText(math.clone()), + Token::UrlRaw(url) => Token::PlainText(url.clone()), + Token::UrlNoEmbed(url) => Token::PlainText(url.clone()), + Token::Link { label, .. } => label.inner(), + Token::BlockCode { inner, .. } => Token::PlainText(inner.clone()), + Token::BlockMath(math) => Token::PlainText(math.clone()), + Token::Function { inner, .. } => inner.inner(), + Token::Mention { name, .. } => Token::PlainText(name.clone()), + Token::UnicodeEmoji(code) => Token::PlainText(code.clone()), + Token::ShortcodeEmoji(shortcode) => Token::PlainText(shortcode.clone()), + Token::Hashtag(tag) => Token::PlainText(tag.clone()), + } + } + fn owned(&self) -> Token<'static> { match self { Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())), @@ -129,7 +204,7 @@ impl Token<'_> { Token::ShortcodeEmoji(shortcode) => { Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned())) } - Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())), + Token::Hashtag(tag) => Token::Hashtag(Cow::Owned(tag.clone().into_owned())), } } @@ -245,6 +320,16 @@ fn collect_char_sequence<'a>( move |chars| func(Cow::Owned(chars.collect())) } +#[inline] +fn alpha1_unicode(input: Span) -> IResult { + recognize(many1_count(satisfy(char::is_alphanumeric)))(input) +} + +#[inline] +fn alphanumeric1_unicode(input: Span) -> IResult { + recognize(many1_count(satisfy(char::is_alphanumeric)))(input) +} + fn spliced<'a>( segments: &[Span<'a>], func: impl Fn(Span) -> IResult, @@ -316,15 +401,16 @@ fn space(input: Span) -> IResult { Ok((input, Token::PlainText(frag.into_fragment().into()))) } -struct Matcher<'a, 'b, T> { +#[derive(Copy, Clone)] +struct Matcher<'a, 'b, T: Clone> { matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), - _phantom_closure: std::marker::PhantomData<&'a ()>, - _phantom_data: std::marker::PhantomData<&'b ()>, - _phantom_output: std::marker::PhantomData T>, + _phantom_closure: PhantomData<&'a ()>, + _phantom_data: PhantomData<&'b ()>, + _phantom_output: PhantomData T>, } -impl<'a, 'b, T> Matcher<'a, 'b, T> { +impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> { fn new( matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), @@ -332,9 +418,9 @@ impl<'a, 'b, T> Matcher<'a, 'b, T> { Self { matcher_inner, collector, - _phantom_closure: std::marker::PhantomData, - _phantom_data: std::marker::PhantomData, - _phantom_output: std::marker::PhantomData, + _phantom_closure: PhantomData, + _phantom_data: PhantomData, + _phantom_output: PhantomData, } } } @@ -345,33 +431,60 @@ impl<'a, 'b> Matcher<'a, 'b, Infallible> { Self { matcher_inner: &fail::<_, Infallible, _>, collector: &|_| unreachable!(), - _phantom_closure: std::marker::PhantomData, - _phantom_data: std::marker::PhantomData, - _phantom_output: std::marker::PhantomData, + _phantom_closure: PhantomData, + _phantom_data: PhantomData, + _phantom_output: PhantomData, } } } -struct Context; +#[derive(Copy, Clone, Debug)] +enum FlankingRule { + Lenient, + Strict, + DontCare, +} + +struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>>( + T, + FlankingRule, + PhantomData<&'a ()>, +); + +impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From<(T, FlankingRule)> + for FlankingDelim<'a, T> +{ + fn from((func, rule): (T, FlankingRule)) -> Self { + FlankingDelim(func, rule, PhantomData) + } +} + +impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From for FlankingDelim<'a, T> { + fn from(func: T) -> Self { + FlankingDelim(func, FlankingRule::DontCare, PhantomData) + } +} + +pub struct Context; impl Context { #[inline] - const fn partial( + fn partial( &self, func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token<'a>> + '_ { move |input| func(self, input) } - fn full<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + pub fn full<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { map(many1(self.partial(Self::full_single)), Token::Sequence)(input) } - fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + pub fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { map(many1(self.partial(Self::inline_single)), Token::Sequence)(input) } - fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + pub fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { map( many1(self.partial(Self::inline_label_safe_single)), Token::Sequence, @@ -606,14 +719,21 @@ impl Context { } #[inline] - fn tag_delimited<'a, 'b: 'a, T, S>( + fn tag_delimited<'a, 'b: 'a, T: Clone, S: Clone, FOpen, FClose>( &'a self, - opening_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, - closing_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + opening_tag: impl Into> + 'a, + closing_tag: impl Into> + 'a, escape: bool, matcher: Matcher<'a, 'b, T>, fallback: Matcher<'a, 'b, S>, - ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ { + ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ + where + FOpen: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + FClose: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + { + let FlankingDelim(opening_tag, opening_rule, ..) = opening_tag.into(); + let FlankingDelim(closing_tag, closing_rule, ..) = closing_tag.into(); + move |input| { if escape { if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) { @@ -662,10 +782,44 @@ impl Context { )); } - let (input, (inner, _)) = res?; + let (input, (inner, closing)) = res?; let mut inner = inner.into_iter().map(|(_, t)| t); - Ok((input, (matcher.collector)(&mut inner))) + let inner_tok = (matcher.collector)(&mut inner); + + let correct_left_flanking = + if let FlankingRule::Lenient | FlankingRule::Strict = opening_rule { + let text_left = inner_tok.str_content_left(); + + !(text_left.is_some_and(|s| s.starts_with(char::is_whitespace)) + || text_left.is_none()) + } else { + true + }; + + let correct_right_flanking = + if let FlankingRule::Lenient | FlankingRule::Strict = closing_rule { + let text_right = inner_tok.str_content_right(); + !(text_right.is_some_and(|s| s.ends_with(char::is_whitespace)) + || text_right.is_none()) + } else { + true + }; + + // TODO: Unfinished flanking rules + let correct_flanking = correct_left_flanking && correct_right_flanking; + + if !correct_flanking { + return Ok(( + input, + Token::Sequence(vec![ + Token::PlainText(begin.fragment_between(&post_open).into()), + inner_tok.inner().owned(), + Token::PlainText(closing.into_fragment().into()), + ]), + )); + } + Ok((input, Token::Sequence(vec![inner_tok]))) } } @@ -720,12 +874,12 @@ impl Context { } fn tag_plain<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let opening_tag = &tag(""); - let closing_tag = &tag(""); + let opening_tag = &tag(""); + let closing_tag = &tag(""); let (input, _) = opening_tag(input)?; let (input, text) = map( - recognize(many1(tuple((not_line_ending, not(closing_tag))))), + recognize(many1(tuple((not(line_ending), not(closing_tag), anychar)))), Span::into_fragment, )(input)?; let (input, _) = closing_tag(input)?; @@ -735,8 +889,8 @@ impl Context { fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag(""), - tag(""), + tag_no_case(""), + tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), @@ -749,11 +903,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("***"), - tag("***"), + (tag("***"), FlankingRule::Lenient), + (tag("***"), FlankingRule::Lenient), true, Matcher::new( &self.partial(Self::inline_single), @@ -766,11 +919,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("___"), - tag("___"), + (tag("___"), FlankingRule::Strict), + (tag("___"), FlankingRule::Strict), true, Matcher::new( &self.partial(Self::inline_single), @@ -785,8 +937,8 @@ impl Context { fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag(""), - tag(""), + tag_no_case(""), + tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), @@ -799,11 +951,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("**"), - tag("**"), + (tag("**"), FlankingRule::Lenient), + (tag("**"), FlankingRule::Lenient), true, Matcher::new( &self.partial(Self::inline_single), @@ -816,11 +967,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("__"), - tag("__"), + (tag("__"), FlankingRule::Strict), + (tag("__"), FlankingRule::Strict), true, Matcher::new( &self.partial(Self::inline_single), @@ -835,8 +985,8 @@ impl Context { fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag(""), - tag(""), + tag_no_case(""), + tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), @@ -849,11 +999,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("*"), - tag("*"), + (tag("*"), FlankingRule::Lenient), + (tag("*"), FlankingRule::Lenient), true, Matcher::new( &self.partial(Self::inline_single), @@ -866,11 +1015,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("_"), - tag("_"), + (tag("_"), FlankingRule::Strict), + (tag("_"), FlankingRule::Strict), true, Matcher::new( &self.partial(Self::inline_single), @@ -885,8 +1033,8 @@ impl Context { fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag(""), - tag(""), + tag_no_case(""), + tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), @@ -899,11 +1047,10 @@ impl Context { )(input) } - // TODO: CommonMark flanking rules fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( - tag("~~"), - tag("~~"), + (tag("~~"), FlankingRule::Lenient), + (tag("~~"), FlankingRule::Lenient), true, Matcher::new( &move |input| { @@ -1037,20 +1184,42 @@ impl Context { } fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - // TODO: Fail when preceded by alphanumerics + if let (plain_out, Some(plain)) = map( + opt(recognize(tuple(( + alphanumeric1_unicode, + self.partial(Self::shortcode_emoji), + )))), + |o| o.map(Span::into_fragment), + )(input)? + { + return Ok((plain_out, Token::PlainText(plain.into()))); + } + let (input, _) = tag(":")(input)?; let (input, shortcode) = map( - recognize(many1(alt((alphanumeric1, recognize(one_of("_+-")))))), + recognize(many1(alt(( + alphanumeric1_unicode, + recognize(one_of("_+-")), + )))), Span::into_fragment, )(input)?; let (input, _) = tag(":")(input)?; - let (input, _) = not(alphanumeric1)(input)?; + let (input, _) = not(alphanumeric1_unicode)(input)?; Ok((input, Token::ShortcodeEmoji(shortcode.into()))) } fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - // TODO: Escaping and skip when preceded by alphanumerics + if let (plain_out, Some(plain)) = map( + opt(recognize(tuple(( + alt((tag("\\"), alphanumeric1_unicode)), + self.partial(Self::tag_mention), + )))), + |o| o.map(Span::into_fragment), + )(input)? + { + return Ok((plain_out, Token::PlainText(plain.into()))); + } let tags = one_of("@!"); let (input, mention_type) = map(tags, |c| match c { @@ -1123,12 +1292,12 @@ fn protocol(input: Span) -> IResult { #[inline] fn url_chars_base(input: Span) -> IResult { - recognize(alt(( - alpha1, + alt(( + alphanumeric1_unicode, recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))), recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))), recognize(one_of(".,_/:%#$&?!~=+-@")), - )))(input) + ))(input) } #[inline] @@ -1221,6 +1390,21 @@ mod test { Token::Italic(Box::new(Token::PlainText("italic".into()))), ); + assert_eq!( + parse_full(r#"* italic *"#), + Token::PlainText("* italic *".into()) + ); + + assert_eq!( + parse_full(r#"_ italic *"#), + Token::PlainText("_ italic *".into()) + ); + + assert_eq!( + parse_full(r#"*"italic"*"#), + Token::Italic(Box::new(Token::PlainText("\"italic\"".into()))) + ); + assert_eq!( parse_full(r#"not code `code` also not code"#), Token::Sequence(vec![ @@ -1356,6 +1540,47 @@ text"# #[test] fn parse_link() { + assert_eq!( + parse_full("IPv4 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv4 test: ".into()), + Token::UrlNoEmbed("https://0".into()) + ]) + ); + + assert_eq!( + parse_full("IPv4 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv4 test: ".into()), + Token::UrlNoEmbed("https://127.0.0.1".into()) + ]) + ); + + assert_eq!( + parse_full("IPv6 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv6 test: ".into()), + Token::UrlNoEmbed("https://[::2f:1]/nya".into()) + ]) + ); + + assert_eq!( + parse_full("IPv6 test: https://[::2f:1]/nya"), + Token::Sequence(vec![ + Token::PlainText("IPv6 test: ".into()), + Token::UrlRaw("https://[::2f:1]/nya".into()) + ]) + ); + + // IDNs + assert_eq!( + parse_full("IDN test: https://www.háčkyčárky.cz/"), + Token::Sequence(vec![ + Token::PlainText("IDN test: ".into()), + Token::UrlRaw("https://www.háčkyčárky.cz/".into()) + ]) + ); + assert_eq!( parse_full("Link test: [label](https://example.com)"), Token::Sequence(vec![ @@ -1440,6 +1665,11 @@ text"# } ); + assert_eq!( + parse_full("email@notactuallyamenmtion.org"), + Token::PlainText("email@notactuallyamenmtion.org".into()) + ); + assert_eq!( parse_full("hgsjlkdsa @tag fgahjsdkd"), Token::Sequence(vec![ @@ -1532,6 +1762,32 @@ text"# ); } + #[test] + fn parse_shortcodes() { + assert_eq!( + parse_full(":bottom:"), + Token::ShortcodeEmoji("bottom".into()) + ); + + assert_eq!( + parse_full(":bottom::blobfox:"), + Token::Sequence(vec![ + Token::ShortcodeEmoji("bottom".into()), + Token::ShortcodeEmoji("blobfox".into()) + ]) + ); + + assert_eq!( + parse_full(":bottom:blobfox"), + Token::PlainText(":bottom:blobfox".into()) + ); + + assert_eq!( + parse_full("bottom:blobfox:"), + Token::PlainText("bottom:blobfox:".into()) + ); + } + #[test] fn parse_emoji() { assert_eq!( From 23a63f2fe926e6297dd55e2a097015a1c7130f77 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 14 Oct 2023 21:41:36 +0200 Subject: [PATCH 20/23] MMM: Made the parser always output owned tokens --- Cargo.lock | 23 +++ Cargo.toml | 1 + magnetar_mmm_parser/Cargo.toml | 1 + magnetar_mmm_parser/src/lib.rs | 287 +++++++++++++-------------------- 4 files changed, 138 insertions(+), 174 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2d79c5..cb3905d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -462,6 +462,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a4f925191b4367301851c6d99b09890311d74b0d43f274c0b34c86d308a3663" +[[package]] +name = "castaway" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.0.81" @@ -584,6 +593,19 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "compact_str" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86b9c4c00838774a6d902ef931eff7470720c51d90c2e32cfe15dc304737b3f" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "ryu", + "static_assertions", +] + [[package]] name = "const-oid" version = "0.9.4" @@ -1622,6 +1644,7 @@ dependencies = [ name = "mmm_parser" version = "0.2.1-alpha" dependencies = [ + "compact_str", "either", "emojis", "nom", diff --git a/Cargo.toml b/Cargo.toml index c5d0c4e..9828764 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ axum = "0.6" cached = "0.46" cfg-if = "1" chrono = "0.4" +compact_str = "0.7" dotenvy = "0.15" either = "1.9" emojis = "0.6" diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 25faa6b..14e36f7 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -9,4 +9,5 @@ either = { workspace = true } emojis = { workspace = true } nom = { workspace = true } nom_locate = { workspace = true } +compact_str = { workspace = true } unicode-segmentation = { workspace = true } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index d270760..b940145 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,17 +1,17 @@ +use compact_str::{CompactString, ToCompactString}; use either::Either; use nom::branch::alt; use nom::bytes::complete::{tag, tag_no_case}; use nom::character::complete::{ - alpha1, alphanumeric1, anychar, char as one_char, char, line_ending, not_line_ending, one_of, + alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, satisfy, space1, tab, }; use nom::combinator::{eof, fail, map, not, opt, recognize}; use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; -use nom::{Compare, IResult, Offset, Slice}; +use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; -use std::borrow::Cow; use std::collections::HashMap; use std::convert::{identity, Infallible}; use std::marker::PhantomData; @@ -33,47 +33,47 @@ impl MentionType { } #[derive(Clone, Debug, Eq, PartialEq)] -pub enum Token<'a> { - PlainText(Cow<'a, str>), - Sequence(Vec>), - Quote(Box>), - Small(Box>), - BoldItalic(Box>), - Bold(Box>), - Italic(Box>), - Center(Box>), - Strikethrough(Box>), - PlainTag(Cow<'a, str>), - InlineCode(Cow<'a, str>), - InlineMath(Cow<'a, str>), - UrlRaw(Cow<'a, str>), - UrlNoEmbed(Cow<'a, str>), +pub enum Token { + PlainText(CompactString), + Sequence(Vec), + Quote(Box), + Small(Box), + BoldItalic(Box), + Bold(Box), + Italic(Box), + Center(Box), + Strikethrough(Box), + PlainTag(String), + InlineCode(String), + InlineMath(String), + UrlRaw(String), + UrlNoEmbed(String), Link { - label: Box>, - href: Cow<'a, str>, + label: Box, + href: String, embed: bool, }, BlockCode { - lang: Option>, - inner: Cow<'a, str>, + lang: Option, + inner: String, }, - BlockMath(Cow<'a, str>), + BlockMath(String), Function { - name: Cow<'a, str>, - params: HashMap, Option>>, - inner: Box>, + name: String, + params: HashMap>, + inner: Box, }, Mention { - name: Cow<'a, str>, - host: Option>, + name: String, + host: Option, mention_type: MentionType, }, - UnicodeEmoji(Cow<'a, str>), - ShortcodeEmoji(Cow<'a, str>), - Hashtag(Cow<'a, str>), + UnicodeEmoji(String), + ShortcodeEmoji(String), + Hashtag(String), } -impl Token<'_> { +impl Token { fn str_content_left(&self) -> Option<&str> { match self { Token::PlainText(text) => Some(text.as_ref()), @@ -132,79 +132,19 @@ impl Token<'_> { Token::Italic(inner) => inner.inner(), Token::Center(inner) => inner.inner(), Token::Strikethrough(inner) => inner.inner(), - Token::PlainTag(text) => Token::PlainText(text.clone()), - Token::InlineCode(code) => Token::PlainText(code.clone()), - Token::InlineMath(math) => Token::PlainText(math.clone()), - Token::UrlRaw(url) => Token::PlainText(url.clone()), - Token::UrlNoEmbed(url) => Token::PlainText(url.clone()), + Token::PlainTag(text) => Token::PlainText(text.clone().into()), + Token::InlineCode(code) => Token::PlainText(code.clone().into()), + Token::InlineMath(math) => Token::PlainText(math.clone().into()), + Token::UrlRaw(url) => Token::PlainText(url.clone().into()), + Token::UrlNoEmbed(url) => Token::PlainText(url.clone().into()), Token::Link { label, .. } => label.inner(), - Token::BlockCode { inner, .. } => Token::PlainText(inner.clone()), - Token::BlockMath(math) => Token::PlainText(math.clone()), + Token::BlockCode { inner, .. } => Token::PlainText(inner.clone().into()), + Token::BlockMath(math) => Token::PlainText(math.clone().into()), Token::Function { inner, .. } => inner.inner(), - Token::Mention { name, .. } => Token::PlainText(name.clone()), - Token::UnicodeEmoji(code) => Token::PlainText(code.clone()), - Token::ShortcodeEmoji(shortcode) => Token::PlainText(shortcode.clone()), - Token::Hashtag(tag) => Token::PlainText(tag.clone()), - } - } - - fn owned(&self) -> Token<'static> { - match self { - Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())), - Token::Sequence(tokens) => Token::Sequence(tokens.iter().map(Token::owned).collect()), - Token::Quote(inner) => Token::Quote(Box::new(inner.owned())), - Token::Small(inner) => Token::Small(Box::new(inner.owned())), - Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.owned())), - Token::Bold(inner) => Token::Bold(Box::new(inner.owned())), - Token::Italic(inner) => Token::Italic(Box::new(inner.owned())), - Token::Center(inner) => Token::Center(Box::new(inner.owned())), - Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.owned())), - Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())), - Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())), - Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())), - Token::UrlRaw(url) => Token::UrlRaw(Cow::Owned(url.clone().into_owned())), - Token::UrlNoEmbed(url) => Token::UrlNoEmbed(Cow::Owned(url.clone().into_owned())), - Token::Link { embed, label, href } => Token::Link { - embed: *embed, - label: Box::new(label.owned()), - href: Cow::Owned(href.clone().into_owned()), - }, - Token::BlockCode { inner, lang } => Token::BlockCode { - lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())), - inner: Cow::Owned(inner.clone().into_owned()), - }, - Token::BlockMath(math) => Token::BlockMath(Cow::Owned(math.clone().into_owned())), - Token::Function { - name, - params, - inner, - } => Token::Function { - name: Cow::Owned(name.clone().into_owned()), - params: params - .iter() - .map(|(k, v)| { - ( - Cow::Owned(k.clone().into_owned()), - v.as_ref().map(|val| Cow::Owned(val.clone().into_owned())), - ) - }) - .collect(), - inner: Box::new(inner.owned()), - }, - Token::Mention { - name, - host, - mention_type, - } => Token::Mention { - name: Cow::Owned(name.clone().into_owned()), - host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())), - mention_type: *mention_type, - }, - Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())), - Token::ShortcodeEmoji(shortcode) => { - Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned())) - } - Token::Hashtag(tag) => Token::Hashtag(Cow::Owned(tag.clone().into_owned())), + Token::Mention { name, .. } => Token::PlainText(name.clone().into()), + Token::UnicodeEmoji(code) => Token::PlainText(code.clone().into()), + Token::ShortcodeEmoji(shortcode) => Token::PlainText(shortcode.clone().into()), + Token::Hashtag(tag) => Token::PlainText(tag.clone().into()), } } @@ -214,7 +154,7 @@ impl Token<'_> { let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| { if let Some(Token::PlainText(last)) = acc.last_mut() { if let Token::PlainText(tok_text) = tok { - *last = Cow::from(last.to_string() + tok_text.as_ref()); + *last += tok_text.as_ref(); return acc; } @@ -229,7 +169,7 @@ impl Token<'_> { for item in items { if let Some(Token::PlainText(last)) = acc.last_mut() { if let Token::PlainText(tok_text) = item { - *last = Cow::from(last.to_string() + tok_text.as_ref()); + *last += tok_text.as_ref(); continue; } @@ -301,23 +241,23 @@ impl SliceOffset for Span<'_> { } #[inline] -fn boxing_token<'a>(func: impl Fn(Box>) -> Token<'a>) -> impl Fn(Token<'a>) -> Token<'a> { +fn boxing_token(func: impl Fn(Box) -> Token) -> impl Fn(Token) -> Token { move |tokens| func(Box::new(tokens)) } #[inline] -fn collect_sequence<'a, T>( - func: impl Fn(Vec) -> Token<'a>, - transform: impl Fn(Token<'a>) -> Token<'a>, -) -> impl Fn(&mut dyn Iterator) -> Token<'a> { +fn collect_sequence( + func: impl Fn(Vec) -> Token, + transform: impl Fn(Token) -> Token, +) -> impl Fn(&mut dyn Iterator) -> Token { move |tokens| transform(func(tokens.collect())) } #[inline] -fn collect_char_sequence<'a>( - func: impl Fn(Cow<'a, str>) -> Token<'a>, -) -> impl Fn(&mut dyn Iterator) -> Token<'a> { - move |chars| func(Cow::Owned(chars.collect())) +fn collect_char_sequence( + func: impl Fn(String) -> Token, +) -> impl Fn(&mut dyn Iterator) -> Token { + move |chars| func(chars.collect()) } #[inline] @@ -334,7 +274,7 @@ fn spliced<'a>( segments: &[Span<'a>], func: impl Fn(Span) -> IResult, parent: Span<'a>, -) -> IResult, Token<'static>, nom::error::Error>> { +) -> IResult, Token, nom::error::Error>> { let combined = segments .iter() .copied() @@ -362,7 +302,7 @@ fn spliced<'a>( let quote_span = Span::new(&combined); let (input, inner) = match func(quote_span) { - Ok((input, token)) => (input, token.owned()), + Ok(s) => s, Err(e) => { return match e { NE::Error(e) => { @@ -393,7 +333,7 @@ fn spliced<'a>( parent }; - Ok((out, inner.owned())) + Ok((out, inner)) } fn space(input: Span) -> IResult { @@ -404,7 +344,7 @@ fn space(input: Span) -> IResult { #[derive(Copy, Clone)] struct Matcher<'a, 'b, T: Clone> { matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), - collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), _phantom_closure: PhantomData<&'a ()>, _phantom_data: PhantomData<&'b ()>, _phantom_output: PhantomData T>, @@ -413,7 +353,7 @@ struct Matcher<'a, 'b, T: Clone> { impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> { fn new( matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), - collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), ) -> Self { Self { matcher_inner, @@ -471,27 +411,27 @@ impl Context { #[inline] fn partial( &self, - func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, - ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token<'a>> + '_ { + func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token> + 'static, + ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token> + '_ { move |input| func(self, input) } - pub fn full<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + pub fn full<'a>(&self, input: Span<'a>) -> IResult, Token> { map(many1(self.partial(Self::full_single)), Token::Sequence)(input) } - pub fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + pub fn inline<'a>(&self, input: Span<'a>) -> IResult, Token> { map(many1(self.partial(Self::inline_single)), Token::Sequence)(input) } - pub fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + pub fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token> { map( many1(self.partial(Self::inline_label_safe_single)), Token::Sequence, )(input) } - fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult, Token> { alt(( self.partial(Self::tag_bold_italic_asterisk), self.partial(Self::tag_bold_italic_underscore), @@ -502,7 +442,7 @@ impl Context { ))(input) } - fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( self.partial(Self::unicode_emoji), alt(( @@ -527,12 +467,12 @@ impl Context { self.partial(Self::shortcode_emoji), self.partial(Self::link), self.partial(Self::raw_url), - self.partial(Self::text), + self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) } - fn inline_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn inline_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( self.partial(Self::unicode_emoji), self.partial(Self::tag_small), @@ -551,12 +491,12 @@ impl Context { self.partial(Self::shortcode_emoji), self.partial(Self::link), self.partial(Self::raw_url), - self.partial(Self::text), + self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) } - fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( self.partial(Self::unicode_emoji), self.partial(Self::url_no_embed), @@ -567,12 +507,12 @@ impl Context { self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji), self.partial(Self::raw_url), - self.partial(Self::text), + self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) } - fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( self.partial(Self::unicode_emoji), self.partial(Self::tag_small), @@ -584,12 +524,12 @@ impl Context { self.partial(Self::tag_strikethrough_tilde), self.partial(Self::tag_func), self.partial(Self::shortcode_emoji), - self.partial(Self::text), + self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) } - fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; if let (None, None) = leading_spaces { @@ -625,7 +565,7 @@ impl Context { Ok((input, Token::Quote(Box::new(inner)))) } - fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult, Token> { let tag_start = &tag("
"); let tag_end = &tag("
"); @@ -649,7 +589,7 @@ impl Context { )) } - fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token> { let delim = &tag("```"); let (input, _) = opt(line_ending)(input)?; @@ -688,7 +628,7 @@ impl Context { )) } - fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult, Token> { let start = &tag("\\["); let end = &tag("\\]"); @@ -714,7 +654,7 @@ impl Context { Ok(( input, - Token::BlockMath(Cow::Borrowed(math_span.into_fragment())), + Token::BlockMath(math_span.into_fragment().to_string()), )) } @@ -726,7 +666,7 @@ impl Context { escape: bool, matcher: Matcher<'a, 'b, T>, fallback: Matcher<'a, 'b, S>, - ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ + ) -> impl Fn(Span<'b>) -> IResult, Token> + '_ where FOpen: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, FClose: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, @@ -739,7 +679,7 @@ impl Context { if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) { return Ok(( input_escaped, - Token::PlainText(Cow::Borrowed(mark.fragment())), + Token::PlainText(mark.fragment().to_string().into()), )); } } @@ -814,7 +754,7 @@ impl Context { input, Token::Sequence(vec![ Token::PlainText(begin.fragment_between(&post_open).into()), - inner_tok.inner().owned(), + inner_tok.inner(), Token::PlainText(closing.into_fragment().into()), ]), )); @@ -823,7 +763,7 @@ impl Context { } } - fn tag_func<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_func<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, _) = tag("$[")(input)?; let func_ident = |input| { @@ -852,8 +792,8 @@ impl Context { .into_iter() .map(|(k, v)| { ( - Cow::from(k.into_fragment()), - v.map(|(_, val)| Cow::from(val.into_fragment())), + k.into_fragment().to_string(), + v.map(|(_, val)| val.into_fragment().to_string()), ) }) .collect::>() @@ -866,14 +806,14 @@ impl Context { Ok(( input, Token::Function { - name: Cow::from(func_name), + name: func_name.to_string(), params: args_out, inner: Box::new(Token::Sequence(inner)), }, )) } - fn tag_plain<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_plain<'a>(&self, input: Span<'a>) -> IResult, Token> { let opening_tag = &tag(""); let closing_tag = &tag(""); @@ -887,7 +827,7 @@ impl Context { Ok((input, Token::PlainTag(text.into()))) } - fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag_no_case(""), tag_no_case(""), @@ -903,7 +843,7 @@ impl Context { )(input) } - fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("***"), FlankingRule::Lenient), (tag("***"), FlankingRule::Lenient), @@ -919,7 +859,7 @@ impl Context { )(input) } - fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("___"), FlankingRule::Strict), (tag("___"), FlankingRule::Strict), @@ -935,7 +875,7 @@ impl Context { )(input) } - fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag_no_case(""), tag_no_case(""), @@ -951,7 +891,7 @@ impl Context { )(input) } - fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("**"), FlankingRule::Lenient), (tag("**"), FlankingRule::Lenient), @@ -967,7 +907,7 @@ impl Context { )(input) } - fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("__"), FlankingRule::Strict), (tag("__"), FlankingRule::Strict), @@ -983,7 +923,7 @@ impl Context { )(input) } - fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag_no_case(""), tag_no_case(""), @@ -999,7 +939,7 @@ impl Context { )(input) } - fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("*"), FlankingRule::Lenient), (tag("*"), FlankingRule::Lenient), @@ -1015,7 +955,7 @@ impl Context { )(input) } - fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("_"), FlankingRule::Strict), (tag("_"), FlankingRule::Strict), @@ -1031,7 +971,7 @@ impl Context { )(input) } - fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag_no_case(""), tag_no_case(""), @@ -1047,7 +987,7 @@ impl Context { )(input) } - fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("~~"), FlankingRule::Lenient), (tag("~~"), FlankingRule::Lenient), @@ -1076,7 +1016,7 @@ impl Context { )(input) } - fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag("`"), |input| alt((tag("`"), tag("´")))(input), @@ -1094,7 +1034,7 @@ impl Context { )(input) } - fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag("\\("), tag("\\)"), @@ -1109,12 +1049,12 @@ impl Context { )(input) } - fn text<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { - let (input, text) = map(recognize(anychar), Span::into_fragment)(input)?; - Ok((input, Token::PlainText(text.into()))) + fn tag_raw_text<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, text) = anychar(input)?; + Ok((input, Token::PlainText(text.to_compact_string()))) } - fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, url_span) = recognize(tuple(( protocol, url_chars(|input| not(url_chars_base)(input), false), @@ -1130,21 +1070,21 @@ impl Context { url }; - Ok((input, Token::UrlRaw(Cow::from(final_url)))) + Ok((input, Token::UrlRaw(final_url.to_string()))) } - fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, _) = tag("<")(input)?; let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?; let (input, _) = tag(">")(input)?; Ok(( input, - Token::UrlNoEmbed(Cow::from(url_span.into_fragment())), + Token::UrlNoEmbed(url_span.into_fragment().to_string()), )) } - fn link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn link<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, no_embed) = opt(tag("?"))(input)?; let (input, _) = tag("[")(input)?; let (input, _) = not(tag("["))(input)?; @@ -1163,7 +1103,7 @@ impl Context { )) } - fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token> { let frag = input.fragment(); let Some(grapheme) = frag.graphemes(true).next() else { return fail(input); @@ -1183,7 +1123,7 @@ impl Context { )) } - fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token> { if let (plain_out, Some(plain)) = map( opt(recognize(tuple(( alphanumeric1_unicode, @@ -1209,7 +1149,7 @@ impl Context { Ok((input, Token::ShortcodeEmoji(shortcode.into()))) } - fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token> { if let (plain_out, Some(plain)) = map( opt(recognize(tuple(( alt((tag("\\"), alphanumeric1_unicode)), @@ -1257,7 +1197,7 @@ impl Context { )) } - fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token> { // TODO: Skip when preceded by alphanumerics let (input, _) = tag("#")(input)?; @@ -1319,11 +1259,10 @@ fn url_chars<'a, T: 'a>( mod test { use crate::{url_chars, Context, Span, Token}; use nom::bytes::complete::tag; - use std::borrow::Cow; use std::collections::HashMap; fn parse_full(string: &str) -> Token { - Context.full(Span::new(string)).unwrap().1.merged().owned() + Context.full(Span::new(string)).unwrap().1.merged() } #[test] @@ -1795,7 +1734,7 @@ text"# Token::Sequence( vec!["🥺", "💜", "❤️", "🦊"] .into_iter() - .map(<&str as Into>>::into) + .map(str::to_string) .map(Token::UnicodeEmoji) .collect::>() ) From 86d5c87e9a20a05583d831bfef65fe6bcf6413bc Mon Sep 17 00:00:00 2001 From: Natty Date: Mon, 16 Oct 2023 23:45:45 +0200 Subject: [PATCH 21/23] MMM: Nesting-limited parsing --- Cargo.lock | 1 + magnetar_mmm_parser/Cargo.toml | 1 + magnetar_mmm_parser/src/lib.rs | 438 +++++++++++++++++++++++---------- 3 files changed, 313 insertions(+), 127 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cb3905d..35e50cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1649,6 +1649,7 @@ dependencies = [ "emojis", "nom", "nom_locate", + "tracing", "unicode-segmentation", ] diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 14e36f7..d7b9b2d 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -10,4 +10,5 @@ emojis = { workspace = true } nom = { workspace = true } nom_locate = { workspace = true } compact_str = { workspace = true } +tracing = { workspace = true } unicode-segmentation = { workspace = true } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index b940145..2f76532 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -7,14 +7,15 @@ use nom::character::complete::{ satisfy, space1, tab, }; use nom::combinator::{eof, fail, map, not, opt, recognize}; -use nom::error::ErrorKind; +use nom::error::{ErrorKind, ParseError}; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; -use nom::{IResult, Offset, Slice}; +use nom::{IResult, Offset, Parser, Slice}; use nom_locate::LocatedSpan; use std::collections::HashMap; use std::convert::{identity, Infallible}; use std::marker::PhantomData; +use tracing::trace; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -217,7 +218,18 @@ impl Token { } } -type Span<'a> = LocatedSpan<&'a str>; +#[derive(Debug, Default, Copy, Clone)] +pub struct SpanMeta { + depth: usize, +} + +impl SpanMeta { + fn new(depth: usize) -> Self { + Self { depth } + } +} + +type Span<'a> = LocatedSpan<&'a str, SpanMeta>; trait SliceOffset { fn up_to(&self, other: &Self) -> Self; @@ -300,7 +312,10 @@ fn spliced<'a>( type NE = nom::Err; type NomError<'x> = nom::error::Error>; - let quote_span = Span::new(&combined); + let quote_span = Span::new_extra( + &combined, + segments.first().map_or(SpanMeta::new(0), |s| s.extra), + ); let (input, inner) = match func(quote_span) { Ok(s) => s, Err(e) => { @@ -311,7 +326,10 @@ fn spliced<'a>( let offset = offset_new - offset_seg_new; let offset_orig = offset + seg_parent.location_offset(); Err(NE::Error(NomError::new( - Span::new(&parent.into_fragment()[offset_orig..]), + Span::new_extra( + &parent.into_fragment()[offset_orig..], + seg_parent.extra, + ), e.code, ))) } else { @@ -405,9 +423,53 @@ impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From for FlankingDel } } -pub struct Context; +pub struct Context { + depth_limit: usize, +} + +const DEFAULT_DEPTH_LIMIT: usize = 24; + +impl Default for Context { + fn default() -> Self { + Context::new(DEFAULT_DEPTH_LIMIT) + } +} impl Context { + pub fn new(depth_limit: usize) -> Self { + Self { depth_limit } + } + + pub fn parse_full(&self, input: &str) -> Token { + match self.full(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Full parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } + } + } + + pub fn parse_inline(&self, input: &str) -> Token { + match self.full(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Inline parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } + } + } + + pub fn parse_ui(&self, input: &str) -> Token { + match self.inline_ui(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Inline parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } + } + } + #[inline] fn partial( &self, @@ -416,6 +478,14 @@ impl Context { move |input| func(self, input) } + #[inline] + fn partial_span( + &self, + func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Span<'a>> + 'static, + ) -> impl for<'a> Fn(Span<'a>) -> IResult, Span<'a>> + '_ { + move |input| func(self, input) + } + pub fn full<'a>(&self, input: Span<'a>) -> IResult, Token> { map(many1(self.partial(Self::full_single)), Token::Sequence)(input) } @@ -431,6 +501,17 @@ impl Context { )(input) } + fn inline_ui<'a>(&self, input: Span<'a>) -> IResult, Token> { + map( + many1(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::shortcode_emoji), + self.partial(Self::tag_raw_text), + ))), + Token::Sequence, + )(input) + } + fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult, Token> { alt(( self.partial(Self::tag_bold_italic_asterisk), @@ -444,69 +525,72 @@ impl Context { fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( - self.partial(Self::unicode_emoji), - alt(( + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), self.partial(Self::tag_block_center), self.partial(Self::tag_small), self.partial(Self::tag_plain), self.partial(Self::tag_bold), self.partial(Self::tag_italic), self.partial(Self::tag_strikethrough), - )), - self.partial(Self::url_no_embed), - self.partial(Self::base_bold_italic), - self.partial(Self::tag_block_code), - self.partial(Self::tag_inline_code), - self.partial(Self::tag_quote), - self.partial(Self::tag_block_math), - self.partial(Self::tag_inline_math), - self.partial(Self::tag_strikethrough_tilde), - self.partial(Self::tag_func), - self.partial(Self::tag_mention), - self.partial(Self::tag_hashtag), - self.partial(Self::shortcode_emoji), - self.partial(Self::link), - self.partial(Self::raw_url), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_block_code), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_quote), + self.partial(Self::tag_block_math), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::link), + self.partial(Self::raw_url), + ))), self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) } fn inline_single<'a>(&self, input: Span<'a>) -> IResult, Token> { - let (input, token) = alt(( - self.partial(Self::unicode_emoji), - self.partial(Self::tag_small), - self.partial(Self::tag_plain), - self.partial(Self::tag_bold), - self.partial(Self::tag_italic), - self.partial(Self::tag_strikethrough), - self.partial(Self::url_no_embed), - self.partial(Self::base_bold_italic), - self.partial(Self::tag_inline_code), - self.partial(Self::tag_inline_math), - self.partial(Self::tag_strikethrough_tilde), - self.partial(Self::tag_func), - self.partial(Self::tag_mention), - self.partial(Self::tag_hashtag), - self.partial(Self::shortcode_emoji), - self.partial(Self::link), - self.partial(Self::raw_url), + alt(( + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::link), + self.partial(Self::raw_url), + ))), self.partial(Self::tag_raw_text), - ))(input)?; - Ok((input, token)) + ))(input) } fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( - self.partial(Self::unicode_emoji), - self.partial(Self::url_no_embed), - self.partial(Self::tag_inline_code), - self.partial(Self::tag_inline_math), - self.partial(Self::tag_func), - self.partial(Self::tag_mention), - self.partial(Self::tag_hashtag), - self.partial(Self::shortcode_emoji), - self.partial(Self::raw_url), + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::url_no_embed), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::raw_url), + ))), self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) @@ -514,16 +598,18 @@ impl Context { fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( - self.partial(Self::unicode_emoji), - self.partial(Self::tag_small), - self.partial(Self::tag_plain), - self.partial(Self::tag_bold), - self.partial(Self::tag_italic), - self.partial(Self::tag_strikethrough), - self.partial(Self::base_bold_italic), - self.partial(Self::tag_strikethrough_tilde), - self.partial(Self::tag_func), - self.partial(Self::shortcode_emoji), + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::shortcode_emoji), + ))), self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) @@ -1056,8 +1142,11 @@ impl Context { fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, url_span) = recognize(tuple(( - protocol, - url_chars(|input| not(url_chars_base)(input), false), + self.partial_span(Self::protocol), + self.url_chars( + |input| recognize(not(self.partial_span(Self::url_chars_base)))(input), + false, + ), )))(input)?; let url = url_span.into_fragment(); @@ -1075,7 +1164,10 @@ impl Context { fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, _) = tag("<")(input)?; - let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?; + let (input, url_span) = recognize(tuple(( + self.partial_span(Self::protocol), + self.url_chars(tag(">"), true), + )))(input)?; let (input, _) = tag(">")(input)?; Ok(( @@ -1090,7 +1182,10 @@ impl Context { let (input, _) = not(tag("["))(input)?; let (input, (label_tok, _)) = many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?; - let (input, url_span) = recognize(tuple((protocol, url_chars(tag(")"), true))))(input)?; + let (input, url_span) = recognize(tuple(( + self.partial_span(Self::protocol), + self.url_chars(tag(")"), true), + )))(input)?; let (input, _) = tag(")")(input)?; Ok(( @@ -1202,74 +1297,136 @@ impl Context { let (input, _) = tag("#")(input)?; - let (input, hashtag_text) = - map(recognize(many1(hashtag_chars)), Span::into_fragment)(input)?; + let (input, hashtag_text) = map( + recognize(many1(self.partial_span(Self::hashtag_chars))), + Span::into_fragment, + )(input)?; Ok((input, Token::Hashtag(hashtag_text.into()))) } -} -#[inline] -fn hashtag_chars(input: Span) -> IResult { - recognize(alt(( - recognize(tuple((tag("("), hashtag_chars, tag(")")))), - recognize(tuple((tag("["), hashtag_chars, tag("]")))), - recognize(tuple((tag("「"), hashtag_chars, tag("」")))), - recognize(tuple((tag("("), hashtag_chars, tag(")")))), - recognize(tuple(( - not(space1), - not_line_ending, - not(one_of(".,:;!?#?/[]【】()「」()<>")), - anychar, - ))), - )))(input) -} + #[inline] + fn increase_nesting<'a, 'b, O, F>( + &'b self, + mut func: F, + ) -> impl FnMut(Span<'a>) -> IResult, O> + 'b + where + F: Parser, O, nom::error::Error>> + 'b, + { + move |mut input| { + if input.extra.depth >= self.depth_limit { + return fail(input); + } -#[inline] -fn protocol(input: Span) -> IResult { - alt((tag("https://"), tag("http://")))(input) -} + input.extra.depth += 1; + func.parse(input) + } + } -#[inline] -fn url_chars_base(input: Span) -> IResult { - alt(( - alphanumeric1_unicode, - recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))), - recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))), - recognize(one_of(".,_/:%#$&?!~=+-@")), - ))(input) -} + #[inline] + fn hashtag_chars<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + recognize(alt(( + recognize(tuple(( + tag("("), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag(")"), + ))), + recognize(tuple(( + tag("["), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag("]"), + ))), + recognize(tuple(( + tag("「"), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag("」"), + ))), + recognize(tuple(( + tag("("), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag(")"), + ))), + recognize(tuple(( + not(space1), + not_line_ending, + not(one_of(".,:;!?#?/[]【】()「」()<>")), + anychar, + ))), + )))(input) + } -#[inline] -fn url_chars<'a, T: 'a>( - terminator: impl Fn(Span<'a>) -> IResult, T> + 'a, - spaces: bool, -) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'a { - let chars = tuple(( - not(tuple((space1, eof))), - not(tuple((space1, tag("\"")))), - not(tuple((opt(space1), terminator))), - alt((url_chars_base, if spaces { space1 } else { fail })), - )); + #[inline] + fn protocol<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + alt((tag("https://"), tag("http://")))(input) + } - recognize(many1_count(chars)) + #[inline] + fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + alt(( + alphanumeric1_unicode, + recognize(tuple(( + tag("["), + many_till( + self.increase_nesting(self.partial_span(Self::url_chars_base)), + tag("]"), + ), + ))), + recognize(tuple(( + tag("("), + many_till( + self.increase_nesting(self.partial_span(Self::url_chars_base)), + tag(")"), + ), + ))), + recognize(one_of(".,_/:%#$&?!~=+-@")), + ))(input) + } + + #[inline] + fn url_chars<'a, 'b, F>( + &'b self, + mut terminator: F, + spaces: bool, + ) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'b + where + F: Parser, Span<'a>, nom::error::Error>> + 'b, + { + move |input| { + recognize(many1_count(tuple(( + not(tuple((space1, eof))), + not(tuple((space1, tag("\"")))), + not(tuple((opt(space1), |input| terminator.parse(input)))), + alt(( + |input| self.url_chars_base(input), + if spaces { space1 } else { fail }, + )), + ))))(input) + } + } } #[cfg(test)] mod test { - use crate::{url_chars, Context, Span, Token}; + use crate::{Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; use nom::bytes::complete::tag; use std::collections::HashMap; fn parse_full(string: &str) -> Token { - Context.full(Span::new(string)).unwrap().1.merged() + Context::default() + .full(Span::new_extra(string, SpanMeta::default())) + .unwrap() + .1 + .merged() } #[test] fn parse_url_chars() { + let ctx = Context::default(); + assert_eq!( - url_chars(tag(")"), true)(Span::new( - "https://en.wikipedia.org/wiki/Sandbox_(computer_security))" + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", + SpanMeta::default() )) .unwrap() .1 @@ -1278,8 +1435,9 @@ mod test { ); assert_eq!( - url_chars(tag(")"), true)(Span::new( - "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))" + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))", + SpanMeta::default() )) .unwrap() .1 @@ -1288,26 +1446,35 @@ mod test { ); assert_eq!( - url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among_Us ")) - .unwrap() - .1 - .into_fragment(), + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://cs.wikipedia.org/wiki/Among_Us ", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), "https://cs.wikipedia.org/wiki/Among_Us", ); assert_eq!( - url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among Us )")) - .unwrap() - .1 - .into_fragment(), + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://cs.wikipedia.org/wiki/Among Us )", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), "https://cs.wikipedia.org/wiki/Among Us" ); assert_eq!( - url_chars(tag(")"), false)(Span::new("https://en.wikipedia.org/wiki/Among Us )")) - .unwrap() - .1 - .into_fragment(), + ctx.url_chars(tag(")"), false)(Span::new_extra( + "https://en.wikipedia.org/wiki/Among Us )", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), "https://en.wikipedia.org/wiki/Among" ); } @@ -1593,6 +1760,23 @@ text"# ); } + #[test] + fn limit_nesting() { + let mut tok = Token::PlainText(" test ".into()); + for _ in 0..DEFAULT_DEPTH_LIMIT { + tok = Token::Bold(Box::new(tok)); + } + + assert_eq!( + parse_full( + &("".repeat(DEFAULT_DEPTH_LIMIT) + + " test " + + &*"".repeat(DEFAULT_DEPTH_LIMIT)) + ), + tok + ); + } + #[test] fn parse_mention() { assert_eq!( From 42fa83c6e248070cc8cd31ef03ce9310b1b87410 Mon Sep 17 00:00:00 2001 From: Natty Date: Mon, 23 Oct 2023 23:52:02 +0200 Subject: [PATCH 22/23] MMM: Fixed hashtag parsing --- magnetar_mmm_parser/src/lib.rs | 35 ++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 2f76532..4806587 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -7,7 +7,7 @@ use nom::character::complete::{ satisfy, space1, tab, }; use nom::combinator::{eof, fail, map, not, opt, recognize}; -use nom::error::{ErrorKind, ParseError}; +use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; use nom::{IResult, Offset, Parser, Slice}; @@ -277,6 +277,14 @@ fn alpha1_unicode(input: Span) -> IResult { recognize(many1_count(satisfy(char::is_alphanumeric)))(input) } +#[inline] +fn space1_unicode(input: Span) -> IResult { + recognize(many1_count(tuple(( + not(line_ending), + satisfy(char::is_whitespace), + ))))(input) +} + #[inline] fn alphanumeric1_unicode(input: Span) -> IResult { recognize(many1_count(satisfy(char::is_alphanumeric)))(input) @@ -1293,7 +1301,12 @@ impl Context { } fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token> { - // TODO: Skip when preceded by alphanumerics + let (input, maybe_preceded) = + opt(recognize(tuple((alphanumeric1_unicode, tag("#")))))(input)?; + + if let Some(preceded) = maybe_preceded { + return Ok((input, Token::PlainText(preceded.into_fragment().into()))); + } let (input, _) = tag("#")(input)?; @@ -1347,8 +1360,8 @@ impl Context { tag(")"), ))), recognize(tuple(( - not(space1), - not_line_ending, + not(space1_unicode), + not(line_ending), not(one_of(".,:;!?#?/[]【】()「」()<>")), anychar, ))), @@ -1699,6 +1712,20 @@ text"# ]) ); + assert_eq!( + parse_full("test #hashtag tail"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Hashtag("hashtag".into()), + Token::PlainText(" tail".into()) + ]) + ); + + assert_eq!( + parse_full("not#hashtag tail"), + Token::PlainText("not#hashtag tail".into()) + ); + assert_eq!( parse_full(""), Token::UrlNoEmbed("https://example.com".into()) From c4a8ebebf38a766bfc41f9b303d3bb090140ea3c Mon Sep 17 00:00:00 2001 From: Natty Date: Tue, 24 Oct 2023 00:27:54 +0200 Subject: [PATCH 23/23] MMM: Janky outer flanking rules implementation --- magnetar_mmm_parser/src/lib.rs | 37 ++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 4806587..26661e6 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -6,7 +6,7 @@ use nom::character::complete::{ alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, satisfy, space1, tab, }; -use nom::combinator::{eof, fail, map, not, opt, recognize}; +use nom::combinator::{eof, fail, map, not, opt, peek, recognize}; use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; @@ -272,11 +272,6 @@ fn collect_char_sequence( move |chars| func(chars.collect()) } -#[inline] -fn alpha1_unicode(input: Span) -> IResult { - recognize(many1_count(satisfy(char::is_alphanumeric)))(input) -} - #[inline] fn space1_unicode(input: Span) -> IResult { recognize(many1_count(tuple(( @@ -778,6 +773,14 @@ impl Context { } } + if let FlankingRule::Strict = opening_rule { + let (input, pre) = + opt(recognize(tuple((alphanumeric1_unicode, &opening_tag))))(input)?; + if let Some(pre_text) = pre { + return Ok((input, Token::PlainText(pre_text.into_fragment().into()))); + } + } + let begin = input; let (post_open, _) = opening_tag(input)?; @@ -840,8 +843,12 @@ impl Context { true }; - // TODO: Unfinished flanking rules - let correct_flanking = correct_left_flanking && correct_right_flanking; + let (input, alphanum) = opt(peek(alphanumeric1_unicode))(input)?; + let correct_right_outer = + alphanum.is_none() || !matches!(closing_rule, FlankingRule::Strict); + + let correct_flanking = + correct_left_flanking && correct_right_flanking && correct_right_outer; if !correct_flanking { return Ok(( @@ -1514,6 +1521,20 @@ mod test { Token::PlainText("* italic *".into()) ); + assert_eq!( + parse_full("snake_case_variable"), + Token::PlainText("snake_case_variable".into()) + ); + + assert_eq!( + parse_full("intra*word*italic"), + Token::Sequence(vec![ + Token::PlainText("intra".into()), + Token::Italic(Box::new(Token::PlainText("word".into()))), + Token::PlainText("italic".into()) + ]) + ); + assert_eq!( parse_full(r#"_ italic *"#), Token::PlainText("_ italic *".into())