From 86d5c87e9a20a05583d831bfef65fe6bcf6413bc Mon Sep 17 00:00:00 2001 From: Natty Date: Mon, 16 Oct 2023 23:45:45 +0200 Subject: [PATCH] MMM: Nesting-limited parsing --- Cargo.lock | 1 + magnetar_mmm_parser/Cargo.toml | 1 + magnetar_mmm_parser/src/lib.rs | 438 +++++++++++++++++++++++---------- 3 files changed, 313 insertions(+), 127 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cb3905d..35e50cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1649,6 +1649,7 @@ dependencies = [ "emojis", "nom", "nom_locate", + "tracing", "unicode-segmentation", ] diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 14e36f7..d7b9b2d 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -10,4 +10,5 @@ emojis = { workspace = true } nom = { workspace = true } nom_locate = { workspace = true } compact_str = { workspace = true } +tracing = { workspace = true } unicode-segmentation = { workspace = true } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index b940145..2f76532 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -7,14 +7,15 @@ use nom::character::complete::{ satisfy, space1, tab, }; use nom::combinator::{eof, fail, map, not, opt, recognize}; -use nom::error::ErrorKind; +use nom::error::{ErrorKind, ParseError}; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; -use nom::{IResult, Offset, Slice}; +use nom::{IResult, Offset, Parser, Slice}; use nom_locate::LocatedSpan; use std::collections::HashMap; use std::convert::{identity, Infallible}; use std::marker::PhantomData; +use tracing::trace; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -217,7 +218,18 @@ impl Token { } } -type Span<'a> = LocatedSpan<&'a str>; +#[derive(Debug, Default, Copy, Clone)] +pub struct SpanMeta { + depth: usize, +} + +impl SpanMeta { + fn new(depth: usize) -> Self { + Self { depth } + } +} + +type Span<'a> = LocatedSpan<&'a str, SpanMeta>; trait SliceOffset { fn up_to(&self, other: &Self) -> Self; @@ -300,7 +312,10 @@ fn spliced<'a>( type NE = nom::Err; type NomError<'x> = nom::error::Error>; - let quote_span = Span::new(&combined); + let quote_span = Span::new_extra( + &combined, + segments.first().map_or(SpanMeta::new(0), |s| s.extra), + ); let (input, inner) = match func(quote_span) { Ok(s) => s, Err(e) => { @@ -311,7 +326,10 @@ fn spliced<'a>( let offset = offset_new - offset_seg_new; let offset_orig = offset + seg_parent.location_offset(); Err(NE::Error(NomError::new( - Span::new(&parent.into_fragment()[offset_orig..]), + Span::new_extra( + &parent.into_fragment()[offset_orig..], + seg_parent.extra, + ), e.code, ))) } else { @@ -405,9 +423,53 @@ impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From for FlankingDel } } -pub struct Context; +pub struct Context { + depth_limit: usize, +} + +const DEFAULT_DEPTH_LIMIT: usize = 24; + +impl Default for Context { + fn default() -> Self { + Context::new(DEFAULT_DEPTH_LIMIT) + } +} impl Context { + pub fn new(depth_limit: usize) -> Self { + Self { depth_limit } + } + + pub fn parse_full(&self, input: &str) -> Token { + match self.full(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Full parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } + } + } + + pub fn parse_inline(&self, input: &str) -> Token { + match self.full(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Inline parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } + } + } + + pub fn parse_ui(&self, input: &str) -> Token { + match self.inline_ui(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Inline parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } + } + } + #[inline] fn partial( &self, @@ -416,6 +478,14 @@ impl Context { move |input| func(self, input) } + #[inline] + fn partial_span( + &self, + func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Span<'a>> + 'static, + ) -> impl for<'a> Fn(Span<'a>) -> IResult, Span<'a>> + '_ { + move |input| func(self, input) + } + pub fn full<'a>(&self, input: Span<'a>) -> IResult, Token> { map(many1(self.partial(Self::full_single)), Token::Sequence)(input) } @@ -431,6 +501,17 @@ impl Context { )(input) } + fn inline_ui<'a>(&self, input: Span<'a>) -> IResult, Token> { + map( + many1(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::shortcode_emoji), + self.partial(Self::tag_raw_text), + ))), + Token::Sequence, + )(input) + } + fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult, Token> { alt(( self.partial(Self::tag_bold_italic_asterisk), @@ -444,69 +525,72 @@ impl Context { fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( - self.partial(Self::unicode_emoji), - alt(( + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), self.partial(Self::tag_block_center), self.partial(Self::tag_small), self.partial(Self::tag_plain), self.partial(Self::tag_bold), self.partial(Self::tag_italic), self.partial(Self::tag_strikethrough), - )), - self.partial(Self::url_no_embed), - self.partial(Self::base_bold_italic), - self.partial(Self::tag_block_code), - self.partial(Self::tag_inline_code), - self.partial(Self::tag_quote), - self.partial(Self::tag_block_math), - self.partial(Self::tag_inline_math), - self.partial(Self::tag_strikethrough_tilde), - self.partial(Self::tag_func), - self.partial(Self::tag_mention), - self.partial(Self::tag_hashtag), - self.partial(Self::shortcode_emoji), - self.partial(Self::link), - self.partial(Self::raw_url), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_block_code), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_quote), + self.partial(Self::tag_block_math), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::link), + self.partial(Self::raw_url), + ))), self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) } fn inline_single<'a>(&self, input: Span<'a>) -> IResult, Token> { - let (input, token) = alt(( - self.partial(Self::unicode_emoji), - self.partial(Self::tag_small), - self.partial(Self::tag_plain), - self.partial(Self::tag_bold), - self.partial(Self::tag_italic), - self.partial(Self::tag_strikethrough), - self.partial(Self::url_no_embed), - self.partial(Self::base_bold_italic), - self.partial(Self::tag_inline_code), - self.partial(Self::tag_inline_math), - self.partial(Self::tag_strikethrough_tilde), - self.partial(Self::tag_func), - self.partial(Self::tag_mention), - self.partial(Self::tag_hashtag), - self.partial(Self::shortcode_emoji), - self.partial(Self::link), - self.partial(Self::raw_url), + alt(( + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::link), + self.partial(Self::raw_url), + ))), self.partial(Self::tag_raw_text), - ))(input)?; - Ok((input, token)) + ))(input) } fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( - self.partial(Self::unicode_emoji), - self.partial(Self::url_no_embed), - self.partial(Self::tag_inline_code), - self.partial(Self::tag_inline_math), - self.partial(Self::tag_func), - self.partial(Self::tag_mention), - self.partial(Self::tag_hashtag), - self.partial(Self::shortcode_emoji), - self.partial(Self::raw_url), + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::url_no_embed), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::raw_url), + ))), self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) @@ -514,16 +598,18 @@ impl Context { fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( - self.partial(Self::unicode_emoji), - self.partial(Self::tag_small), - self.partial(Self::tag_plain), - self.partial(Self::tag_bold), - self.partial(Self::tag_italic), - self.partial(Self::tag_strikethrough), - self.partial(Self::base_bold_italic), - self.partial(Self::tag_strikethrough_tilde), - self.partial(Self::tag_func), - self.partial(Self::shortcode_emoji), + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::shortcode_emoji), + ))), self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) @@ -1056,8 +1142,11 @@ impl Context { fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, url_span) = recognize(tuple(( - protocol, - url_chars(|input| not(url_chars_base)(input), false), + self.partial_span(Self::protocol), + self.url_chars( + |input| recognize(not(self.partial_span(Self::url_chars_base)))(input), + false, + ), )))(input)?; let url = url_span.into_fragment(); @@ -1075,7 +1164,10 @@ impl Context { fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, _) = tag("<")(input)?; - let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?; + let (input, url_span) = recognize(tuple(( + self.partial_span(Self::protocol), + self.url_chars(tag(">"), true), + )))(input)?; let (input, _) = tag(">")(input)?; Ok(( @@ -1090,7 +1182,10 @@ impl Context { let (input, _) = not(tag("["))(input)?; let (input, (label_tok, _)) = many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?; - let (input, url_span) = recognize(tuple((protocol, url_chars(tag(")"), true))))(input)?; + let (input, url_span) = recognize(tuple(( + self.partial_span(Self::protocol), + self.url_chars(tag(")"), true), + )))(input)?; let (input, _) = tag(")")(input)?; Ok(( @@ -1202,74 +1297,136 @@ impl Context { let (input, _) = tag("#")(input)?; - let (input, hashtag_text) = - map(recognize(many1(hashtag_chars)), Span::into_fragment)(input)?; + let (input, hashtag_text) = map( + recognize(many1(self.partial_span(Self::hashtag_chars))), + Span::into_fragment, + )(input)?; Ok((input, Token::Hashtag(hashtag_text.into()))) } -} -#[inline] -fn hashtag_chars(input: Span) -> IResult { - recognize(alt(( - recognize(tuple((tag("("), hashtag_chars, tag(")")))), - recognize(tuple((tag("["), hashtag_chars, tag("]")))), - recognize(tuple((tag("「"), hashtag_chars, tag("」")))), - recognize(tuple((tag("("), hashtag_chars, tag(")")))), - recognize(tuple(( - not(space1), - not_line_ending, - not(one_of(".,:;!?#?/[]【】()「」()<>")), - anychar, - ))), - )))(input) -} + #[inline] + fn increase_nesting<'a, 'b, O, F>( + &'b self, + mut func: F, + ) -> impl FnMut(Span<'a>) -> IResult, O> + 'b + where + F: Parser, O, nom::error::Error>> + 'b, + { + move |mut input| { + if input.extra.depth >= self.depth_limit { + return fail(input); + } -#[inline] -fn protocol(input: Span) -> IResult { - alt((tag("https://"), tag("http://")))(input) -} + input.extra.depth += 1; + func.parse(input) + } + } -#[inline] -fn url_chars_base(input: Span) -> IResult { - alt(( - alphanumeric1_unicode, - recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))), - recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))), - recognize(one_of(".,_/:%#$&?!~=+-@")), - ))(input) -} + #[inline] + fn hashtag_chars<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + recognize(alt(( + recognize(tuple(( + tag("("), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag(")"), + ))), + recognize(tuple(( + tag("["), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag("]"), + ))), + recognize(tuple(( + tag("「"), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag("」"), + ))), + recognize(tuple(( + tag("("), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag(")"), + ))), + recognize(tuple(( + not(space1), + not_line_ending, + not(one_of(".,:;!?#?/[]【】()「」()<>")), + anychar, + ))), + )))(input) + } -#[inline] -fn url_chars<'a, T: 'a>( - terminator: impl Fn(Span<'a>) -> IResult, T> + 'a, - spaces: bool, -) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'a { - let chars = tuple(( - not(tuple((space1, eof))), - not(tuple((space1, tag("\"")))), - not(tuple((opt(space1), terminator))), - alt((url_chars_base, if spaces { space1 } else { fail })), - )); + #[inline] + fn protocol<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + alt((tag("https://"), tag("http://")))(input) + } - recognize(many1_count(chars)) + #[inline] + fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + alt(( + alphanumeric1_unicode, + recognize(tuple(( + tag("["), + many_till( + self.increase_nesting(self.partial_span(Self::url_chars_base)), + tag("]"), + ), + ))), + recognize(tuple(( + tag("("), + many_till( + self.increase_nesting(self.partial_span(Self::url_chars_base)), + tag(")"), + ), + ))), + recognize(one_of(".,_/:%#$&?!~=+-@")), + ))(input) + } + + #[inline] + fn url_chars<'a, 'b, F>( + &'b self, + mut terminator: F, + spaces: bool, + ) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'b + where + F: Parser, Span<'a>, nom::error::Error>> + 'b, + { + move |input| { + recognize(many1_count(tuple(( + not(tuple((space1, eof))), + not(tuple((space1, tag("\"")))), + not(tuple((opt(space1), |input| terminator.parse(input)))), + alt(( + |input| self.url_chars_base(input), + if spaces { space1 } else { fail }, + )), + ))))(input) + } + } } #[cfg(test)] mod test { - use crate::{url_chars, Context, Span, Token}; + use crate::{Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; use nom::bytes::complete::tag; use std::collections::HashMap; fn parse_full(string: &str) -> Token { - Context.full(Span::new(string)).unwrap().1.merged() + Context::default() + .full(Span::new_extra(string, SpanMeta::default())) + .unwrap() + .1 + .merged() } #[test] fn parse_url_chars() { + let ctx = Context::default(); + assert_eq!( - url_chars(tag(")"), true)(Span::new( - "https://en.wikipedia.org/wiki/Sandbox_(computer_security))" + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", + SpanMeta::default() )) .unwrap() .1 @@ -1278,8 +1435,9 @@ mod test { ); assert_eq!( - url_chars(tag(")"), true)(Span::new( - "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))" + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))", + SpanMeta::default() )) .unwrap() .1 @@ -1288,26 +1446,35 @@ mod test { ); assert_eq!( - url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among_Us ")) - .unwrap() - .1 - .into_fragment(), + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://cs.wikipedia.org/wiki/Among_Us ", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), "https://cs.wikipedia.org/wiki/Among_Us", ); assert_eq!( - url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among Us )")) - .unwrap() - .1 - .into_fragment(), + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://cs.wikipedia.org/wiki/Among Us )", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), "https://cs.wikipedia.org/wiki/Among Us" ); assert_eq!( - url_chars(tag(")"), false)(Span::new("https://en.wikipedia.org/wiki/Among Us )")) - .unwrap() - .1 - .into_fragment(), + ctx.url_chars(tag(")"), false)(Span::new_extra( + "https://en.wikipedia.org/wiki/Among Us )", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), "https://en.wikipedia.org/wiki/Among" ); } @@ -1593,6 +1760,23 @@ text"# ); } + #[test] + fn limit_nesting() { + let mut tok = Token::PlainText(" test ".into()); + for _ in 0..DEFAULT_DEPTH_LIMIT { + tok = Token::Bold(Box::new(tok)); + } + + assert_eq!( + parse_full( + &("".repeat(DEFAULT_DEPTH_LIMIT) + + " test " + + &*"".repeat(DEFAULT_DEPTH_LIMIT)) + ), + tok + ); + } + #[test] fn parse_mention() { assert_eq!(