MMM: Nesting-limited parsing

This commit is contained in:
Natty 2023-10-16 23:45:45 +02:00
parent 23a63f2fe9
commit 86d5c87e9a
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
3 changed files with 313 additions and 127 deletions

1
Cargo.lock generated
View File

@ -1649,6 +1649,7 @@ dependencies = [
"emojis", "emojis",
"nom", "nom",
"nom_locate", "nom_locate",
"tracing",
"unicode-segmentation", "unicode-segmentation",
] ]

View File

@ -10,4 +10,5 @@ emojis = { workspace = true }
nom = { workspace = true } nom = { workspace = true }
nom_locate = { workspace = true } nom_locate = { workspace = true }
compact_str = { workspace = true } compact_str = { workspace = true }
tracing = { workspace = true }
unicode-segmentation = { workspace = true } unicode-segmentation = { workspace = true }

View File

@ -7,14 +7,15 @@ use nom::character::complete::{
satisfy, space1, tab, satisfy, space1, tab,
}; };
use nom::combinator::{eof, fail, map, not, opt, recognize}; use nom::combinator::{eof, fail, map, not, opt, recognize};
use nom::error::ErrorKind; use nom::error::{ErrorKind, ParseError};
use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1};
use nom::sequence::tuple; use nom::sequence::tuple;
use nom::{IResult, Offset, Slice}; use nom::{IResult, Offset, Parser, Slice};
use nom_locate::LocatedSpan; use nom_locate::LocatedSpan;
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::{identity, Infallible}; use std::convert::{identity, Infallible};
use std::marker::PhantomData; use std::marker::PhantomData;
use tracing::trace;
use unicode_segmentation::UnicodeSegmentation; use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug, Eq, PartialEq)] #[derive(Copy, Clone, Debug, Eq, PartialEq)]
@ -217,7 +218,18 @@ impl Token {
} }
} }
type Span<'a> = LocatedSpan<&'a str>; #[derive(Debug, Default, Copy, Clone)]
pub struct SpanMeta {
depth: usize,
}
impl SpanMeta {
fn new(depth: usize) -> Self {
Self { depth }
}
}
type Span<'a> = LocatedSpan<&'a str, SpanMeta>;
trait SliceOffset { trait SliceOffset {
fn up_to(&self, other: &Self) -> Self; fn up_to(&self, other: &Self) -> Self;
@ -300,7 +312,10 @@ fn spliced<'a>(
type NE<E> = nom::Err<E>; type NE<E> = nom::Err<E>;
type NomError<'x> = nom::error::Error<Span<'x>>; type NomError<'x> = nom::error::Error<Span<'x>>;
let quote_span = Span::new(&combined); let quote_span = Span::new_extra(
&combined,
segments.first().map_or(SpanMeta::new(0), |s| s.extra),
);
let (input, inner) = match func(quote_span) { let (input, inner) = match func(quote_span) {
Ok(s) => s, Ok(s) => s,
Err(e) => { Err(e) => {
@ -311,7 +326,10 @@ fn spliced<'a>(
let offset = offset_new - offset_seg_new; let offset = offset_new - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset(); let offset_orig = offset + seg_parent.location_offset();
Err(NE::Error(NomError::new( Err(NE::Error(NomError::new(
Span::new(&parent.into_fragment()[offset_orig..]), Span::new_extra(
&parent.into_fragment()[offset_orig..],
seg_parent.extra,
),
e.code, e.code,
))) )))
} else { } else {
@ -405,9 +423,53 @@ impl<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>> From<T> for FlankingDel
} }
} }
pub struct Context; pub struct Context {
depth_limit: usize,
}
const DEFAULT_DEPTH_LIMIT: usize = 24;
impl Default for Context {
fn default() -> Self {
Context::new(DEFAULT_DEPTH_LIMIT)
}
}
impl Context { impl Context {
pub fn new(depth_limit: usize) -> Self {
Self { depth_limit }
}
pub fn parse_full(&self, input: &str) -> Token {
match self.full(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Full parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
pub fn parse_inline(&self, input: &str) -> Token {
match self.full(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Inline parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
pub fn parse_ui(&self, input: &str) -> Token {
match self.inline_ui(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Inline parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
#[inline] #[inline]
fn partial( fn partial(
&self, &self,
@ -416,6 +478,14 @@ impl Context {
move |input| func(self, input) move |input| func(self, input)
} }
#[inline]
fn partial_span(
&self,
func: impl for<'a> Fn(&Self, Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'static,
) -> impl for<'a> Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>> + '_ {
move |input| func(self, input)
}
pub fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { pub fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
map(many1(self.partial(Self::full_single)), Token::Sequence)(input) map(many1(self.partial(Self::full_single)), Token::Sequence)(input)
} }
@ -431,6 +501,17 @@ impl Context {
)(input) )(input)
} }
fn inline_ui<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
map(
many1(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::shortcode_emoji),
self.partial(Self::tag_raw_text),
))),
Token::Sequence,
)(input)
}
fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
alt(( alt((
self.partial(Self::tag_bold_italic_asterisk), self.partial(Self::tag_bold_italic_asterisk),
@ -444,69 +525,72 @@ impl Context {
fn full_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn full_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt(( let (input, token) = alt((
self.partial(Self::unicode_emoji), self.increase_nesting(alt((
alt(( self.partial(Self::unicode_emoji),
self.partial(Self::tag_block_center), self.partial(Self::tag_block_center),
self.partial(Self::tag_small), self.partial(Self::tag_small),
self.partial(Self::tag_plain), self.partial(Self::tag_plain),
self.partial(Self::tag_bold), self.partial(Self::tag_bold),
self.partial(Self::tag_italic), self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough), self.partial(Self::tag_strikethrough),
)), self.partial(Self::url_no_embed),
self.partial(Self::url_no_embed), self.partial(Self::base_bold_italic),
self.partial(Self::base_bold_italic), self.partial(Self::tag_block_code),
self.partial(Self::tag_block_code), self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_code), self.partial(Self::tag_quote),
self.partial(Self::tag_quote), self.partial(Self::tag_block_math),
self.partial(Self::tag_block_math), self.partial(Self::tag_inline_math),
self.partial(Self::tag_inline_math), self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_strikethrough_tilde), self.partial(Self::tag_func),
self.partial(Self::tag_func), self.partial(Self::tag_mention),
self.partial(Self::tag_mention), self.partial(Self::tag_hashtag),
self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji),
self.partial(Self::shortcode_emoji), self.partial(Self::link),
self.partial(Self::link), self.partial(Self::raw_url),
self.partial(Self::raw_url), ))),
self.partial(Self::tag_raw_text), self.partial(Self::tag_raw_text),
))(input)?; ))(input)?;
Ok((input, token)) Ok((input, token))
} }
fn inline_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn inline_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt(( alt((
self.partial(Self::unicode_emoji), self.increase_nesting(alt((
self.partial(Self::tag_small), self.partial(Self::unicode_emoji),
self.partial(Self::tag_plain), self.partial(Self::tag_small),
self.partial(Self::tag_bold), self.partial(Self::tag_plain),
self.partial(Self::tag_italic), self.partial(Self::tag_bold),
self.partial(Self::tag_strikethrough), self.partial(Self::tag_italic),
self.partial(Self::url_no_embed), self.partial(Self::tag_strikethrough),
self.partial(Self::base_bold_italic), self.partial(Self::url_no_embed),
self.partial(Self::tag_inline_code), self.partial(Self::base_bold_italic),
self.partial(Self::tag_inline_math), self.partial(Self::tag_inline_code),
self.partial(Self::tag_strikethrough_tilde), self.partial(Self::tag_inline_math),
self.partial(Self::tag_func), self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_mention), self.partial(Self::tag_func),
self.partial(Self::tag_hashtag), self.partial(Self::tag_mention),
self.partial(Self::shortcode_emoji), self.partial(Self::tag_hashtag),
self.partial(Self::link), self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url), self.partial(Self::link),
self.partial(Self::raw_url),
))),
self.partial(Self::tag_raw_text), self.partial(Self::tag_raw_text),
))(input)?; ))(input)
Ok((input, token))
} }
fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt(( let (input, token) = alt((
self.partial(Self::unicode_emoji), self.increase_nesting(alt((
self.partial(Self::url_no_embed), self.partial(Self::unicode_emoji),
self.partial(Self::tag_inline_code), self.partial(Self::url_no_embed),
self.partial(Self::tag_inline_math), self.partial(Self::tag_inline_code),
self.partial(Self::tag_func), self.partial(Self::tag_inline_math),
self.partial(Self::tag_mention), self.partial(Self::tag_func),
self.partial(Self::tag_hashtag), self.partial(Self::tag_mention),
self.partial(Self::shortcode_emoji), self.partial(Self::tag_hashtag),
self.partial(Self::raw_url), self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url),
))),
self.partial(Self::tag_raw_text), self.partial(Self::tag_raw_text),
))(input)?; ))(input)?;
Ok((input, token)) Ok((input, token))
@ -514,16 +598,18 @@ impl Context {
fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt(( let (input, token) = alt((
self.partial(Self::unicode_emoji), self.increase_nesting(alt((
self.partial(Self::tag_small), self.partial(Self::unicode_emoji),
self.partial(Self::tag_plain), self.partial(Self::tag_small),
self.partial(Self::tag_bold), self.partial(Self::tag_plain),
self.partial(Self::tag_italic), self.partial(Self::tag_bold),
self.partial(Self::tag_strikethrough), self.partial(Self::tag_italic),
self.partial(Self::base_bold_italic), self.partial(Self::tag_strikethrough),
self.partial(Self::tag_strikethrough_tilde), self.partial(Self::base_bold_italic),
self.partial(Self::tag_func), self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::shortcode_emoji), self.partial(Self::tag_func),
self.partial(Self::shortcode_emoji),
))),
self.partial(Self::tag_raw_text), self.partial(Self::tag_raw_text),
))(input)?; ))(input)?;
Ok((input, token)) Ok((input, token))
@ -1056,8 +1142,11 @@ impl Context {
fn raw_url<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn raw_url<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, url_span) = recognize(tuple(( let (input, url_span) = recognize(tuple((
protocol, self.partial_span(Self::protocol),
url_chars(|input| not(url_chars_base)(input), false), self.url_chars(
|input| recognize(not(self.partial_span(Self::url_chars_base)))(input),
false,
),
)))(input)?; )))(input)?;
let url = url_span.into_fragment(); let url = url_span.into_fragment();
@ -1075,7 +1164,10 @@ impl Context {
fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, _) = tag("<")(input)?; let (input, _) = tag("<")(input)?;
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?; let (input, url_span) = recognize(tuple((
self.partial_span(Self::protocol),
self.url_chars(tag(">"), true),
)))(input)?;
let (input, _) = tag(">")(input)?; let (input, _) = tag(">")(input)?;
Ok(( Ok((
@ -1090,7 +1182,10 @@ impl Context {
let (input, _) = not(tag("["))(input)?; let (input, _) = not(tag("["))(input)?;
let (input, (label_tok, _)) = let (input, (label_tok, _)) =
many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?; many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?;
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(")"), true))))(input)?; let (input, url_span) = recognize(tuple((
self.partial_span(Self::protocol),
self.url_chars(tag(")"), true),
)))(input)?;
let (input, _) = tag(")")(input)?; let (input, _) = tag(")")(input)?;
Ok(( Ok((
@ -1202,74 +1297,136 @@ impl Context {
let (input, _) = tag("#")(input)?; let (input, _) = tag("#")(input)?;
let (input, hashtag_text) = let (input, hashtag_text) = map(
map(recognize(many1(hashtag_chars)), Span::into_fragment)(input)?; recognize(many1(self.partial_span(Self::hashtag_chars))),
Span::into_fragment,
)(input)?;
Ok((input, Token::Hashtag(hashtag_text.into()))) Ok((input, Token::Hashtag(hashtag_text.into())))
} }
}
#[inline] #[inline]
fn hashtag_chars(input: Span) -> IResult<Span, Span> { fn increase_nesting<'a, 'b, O, F>(
recognize(alt(( &'b self,
recognize(tuple((tag("("), hashtag_chars, tag(")")))), mut func: F,
recognize(tuple((tag("["), hashtag_chars, tag("]")))), ) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, O> + 'b
recognize(tuple((tag(""), hashtag_chars, tag("")))), where
recognize(tuple((tag(""), hashtag_chars, tag("")))), F: Parser<Span<'a>, O, nom::error::Error<Span<'a>>> + 'b,
recognize(tuple(( {
not(space1), move |mut input| {
not_line_ending, if input.extra.depth >= self.depth_limit {
not(one_of(".,:;!?#?/[]【】()「」()<>")), return fail(input);
anychar, }
))),
)))(input)
}
#[inline] input.extra.depth += 1;
fn protocol(input: Span) -> IResult<Span, Span> { func.parse(input)
alt((tag("https://"), tag("http://")))(input) }
} }
#[inline] #[inline]
fn url_chars_base(input: Span) -> IResult<Span, Span> { fn hashtag_chars<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
alt(( recognize(alt((
alphanumeric1_unicode, recognize(tuple((
recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))), tag("("),
recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))), self.increase_nesting(self.partial_span(Self::hashtag_chars)),
recognize(one_of(".,_/:%#$&?!~=+-@")), tag(")"),
))(input) ))),
} recognize(tuple((
tag("["),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag("]"),
))),
recognize(tuple((
tag(""),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag(""),
))),
recognize(tuple((
tag(""),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag(""),
))),
recognize(tuple((
not(space1),
not_line_ending,
not(one_of(".,:;!?#?/[]【】()「」()<>")),
anychar,
))),
)))(input)
}
#[inline] #[inline]
fn url_chars<'a, T: 'a>( fn protocol<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
terminator: impl Fn(Span<'a>) -> IResult<Span<'a>, T> + 'a, alt((tag("https://"), tag("http://")))(input)
spaces: bool, }
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'a {
let chars = tuple((
not(tuple((space1, eof))),
not(tuple((space1, tag("\"")))),
not(tuple((opt(space1), terminator))),
alt((url_chars_base, if spaces { space1 } else { fail })),
));
recognize(many1_count(chars)) #[inline]
fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
alt((
alphanumeric1_unicode,
recognize(tuple((
tag("["),
many_till(
self.increase_nesting(self.partial_span(Self::url_chars_base)),
tag("]"),
),
))),
recognize(tuple((
tag("("),
many_till(
self.increase_nesting(self.partial_span(Self::url_chars_base)),
tag(")"),
),
))),
recognize(one_of(".,_/:%#$&?!~=+-@")),
))(input)
}
#[inline]
fn url_chars<'a, 'b, F>(
&'b self,
mut terminator: F,
spaces: bool,
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'b
where
F: Parser<Span<'a>, Span<'a>, nom::error::Error<Span<'a>>> + 'b,
{
move |input| {
recognize(many1_count(tuple((
not(tuple((space1, eof))),
not(tuple((space1, tag("\"")))),
not(tuple((opt(space1), |input| terminator.parse(input)))),
alt((
|input| self.url_chars_base(input),
if spaces { space1 } else { fail },
)),
))))(input)
}
}
} }
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use crate::{url_chars, Context, Span, Token}; use crate::{Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT};
use nom::bytes::complete::tag; use nom::bytes::complete::tag;
use std::collections::HashMap; use std::collections::HashMap;
fn parse_full(string: &str) -> Token { fn parse_full(string: &str) -> Token {
Context.full(Span::new(string)).unwrap().1.merged() Context::default()
.full(Span::new_extra(string, SpanMeta::default()))
.unwrap()
.1
.merged()
} }
#[test] #[test]
fn parse_url_chars() { fn parse_url_chars() {
let ctx = Context::default();
assert_eq!( assert_eq!(
url_chars(tag(")"), true)(Span::new( ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))" "https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
SpanMeta::default()
)) ))
.unwrap() .unwrap()
.1 .1
@ -1278,8 +1435,9 @@ mod test {
); );
assert_eq!( assert_eq!(
url_chars(tag(")"), true)(Span::new( ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)))" "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))",
SpanMeta::default()
)) ))
.unwrap() .unwrap()
.1 .1
@ -1288,26 +1446,35 @@ mod test {
); );
assert_eq!( assert_eq!(
url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among_Us ")) ctx.url_chars(tag(")"), true)(Span::new_extra(
.unwrap() "https://cs.wikipedia.org/wiki/Among_Us ",
.1 SpanMeta::default()
.into_fragment(), ))
.unwrap()
.1
.into_fragment(),
"https://cs.wikipedia.org/wiki/Among_Us", "https://cs.wikipedia.org/wiki/Among_Us",
); );
assert_eq!( assert_eq!(
url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among Us )")) ctx.url_chars(tag(")"), true)(Span::new_extra(
.unwrap() "https://cs.wikipedia.org/wiki/Among Us )",
.1 SpanMeta::default()
.into_fragment(), ))
.unwrap()
.1
.into_fragment(),
"https://cs.wikipedia.org/wiki/Among Us" "https://cs.wikipedia.org/wiki/Among Us"
); );
assert_eq!( assert_eq!(
url_chars(tag(")"), false)(Span::new("https://en.wikipedia.org/wiki/Among Us )")) ctx.url_chars(tag(")"), false)(Span::new_extra(
.unwrap() "https://en.wikipedia.org/wiki/Among Us )",
.1 SpanMeta::default()
.into_fragment(), ))
.unwrap()
.1
.into_fragment(),
"https://en.wikipedia.org/wiki/Among" "https://en.wikipedia.org/wiki/Among"
); );
} }
@ -1593,6 +1760,23 @@ text</center>"#
); );
} }
#[test]
fn limit_nesting() {
let mut tok = Token::PlainText(" <s><i>test</i></s> ".into());
for _ in 0..DEFAULT_DEPTH_LIMIT {
tok = Token::Bold(Box::new(tok));
}
assert_eq!(
parse_full(
&("<b>".repeat(DEFAULT_DEPTH_LIMIT)
+ " <s><i>test</i></s> "
+ &*"</b>".repeat(DEFAULT_DEPTH_LIMIT))
),
tok
);
}
#[test] #[test]
fn parse_mention() { fn parse_mention() {
assert_eq!( assert_eq!(