MMM: Nesting-limited parsing

This commit is contained in:
Natty 2023-10-16 23:45:45 +02:00
parent 23a63f2fe9
commit 86d5c87e9a
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
3 changed files with 313 additions and 127 deletions

1
Cargo.lock generated
View File

@ -1649,6 +1649,7 @@ dependencies = [
"emojis",
"nom",
"nom_locate",
"tracing",
"unicode-segmentation",
]

View File

@ -10,4 +10,5 @@ emojis = { workspace = true }
nom = { workspace = true }
nom_locate = { workspace = true }
compact_str = { workspace = true }
tracing = { workspace = true }
unicode-segmentation = { workspace = true }

View File

@ -7,14 +7,15 @@ use nom::character::complete::{
satisfy, space1, tab,
};
use nom::combinator::{eof, fail, map, not, opt, recognize};
use nom::error::ErrorKind;
use nom::error::{ErrorKind, ParseError};
use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1};
use nom::sequence::tuple;
use nom::{IResult, Offset, Slice};
use nom::{IResult, Offset, Parser, Slice};
use nom_locate::LocatedSpan;
use std::collections::HashMap;
use std::convert::{identity, Infallible};
use std::marker::PhantomData;
use tracing::trace;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
@ -217,7 +218,18 @@ impl Token {
}
}
type Span<'a> = LocatedSpan<&'a str>;
#[derive(Debug, Default, Copy, Clone)]
pub struct SpanMeta {
depth: usize,
}
impl SpanMeta {
fn new(depth: usize) -> Self {
Self { depth }
}
}
type Span<'a> = LocatedSpan<&'a str, SpanMeta>;
trait SliceOffset {
fn up_to(&self, other: &Self) -> Self;
@ -300,7 +312,10 @@ fn spliced<'a>(
type NE<E> = nom::Err<E>;
type NomError<'x> = nom::error::Error<Span<'x>>;
let quote_span = Span::new(&combined);
let quote_span = Span::new_extra(
&combined,
segments.first().map_or(SpanMeta::new(0), |s| s.extra),
);
let (input, inner) = match func(quote_span) {
Ok(s) => s,
Err(e) => {
@ -311,7 +326,10 @@ fn spliced<'a>(
let offset = offset_new - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset();
Err(NE::Error(NomError::new(
Span::new(&parent.into_fragment()[offset_orig..]),
Span::new_extra(
&parent.into_fragment()[offset_orig..],
seg_parent.extra,
),
e.code,
)))
} else {
@ -405,9 +423,53 @@ impl<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>> From<T> for FlankingDel
}
}
pub struct Context;
pub struct Context {
depth_limit: usize,
}
const DEFAULT_DEPTH_LIMIT: usize = 24;
impl Default for Context {
fn default() -> Self {
Context::new(DEFAULT_DEPTH_LIMIT)
}
}
impl Context {
pub fn new(depth_limit: usize) -> Self {
Self { depth_limit }
}
pub fn parse_full(&self, input: &str) -> Token {
match self.full(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Full parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
pub fn parse_inline(&self, input: &str) -> Token {
match self.full(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Inline parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
pub fn parse_ui(&self, input: &str) -> Token {
match self.inline_ui(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Inline parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
#[inline]
fn partial(
&self,
@ -416,6 +478,14 @@ impl Context {
move |input| func(self, input)
}
#[inline]
fn partial_span(
&self,
func: impl for<'a> Fn(&Self, Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'static,
) -> impl for<'a> Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>> + '_ {
move |input| func(self, input)
}
pub fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
map(many1(self.partial(Self::full_single)), Token::Sequence)(input)
}
@ -431,6 +501,17 @@ impl Context {
)(input)
}
fn inline_ui<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
map(
many1(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::shortcode_emoji),
self.partial(Self::tag_raw_text),
))),
Token::Sequence,
)(input)
}
fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
alt((
self.partial(Self::tag_bold_italic_asterisk),
@ -444,69 +525,72 @@ impl Context {
fn full_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
alt((
self.increase_nesting(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_block_center),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
)),
self.partial(Self::url_no_embed),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_block_code),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_quote),
self.partial(Self::tag_block_math),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::link),
self.partial(Self::raw_url),
self.partial(Self::url_no_embed),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_block_code),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_quote),
self.partial(Self::tag_block_math),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::link),
self.partial(Self::raw_url),
))),
self.partial(Self::tag_raw_text),
))(input)?;
Ok((input, token))
}
fn inline_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::url_no_embed),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::link),
self.partial(Self::raw_url),
alt((
self.increase_nesting(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::url_no_embed),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::link),
self.partial(Self::raw_url),
))),
self.partial(Self::tag_raw_text),
))(input)?;
Ok((input, token))
))(input)
}
fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
self.partial(Self::url_no_embed),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url),
self.increase_nesting(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::url_no_embed),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url),
))),
self.partial(Self::tag_raw_text),
))(input)?;
Ok((input, token))
@ -514,16 +598,18 @@ impl Context {
fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::shortcode_emoji),
self.increase_nesting(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::shortcode_emoji),
))),
self.partial(Self::tag_raw_text),
))(input)?;
Ok((input, token))
@ -1056,8 +1142,11 @@ impl Context {
fn raw_url<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, url_span) = recognize(tuple((
protocol,
url_chars(|input| not(url_chars_base)(input), false),
self.partial_span(Self::protocol),
self.url_chars(
|input| recognize(not(self.partial_span(Self::url_chars_base)))(input),
false,
),
)))(input)?;
let url = url_span.into_fragment();
@ -1075,7 +1164,10 @@ impl Context {
fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, _) = tag("<")(input)?;
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?;
let (input, url_span) = recognize(tuple((
self.partial_span(Self::protocol),
self.url_chars(tag(">"), true),
)))(input)?;
let (input, _) = tag(">")(input)?;
Ok((
@ -1090,7 +1182,10 @@ impl Context {
let (input, _) = not(tag("["))(input)?;
let (input, (label_tok, _)) =
many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?;
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(")"), true))))(input)?;
let (input, url_span) = recognize(tuple((
self.partial_span(Self::protocol),
self.url_chars(tag(")"), true),
)))(input)?;
let (input, _) = tag(")")(input)?;
Ok((
@ -1202,74 +1297,136 @@ impl Context {
let (input, _) = tag("#")(input)?;
let (input, hashtag_text) =
map(recognize(many1(hashtag_chars)), Span::into_fragment)(input)?;
let (input, hashtag_text) = map(
recognize(many1(self.partial_span(Self::hashtag_chars))),
Span::into_fragment,
)(input)?;
Ok((input, Token::Hashtag(hashtag_text.into())))
}
}
#[inline]
fn hashtag_chars(input: Span) -> IResult<Span, Span> {
recognize(alt((
recognize(tuple((tag("("), hashtag_chars, tag(")")))),
recognize(tuple((tag("["), hashtag_chars, tag("]")))),
recognize(tuple((tag(""), hashtag_chars, tag("")))),
recognize(tuple((tag(""), hashtag_chars, tag("")))),
recognize(tuple((
not(space1),
not_line_ending,
not(one_of(".,:;!?#?/[]【】()「」()<>")),
anychar,
))),
)))(input)
}
#[inline]
fn increase_nesting<'a, 'b, O, F>(
&'b self,
mut func: F,
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, O> + 'b
where
F: Parser<Span<'a>, O, nom::error::Error<Span<'a>>> + 'b,
{
move |mut input| {
if input.extra.depth >= self.depth_limit {
return fail(input);
}
#[inline]
fn protocol(input: Span) -> IResult<Span, Span> {
alt((tag("https://"), tag("http://")))(input)
}
input.extra.depth += 1;
func.parse(input)
}
}
#[inline]
fn url_chars_base(input: Span) -> IResult<Span, Span> {
alt((
alphanumeric1_unicode,
recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))),
recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))),
recognize(one_of(".,_/:%#$&?!~=+-@")),
))(input)
}
#[inline]
fn hashtag_chars<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
recognize(alt((
recognize(tuple((
tag("("),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag(")"),
))),
recognize(tuple((
tag("["),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag("]"),
))),
recognize(tuple((
tag(""),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag(""),
))),
recognize(tuple((
tag(""),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag(""),
))),
recognize(tuple((
not(space1),
not_line_ending,
not(one_of(".,:;!?#?/[]【】()「」()<>")),
anychar,
))),
)))(input)
}
#[inline]
fn url_chars<'a, T: 'a>(
terminator: impl Fn(Span<'a>) -> IResult<Span<'a>, T> + 'a,
spaces: bool,
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'a {
let chars = tuple((
not(tuple((space1, eof))),
not(tuple((space1, tag("\"")))),
not(tuple((opt(space1), terminator))),
alt((url_chars_base, if spaces { space1 } else { fail })),
));
#[inline]
fn protocol<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
alt((tag("https://"), tag("http://")))(input)
}
recognize(many1_count(chars))
#[inline]
fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
alt((
alphanumeric1_unicode,
recognize(tuple((
tag("["),
many_till(
self.increase_nesting(self.partial_span(Self::url_chars_base)),
tag("]"),
),
))),
recognize(tuple((
tag("("),
many_till(
self.increase_nesting(self.partial_span(Self::url_chars_base)),
tag(")"),
),
))),
recognize(one_of(".,_/:%#$&?!~=+-@")),
))(input)
}
#[inline]
fn url_chars<'a, 'b, F>(
&'b self,
mut terminator: F,
spaces: bool,
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'b
where
F: Parser<Span<'a>, Span<'a>, nom::error::Error<Span<'a>>> + 'b,
{
move |input| {
recognize(many1_count(tuple((
not(tuple((space1, eof))),
not(tuple((space1, tag("\"")))),
not(tuple((opt(space1), |input| terminator.parse(input)))),
alt((
|input| self.url_chars_base(input),
if spaces { space1 } else { fail },
)),
))))(input)
}
}
}
#[cfg(test)]
mod test {
use crate::{url_chars, Context, Span, Token};
use crate::{Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT};
use nom::bytes::complete::tag;
use std::collections::HashMap;
fn parse_full(string: &str) -> Token {
Context.full(Span::new(string)).unwrap().1.merged()
Context::default()
.full(Span::new_extra(string, SpanMeta::default()))
.unwrap()
.1
.merged()
}
#[test]
fn parse_url_chars() {
let ctx = Context::default();
assert_eq!(
url_chars(tag(")"), true)(Span::new(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))"
ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
SpanMeta::default()
))
.unwrap()
.1
@ -1278,8 +1435,9 @@ mod test {
);
assert_eq!(
url_chars(tag(")"), true)(Span::new(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)))"
ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)))",
SpanMeta::default()
))
.unwrap()
.1
@ -1288,26 +1446,35 @@ mod test {
);
assert_eq!(
url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among_Us "))
.unwrap()
.1
.into_fragment(),
ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://cs.wikipedia.org/wiki/Among_Us ",
SpanMeta::default()
))
.unwrap()
.1
.into_fragment(),
"https://cs.wikipedia.org/wiki/Among_Us",
);
assert_eq!(
url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among Us )"))
.unwrap()
.1
.into_fragment(),
ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://cs.wikipedia.org/wiki/Among Us )",
SpanMeta::default()
))
.unwrap()
.1
.into_fragment(),
"https://cs.wikipedia.org/wiki/Among Us"
);
assert_eq!(
url_chars(tag(")"), false)(Span::new("https://en.wikipedia.org/wiki/Among Us )"))
.unwrap()
.1
.into_fragment(),
ctx.url_chars(tag(")"), false)(Span::new_extra(
"https://en.wikipedia.org/wiki/Among Us )",
SpanMeta::default()
))
.unwrap()
.1
.into_fragment(),
"https://en.wikipedia.org/wiki/Among"
);
}
@ -1593,6 +1760,23 @@ text</center>"#
);
}
#[test]
fn limit_nesting() {
let mut tok = Token::PlainText(" <s><i>test</i></s> ".into());
for _ in 0..DEFAULT_DEPTH_LIMIT {
tok = Token::Bold(Box::new(tok));
}
assert_eq!(
parse_full(
&("<b>".repeat(DEFAULT_DEPTH_LIMIT)
+ " <s><i>test</i></s> "
+ &*"</b>".repeat(DEFAULT_DEPTH_LIMIT))
),
tok
);
}
#[test]
fn parse_mention() {
assert_eq!(