MMM: Nesting-limited parsing
This commit is contained in:
parent
23a63f2fe9
commit
86d5c87e9a
|
@ -1649,6 +1649,7 @@ dependencies = [
|
|||
"emojis",
|
||||
"nom",
|
||||
"nom_locate",
|
||||
"tracing",
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
|
|
|
@ -10,4 +10,5 @@ emojis = { workspace = true }
|
|||
nom = { workspace = true }
|
||||
nom_locate = { workspace = true }
|
||||
compact_str = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
unicode-segmentation = { workspace = true }
|
||||
|
|
|
@ -7,14 +7,15 @@ use nom::character::complete::{
|
|||
satisfy, space1, tab,
|
||||
};
|
||||
use nom::combinator::{eof, fail, map, not, opt, recognize};
|
||||
use nom::error::ErrorKind;
|
||||
use nom::error::{ErrorKind, ParseError};
|
||||
use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1};
|
||||
use nom::sequence::tuple;
|
||||
use nom::{IResult, Offset, Slice};
|
||||
use nom::{IResult, Offset, Parser, Slice};
|
||||
use nom_locate::LocatedSpan;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::{identity, Infallible};
|
||||
use std::marker::PhantomData;
|
||||
use tracing::trace;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
|
||||
|
@ -217,7 +218,18 @@ impl Token {
|
|||
}
|
||||
}
|
||||
|
||||
type Span<'a> = LocatedSpan<&'a str>;
|
||||
#[derive(Debug, Default, Copy, Clone)]
|
||||
pub struct SpanMeta {
|
||||
depth: usize,
|
||||
}
|
||||
|
||||
impl SpanMeta {
|
||||
fn new(depth: usize) -> Self {
|
||||
Self { depth }
|
||||
}
|
||||
}
|
||||
|
||||
type Span<'a> = LocatedSpan<&'a str, SpanMeta>;
|
||||
|
||||
trait SliceOffset {
|
||||
fn up_to(&self, other: &Self) -> Self;
|
||||
|
@ -300,7 +312,10 @@ fn spliced<'a>(
|
|||
type NE<E> = nom::Err<E>;
|
||||
type NomError<'x> = nom::error::Error<Span<'x>>;
|
||||
|
||||
let quote_span = Span::new(&combined);
|
||||
let quote_span = Span::new_extra(
|
||||
&combined,
|
||||
segments.first().map_or(SpanMeta::new(0), |s| s.extra),
|
||||
);
|
||||
let (input, inner) = match func(quote_span) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
|
@ -311,7 +326,10 @@ fn spliced<'a>(
|
|||
let offset = offset_new - offset_seg_new;
|
||||
let offset_orig = offset + seg_parent.location_offset();
|
||||
Err(NE::Error(NomError::new(
|
||||
Span::new(&parent.into_fragment()[offset_orig..]),
|
||||
Span::new_extra(
|
||||
&parent.into_fragment()[offset_orig..],
|
||||
seg_parent.extra,
|
||||
),
|
||||
e.code,
|
||||
)))
|
||||
} else {
|
||||
|
@ -405,9 +423,53 @@ impl<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>> From<T> for FlankingDel
|
|||
}
|
||||
}
|
||||
|
||||
pub struct Context;
|
||||
pub struct Context {
|
||||
depth_limit: usize,
|
||||
}
|
||||
|
||||
const DEFAULT_DEPTH_LIMIT: usize = 24;
|
||||
|
||||
impl Default for Context {
|
||||
fn default() -> Self {
|
||||
Context::new(DEFAULT_DEPTH_LIMIT)
|
||||
}
|
||||
}
|
||||
|
||||
impl Context {
|
||||
pub fn new(depth_limit: usize) -> Self {
|
||||
Self { depth_limit }
|
||||
}
|
||||
|
||||
pub fn parse_full(&self, input: &str) -> Token {
|
||||
match self.full(Span::new_extra(input, SpanMeta::default())) {
|
||||
Ok((_, t)) => t.merged(),
|
||||
Err(e) => {
|
||||
trace!(input = input, "Full parser fail: {:?}", e);
|
||||
Token::PlainText(e.to_compact_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_inline(&self, input: &str) -> Token {
|
||||
match self.full(Span::new_extra(input, SpanMeta::default())) {
|
||||
Ok((_, t)) => t.merged(),
|
||||
Err(e) => {
|
||||
trace!(input = input, "Inline parser fail: {:?}", e);
|
||||
Token::PlainText(e.to_compact_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_ui(&self, input: &str) -> Token {
|
||||
match self.inline_ui(Span::new_extra(input, SpanMeta::default())) {
|
||||
Ok((_, t)) => t.merged(),
|
||||
Err(e) => {
|
||||
trace!(input = input, "Inline parser fail: {:?}", e);
|
||||
Token::PlainText(e.to_compact_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn partial(
|
||||
&self,
|
||||
|
@ -416,6 +478,14 @@ impl Context {
|
|||
move |input| func(self, input)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn partial_span(
|
||||
&self,
|
||||
func: impl for<'a> Fn(&Self, Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'static,
|
||||
) -> impl for<'a> Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>> + '_ {
|
||||
move |input| func(self, input)
|
||||
}
|
||||
|
||||
pub fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
|
||||
map(many1(self.partial(Self::full_single)), Token::Sequence)(input)
|
||||
}
|
||||
|
@ -431,6 +501,17 @@ impl Context {
|
|||
)(input)
|
||||
}
|
||||
|
||||
fn inline_ui<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
|
||||
map(
|
||||
many1(alt((
|
||||
self.partial(Self::unicode_emoji),
|
||||
self.partial(Self::shortcode_emoji),
|
||||
self.partial(Self::tag_raw_text),
|
||||
))),
|
||||
Token::Sequence,
|
||||
)(input)
|
||||
}
|
||||
|
||||
fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
|
||||
alt((
|
||||
self.partial(Self::tag_bold_italic_asterisk),
|
||||
|
@ -444,15 +525,14 @@ impl Context {
|
|||
|
||||
fn full_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
|
||||
let (input, token) = alt((
|
||||
self.increase_nesting(alt((
|
||||
self.partial(Self::unicode_emoji),
|
||||
alt((
|
||||
self.partial(Self::tag_block_center),
|
||||
self.partial(Self::tag_small),
|
||||
self.partial(Self::tag_plain),
|
||||
self.partial(Self::tag_bold),
|
||||
self.partial(Self::tag_italic),
|
||||
self.partial(Self::tag_strikethrough),
|
||||
)),
|
||||
self.partial(Self::url_no_embed),
|
||||
self.partial(Self::base_bold_italic),
|
||||
self.partial(Self::tag_block_code),
|
||||
|
@ -467,13 +547,15 @@ impl Context {
|
|||
self.partial(Self::shortcode_emoji),
|
||||
self.partial(Self::link),
|
||||
self.partial(Self::raw_url),
|
||||
))),
|
||||
self.partial(Self::tag_raw_text),
|
||||
))(input)?;
|
||||
Ok((input, token))
|
||||
}
|
||||
|
||||
fn inline_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
|
||||
let (input, token) = alt((
|
||||
alt((
|
||||
self.increase_nesting(alt((
|
||||
self.partial(Self::unicode_emoji),
|
||||
self.partial(Self::tag_small),
|
||||
self.partial(Self::tag_plain),
|
||||
|
@ -491,13 +573,14 @@ impl Context {
|
|||
self.partial(Self::shortcode_emoji),
|
||||
self.partial(Self::link),
|
||||
self.partial(Self::raw_url),
|
||||
))),
|
||||
self.partial(Self::tag_raw_text),
|
||||
))(input)?;
|
||||
Ok((input, token))
|
||||
))(input)
|
||||
}
|
||||
|
||||
fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
|
||||
let (input, token) = alt((
|
||||
self.increase_nesting(alt((
|
||||
self.partial(Self::unicode_emoji),
|
||||
self.partial(Self::url_no_embed),
|
||||
self.partial(Self::tag_inline_code),
|
||||
|
@ -507,6 +590,7 @@ impl Context {
|
|||
self.partial(Self::tag_hashtag),
|
||||
self.partial(Self::shortcode_emoji),
|
||||
self.partial(Self::raw_url),
|
||||
))),
|
||||
self.partial(Self::tag_raw_text),
|
||||
))(input)?;
|
||||
Ok((input, token))
|
||||
|
@ -514,6 +598,7 @@ impl Context {
|
|||
|
||||
fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
|
||||
let (input, token) = alt((
|
||||
self.increase_nesting(alt((
|
||||
self.partial(Self::unicode_emoji),
|
||||
self.partial(Self::tag_small),
|
||||
self.partial(Self::tag_plain),
|
||||
|
@ -524,6 +609,7 @@ impl Context {
|
|||
self.partial(Self::tag_strikethrough_tilde),
|
||||
self.partial(Self::tag_func),
|
||||
self.partial(Self::shortcode_emoji),
|
||||
))),
|
||||
self.partial(Self::tag_raw_text),
|
||||
))(input)?;
|
||||
Ok((input, token))
|
||||
|
@ -1056,8 +1142,11 @@ impl Context {
|
|||
|
||||
fn raw_url<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
|
||||
let (input, url_span) = recognize(tuple((
|
||||
protocol,
|
||||
url_chars(|input| not(url_chars_base)(input), false),
|
||||
self.partial_span(Self::protocol),
|
||||
self.url_chars(
|
||||
|input| recognize(not(self.partial_span(Self::url_chars_base)))(input),
|
||||
false,
|
||||
),
|
||||
)))(input)?;
|
||||
|
||||
let url = url_span.into_fragment();
|
||||
|
@ -1075,7 +1164,10 @@ impl Context {
|
|||
|
||||
fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
|
||||
let (input, _) = tag("<")(input)?;
|
||||
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?;
|
||||
let (input, url_span) = recognize(tuple((
|
||||
self.partial_span(Self::protocol),
|
||||
self.url_chars(tag(">"), true),
|
||||
)))(input)?;
|
||||
let (input, _) = tag(">")(input)?;
|
||||
|
||||
Ok((
|
||||
|
@ -1090,7 +1182,10 @@ impl Context {
|
|||
let (input, _) = not(tag("["))(input)?;
|
||||
let (input, (label_tok, _)) =
|
||||
many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?;
|
||||
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(")"), true))))(input)?;
|
||||
let (input, url_span) = recognize(tuple((
|
||||
self.partial_span(Self::protocol),
|
||||
self.url_chars(tag(")"), true),
|
||||
)))(input)?;
|
||||
let (input, _) = tag(")")(input)?;
|
||||
|
||||
Ok((
|
||||
|
@ -1202,20 +1297,55 @@ impl Context {
|
|||
|
||||
let (input, _) = tag("#")(input)?;
|
||||
|
||||
let (input, hashtag_text) =
|
||||
map(recognize(many1(hashtag_chars)), Span::into_fragment)(input)?;
|
||||
let (input, hashtag_text) = map(
|
||||
recognize(many1(self.partial_span(Self::hashtag_chars))),
|
||||
Span::into_fragment,
|
||||
)(input)?;
|
||||
|
||||
Ok((input, Token::Hashtag(hashtag_text.into())))
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hashtag_chars(input: Span) -> IResult<Span, Span> {
|
||||
#[inline]
|
||||
fn increase_nesting<'a, 'b, O, F>(
|
||||
&'b self,
|
||||
mut func: F,
|
||||
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, O> + 'b
|
||||
where
|
||||
F: Parser<Span<'a>, O, nom::error::Error<Span<'a>>> + 'b,
|
||||
{
|
||||
move |mut input| {
|
||||
if input.extra.depth >= self.depth_limit {
|
||||
return fail(input);
|
||||
}
|
||||
|
||||
input.extra.depth += 1;
|
||||
func.parse(input)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hashtag_chars<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
|
||||
recognize(alt((
|
||||
recognize(tuple((tag("("), hashtag_chars, tag(")")))),
|
||||
recognize(tuple((tag("["), hashtag_chars, tag("]")))),
|
||||
recognize(tuple((tag("「"), hashtag_chars, tag("」")))),
|
||||
recognize(tuple((tag("("), hashtag_chars, tag(")")))),
|
||||
recognize(tuple((
|
||||
tag("("),
|
||||
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
|
||||
tag(")"),
|
||||
))),
|
||||
recognize(tuple((
|
||||
tag("["),
|
||||
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
|
||||
tag("]"),
|
||||
))),
|
||||
recognize(tuple((
|
||||
tag("「"),
|
||||
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
|
||||
tag("」"),
|
||||
))),
|
||||
recognize(tuple((
|
||||
tag("("),
|
||||
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
|
||||
tag(")"),
|
||||
))),
|
||||
recognize(tuple((
|
||||
not(space1),
|
||||
not_line_ending,
|
||||
|
@ -1223,53 +1353,80 @@ fn hashtag_chars(input: Span) -> IResult<Span, Span> {
|
|||
anychar,
|
||||
))),
|
||||
)))(input)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn protocol(input: Span) -> IResult<Span, Span> {
|
||||
#[inline]
|
||||
fn protocol<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
|
||||
alt((tag("https://"), tag("http://")))(input)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn url_chars_base(input: Span) -> IResult<Span, Span> {
|
||||
#[inline]
|
||||
fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
|
||||
alt((
|
||||
alphanumeric1_unicode,
|
||||
recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))),
|
||||
recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))),
|
||||
recognize(tuple((
|
||||
tag("["),
|
||||
many_till(
|
||||
self.increase_nesting(self.partial_span(Self::url_chars_base)),
|
||||
tag("]"),
|
||||
),
|
||||
))),
|
||||
recognize(tuple((
|
||||
tag("("),
|
||||
many_till(
|
||||
self.increase_nesting(self.partial_span(Self::url_chars_base)),
|
||||
tag(")"),
|
||||
),
|
||||
))),
|
||||
recognize(one_of(".,_/:%#$&?!~=+-@")),
|
||||
))(input)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn url_chars<'a, T: 'a>(
|
||||
terminator: impl Fn(Span<'a>) -> IResult<Span<'a>, T> + 'a,
|
||||
#[inline]
|
||||
fn url_chars<'a, 'b, F>(
|
||||
&'b self,
|
||||
mut terminator: F,
|
||||
spaces: bool,
|
||||
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'a {
|
||||
let chars = tuple((
|
||||
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'b
|
||||
where
|
||||
F: Parser<Span<'a>, Span<'a>, nom::error::Error<Span<'a>>> + 'b,
|
||||
{
|
||||
move |input| {
|
||||
recognize(many1_count(tuple((
|
||||
not(tuple((space1, eof))),
|
||||
not(tuple((space1, tag("\"")))),
|
||||
not(tuple((opt(space1), terminator))),
|
||||
alt((url_chars_base, if spaces { space1 } else { fail })),
|
||||
));
|
||||
|
||||
recognize(many1_count(chars))
|
||||
not(tuple((opt(space1), |input| terminator.parse(input)))),
|
||||
alt((
|
||||
|input| self.url_chars_base(input),
|
||||
if spaces { space1 } else { fail },
|
||||
)),
|
||||
))))(input)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::{url_chars, Context, Span, Token};
|
||||
use crate::{Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT};
|
||||
use nom::bytes::complete::tag;
|
||||
use std::collections::HashMap;
|
||||
|
||||
fn parse_full(string: &str) -> Token {
|
||||
Context.full(Span::new(string)).unwrap().1.merged()
|
||||
Context::default()
|
||||
.full(Span::new_extra(string, SpanMeta::default()))
|
||||
.unwrap()
|
||||
.1
|
||||
.merged()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_url_chars() {
|
||||
let ctx = Context::default();
|
||||
|
||||
assert_eq!(
|
||||
url_chars(tag(")"), true)(Span::new(
|
||||
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))"
|
||||
ctx.url_chars(tag(")"), true)(Span::new_extra(
|
||||
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
|
||||
SpanMeta::default()
|
||||
))
|
||||
.unwrap()
|
||||
.1
|
||||
|
@ -1278,8 +1435,9 @@ mod test {
|
|||
);
|
||||
|
||||
assert_eq!(
|
||||
url_chars(tag(")"), true)(Span::new(
|
||||
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)))"
|
||||
ctx.url_chars(tag(")"), true)(Span::new_extra(
|
||||
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)))",
|
||||
SpanMeta::default()
|
||||
))
|
||||
.unwrap()
|
||||
.1
|
||||
|
@ -1288,7 +1446,10 @@ mod test {
|
|||
);
|
||||
|
||||
assert_eq!(
|
||||
url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among_Us "))
|
||||
ctx.url_chars(tag(")"), true)(Span::new_extra(
|
||||
"https://cs.wikipedia.org/wiki/Among_Us ",
|
||||
SpanMeta::default()
|
||||
))
|
||||
.unwrap()
|
||||
.1
|
||||
.into_fragment(),
|
||||
|
@ -1296,7 +1457,10 @@ mod test {
|
|||
);
|
||||
|
||||
assert_eq!(
|
||||
url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among Us )"))
|
||||
ctx.url_chars(tag(")"), true)(Span::new_extra(
|
||||
"https://cs.wikipedia.org/wiki/Among Us )",
|
||||
SpanMeta::default()
|
||||
))
|
||||
.unwrap()
|
||||
.1
|
||||
.into_fragment(),
|
||||
|
@ -1304,7 +1468,10 @@ mod test {
|
|||
);
|
||||
|
||||
assert_eq!(
|
||||
url_chars(tag(")"), false)(Span::new("https://en.wikipedia.org/wiki/Among Us )"))
|
||||
ctx.url_chars(tag(")"), false)(Span::new_extra(
|
||||
"https://en.wikipedia.org/wiki/Among Us )",
|
||||
SpanMeta::default()
|
||||
))
|
||||
.unwrap()
|
||||
.1
|
||||
.into_fragment(),
|
||||
|
@ -1593,6 +1760,23 @@ text</center>"#
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn limit_nesting() {
|
||||
let mut tok = Token::PlainText(" <s><i>test</i></s> ".into());
|
||||
for _ in 0..DEFAULT_DEPTH_LIMIT {
|
||||
tok = Token::Bold(Box::new(tok));
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
parse_full(
|
||||
&("<b>".repeat(DEFAULT_DEPTH_LIMIT)
|
||||
+ " <s><i>test</i></s> "
|
||||
+ &*"</b>".repeat(DEFAULT_DEPTH_LIMIT))
|
||||
),
|
||||
tok
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_mention() {
|
||||
assert_eq!(
|
||||
|
|
Loading…
Reference in New Issue