Fixed parsing complexity issues in MMM
ci/woodpecker/push/ociImagePush Pipeline is running Details

This commit is contained in:
Natty 2024-05-22 03:42:50 +02:00
parent b74f2d69a4
commit 78c93f3c20
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
1 changed files with 151 additions and 111 deletions

View File

@ -1,5 +1,11 @@
use std::collections::HashMap;
use std::convert::{identity, Infallible};
use std::io::{Cursor, Write};
use std::marker::PhantomData;
use compact_str::{CompactString, ToCompactString}; use compact_str::{CompactString, ToCompactString};
use either::Either; use either::Either;
use nom::{IResult, Offset, Parser, Slice};
use nom::branch::alt; use nom::branch::alt;
use nom::bytes::complete::{tag, tag_no_case}; use nom::bytes::complete::{tag, tag_no_case};
use nom::character::complete::{ use nom::character::complete::{
@ -10,14 +16,9 @@ use nom::combinator::{eof, fail, map, not, opt, peek, recognize};
use nom::error::ErrorKind; use nom::error::ErrorKind;
use nom::multi::{many0_count, many1, many1_count, many_till, separated_list1}; use nom::multi::{many0_count, many1, many1_count, many_till, separated_list1};
use nom::sequence::tuple; use nom::sequence::tuple;
use nom::{IResult, Offset, Parser, Slice};
use nom_locate::LocatedSpan; use nom_locate::LocatedSpan;
use quick_xml::events::{BytesText, Event}; use quick_xml::events::{BytesText, Event};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::convert::{identity, Infallible};
use std::io::{Cursor, Write};
use std::marker::PhantomData;
use strum::IntoStaticStr; use strum::IntoStaticStr;
use tracing::trace; use tracing::trace;
use unicode_segmentation::UnicodeSegmentation; use unicode_segmentation::UnicodeSegmentation;
@ -436,6 +437,20 @@ pub fn to_xml_string(token: &Token) -> quick_xml::Result<String> {
Ok(String::from_utf8(writer.into_inner().into_inner())?) Ok(String::from_utf8(writer.into_inner().into_inner())?)
} }
pub fn janky_is_line_begin(input: Span<'_>) -> bool {
let offset = input.location_offset();
// VERY BAD
// Safety: This is very janky, but hopefully will work as long as nom-locate keeps the invariant of fragments being subslices of the input
// We do this to avoid scanning the entire input for a line separator when we just need the previous byte
offset == 0 || unsafe {
let frag_bytes = input.fragment().as_bytes();
let frag_ptr = frag_bytes.as_ptr();
let prev_byte = frag_ptr.offset(-1);
matches!(*prev_byte, b'\n')
}
}
#[derive(Debug, Default, Copy, Clone)] #[derive(Debug, Default, Copy, Clone)]
pub struct SpanMeta { pub struct SpanMeta {
depth: usize, depth: usize,
@ -500,7 +515,12 @@ fn space1_unicode(input: Span) -> IResult<Span, Span> {
#[inline] #[inline]
fn alphanumeric1_unicode(input: Span) -> IResult<Span, Span> { fn alphanumeric1_unicode(input: Span) -> IResult<Span, Span> {
recognize(many1_count(satisfy(char::is_alphanumeric)))(input) recognize(many1_count(char_alphanumeric_unicode))(input)
}
#[inline]
fn char_alphanumeric_unicode(input: Span) -> IResult<Span, char> {
satisfy(char::is_alphanumeric)(input)
} }
fn spliced<'a>( fn spliced<'a>(
@ -877,7 +897,7 @@ impl Context {
let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?;
if let (None, None) = leading_spaces { if let (None, None) = leading_spaces {
if input.get_column() != 1 { if !janky_is_line_begin(input) {
return fail(input); return fail(input);
} }
} }
@ -915,7 +935,7 @@ impl Context {
let (input, _) = opt(line_ending)(input)?; let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 1 { if !janky_is_line_begin(input) {
return fail(input); return fail(input);
} }
@ -938,7 +958,7 @@ impl Context {
let (input, _) = opt(line_ending)(input)?; let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 1 { if !janky_is_line_begin(input) {
return fail(input); return fail(input);
} }
@ -980,7 +1000,7 @@ impl Context {
let (input, _) = opt(line_ending)(input)?; let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 1 { if !janky_is_line_begin(input) {
return fail(input); return fail(input);
} }
@ -1024,7 +1044,7 @@ impl Context {
move |input| { move |input| {
if let FlankingRule::Strict = opening_rule { if let FlankingRule::Strict = opening_rule {
let (input, pre) = opt(recognize(tuple(( let (input, pre) = opt(recognize(tuple((
alphanumeric1_unicode, char_alphanumeric_unicode,
opt(tag("\\")), opt(tag("\\")),
&opening_tag, &opening_tag,
peek(not(alt((recognize(satisfy(|c| c.is_whitespace())), eof)))), peek(not(alt((recognize(satisfy(|c| c.is_whitespace())), eof)))),
@ -1072,7 +1092,7 @@ impl Context {
input, input,
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText(begin.fragment_between(&post_open).into()), Token::PlainText(begin.fragment_between(&post_open).into()),
((fallback.collector)(&mut inner)), (fallback.collector)(&mut inner),
Token::PlainText(closing.into_fragment().into()), Token::PlainText(closing.into_fragment().into()),
]), ]),
)); ));
@ -1355,7 +1375,7 @@ impl Context {
Matcher::new( Matcher::new(
&move |input| { &move |input| {
map( map(
tuple(((not(line_ending)), self.partial(Self::inline_single))), tuple((not(line_ending), self.partial(Self::inline_single))),
|(_, captured)| captured, |(_, captured)| captured,
)(input) )(input)
}, },
@ -1365,7 +1385,7 @@ impl Context {
&move |input| { &move |input| {
map( map(
tuple(( tuple((
(not(line_ending)), not(line_ending),
self.partial(Self::inline_non_formatting_single), self.partial(Self::inline_non_formatting_single),
)), )),
|(_, captured)| captured, |(_, captured)| captured,
@ -1492,18 +1512,7 @@ impl Context {
)) ))
} }
fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn shortcode_emoji_inner<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
if let (plain_out, Some(plain)) = map(
opt(recognize(tuple((
alphanumeric1_unicode,
self.partial(Self::shortcode_emoji),
)))),
|o| o.map(Span::into_fragment),
)(input)?
{
return Ok((plain_out, Token::PlainText(plain.into())));
}
let (input, _) = tag(":")(input)?; let (input, _) = tag(":")(input)?;
let (input, shortcode) = map( let (input, shortcode) = map(
recognize(many1(alt(( recognize(many1(alt((
@ -1534,11 +1543,11 @@ impl Context {
)) ))
} }
fn tag_mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
if let (plain_out, Some(plain)) = map( if let (plain_out, Some(plain)) = map(
opt(recognize(tuple(( opt(recognize(tuple((
alt((tag("\\"), alphanumeric1_unicode)), char_alphanumeric_unicode,
self.partial(Self::tag_mention), self.partial(Self::shortcode_emoji_inner),
)))), )))),
|o| o.map(Span::into_fragment), |o| o.map(Span::into_fragment),
)(input)? )(input)?
@ -1546,6 +1555,10 @@ impl Context {
return Ok((plain_out, Token::PlainText(plain.into()))); return Ok((plain_out, Token::PlainText(plain.into())));
} }
self.shortcode_emoji_inner(input)
}
fn tag_mention_inner<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let tags = one_of("@!"); let tags = one_of("@!");
let (input, mention_type) = map(tags, |c| match c { let (input, mention_type) = map(tags, |c| match c {
'@' => MentionType::User, '@' => MentionType::User,
@ -1591,9 +1604,24 @@ impl Context {
)) ))
} }
fn tag_mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
if let (plain_out, Some(plain)) = map(
opt(recognize(tuple((
alt((tag("\\"), recognize(char_alphanumeric_unicode))),
self.partial(Self::tag_mention_inner),
)))),
|o| o.map(Span::into_fragment),
)(input)?
{
return Ok((plain_out, Token::PlainText(plain.into())));
}
self.tag_mention_inner(input)
}
fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, maybe_preceded) = let (input, maybe_preceded) =
opt(recognize(tuple((alphanumeric1_unicode, tag("#")))))(input)?; opt(recognize(tuple((char_alphanumeric_unicode, tag("#")))))(input)?;
if let Some(preceded) = maybe_preceded { if let Some(preceded) = maybe_preceded {
return Ok((input, Token::PlainText(preceded.into_fragment().into()))); return Ok((input, Token::PlainText(preceded.into_fragment().into())));
@ -1714,10 +1742,12 @@ impl Context {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use crate::{to_xml_string, Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT};
use nom::bytes::complete::tag;
use std::collections::HashMap; use std::collections::HashMap;
use nom::bytes::complete::tag;
use crate::{Context, DEFAULT_DEPTH_LIMIT, Span, SpanMeta, to_xml_string, Token};
fn parse_full(string: &str) -> Token { fn parse_full(string: &str) -> Token {
Context::default() Context::default()
.full(Span::new_extra(string, SpanMeta::default())) .full(Span::new_extra(string, SpanMeta::default()))
@ -1738,7 +1768,7 @@ mod test {
assert_eq!( assert_eq!(
ctx.url_chars(tag(")"), true)(Span::new_extra( ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))", "https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
SpanMeta::default() SpanMeta::default(),
)) ))
.unwrap() .unwrap()
.1 .1
@ -1771,7 +1801,7 @@ mod test {
assert_eq!( assert_eq!(
ctx.url_chars(tag(")"), true)(Span::new_extra( ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://cs.wikipedia.org/wiki/Among Us )", "https://cs.wikipedia.org/wiki/Among Us )",
SpanMeta::default() SpanMeta::default(),
)) ))
.unwrap() .unwrap()
.1 .1
@ -1782,7 +1812,7 @@ mod test {
assert_eq!( assert_eq!(
ctx.url_chars(tag(")"), false)(Span::new_extra( ctx.url_chars(tag(")"), false)(Span::new_extra(
"https://en.wikipedia.org/wiki/Among Us )", "https://en.wikipedia.org/wiki/Among Us )",
SpanMeta::default() SpanMeta::default(),
)) ))
.unwrap() .unwrap()
.1 .1
@ -1823,7 +1853,7 @@ mod test {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("intra".into()), Token::PlainText("intra".into()),
Token::Italic(Box::new(Token::PlainText("word".into()))), Token::Italic(Box::new(Token::PlainText("word".into()))),
Token::PlainText("italic".into()) Token::PlainText("italic".into()),
]) ])
); );
@ -1836,7 +1866,7 @@ mod test {
parse_full(r#"long text with a *footnote <b>text</b>"#), parse_full(r#"long text with a *footnote <b>text</b>"#),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("long text with a *footnote ".into()), Token::PlainText("long text with a *footnote ".into()),
Token::Bold(Box::new(Token::PlainText("text".into()))) Token::Bold(Box::new(Token::PlainText("text".into()))),
]) ])
); );
@ -1888,7 +1918,7 @@ mod test {
parse_full("~~*hello\nworld*"), parse_full("~~*hello\nworld*"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("~~".into()), Token::PlainText("~~".into()),
Token::Italic(Box::new(Token::PlainText("hello\nworld".into()))) Token::Italic(Box::new(Token::PlainText("hello\nworld".into()))),
]) ])
) )
} }
@ -1900,7 +1930,7 @@ mod test {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("aaa".into()), Token::PlainText("aaa".into()),
Token::Italic(Box::new(Token::PlainText("iii".into()))), Token::Italic(Box::new(Token::PlainText("iii".into()))),
Token::PlainText("bbb".into()) Token::PlainText("bbb".into()),
]) ])
); );
@ -1914,7 +1944,7 @@ mod test {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("aaa\n".into()), Token::PlainText("aaa\n".into()),
Token::Italic(Box::new(Token::PlainText("iii".into()))), Token::Italic(Box::new(Token::PlainText("iii".into()))),
Token::PlainText("\nbbb".into()) Token::PlainText("\nbbb".into()),
]) ])
); );
@ -1955,6 +1985,16 @@ mod test {
); );
} }
#[test]
fn parse_long() {
parse_full(&"A".repeat(20000));
parse_full(&"*A".repeat(20000));
parse_full(&"@A".repeat(20000));
}
#[test] #[test]
fn parse_complex() { fn parse_complex() {
assert_eq!( assert_eq!(
@ -2015,7 +2055,7 @@ text</center>"#
Token::PlainText("centered\n".into()), Token::PlainText("centered\n".into()),
Token::UnicodeEmoji("🦋".into()), Token::UnicodeEmoji("🦋".into()),
Token::UnicodeEmoji("🏳️‍⚧️".into()), Token::UnicodeEmoji("🏳️‍⚧️".into()),
Token::PlainText("\ntext".into()) Token::PlainText("\ntext".into()),
]))) ])))
); );
@ -2102,7 +2142,7 @@ text</center>"#
parse_full("IPv4 test: <https://0>"), parse_full("IPv4 test: <https://0>"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("IPv4 test: ".into()), Token::PlainText("IPv4 test: ".into()),
Token::UrlNoEmbed("https://0".into()) Token::UrlNoEmbed("https://0".into()),
]) ])
); );
@ -2110,7 +2150,7 @@ text</center>"#
parse_full("IPv4 test: <https://127.0.0.1>"), parse_full("IPv4 test: <https://127.0.0.1>"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("IPv4 test: ".into()), Token::PlainText("IPv4 test: ".into()),
Token::UrlNoEmbed("https://127.0.0.1".into()) Token::UrlNoEmbed("https://127.0.0.1".into()),
]) ])
); );
@ -2118,7 +2158,7 @@ text</center>"#
parse_full("IPv6 test: <https://[::2f:1]/nya>"), parse_full("IPv6 test: <https://[::2f:1]/nya>"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("IPv6 test: ".into()), Token::PlainText("IPv6 test: ".into()),
Token::UrlNoEmbed("https://[::2f:1]/nya".into()) Token::UrlNoEmbed("https://[::2f:1]/nya".into()),
]) ])
); );
@ -2126,7 +2166,7 @@ text</center>"#
parse_full("IPv6 test: https://[::2f:1]/nya"), parse_full("IPv6 test: https://[::2f:1]/nya"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("IPv6 test: ".into()), Token::PlainText("IPv6 test: ".into()),
Token::UrlRaw("https://[::2f:1]/nya".into()) Token::UrlRaw("https://[::2f:1]/nya".into()),
]) ])
); );
@ -2135,7 +2175,7 @@ text</center>"#
parse_full("IDN test: https://www.háčkyčárky.cz/"), parse_full("IDN test: https://www.háčkyčárky.cz/"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("IDN test: ".into()), Token::PlainText("IDN test: ".into()),
Token::UrlRaw("https://www.háčkyčárky.cz/".into()) Token::UrlRaw("https://www.háčkyčárky.cz/".into()),
]) ])
); );
@ -2146,8 +2186,8 @@ text</center>"#
Token::Link { Token::Link {
label: Box::new(Token::PlainText("label".into())), label: Box::new(Token::PlainText("label".into())),
href: "https://example.com".into(), href: "https://example.com".into(),
embed: true embed: true,
} },
]) ])
); );
@ -2156,7 +2196,7 @@ text</center>"#
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("test ".into()), Token::PlainText("test ".into()),
Token::Hashtag("hashtag".into()), Token::Hashtag("hashtag".into()),
Token::PlainText(" tail".into()) Token::PlainText(" tail".into()),
]) ])
); );
@ -2175,7 +2215,7 @@ text</center>"#
parse_full("<https://example.com/><https://awawa.gay/>"), parse_full("<https://example.com/><https://awawa.gay/>"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::UrlNoEmbed("https://example.com/".into()), Token::UrlNoEmbed("https://example.com/".into()),
Token::UrlNoEmbed("https://awawa.gay/".into()) Token::UrlNoEmbed("https://awawa.gay/".into()),
]) ])
); );
@ -2186,8 +2226,8 @@ text</center>"#
Token::Link { Token::Link {
label: Box::new(Token::PlainText("label".into())), label: Box::new(Token::PlainText("label".into())),
href: "https://awawa.gay".into(), href: "https://awawa.gay".into(),
embed: false embed: false,
} },
]) ])
); );
@ -2198,9 +2238,9 @@ text</center>"#
Token::Link { Token::Link {
label: Box::new(Token::PlainText("label".into())), label: Box::new(Token::PlainText("label".into())),
href: "https://awawa.gay".into(), href: "https://awawa.gay".into(),
embed: false embed: false,
}, },
Token::PlainText("test".into()) Token::PlainText("test".into()),
]) ])
); );
@ -2211,9 +2251,9 @@ text</center>"#
Token::Link { Token::Link {
label: Box::new(Token::PlainText("label".into())), label: Box::new(Token::PlainText("label".into())),
href: "https://awawa.gay".into(), href: "https://awawa.gay".into(),
embed: false embed: false,
}, },
Token::PlainText(")".into()) Token::PlainText(")".into()),
]) ])
); );
@ -2250,7 +2290,7 @@ text</center>"#
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: crate::MentionType::User,
name: "tag".into(), name: "tag".into(),
host: None host: None,
} }
); );
@ -2266,9 +2306,9 @@ text</center>"#
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: crate::MentionType::User,
name: "tag".into(), name: "tag".into(),
host: None host: None,
}, },
Token::PlainText(" fgahjsdkd".into()) Token::PlainText(" fgahjsdkd".into()),
]) ])
); );
@ -2279,9 +2319,9 @@ text</center>"#
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: crate::MentionType::User,
name: "tag".into(), name: "tag".into(),
host: None host: None,
}, },
Token::PlainText("@ fgahjsdkd".into()) Token::PlainText("@ fgahjsdkd".into()),
]) ])
); );
@ -2292,9 +2332,9 @@ text</center>"#
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: crate::MentionType::User,
name: "tag".into(), name: "tag".into(),
host: Some("domain".into()) host: Some("domain".into()),
}, },
Token::PlainText(" bbbbb".into()) Token::PlainText(" bbbbb".into()),
]) ])
); );
@ -2305,9 +2345,9 @@ text</center>"#
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: crate::MentionType::User,
name: "tag".into(), name: "tag".into(),
host: Some("domain".into()) host: Some("domain".into()),
}, },
Token::PlainText(", test".into()) Token::PlainText(", test".into()),
]) ])
); );
@ -2318,9 +2358,9 @@ text</center>"#
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: crate::MentionType::User,
name: "tag".into(), name: "tag".into(),
host: Some("domain.gay".into()) host: Some("domain.gay".into()),
}, },
Token::PlainText(". test".into()) Token::PlainText(". test".into()),
]) ])
); );
@ -2331,9 +2371,9 @@ text</center>"#
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: crate::MentionType::User,
name: "tag".into(), name: "tag".into(),
host: Some("domain".into()) host: Some("domain".into()),
}, },
Token::PlainText("? test".into()) Token::PlainText("? test".into()),
]) ])
); );
@ -2344,9 +2384,9 @@ text</center>"#
Token::Mention { Token::Mention {
mention_type: crate::MentionType::Community, mention_type: crate::MentionType::Community,
name: "tag".into(), name: "tag".into(),
host: Some("domain.com".into()) host: Some("domain.com".into()),
}, },
Token::PlainText(" test".into()) Token::PlainText(" test".into()),
]) ])
); );
@ -2366,7 +2406,7 @@ text</center>"#
parse_full(":bottom:"), parse_full(":bottom:"),
Token::ShortcodeEmoji { Token::ShortcodeEmoji {
shortcode: "bottom".into(), shortcode: "bottom".into(),
host: None host: None,
} }
); );
@ -2375,12 +2415,12 @@ text</center>"#
Token::Sequence(vec![ Token::Sequence(vec![
Token::ShortcodeEmoji { Token::ShortcodeEmoji {
shortcode: "bottom".into(), shortcode: "bottom".into(),
host: None host: None,
}, },
Token::ShortcodeEmoji { Token::ShortcodeEmoji {
shortcode: "blobfox".into(), shortcode: "blobfox".into(),
host: None host: None,
} },
]) ])
); );
@ -2388,7 +2428,7 @@ text</center>"#
parse_full(":bottom@magnetar.social:"), parse_full(":bottom@magnetar.social:"),
Token::ShortcodeEmoji { Token::ShortcodeEmoji {
shortcode: "bottom".into(), shortcode: "bottom".into(),
host: Some("magnetar.social".into()) host: Some("magnetar.social".into()),
} }
); );
@ -2436,7 +2476,7 @@ text</center>"#
Token::Sequence(vec![ Token::Sequence(vec![
Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag
Token::PlainText("\u{0200c}".into()), // ZWNJ Token::PlainText("\u{0200c}".into()), // ZWNJ
Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()) // Trans symbol Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()), // Trans symbol
]) ])
); );