diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 19a9235..8f2563f 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,5 +1,11 @@ +use std::collections::HashMap; +use std::convert::{identity, Infallible}; +use std::io::{Cursor, Write}; +use std::marker::PhantomData; + use compact_str::{CompactString, ToCompactString}; use either::Either; +use nom::{IResult, Offset, Parser, Slice}; use nom::branch::alt; use nom::bytes::complete::{tag, tag_no_case}; use nom::character::complete::{ @@ -10,14 +16,9 @@ use nom::combinator::{eof, fail, map, not, opt, peek, recognize}; use nom::error::ErrorKind; use nom::multi::{many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; -use nom::{IResult, Offset, Parser, Slice}; use nom_locate::LocatedSpan; use quick_xml::events::{BytesText, Event}; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::convert::{identity, Infallible}; -use std::io::{Cursor, Write}; -use std::marker::PhantomData; use strum::IntoStaticStr; use tracing::trace; use unicode_segmentation::UnicodeSegmentation; @@ -436,6 +437,20 @@ pub fn to_xml_string(token: &Token) -> quick_xml::Result { Ok(String::from_utf8(writer.into_inner().into_inner())?) } +pub fn janky_is_line_begin(input: Span<'_>) -> bool { + let offset = input.location_offset(); + + // VERY BAD + // Safety: This is very janky, but hopefully will work as long as nom-locate keeps the invariant of fragments being subslices of the input + // We do this to avoid scanning the entire input for a line separator when we just need the previous byte + offset == 0 || unsafe { + let frag_bytes = input.fragment().as_bytes(); + let frag_ptr = frag_bytes.as_ptr(); + let prev_byte = frag_ptr.offset(-1); + matches!(*prev_byte, b'\n') + } +} + #[derive(Debug, Default, Copy, Clone)] pub struct SpanMeta { depth: usize, @@ -453,8 +468,8 @@ trait SliceOffset { fn up_to(&self, other: &Self) -> Self; fn fragment_between<'a>(&self, other: &Self) -> &'a str - where - Self: 'a; + where + Self: 'a; } impl SliceOffset for Span<'_> { @@ -463,8 +478,8 @@ impl SliceOffset for Span<'_> { } fn fragment_between<'a>(&self, other: &Self) -> &'a str - where - Self: 'a, + where + Self: 'a, { self.up_to(other).into_fragment() } @@ -479,14 +494,14 @@ fn boxing_token(func: impl Fn(Box) -> Token) -> impl Fn(Token) -> Token { fn collect_sequence( func: impl Fn(Vec) -> Token, transform: impl Fn(Token) -> Token, -) -> impl Fn(&mut dyn Iterator) -> Token { +) -> impl Fn(&mut dyn Iterator) -> Token { move |tokens| transform(func(tokens.collect())) } #[inline] fn collect_char_sequence( func: impl Fn(String) -> Token, -) -> impl Fn(&mut dyn Iterator) -> Token { +) -> impl Fn(&mut dyn Iterator) -> Token { move |chars| func(chars.collect()) } @@ -500,7 +515,12 @@ fn space1_unicode(input: Span) -> IResult { #[inline] fn alphanumeric1_unicode(input: Span) -> IResult { - recognize(many1_count(satisfy(char::is_alphanumeric)))(input) + recognize(many1_count(char_alphanumeric_unicode))(input) +} + +#[inline] +fn char_alphanumeric_unicode(input: Span) -> IResult { + satisfy(char::is_alphanumeric)(input) } fn spliced<'a>( @@ -583,7 +603,7 @@ fn space(input: Span) -> IResult { #[derive(Copy, Clone)] struct Matcher<'a, 'b, T: Clone> { matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), - collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), _phantom_closure: PhantomData<&'a ()>, _phantom_data: PhantomData<&'b ()>, _phantom_output: PhantomData T>, @@ -592,7 +612,7 @@ struct Matcher<'a, 'b, T: Clone> { impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> { fn new( matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), - collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), ) -> Self { Self { matcher_inner, @@ -631,7 +651,7 @@ struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>>( ); impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From<(T, FlankingRule)> - for FlankingDelim<'a, T> +for FlankingDelim<'a, T> { fn from((func, rule): (T, FlankingRule)) -> Self { FlankingDelim(func, rule, PhantomData) @@ -750,7 +770,7 @@ impl Context { )), eof, ) - .map(|v| v.0), + .map(|v| v.0), Token::Sequence, )(input) } @@ -765,7 +785,7 @@ impl Context { )), eof, ) - .map(|v| v.0), + .map(|v| v.0), Token::Sequence, )(input) } @@ -877,7 +897,7 @@ impl Context { let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; if let (None, None) = leading_spaces { - if input.get_column() != 1 { + if !janky_is_line_begin(input) { return fail(input); } } @@ -894,10 +914,10 @@ impl Context { if quote_lines.len() == 1 && quote_lines - .iter() - .map(Span::fragment) - .copied() - .any(&str::is_empty) + .iter() + .map(Span::fragment) + .copied() + .any(&str::is_empty) { return fail(input); } @@ -915,7 +935,7 @@ impl Context { let (input, _) = opt(line_ending)(input)?; - if input.get_column() != 1 { + if !janky_is_line_begin(input) { return fail(input); } @@ -938,7 +958,7 @@ impl Context { let (input, _) = opt(line_ending)(input)?; - if input.get_column() != 1 { + if !janky_is_line_begin(input) { return fail(input); } @@ -980,7 +1000,7 @@ impl Context { let (input, _) = opt(line_ending)(input)?; - if input.get_column() != 1 { + if !janky_is_line_begin(input) { return fail(input); } @@ -1014,9 +1034,9 @@ impl Context { matcher: Matcher<'a, 'b, T>, fallback: Matcher<'a, 'b, S>, ) -> impl Fn(Span<'b>) -> IResult, Token> + '_ - where - FOpen: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, - FClose: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + where + FOpen: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + FClose: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, { let FlankingDelim(opening_tag, opening_rule, ..) = opening_tag.into(); let FlankingDelim(closing_tag, closing_rule, ..) = closing_tag.into(); @@ -1024,7 +1044,7 @@ impl Context { move |input| { if let FlankingRule::Strict = opening_rule { let (input, pre) = opt(recognize(tuple(( - alphanumeric1_unicode, + char_alphanumeric_unicode, opt(tag("\\")), &opening_tag, peek(not(alt((recognize(satisfy(|c| c.is_whitespace())), eof)))), @@ -1072,7 +1092,7 @@ impl Context { input, Token::Sequence(vec![ Token::PlainText(begin.fragment_between(&post_open).into()), - ((fallback.collector)(&mut inner)), + (fallback.collector)(&mut inner), Token::PlainText(closing.into_fragment().into()), ]), )); @@ -1355,7 +1375,7 @@ impl Context { Matcher::new( &move |input| { map( - tuple(((not(line_ending)), self.partial(Self::inline_single))), + tuple((not(line_ending), self.partial(Self::inline_single))), |(_, captured)| captured, )(input) }, @@ -1365,7 +1385,7 @@ impl Context { &move |input| { map( tuple(( - (not(line_ending)), + not(line_ending), self.partial(Self::inline_non_formatting_single), )), |(_, captured)| captured, @@ -1492,18 +1512,7 @@ impl Context { )) } - fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token> { - if let (plain_out, Some(plain)) = map( - opt(recognize(tuple(( - alphanumeric1_unicode, - self.partial(Self::shortcode_emoji), - )))), - |o| o.map(Span::into_fragment), - )(input)? - { - return Ok((plain_out, Token::PlainText(plain.into()))); - } - + fn shortcode_emoji_inner<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, _) = tag(":")(input)?; let (input, shortcode) = map( recognize(many1(alt(( @@ -1534,11 +1543,11 @@ impl Context { )) } - fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token> { + fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token> { if let (plain_out, Some(plain)) = map( opt(recognize(tuple(( - alt((tag("\\"), alphanumeric1_unicode)), - self.partial(Self::tag_mention), + char_alphanumeric_unicode, + self.partial(Self::shortcode_emoji_inner), )))), |o| o.map(Span::into_fragment), )(input)? @@ -1546,6 +1555,10 @@ impl Context { return Ok((plain_out, Token::PlainText(plain.into()))); } + self.shortcode_emoji_inner(input) + } + + fn tag_mention_inner<'a>(&self, input: Span<'a>) -> IResult, Token> { let tags = one_of("@!"); let (input, mention_type) = map(tags, |c| match c { '@' => MentionType::User, @@ -1591,9 +1604,24 @@ impl Context { )) } + fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token> { + if let (plain_out, Some(plain)) = map( + opt(recognize(tuple(( + alt((tag("\\"), recognize(char_alphanumeric_unicode))), + self.partial(Self::tag_mention_inner), + )))), + |o| o.map(Span::into_fragment), + )(input)? + { + return Ok((plain_out, Token::PlainText(plain.into()))); + } + + self.tag_mention_inner(input) + } + fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, maybe_preceded) = - opt(recognize(tuple((alphanumeric1_unicode, tag("#")))))(input)?; + opt(recognize(tuple((char_alphanumeric_unicode, tag("#")))))(input)?; if let Some(preceded) = maybe_preceded { return Ok((input, Token::PlainText(preceded.into_fragment().into()))); @@ -1614,8 +1642,8 @@ impl Context { &'b self, mut func: F, ) -> impl FnMut(Span<'a>) -> IResult, O> + 'b - where - F: Parser, O, nom::error::Error>> + 'b, + where + F: Parser, O, nom::error::Error>> + 'b, { move |mut input| { if input.extra.depth >= self.depth_limit { @@ -1695,8 +1723,8 @@ impl Context { mut terminator: F, spaces: bool, ) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'b - where - F: Parser, Span<'a>, nom::error::Error>> + 'b, + where + F: Parser, Span<'a>, nom::error::Error>> + 'b, { move |input| { recognize(many1_count(tuple(( @@ -1714,10 +1742,12 @@ impl Context { #[cfg(test)] mod test { - use crate::{to_xml_string, Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; - use nom::bytes::complete::tag; use std::collections::HashMap; + use nom::bytes::complete::tag; + + use crate::{Context, DEFAULT_DEPTH_LIMIT, Span, SpanMeta, to_xml_string, Token}; + fn parse_full(string: &str) -> Token { Context::default() .full(Span::new_extra(string, SpanMeta::default())) @@ -1738,11 +1768,11 @@ mod test { assert_eq!( ctx.url_chars(tag(")"), true)(Span::new_extra( "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", - SpanMeta::default() + SpanMeta::default(), )) - .unwrap() - .1 - .into_fragment(), + .unwrap() + .1 + .into_fragment(), "https://en.wikipedia.org/wiki/Sandbox_(computer_security)" ); @@ -1771,22 +1801,22 @@ mod test { assert_eq!( ctx.url_chars(tag(")"), true)(Span::new_extra( "https://cs.wikipedia.org/wiki/Among Us )", - SpanMeta::default() + SpanMeta::default(), )) - .unwrap() - .1 - .into_fragment(), + .unwrap() + .1 + .into_fragment(), "https://cs.wikipedia.org/wiki/Among Us" ); assert_eq!( ctx.url_chars(tag(")"), false)(Span::new_extra( "https://en.wikipedia.org/wiki/Among Us )", - SpanMeta::default() + SpanMeta::default(), )) - .unwrap() - .1 - .into_fragment(), + .unwrap() + .1 + .into_fragment(), "https://en.wikipedia.org/wiki/Among" ); } @@ -1823,7 +1853,7 @@ mod test { Token::Sequence(vec![ Token::PlainText("intra".into()), Token::Italic(Box::new(Token::PlainText("word".into()))), - Token::PlainText("italic".into()) + Token::PlainText("italic".into()), ]) ); @@ -1836,7 +1866,7 @@ mod test { parse_full(r#"long text with a *footnote text"#), Token::Sequence(vec![ Token::PlainText("long text with a *footnote ".into()), - Token::Bold(Box::new(Token::PlainText("text".into()))) + Token::Bold(Box::new(Token::PlainText("text".into()))), ]) ); @@ -1888,7 +1918,7 @@ mod test { parse_full("~~*hello\nworld*"), Token::Sequence(vec![ Token::PlainText("~~".into()), - Token::Italic(Box::new(Token::PlainText("hello\nworld".into()))) + Token::Italic(Box::new(Token::PlainText("hello\nworld".into()))), ]) ) } @@ -1900,7 +1930,7 @@ mod test { Token::Sequence(vec![ Token::PlainText("aaa".into()), Token::Italic(Box::new(Token::PlainText("iii".into()))), - Token::PlainText("bbb".into()) + Token::PlainText("bbb".into()), ]) ); @@ -1914,7 +1944,7 @@ mod test { Token::Sequence(vec![ Token::PlainText("aaa\n".into()), Token::Italic(Box::new(Token::PlainText("iii".into()))), - Token::PlainText("\nbbb".into()) + Token::PlainText("\nbbb".into()), ]) ); @@ -1955,6 +1985,16 @@ mod test { ); } + #[test] + fn parse_long() { + parse_full(&"A".repeat(20000)); + + + parse_full(&"*A".repeat(20000)); + + parse_full(&"@A".repeat(20000)); + } + #[test] fn parse_complex() { assert_eq!( @@ -2015,7 +2055,7 @@ text"# Token::PlainText("centered\n".into()), Token::UnicodeEmoji("🦋".into()), Token::UnicodeEmoji("🏳️‍⚧️".into()), - Token::PlainText("\ntext".into()) + Token::PlainText("\ntext".into()), ]))) ); @@ -2102,7 +2142,7 @@ text"# parse_full("IPv4 test: "), Token::Sequence(vec![ Token::PlainText("IPv4 test: ".into()), - Token::UrlNoEmbed("https://0".into()) + Token::UrlNoEmbed("https://0".into()), ]) ); @@ -2110,7 +2150,7 @@ text"# parse_full("IPv4 test: "), Token::Sequence(vec![ Token::PlainText("IPv4 test: ".into()), - Token::UrlNoEmbed("https://127.0.0.1".into()) + Token::UrlNoEmbed("https://127.0.0.1".into()), ]) ); @@ -2118,7 +2158,7 @@ text"# parse_full("IPv6 test: "), Token::Sequence(vec![ Token::PlainText("IPv6 test: ".into()), - Token::UrlNoEmbed("https://[::2f:1]/nya".into()) + Token::UrlNoEmbed("https://[::2f:1]/nya".into()), ]) ); @@ -2126,7 +2166,7 @@ text"# parse_full("IPv6 test: https://[::2f:1]/nya"), Token::Sequence(vec![ Token::PlainText("IPv6 test: ".into()), - Token::UrlRaw("https://[::2f:1]/nya".into()) + Token::UrlRaw("https://[::2f:1]/nya".into()), ]) ); @@ -2135,7 +2175,7 @@ text"# parse_full("IDN test: https://www.háčkyčárky.cz/"), Token::Sequence(vec![ Token::PlainText("IDN test: ".into()), - Token::UrlRaw("https://www.háčkyčárky.cz/".into()) + Token::UrlRaw("https://www.háčkyčárky.cz/".into()), ]) ); @@ -2146,8 +2186,8 @@ text"# Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://example.com".into(), - embed: true - } + embed: true, + }, ]) ); @@ -2156,7 +2196,7 @@ text"# Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Hashtag("hashtag".into()), - Token::PlainText(" tail".into()) + Token::PlainText(" tail".into()), ]) ); @@ -2175,7 +2215,7 @@ text"# parse_full(""), Token::Sequence(vec![ Token::UrlNoEmbed("https://example.com/".into()), - Token::UrlNoEmbed("https://awawa.gay/".into()) + Token::UrlNoEmbed("https://awawa.gay/".into()), ]) ); @@ -2186,8 +2226,8 @@ text"# Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://awawa.gay".into(), - embed: false - } + embed: false, + }, ]) ); @@ -2198,9 +2238,9 @@ text"# Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://awawa.gay".into(), - embed: false + embed: false, }, - Token::PlainText("test".into()) + Token::PlainText("test".into()), ]) ); @@ -2211,9 +2251,9 @@ text"# Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://awawa.gay".into(), - embed: false + embed: false, }, - Token::PlainText(")".into()) + Token::PlainText(")".into()), ]) ); @@ -2250,7 +2290,7 @@ text"# Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), - host: None + host: None, } ); @@ -2266,9 +2306,9 @@ text"# Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), - host: None + host: None, }, - Token::PlainText(" fgahjsdkd".into()) + Token::PlainText(" fgahjsdkd".into()), ]) ); @@ -2279,9 +2319,9 @@ text"# Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), - host: None + host: None, }, - Token::PlainText("@ fgahjsdkd".into()) + Token::PlainText("@ fgahjsdkd".into()), ]) ); @@ -2292,9 +2332,9 @@ text"# Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), - host: Some("domain".into()) + host: Some("domain".into()), }, - Token::PlainText(" bbbbb".into()) + Token::PlainText(" bbbbb".into()), ]) ); @@ -2305,9 +2345,9 @@ text"# Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), - host: Some("domain".into()) + host: Some("domain".into()), }, - Token::PlainText(", test".into()) + Token::PlainText(", test".into()), ]) ); @@ -2318,9 +2358,9 @@ text"# Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), - host: Some("domain.gay".into()) + host: Some("domain.gay".into()), }, - Token::PlainText(". test".into()) + Token::PlainText(". test".into()), ]) ); @@ -2331,9 +2371,9 @@ text"# Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), - host: Some("domain".into()) + host: Some("domain".into()), }, - Token::PlainText("? test".into()) + Token::PlainText("? test".into()), ]) ); @@ -2344,9 +2384,9 @@ text"# Token::Mention { mention_type: crate::MentionType::Community, name: "tag".into(), - host: Some("domain.com".into()) + host: Some("domain.com".into()), }, - Token::PlainText(" test".into()) + Token::PlainText(" test".into()), ]) ); @@ -2366,7 +2406,7 @@ text"# parse_full(":bottom:"), Token::ShortcodeEmoji { shortcode: "bottom".into(), - host: None + host: None, } ); @@ -2375,12 +2415,12 @@ text"# Token::Sequence(vec![ Token::ShortcodeEmoji { shortcode: "bottom".into(), - host: None + host: None, }, Token::ShortcodeEmoji { shortcode: "blobfox".into(), - host: None - } + host: None, + }, ]) ); @@ -2388,7 +2428,7 @@ text"# parse_full(":bottom@magnetar.social:"), Token::ShortcodeEmoji { shortcode: "bottom".into(), - host: Some("magnetar.social".into()) + host: Some("magnetar.social".into()), } ); @@ -2436,7 +2476,7 @@ text"# Token::Sequence(vec![ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag Token::PlainText("\u{0200c}".into()), // ZWNJ - Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()) // Trans symbol + Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()), // Trans symbol ]) ); @@ -2460,7 +2500,7 @@ text"# &to_xml_string(&parse_full( "@natty $[spin.speed=0.5s 🥺]:cat_attack: test" )) - .unwrap(), + .unwrap(), r#" 🥺cat_attack test"# ); @@ -2468,7 +2508,7 @@ text"# &to_xml_string(&parse_full( "Ring Galaxy AM 0644 741 from Hubble\nCredits: AURA, STScI, J. Higdon, Cornell, ESA, #NASA\n#nature #space #astrophotography" )) - .unwrap(), + .unwrap(), r#"Ring Galaxy AM 0644 741 from Hubble Credits: AURA, STScI, J. Higdon, Cornell, ESA, NASA nature space astrophotography"# @@ -2481,7 +2521,7 @@ Credits: AURA, STScI, J. Higdon, Cornell, ESA, NASA var x = undefined; ``` "# )) - .unwrap(), + .unwrap(), "var x = undefined;" ); }