From c4fd99fa45990cd68dfa4465c4635e4744a7efd1 Mon Sep 17 00:00:00 2001 From: Natty Date: Sat, 7 Oct 2023 21:22:21 +0200 Subject: [PATCH] Stricter URL parsing --- magnetar_mmm_parser/src/lib.rs | 129 ++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 49 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 63e55c5..6f1bf94 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -991,7 +991,10 @@ impl Context { let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?; let (input, _) = tag(">")(input)?; - Ok((input, Token::UrlRaw(Cow::from(url_span.into_fragment())))) + Ok(( + input, + Token::UrlNoEmbed(Cow::from(url_span.into_fragment())), + )) } fn link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { @@ -1120,7 +1123,12 @@ fn protocol(input: Span) -> IResult { #[inline] fn url_chars_base(input: Span) -> IResult { - recognize(alt((alpha1, recognize(one_of(".,_/:%#$&?!~=+-()[]@")))))(input) + recognize(alt(( + alpha1, + recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))), + recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))), + recognize(one_of(".,_/:%#$&?!~=+-@")), + )))(input) } #[inline] @@ -1128,26 +1136,10 @@ fn url_chars<'a, T: 'a>( terminator: impl Fn(Span<'a>) -> IResult, T> + 'a, spaces: bool, ) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'a { - let terminating = move |input| { - tuple(( - &terminator, - alt(( - space1, - line_ending, - eof, - recognize(one_of("([<'\"")), - recognize(tuple(( - alt((alpha1, recognize(one_of("*")))), - alt((space1, line_ending, eof)), - ))), - )), - ))(input) - }; - let chars = tuple(( not(tuple((space1, eof))), not(tuple((space1, tag("\"")))), - not(tuple((opt(space1), terminating))), + not(tuple((opt(space1), terminator))), alt((url_chars_base, if spaces { space1 } else { fail })), )); @@ -1167,49 +1159,48 @@ mod test { #[test] fn parse_url_chars() { - let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))"; assert_eq!( + url_chars(tag(")"), true)(Span::new( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security))" + )) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)" + ); + + assert_eq!( + url_chars(tag(")"), true)(Span::new( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))" + )) + .unwrap() + .1 + .into_fragment(), "https://en.wikipedia.org/wiki/Sandbox_(computer_security)", - url_chars(tag(")"), true)(Span::new(test1)) - .unwrap() - .1 - .into_fragment() ); - let test2 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))"; assert_eq!( - "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", - url_chars(tag(")"), true)(Span::new(test2)) + url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among_Us ")) .unwrap() .1 - .into_fragment() - ); - - let test3 = "https://en.wikipedia.org/wiki/("; - assert_eq!( - test3, - url_chars(tag(")"), true)(Span::new(test3)) - .unwrap() - .1 - .into_fragment() - ); - - let test4 = "https://cs.wikipedia.org/wiki/Among_Us "; - assert_eq!( + .into_fragment(), "https://cs.wikipedia.org/wiki/Among_Us", - url_chars(tag(")"), true)(Span::new(test4)) - .unwrap() - .1 - .into_fragment() ); - let test5 = "https://cs.wikipedia.org/wiki/Among Us )"; assert_eq!( - "https://cs.wikipedia.org/wiki/Among Us", - url_chars(tag(")"), true)(Span::new(test5)) + url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among Us )")) .unwrap() .1 - .into_fragment() + .into_fragment(), + "https://cs.wikipedia.org/wiki/Among Us" + ); + + assert_eq!( + url_chars(tag(")"), false)(Span::new("https://en.wikipedia.org/wiki/Among Us )")) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Among" ); } @@ -1381,6 +1372,20 @@ text"# ]) ); + assert_eq!( + parse_full(""), + Token::UrlNoEmbed("https://example.com".into()) + ); + + // Adjacent links okay + assert_eq!( + parse_full(""), + Token::Sequence(vec![ + Token::UrlNoEmbed("https://example.com/".into()), + Token::UrlNoEmbed("https://awawa.gay/".into()) + ]) + ); + assert_eq!( parse_full("Link test: ?[label](https://awawa.gay)"), Token::Sequence(vec![ @@ -1393,6 +1398,32 @@ text"# ]) ); + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay)test"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false + }, + Token::PlainText("test".into()) + ]) + ); + + assert_eq!( + parse_full("Link test: (?[label](https://awawa.gay))"), + Token::Sequence(vec![ + Token::PlainText("Link test: (".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false + }, + Token::PlainText(")".into()) + ]) + ); + assert_eq!( parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket Token::Sequence(vec![