Stricter URL parsing

This commit is contained in:
Natty 2023-10-07 21:22:21 +02:00
parent d2bc679740
commit c4fd99fa45
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
1 changed files with 80 additions and 49 deletions

View File

@ -991,7 +991,10 @@ impl Context {
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?;
let (input, _) = tag(">")(input)?;
Ok((input, Token::UrlRaw(Cow::from(url_span.into_fragment()))))
Ok((
input,
Token::UrlNoEmbed(Cow::from(url_span.into_fragment())),
))
}
fn link<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
@ -1120,7 +1123,12 @@ fn protocol(input: Span) -> IResult<Span, Span> {
#[inline]
fn url_chars_base(input: Span) -> IResult<Span, Span> {
recognize(alt((alpha1, recognize(one_of(".,_/:%#$&?!~=+-()[]@")))))(input)
recognize(alt((
alpha1,
recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))),
recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))),
recognize(one_of(".,_/:%#$&?!~=+-@")),
)))(input)
}
#[inline]
@ -1128,26 +1136,10 @@ fn url_chars<'a, T: 'a>(
terminator: impl Fn(Span<'a>) -> IResult<Span<'a>, T> + 'a,
spaces: bool,
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'a {
let terminating = move |input| {
tuple((
&terminator,
alt((
space1,
line_ending,
eof,
recognize(one_of("([<'\"")),
recognize(tuple((
alt((alpha1, recognize(one_of("*")))),
alt((space1, line_ending, eof)),
))),
)),
))(input)
};
let chars = tuple((
not(tuple((space1, eof))),
not(tuple((space1, tag("\"")))),
not(tuple((opt(space1), terminating))),
not(tuple((opt(space1), terminator))),
alt((url_chars_base, if spaces { space1 } else { fail })),
));
@ -1167,49 +1159,48 @@ mod test {
#[test]
fn parse_url_chars() {
let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))";
assert_eq!(
url_chars(tag(")"), true)(Span::new(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))"
))
.unwrap()
.1
.into_fragment(),
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)"
);
assert_eq!(
url_chars(tag(")"), true)(Span::new(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)))"
))
.unwrap()
.1
.into_fragment(),
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)",
url_chars(tag(")"), true)(Span::new(test1))
.unwrap()
.1
.into_fragment()
);
let test2 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))";
assert_eq!(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
url_chars(tag(")"), true)(Span::new(test2))
url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among_Us "))
.unwrap()
.1
.into_fragment()
);
let test3 = "https://en.wikipedia.org/wiki/(";
assert_eq!(
test3,
url_chars(tag(")"), true)(Span::new(test3))
.unwrap()
.1
.into_fragment()
);
let test4 = "https://cs.wikipedia.org/wiki/Among_Us ";
assert_eq!(
.into_fragment(),
"https://cs.wikipedia.org/wiki/Among_Us",
url_chars(tag(")"), true)(Span::new(test4))
.unwrap()
.1
.into_fragment()
);
let test5 = "https://cs.wikipedia.org/wiki/Among Us )";
assert_eq!(
"https://cs.wikipedia.org/wiki/Among Us",
url_chars(tag(")"), true)(Span::new(test5))
url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among Us )"))
.unwrap()
.1
.into_fragment()
.into_fragment(),
"https://cs.wikipedia.org/wiki/Among Us"
);
assert_eq!(
url_chars(tag(")"), false)(Span::new("https://en.wikipedia.org/wiki/Among Us )"))
.unwrap()
.1
.into_fragment(),
"https://en.wikipedia.org/wiki/Among"
);
}
@ -1381,6 +1372,20 @@ text</center>"#
])
);
assert_eq!(
parse_full("<https://example.com>"),
Token::UrlNoEmbed("https://example.com".into())
);
// Adjacent links okay
assert_eq!(
parse_full("<https://example.com/><https://awawa.gay/>"),
Token::Sequence(vec![
Token::UrlNoEmbed("https://example.com/".into()),
Token::UrlNoEmbed("https://awawa.gay/".into())
])
);
assert_eq!(
parse_full("Link test: ?[label](https://awawa.gay)"),
Token::Sequence(vec![
@ -1393,6 +1398,32 @@ text</center>"#
])
);
assert_eq!(
parse_full("Link test: ?[label](https://awawa.gay)test"),
Token::Sequence(vec![
Token::PlainText("Link test: ".into()),
Token::Link {
label: Box::new(Token::PlainText("label".into())),
href: "https://awawa.gay".into(),
embed: false
},
Token::PlainText("test".into())
])
);
assert_eq!(
parse_full("Link test: (?[label](https://awawa.gay))"),
Token::Sequence(vec![
Token::PlainText("Link test: (".into()),
Token::Link {
label: Box::new(Token::PlainText("label".into())),
href: "https://awawa.gay".into(),
embed: false
},
Token::PlainText(")".into())
])
);
assert_eq!(
parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket
Token::Sequence(vec![