From 9b26691ff41e3418041691b2025bcca847390159 Mon Sep 17 00:00:00 2001 From: Natty Date: Thu, 5 Oct 2023 19:09:26 +0200 Subject: [PATCH] Implemented URL parsing --- magnetar_mmm_parser/src/lib.rs | 176 +++++++++++++++++++++++++++++++-- 1 file changed, 167 insertions(+), 9 deletions(-) diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 48a9e17..62d7116 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,10 +1,10 @@ use nom::branch::alt; use nom::bytes::complete::tag; -use nom::character::complete; use nom::character::complete::{ - alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, tab, + alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1, + tab, }; -use nom::combinator::{fail, not, opt, recognize}; +use nom::combinator::{eof, fail, not, opt, recognize}; use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, separated_list1}; use nom::sequence::tuple; @@ -28,6 +28,13 @@ pub enum Token<'a> { PlainTag(Cow<'a, str>), InlineCode(Cow<'a, str>), InlineMath(Cow<'a, str>), + UrlRaw(Cow<'a, str>), + UrlNoEmbed(Cow<'a, str>), + Link { + label: Cow<'a, str>, + href: Cow<'a, str>, + embed: bool, + }, BlockCode { lang: Option>, inner: Cow<'a, str>, @@ -56,6 +63,13 @@ impl Token<'_> { Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())), Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())), Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())), + Token::UrlRaw(url) => Token::UrlRaw(Cow::Owned(url.clone().into_owned())), + Token::UrlNoEmbed(url) => Token::UrlNoEmbed(Cow::Owned(url.clone().into_owned())), + Token::Link { embed, label, href } => Token::Link { + embed: *embed, + label: Cow::Owned(label.clone().into_owned()), + href: Cow::Owned(href.clone().into_owned()), + }, Token::BlockCode { inner, lang } => Token::BlockCode { lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())), inner: Cow::Owned(inner.clone().into_owned()), @@ -184,12 +198,8 @@ fn spliced<'a>( } fn space(input: Span) -> IResult { - let start = input; - let (input, _) = alt((complete::char('\u{0020}'), complete::char('\u{3000}'), tab))(input)?; - Ok(( - input, - Token::PlainText(start.fragment_between(&input).into()), - )) + let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?; + Ok((input, Token::PlainText(frag.into_fragment().into()))) } struct Context; @@ -213,6 +223,11 @@ impl Context { Ok((input, token)) } + fn inline_no_link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?; + Ok((input, token)) + } + fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; @@ -550,4 +565,147 @@ impl Context { Token::PlainText(before.fragment_between(&input).into()), )) } + + fn url<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, url_span) = recognize(tuple(( + protocol, + url_chars(|input| not(url_chars_base)(input), false), + )))(input)?; + + let url = url_span.into_fragment(); + let url_bytes = url.as_bytes(); + + // Strip punctuation at the end of sentences that might have been consumed as a part of the URL + let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) { + url.slice(..url.len() - 1) + } else { + url + }; + + Ok((input, Token::UrlRaw(Cow::from(final_url)))) + } + + fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, _) = tag("<")(input)?; + let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?; + let (input, _) = tag(">")(input)?; + + Ok((input, Token::UrlRaw(Cow::from(url_span.into_fragment())))) + } + + fn link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let (input, no_embed) = opt(tag("?"))(input)?; + let (input, _) = tag("[")(input)?; + let (input, _) = not(tag("["))(input)?; + let (input, label_span) = + recognize(many1(tuple((not(tag("](")), not_line_ending))))(input)?; + let (input, _) = tag("]")(input)?; + let (input, _) = tag("(")(input)?; + let (input, url_span) = recognize(tuple((protocol, url_chars(tag("]"), true))))(input)?; + let (input, _) = tag(")")(input)?; + + Ok(( + input, + Token::Link { + label: label_span.into_fragment().into(), + href: url_span.into_fragment().into(), + embed: no_embed.is_none(), + }, + )) + } +} + +#[inline] +fn protocol(input: Span) -> IResult { + alt((tag("https://"), tag("http://")))(input) +} + +#[inline] +fn url_chars_base(input: Span) -> IResult { + recognize(alt((alpha1, recognize(one_of(".,_/:%#$&?!~=+-()[]@")))))(input) +} + +#[inline] +fn url_chars<'a, T: 'a>( + terminator: impl Fn(Span<'a>) -> IResult, T> + 'a, + spaces: bool, +) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'a { + let terminating = move |input| { + tuple(( + &terminator, + alt(( + space1, + line_ending, + eof, + recognize(one_of("([<'\"")), + recognize(tuple(( + alt((alpha1, recognize(one_of("*")))), + alt((space1, line_ending, eof)), + ))), + )), + ))(input) + }; + + let chars = tuple(( + not(tuple((space1, eof))), + not(tuple((space1, tag("\"")))), + not(tuple((opt(space1), terminating))), + alt((url_chars_base, if spaces { space1 } else { fail })), + )); + + recognize(many1_count(chars)) +} + +#[cfg(test)] +mod test { + use crate::{url_chars, Span}; + use nom::bytes::complete::tag; + + #[test] + fn parse_url_chars() { + let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))"; + assert_eq!( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)", + url_chars(tag(")"), true)(Span::new(test1)) + .unwrap() + .1 + .into_fragment() + ); + + let test2 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))"; + assert_eq!( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", + url_chars(tag(")"), true)(Span::new(test2)) + .unwrap() + .1 + .into_fragment() + ); + + let test3 = "https://en.wikipedia.org/wiki/("; + assert_eq!( + test3, + url_chars(tag(")"), true)(Span::new(test3)) + .unwrap() + .1 + .into_fragment() + ); + + let test4 = "https://cs.wikipedia.org/wiki/Among_Us "; + assert_eq!( + "https://cs.wikipedia.org/wiki/Among_Us", + url_chars(tag(")"), true)(Span::new(test4)) + .unwrap() + .1 + .into_fragment() + ); + + let test5 = "https://cs.wikipedia.org/wiki/Among Us )"; + assert_eq!( + "https://cs.wikipedia.org/wiki/Among Us", + url_chars(tag(")"), true)(Span::new(test5)) + .unwrap() + .1 + .into_fragment() + ); + } }