Implemented URL parsing

2023-10-05 19:09:26 +02:00 · 2023-10-05 19:09:26 +02:00 · 9b26691ff4
parent 24d44632e0
commit 9b26691ff4
1 changed files with 167 additions and 9 deletions
--- a/magnetar_mmm_parser/src/lib.rs
+++ b/magnetar_mmm_parser/src/lib.rs
@ -1,10 +1,10 @@
 use nom::branch::alt;
 use nom::bytes::complete::tag;
-use nom::character::complete;
 use nom::character::complete::{
-    alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, tab,
+    alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1,
+    tab,
 };
-use nom::combinator::{fail, not, opt, recognize};
+use nom::combinator::{eof, fail, not, opt, recognize};
 use nom::error::ErrorKind;
 use nom::multi::{many0, many0_count, many1, many1_count, separated_list1};
 use nom::sequence::tuple;
@ -28,6 +28,13 @@ pub enum Token<'a> {
    PlainTag(Cow<'a, str>),
    InlineCode(Cow<'a, str>),
    InlineMath(Cow<'a, str>),
+    UrlRaw(Cow<'a, str>),
+    UrlNoEmbed(Cow<'a, str>),
+    Link {
+        label: Cow<'a, str>,
+        href: Cow<'a, str>,
+        embed: bool,
+    },
    BlockCode {
        lang: Option<Cow<'a, str>>,
        inner: Cow<'a, str>,
@ -56,6 +63,13 @@ impl Token<'_> {
            Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())),
            Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())),
            Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())),
+            Token::UrlRaw(url) => Token::UrlRaw(Cow::Owned(url.clone().into_owned())),
+            Token::UrlNoEmbed(url) => Token::UrlNoEmbed(Cow::Owned(url.clone().into_owned())),
+            Token::Link { embed, label, href } => Token::Link {
+                embed: *embed,
+                label: Cow::Owned(label.clone().into_owned()),
+                href: Cow::Owned(href.clone().into_owned()),
+            },
            Token::BlockCode { inner, lang } => Token::BlockCode {
                lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())),
                inner: Cow::Owned(inner.clone().into_owned()),
@ -184,12 +198,8 @@ fn spliced<'a>(
 }

 fn space(input: Span) -> IResult<Span, Token> {
-    let start = input;
-    let (input, _) = alt((complete::char('\u{0020}'), complete::char('\u{3000}'), tab))(input)?;
-    Ok((
-        input,
-        Token::PlainText(start.fragment_between(&input).into()),
-    ))
+    let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?;
+    Ok((input, Token::PlainText(frag.into_fragment().into())))
 }

 struct Context;
@ -213,6 +223,11 @@ impl Context {
        Ok((input, token))
    }

+    fn inline_no_link<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
+        let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?;
+        Ok((input, token))
+    }
+
    fn tag_quote<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
        let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?;

@ -550,4 +565,147 @@ impl Context {
            Token::PlainText(before.fragment_between(&input).into()),
        ))
    }
+
+    fn url<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
+        let (input, url_span) = recognize(tuple((
+            protocol,
+            url_chars(|input| not(url_chars_base)(input), false),
+        )))(input)?;
+
+        let url = url_span.into_fragment();
+        let url_bytes = url.as_bytes();
+
+        // Strip punctuation at the end of sentences that might have been consumed as a part of the URL
+        let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) {
+            url.slice(..url.len() - 1)
+        } else {
+            url
+        };
+
+        Ok((input, Token::UrlRaw(Cow::from(final_url))))
+    }
+
+    fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
+        let (input, _) = tag("<")(input)?;
+        let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?;
+        let (input, _) = tag(">")(input)?;
+
+        Ok((input, Token::UrlRaw(Cow::from(url_span.into_fragment()))))
+    }
+
+    fn link<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
+        let (input, no_embed) = opt(tag("?"))(input)?;
+        let (input, _) = tag("[")(input)?;
+        let (input, _) = not(tag("["))(input)?;
+        let (input, label_span) =
+            recognize(many1(tuple((not(tag("](")), not_line_ending))))(input)?;
+        let (input, _) = tag("]")(input)?;
+        let (input, _) = tag("(")(input)?;
+        let (input, url_span) = recognize(tuple((protocol, url_chars(tag("]"), true))))(input)?;
+        let (input, _) = tag(")")(input)?;
+
+        Ok((
+            input,
+            Token::Link {
+                label: label_span.into_fragment().into(),
+                href: url_span.into_fragment().into(),
+                embed: no_embed.is_none(),
+            },
+        ))
+    }
+}
+
+#[inline]
+fn protocol(input: Span) -> IResult<Span, Span> {
+    alt((tag("https://"), tag("http://")))(input)
+}
+
+#[inline]
+fn url_chars_base(input: Span) -> IResult<Span, Span> {
+    recognize(alt((alpha1, recognize(one_of(".,_/:%#$&?!~=+-()[]@")))))(input)
+}
+
+#[inline]
+fn url_chars<'a, T: 'a>(
+    terminator: impl Fn(Span<'a>) -> IResult<Span<'a>, T> + 'a,
+    spaces: bool,
+) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'a {
+    let terminating = move |input| {
+        tuple((
+            &terminator,
+            alt((
+                space1,
+                line_ending,
+                eof,
+                recognize(one_of("([<'\"")),
+                recognize(tuple((
+                    alt((alpha1, recognize(one_of("*")))),
+                    alt((space1, line_ending, eof)),
+                ))),
+            )),
+        ))(input)
+    };
+
+    let chars = tuple((
+        not(tuple((space1, eof))),
+        not(tuple((space1, tag("\"")))),
+        not(tuple((opt(space1), terminating))),
+        alt((url_chars_base, if spaces { space1 } else { fail })),
+    ));
+
+    recognize(many1_count(chars))
+}
+
+#[cfg(test)]
+mod test {
+    use crate::{url_chars, Span};
+    use nom::bytes::complete::tag;
+
+    #[test]
+    fn parse_url_chars() {
+        let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))";
+        assert_eq!(
+            "https://en.wikipedia.org/wiki/Sandbox_(computer_security)",
+            url_chars(tag(")"), true)(Span::new(test1))
+                .unwrap()
+                .1
+                .into_fragment()
+        );
+
+        let test2 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))";
+        assert_eq!(
+            "https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
+            url_chars(tag(")"), true)(Span::new(test2))
+                .unwrap()
+                .1
+                .into_fragment()
+        );
+
+        let test3 = "https://en.wikipedia.org/wiki/(";
+        assert_eq!(
+            test3,
+            url_chars(tag(")"), true)(Span::new(test3))
+                .unwrap()
+                .1
+                .into_fragment()
+        );
+
+        let test4 = "https://cs.wikipedia.org/wiki/Among_Us  ";
+        assert_eq!(
+            "https://cs.wikipedia.org/wiki/Among_Us",
+            url_chars(tag(")"), true)(Span::new(test4))
+                .unwrap()
+                .1
+                .into_fragment()
+        );
+
+        let test5 = "https://cs.wikipedia.org/wiki/Among Us  )";
+        assert_eq!(
+            "https://cs.wikipedia.org/wiki/Among Us",
+            url_chars(tag(")"), true)(Span::new(test5))
+                .unwrap()
+                .1
+                .into_fragment()
+        );
+    }
 }