From d0d977e6ebbf7676b26c1a76ab5890baa2389910 Mon Sep 17 00:00:00 2001
From: Natty <natty.sh.git@gmail.com>
Date: Sun, 8 Oct 2023 22:15:55 +0200
Subject: [PATCH] Fixed URL parsing and initial flanking rules implementation

---
 magnetar_mmm_parser/src/lib.rs | 388 +++++++++++++++++++++++++++------
 1 file changed, 322 insertions(+), 66 deletions(-)

diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs
index 7385d65..d270760 100644
--- a/magnetar_mmm_parser/src/lib.rs
+++ b/magnetar_mmm_parser/src/lib.rs
@@ -1,19 +1,20 @@
 use either::Either;
 use nom::branch::alt;
-use nom::bytes::complete::tag;
+use nom::bytes::complete::{tag, tag_no_case};
 use nom::character::complete::{
-    alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1,
-    tab,
+    alpha1, alphanumeric1, anychar, char as one_char, char, line_ending, not_line_ending, one_of,
+    satisfy, space1, tab,
 };
 use nom::combinator::{eof, fail, map, not, opt, recognize};
 use nom::error::ErrorKind;
 use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1};
 use nom::sequence::tuple;
-use nom::{IResult, Offset, Slice};
+use nom::{Compare, IResult, Offset, Slice};
 use nom_locate::LocatedSpan;
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::convert::{identity, Infallible};
+use std::marker::PhantomData;
 use unicode_segmentation::UnicodeSegmentation;
 
 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
@@ -73,6 +74,80 @@ pub enum Token<'a> {
 }
 
 impl Token<'_> {
+    fn str_content_left(&self) -> Option<&str> {
+        match self {
+            Token::PlainText(text) => Some(text.as_ref()),
+            Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left),
+            Token::Quote(inner) => inner.str_content_left(),
+            Token::Small(inner) => inner.str_content_left(),
+            Token::BoldItalic(inner) => inner.str_content_left(),
+            Token::Bold(inner) => inner.str_content_left(),
+            Token::Italic(inner) => inner.str_content_left(),
+            Token::Center(inner) => inner.str_content_left(),
+            Token::Strikethrough(inner) => inner.str_content_left(),
+            Token::PlainTag(tag) => Some(tag.as_ref()),
+            Token::UrlRaw(url) => Some(url.as_ref()),
+            Token::UrlNoEmbed(url) => Some(url.as_ref()),
+            Token::Link { label, .. } => label.str_content_left(),
+            Token::Function { inner, .. } => inner.str_content_left(),
+            Token::Mention { name, .. } => Some(name.as_ref()),
+            Token::UnicodeEmoji(code) => Some(code.as_ref()),
+            Token::ShortcodeEmoji(_) => None,
+            Token::Hashtag(tag) => Some(tag.as_ref()),
+            _ => None,
+        }
+    }
+
+    fn str_content_right(&self) -> Option<&str> {
+        match self {
+            Token::PlainText(text) => Some(text.as_ref()),
+            Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right),
+            Token::Quote(inner) => inner.str_content_right(),
+            Token::Small(inner) => inner.str_content_right(),
+            Token::BoldItalic(inner) => inner.str_content_right(),
+            Token::Bold(inner) => inner.str_content_right(),
+            Token::Italic(inner) => inner.str_content_right(),
+            Token::Center(inner) => inner.str_content_right(),
+            Token::Strikethrough(inner) => inner.str_content_right(),
+            Token::PlainTag(tag) => Some(tag.as_ref()),
+            Token::UrlRaw(url) => Some(url.as_ref()),
+            Token::UrlNoEmbed(url) => Some(url.as_ref()),
+            Token::Link { label, .. } => label.str_content_right(),
+            Token::Function { inner, .. } => inner.str_content_right(),
+            Token::Mention { name, .. } => Some(name.as_ref()),
+            Token::UnicodeEmoji(code) => Some(code.as_ref()),
+            Token::Hashtag(tag) => Some(tag.as_ref()),
+            _ => None,
+        }
+    }
+
+    fn inner(&self) -> Token {
+        match self {
+            plain @ Token::PlainText(_) => plain.clone(),
+            sequence @ Token::Sequence(_) => sequence.clone(),
+            Token::Quote(inner) => inner.inner(),
+            Token::Small(inner) => inner.inner(),
+            Token::BoldItalic(inner) => inner.inner(),
+            Token::Bold(inner) => inner.inner(),
+            Token::Italic(inner) => inner.inner(),
+            Token::Center(inner) => inner.inner(),
+            Token::Strikethrough(inner) => inner.inner(),
+            Token::PlainTag(text) => Token::PlainText(text.clone()),
+            Token::InlineCode(code) => Token::PlainText(code.clone()),
+            Token::InlineMath(math) => Token::PlainText(math.clone()),
+            Token::UrlRaw(url) => Token::PlainText(url.clone()),
+            Token::UrlNoEmbed(url) => Token::PlainText(url.clone()),
+            Token::Link { label, .. } => label.inner(),
+            Token::BlockCode { inner, .. } => Token::PlainText(inner.clone()),
+            Token::BlockMath(math) => Token::PlainText(math.clone()),
+            Token::Function { inner, .. } => inner.inner(),
+            Token::Mention { name, .. } => Token::PlainText(name.clone()),
+            Token::UnicodeEmoji(code) => Token::PlainText(code.clone()),
+            Token::ShortcodeEmoji(shortcode) => Token::PlainText(shortcode.clone()),
+            Token::Hashtag(tag) => Token::PlainText(tag.clone()),
+        }
+    }
+
     fn owned(&self) -> Token<'static> {
         match self {
             Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())),
@@ -129,7 +204,7 @@ impl Token<'_> {
             Token::ShortcodeEmoji(shortcode) => {
                 Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned()))
             }
-            Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())),
+            Token::Hashtag(tag) => Token::Hashtag(Cow::Owned(tag.clone().into_owned())),
         }
     }
 
@@ -245,6 +320,16 @@ fn collect_char_sequence<'a>(
     move |chars| func(Cow::Owned(chars.collect()))
 }
 
+#[inline]
+fn alpha1_unicode(input: Span) -> IResult<Span, Span> {
+    recognize(many1_count(satisfy(char::is_alphanumeric)))(input)
+}
+
+#[inline]
+fn alphanumeric1_unicode(input: Span) -> IResult<Span, Span> {
+    recognize(many1_count(satisfy(char::is_alphanumeric)))(input)
+}
+
 fn spliced<'a>(
     segments: &[Span<'a>],
     func: impl Fn(Span) -> IResult<Span, Token>,
@@ -316,15 +401,16 @@ fn space(input: Span) -> IResult<Span, Token> {
     Ok((input, Token::PlainText(frag.into_fragment().into())))
 }
 
-struct Matcher<'a, 'b, T> {
+#[derive(Copy, Clone)]
+struct Matcher<'a, 'b, T: Clone> {
     matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a),
     collector: &'a (dyn Fn(&mut dyn Iterator<Item = T>) -> Token<'b> + 'a),
-    _phantom_closure: std::marker::PhantomData<&'a ()>,
-    _phantom_data: std::marker::PhantomData<&'b ()>,
-    _phantom_output: std::marker::PhantomData<fn() -> T>,
+    _phantom_closure: PhantomData<&'a ()>,
+    _phantom_data: PhantomData<&'b ()>,
+    _phantom_output: PhantomData<fn() -> T>,
 }
 
-impl<'a, 'b, T> Matcher<'a, 'b, T> {
+impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> {
     fn new(
         matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a),
         collector: &'a (dyn Fn(&mut dyn Iterator<Item = T>) -> Token<'b> + 'a),
@@ -332,9 +418,9 @@ impl<'a, 'b, T> Matcher<'a, 'b, T> {
         Self {
             matcher_inner,
             collector,
-            _phantom_closure: std::marker::PhantomData,
-            _phantom_data: std::marker::PhantomData,
-            _phantom_output: std::marker::PhantomData,
+            _phantom_closure: PhantomData,
+            _phantom_data: PhantomData,
+            _phantom_output: PhantomData,
         }
     }
 }
@@ -345,33 +431,60 @@ impl<'a, 'b> Matcher<'a, 'b, Infallible> {
         Self {
             matcher_inner: &fail::<_, Infallible, _>,
             collector: &|_| unreachable!(),
-            _phantom_closure: std::marker::PhantomData,
-            _phantom_data: std::marker::PhantomData,
-            _phantom_output: std::marker::PhantomData,
+            _phantom_closure: PhantomData,
+            _phantom_data: PhantomData,
+            _phantom_output: PhantomData,
         }
     }
 }
 
-struct Context;
+#[derive(Copy, Clone, Debug)]
+enum FlankingRule {
+    Lenient,
+    Strict,
+    DontCare,
+}
+
+struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>>(
+    T,
+    FlankingRule,
+    PhantomData<&'a ()>,
+);
+
+impl<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>> From<(T, FlankingRule)>
+    for FlankingDelim<'a, T>
+{
+    fn from((func, rule): (T, FlankingRule)) -> Self {
+        FlankingDelim(func, rule, PhantomData)
+    }
+}
+
+impl<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>> From<T> for FlankingDelim<'a, T> {
+    fn from(func: T) -> Self {
+        FlankingDelim(func, FlankingRule::DontCare, PhantomData)
+    }
+}
+
+pub struct Context;
 
 impl Context {
     #[inline]
-    const fn partial(
+    fn partial(
         &self,
         func: impl for<'a> Fn(&Self, Span<'a>) -> IResult<Span<'a>, Token<'a>> + 'static,
     ) -> impl for<'a> Fn(Span<'a>) -> IResult<Span<'a>, Token<'a>> + '_ {
         move |input| func(self, input)
     }
 
-    fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
+    pub fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         map(many1(self.partial(Self::full_single)), Token::Sequence)(input)
     }
 
-    fn inline<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
+    pub fn inline<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         map(many1(self.partial(Self::inline_single)), Token::Sequence)(input)
     }
 
-    fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
+    pub fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         map(
             many1(self.partial(Self::inline_label_safe_single)),
             Token::Sequence,
@@ -606,14 +719,21 @@ impl Context {
     }
 
     #[inline]
-    fn tag_delimited<'a, 'b: 'a, T, S>(
+    fn tag_delimited<'a, 'b: 'a, T: Clone, S: Clone, FOpen, FClose>(
         &'a self,
-        opening_tag: impl Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
-        closing_tag: impl Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
+        opening_tag: impl Into<FlankingDelim<'b, FOpen>> + 'a,
+        closing_tag: impl Into<FlankingDelim<'b, FClose>> + 'a,
         escape: bool,
         matcher: Matcher<'a, 'b, T>,
         fallback: Matcher<'a, 'b, S>,
-    ) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_ {
+    ) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_
+    where
+        FOpen: Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
+        FClose: Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
+    {
+        let FlankingDelim(opening_tag, opening_rule, ..) = opening_tag.into();
+        let FlankingDelim(closing_tag, closing_rule, ..) = closing_tag.into();
+
         move |input| {
             if escape {
                 if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) {
@@ -662,10 +782,44 @@ impl Context {
                 ));
             }
 
-            let (input, (inner, _)) = res?;
+            let (input, (inner, closing)) = res?;
             let mut inner = inner.into_iter().map(|(_, t)| t);
 
-            Ok((input, (matcher.collector)(&mut inner)))
+            let inner_tok = (matcher.collector)(&mut inner);
+
+            let correct_left_flanking =
+                if let FlankingRule::Lenient | FlankingRule::Strict = opening_rule {
+                    let text_left = inner_tok.str_content_left();
+
+                    !(text_left.is_some_and(|s| s.starts_with(char::is_whitespace))
+                        || text_left.is_none())
+                } else {
+                    true
+                };
+
+            let correct_right_flanking =
+                if let FlankingRule::Lenient | FlankingRule::Strict = closing_rule {
+                    let text_right = inner_tok.str_content_right();
+                    !(text_right.is_some_and(|s| s.ends_with(char::is_whitespace))
+                        || text_right.is_none())
+                } else {
+                    true
+                };
+
+            // TODO: Unfinished flanking rules
+            let correct_flanking = correct_left_flanking && correct_right_flanking;
+
+            if !correct_flanking {
+                return Ok((
+                    input,
+                    Token::Sequence(vec![
+                        Token::PlainText(begin.fragment_between(&post_open).into()),
+                        inner_tok.inner().owned(),
+                        Token::PlainText(closing.into_fragment().into()),
+                    ]),
+                ));
+            }
+            Ok((input, Token::Sequence(vec![inner_tok])))
         }
     }
 
@@ -720,12 +874,12 @@ impl Context {
     }
 
     fn tag_plain<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
-        let opening_tag = &tag("<small>");
-        let closing_tag = &tag("</small>");
+        let opening_tag = &tag("<plain>");
+        let closing_tag = &tag("</plain>");
 
         let (input, _) = opening_tag(input)?;
         let (input, text) = map(
-            recognize(many1(tuple((not_line_ending, not(closing_tag))))),
+            recognize(many1(tuple((not(line_ending), not(closing_tag), anychar)))),
             Span::into_fragment,
         )(input)?;
         let (input, _) = closing_tag(input)?;
@@ -735,8 +889,8 @@ impl Context {
 
     fn tag_small<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("<small>"),
-            tag("</small>"),
+            tag_no_case("<small>"),
+            tag_no_case("</small>"),
             false,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -749,11 +903,10 @@ impl Context {
         )(input)
     }
 
-    // TODO: CommonMark flanking rules
     fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("***"),
-            tag("***"),
+            (tag("***"), FlankingRule::Lenient),
+            (tag("***"), FlankingRule::Lenient),
             true,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -766,11 +919,10 @@ impl Context {
         )(input)
     }
 
-    // TODO: CommonMark flanking rules
     fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("___"),
-            tag("___"),
+            (tag("___"), FlankingRule::Strict),
+            (tag("___"), FlankingRule::Strict),
             true,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -785,8 +937,8 @@ impl Context {
 
     fn tag_bold<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("<b>"),
-            tag("</b>"),
+            tag_no_case("<b>"),
+            tag_no_case("</b>"),
             false,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -799,11 +951,10 @@ impl Context {
         )(input)
     }
 
-    // TODO: CommonMark flanking rules
     fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("**"),
-            tag("**"),
+            (tag("**"), FlankingRule::Lenient),
+            (tag("**"), FlankingRule::Lenient),
             true,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -816,11 +967,10 @@ impl Context {
         )(input)
     }
 
-    // TODO: CommonMark flanking rules
     fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("__"),
-            tag("__"),
+            (tag("__"), FlankingRule::Strict),
+            (tag("__"), FlankingRule::Strict),
             true,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -835,8 +985,8 @@ impl Context {
 
     fn tag_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("<i>"),
-            tag("</i>"),
+            tag_no_case("<i>"),
+            tag_no_case("</i>"),
             false,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -849,11 +999,10 @@ impl Context {
         )(input)
     }
 
-    // TODO: CommonMark flanking rules
     fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("*"),
-            tag("*"),
+            (tag("*"), FlankingRule::Lenient),
+            (tag("*"), FlankingRule::Lenient),
             true,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -866,11 +1015,10 @@ impl Context {
         )(input)
     }
 
-    // TODO: CommonMark flanking rules
     fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("_"),
-            tag("_"),
+            (tag("_"), FlankingRule::Strict),
+            (tag("_"), FlankingRule::Strict),
             true,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -885,8 +1033,8 @@ impl Context {
 
     fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("<s>"),
-            tag("</s>"),
+            tag_no_case("<s>"),
+            tag_no_case("</s>"),
             false,
             Matcher::new(
                 &self.partial(Self::inline_single),
@@ -899,11 +1047,10 @@ impl Context {
         )(input)
     }
 
-    // TODO: CommonMark flanking rules
     fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
         self.tag_delimited(
-            tag("~~"),
-            tag("~~"),
+            (tag("~~"), FlankingRule::Lenient),
+            (tag("~~"), FlankingRule::Lenient),
             true,
             Matcher::new(
                 &move |input| {
@@ -1037,20 +1184,42 @@ impl Context {
     }
 
     fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
-        // TODO: Fail when preceded by alphanumerics
+        if let (plain_out, Some(plain)) = map(
+            opt(recognize(tuple((
+                alphanumeric1_unicode,
+                self.partial(Self::shortcode_emoji),
+            )))),
+            |o| o.map(Span::into_fragment),
+        )(input)?
+        {
+            return Ok((plain_out, Token::PlainText(plain.into())));
+        }
+
         let (input, _) = tag(":")(input)?;
         let (input, shortcode) = map(
-            recognize(many1(alt((alphanumeric1, recognize(one_of("_+-")))))),
+            recognize(many1(alt((
+                alphanumeric1_unicode,
+                recognize(one_of("_+-")),
+            )))),
             Span::into_fragment,
         )(input)?;
         let (input, _) = tag(":")(input)?;
-        let (input, _) = not(alphanumeric1)(input)?;
+        let (input, _) = not(alphanumeric1_unicode)(input)?;
 
         Ok((input, Token::ShortcodeEmoji(shortcode.into())))
     }
 
     fn tag_mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
-        // TODO: Escaping and skip when preceded by alphanumerics
+        if let (plain_out, Some(plain)) = map(
+            opt(recognize(tuple((
+                alt((tag("\\"), alphanumeric1_unicode)),
+                self.partial(Self::tag_mention),
+            )))),
+            |o| o.map(Span::into_fragment),
+        )(input)?
+        {
+            return Ok((plain_out, Token::PlainText(plain.into())));
+        }
 
         let tags = one_of("@!");
         let (input, mention_type) = map(tags, |c| match c {
@@ -1123,12 +1292,12 @@ fn protocol(input: Span) -> IResult<Span, Span> {
 
 #[inline]
 fn url_chars_base(input: Span) -> IResult<Span, Span> {
-    recognize(alt((
-        alpha1,
+    alt((
+        alphanumeric1_unicode,
         recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))),
         recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))),
         recognize(one_of(".,_/:%#$&?!~=+-@")),
-    )))(input)
+    ))(input)
 }
 
 #[inline]
@@ -1221,6 +1390,21 @@ mod test {
             Token::Italic(Box::new(Token::PlainText("italic".into()))),
         );
 
+        assert_eq!(
+            parse_full(r#"* italic *"#),
+            Token::PlainText("* italic *".into())
+        );
+
+        assert_eq!(
+            parse_full(r#"_ italic *"#),
+            Token::PlainText("_ italic *".into())
+        );
+
+        assert_eq!(
+            parse_full(r#"*"italic"*"#),
+            Token::Italic(Box::new(Token::PlainText("\"italic\"".into())))
+        );
+
         assert_eq!(
             parse_full(r#"not code `code` also not code"#),
             Token::Sequence(vec![
@@ -1356,6 +1540,47 @@ text</center>"#
 
     #[test]
     fn parse_link() {
+        assert_eq!(
+            parse_full("IPv4 test: <https://0>"),
+            Token::Sequence(vec![
+                Token::PlainText("IPv4 test: ".into()),
+                Token::UrlNoEmbed("https://0".into())
+            ])
+        );
+
+        assert_eq!(
+            parse_full("IPv4 test: <https://127.0.0.1>"),
+            Token::Sequence(vec![
+                Token::PlainText("IPv4 test: ".into()),
+                Token::UrlNoEmbed("https://127.0.0.1".into())
+            ])
+        );
+
+        assert_eq!(
+            parse_full("IPv6 test: <https://[::2f:1]/nya>"),
+            Token::Sequence(vec![
+                Token::PlainText("IPv6 test: ".into()),
+                Token::UrlNoEmbed("https://[::2f:1]/nya".into())
+            ])
+        );
+
+        assert_eq!(
+            parse_full("IPv6 test: https://[::2f:1]/nya"),
+            Token::Sequence(vec![
+                Token::PlainText("IPv6 test: ".into()),
+                Token::UrlRaw("https://[::2f:1]/nya".into())
+            ])
+        );
+
+        // IDNs
+        assert_eq!(
+            parse_full("IDN test: https://www.háčkyčárky.cz/"),
+            Token::Sequence(vec![
+                Token::PlainText("IDN test: ".into()),
+                Token::UrlRaw("https://www.háčkyčárky.cz/".into())
+            ])
+        );
+
         assert_eq!(
             parse_full("Link test: [label](https://example.com)"),
             Token::Sequence(vec![
@@ -1440,6 +1665,11 @@ text</center>"#
             }
         );
 
+        assert_eq!(
+            parse_full("email@notactuallyamenmtion.org"),
+            Token::PlainText("email@notactuallyamenmtion.org".into())
+        );
+
         assert_eq!(
             parse_full("hgsjlkdsa @tag fgahjsdkd"),
             Token::Sequence(vec![
@@ -1532,6 +1762,32 @@ text</center>"#
         );
     }
 
+    #[test]
+    fn parse_shortcodes() {
+        assert_eq!(
+            parse_full(":bottom:"),
+            Token::ShortcodeEmoji("bottom".into())
+        );
+
+        assert_eq!(
+            parse_full(":bottom::blobfox:"),
+            Token::Sequence(vec![
+                Token::ShortcodeEmoji("bottom".into()),
+                Token::ShortcodeEmoji("blobfox".into())
+            ])
+        );
+
+        assert_eq!(
+            parse_full(":bottom:blobfox"),
+            Token::PlainText(":bottom:blobfox".into())
+        );
+
+        assert_eq!(
+            parse_full("bottom:blobfox:"),
+            Token::PlainText("bottom:blobfox:".into())
+        );
+    }
+
     #[test]
     fn parse_emoji() {
         assert_eq!(