diff --git a/Cargo.lock b/Cargo.lock index 95e3419..35e50cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -411,6 +411,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + [[package]] name = "byteorder" version = "1.4.3" @@ -456,6 +462,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a4f925191b4367301851c6d99b09890311d74b0d43f274c0b34c86d308a3663" +[[package]] +name = "castaway" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.0.81" @@ -578,6 +593,19 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "compact_str" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86b9c4c00838774a6d902ef931eff7470720c51d90c2e32cfe15dc304737b3f" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "ryu", + "static_assertions", +] + [[package]] name = "const-oid" version = "0.9.4" @@ -742,6 +770,15 @@ dependencies = [ "serde", ] +[[package]] +name = "emojis" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979" +dependencies = [ + "phf", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -1603,6 +1640,19 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "mmm_parser" +version = "0.2.1-alpha" +dependencies = [ + "compact_str", + "either", + "emojis", + "nom", + "nom_locate", + "tracing", + "unicode-segmentation", +] + [[package]] name = "nom" version = "7.1.3" @@ -1613,6 +1663,17 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom_locate" +version = "4.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e3c83c053b0713da60c5b8de47fe8e494fe3ece5267b2f23090a07a053ba8f3" +dependencies = [ + "bytecount", + "memchr", + "nom", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -2474,18 +2535,18 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" [[package]] name = "serde" -version = "1.0.180" +version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea67f183f058fe88a4e3ec6e2788e003840893b91bac4559cabedd00863b3ed" +checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.180" +version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24e744d7782b686ab3b73267ef05697159cc0e5abbed3f47f9933165e5219036" +checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 95a4fe6..9828764 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ members = [ "fe_calckey", "magnetar_common", "magnetar_sdk", + "magnetar_mmm_parser", "core" ] @@ -27,7 +28,10 @@ axum = "0.6" cached = "0.46" cfg-if = "1" chrono = "0.4" +compact_str = "0.7" dotenvy = "0.15" +either = "1.9" +emojis = "0.6" futures-core = "0.3" futures-util = "0.3" headers = "0.3" @@ -36,8 +40,11 @@ hyper = "0.14" js-sys = "0.3" log = "0.4" miette = "5.9" +nom = "7" +nom_locate = "4" percent-encoding = "2.2" redis = "0.23" +regex = "1.9" reqwest = "0.11" sea-orm = "0.12" sea-orm-migration = "0.12" @@ -98,4 +105,4 @@ toml = { workspace = true } unicode-segmentation = { workspace = true } [profile.release] -lto = true \ No newline at end of file +lto = true diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml new file mode 100644 index 0000000..d7b9b2d --- /dev/null +++ b/magnetar_mmm_parser/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "mmm_parser" +version.workspace = true +edition.workspace = true +license = "MIT OR Apache-2.0" + +[dependencies] +either = { workspace = true } +emojis = { workspace = true } +nom = { workspace = true } +nom_locate = { workspace = true } +compact_str = { workspace = true } +tracing = { workspace = true } +unicode-segmentation = { workspace = true } diff --git a/magnetar_mmm_parser/README.md b/magnetar_mmm_parser/README.md new file mode 100644 index 0000000..92dff9b --- /dev/null +++ b/magnetar_mmm_parser/README.md @@ -0,0 +1,5 @@ +# MMM + +Magnetar {marinated, modified} Markdown? + +#TODO: Finish docs \ No newline at end of file diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs new file mode 100644 index 0000000..26661e6 --- /dev/null +++ b/magnetar_mmm_parser/src/lib.rs @@ -0,0 +1,2007 @@ +use compact_str::{CompactString, ToCompactString}; +use either::Either; +use nom::branch::alt; +use nom::bytes::complete::{tag, tag_no_case}; +use nom::character::complete::{ + alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, + satisfy, space1, tab, +}; +use nom::combinator::{eof, fail, map, not, opt, peek, recognize}; +use nom::error::ErrorKind; +use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; +use nom::sequence::tuple; +use nom::{IResult, Offset, Parser, Slice}; +use nom_locate::LocatedSpan; +use std::collections::HashMap; +use std::convert::{identity, Infallible}; +use std::marker::PhantomData; +use tracing::trace; +use unicode_segmentation::UnicodeSegmentation; + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum MentionType { + Community, + User, +} + +impl MentionType { + pub fn to_char(&self) -> char { + match self { + MentionType::Community => '!', + MentionType::User => '@', + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Token { + PlainText(CompactString), + Sequence(Vec), + Quote(Box), + Small(Box), + BoldItalic(Box), + Bold(Box), + Italic(Box), + Center(Box), + Strikethrough(Box), + PlainTag(String), + InlineCode(String), + InlineMath(String), + UrlRaw(String), + UrlNoEmbed(String), + Link { + label: Box, + href: String, + embed: bool, + }, + BlockCode { + lang: Option, + inner: String, + }, + BlockMath(String), + Function { + name: String, + params: HashMap>, + inner: Box, + }, + Mention { + name: String, + host: Option, + mention_type: MentionType, + }, + UnicodeEmoji(String), + ShortcodeEmoji(String), + Hashtag(String), +} + +impl Token { + fn str_content_left(&self) -> Option<&str> { + match self { + Token::PlainText(text) => Some(text.as_ref()), + Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left), + Token::Quote(inner) => inner.str_content_left(), + Token::Small(inner) => inner.str_content_left(), + Token::BoldItalic(inner) => inner.str_content_left(), + Token::Bold(inner) => inner.str_content_left(), + Token::Italic(inner) => inner.str_content_left(), + Token::Center(inner) => inner.str_content_left(), + Token::Strikethrough(inner) => inner.str_content_left(), + Token::PlainTag(tag) => Some(tag.as_ref()), + Token::UrlRaw(url) => Some(url.as_ref()), + Token::UrlNoEmbed(url) => Some(url.as_ref()), + Token::Link { label, .. } => label.str_content_left(), + Token::Function { inner, .. } => inner.str_content_left(), + Token::Mention { name, .. } => Some(name.as_ref()), + Token::UnicodeEmoji(code) => Some(code.as_ref()), + Token::ShortcodeEmoji(_) => None, + Token::Hashtag(tag) => Some(tag.as_ref()), + _ => None, + } + } + + fn str_content_right(&self) -> Option<&str> { + match self { + Token::PlainText(text) => Some(text.as_ref()), + Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right), + Token::Quote(inner) => inner.str_content_right(), + Token::Small(inner) => inner.str_content_right(), + Token::BoldItalic(inner) => inner.str_content_right(), + Token::Bold(inner) => inner.str_content_right(), + Token::Italic(inner) => inner.str_content_right(), + Token::Center(inner) => inner.str_content_right(), + Token::Strikethrough(inner) => inner.str_content_right(), + Token::PlainTag(tag) => Some(tag.as_ref()), + Token::UrlRaw(url) => Some(url.as_ref()), + Token::UrlNoEmbed(url) => Some(url.as_ref()), + Token::Link { label, .. } => label.str_content_right(), + Token::Function { inner, .. } => inner.str_content_right(), + Token::Mention { name, .. } => Some(name.as_ref()), + Token::UnicodeEmoji(code) => Some(code.as_ref()), + Token::Hashtag(tag) => Some(tag.as_ref()), + _ => None, + } + } + + fn inner(&self) -> Token { + match self { + plain @ Token::PlainText(_) => plain.clone(), + sequence @ Token::Sequence(_) => sequence.clone(), + Token::Quote(inner) => inner.inner(), + Token::Small(inner) => inner.inner(), + Token::BoldItalic(inner) => inner.inner(), + Token::Bold(inner) => inner.inner(), + Token::Italic(inner) => inner.inner(), + Token::Center(inner) => inner.inner(), + Token::Strikethrough(inner) => inner.inner(), + Token::PlainTag(text) => Token::PlainText(text.clone().into()), + Token::InlineCode(code) => Token::PlainText(code.clone().into()), + Token::InlineMath(math) => Token::PlainText(math.clone().into()), + Token::UrlRaw(url) => Token::PlainText(url.clone().into()), + Token::UrlNoEmbed(url) => Token::PlainText(url.clone().into()), + Token::Link { label, .. } => label.inner(), + Token::BlockCode { inner, .. } => Token::PlainText(inner.clone().into()), + Token::BlockMath(math) => Token::PlainText(math.clone().into()), + Token::Function { inner, .. } => inner.inner(), + Token::Mention { name, .. } => Token::PlainText(name.clone().into()), + Token::UnicodeEmoji(code) => Token::PlainText(code.clone().into()), + Token::ShortcodeEmoji(shortcode) => Token::PlainText(shortcode.clone().into()), + Token::Hashtag(tag) => Token::PlainText(tag.clone().into()), + } + } + + fn merged(&self) -> Token { + match self { + Token::Sequence(tokens) => { + let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| { + if let Some(Token::PlainText(last)) = acc.last_mut() { + if let Token::PlainText(tok_text) = tok { + *last += tok_text.as_ref(); + + return acc; + } + } + + if let Token::Sequence(seq) = tok { + let items = seq.iter().map(Token::merged).flat_map(|t| match t { + Token::Sequence(seq) => Either::Left(seq.into_iter()), + other => Either::Right(std::iter::once(other)), + }); + + for item in items { + if let Some(Token::PlainText(last)) = acc.last_mut() { + if let Token::PlainText(tok_text) = item { + *last += tok_text.as_ref(); + + continue; + } + } + + acc.push(item); + } + + return acc; + } + + acc.push(tok.merged()); + acc + }); + + if tokens_multi.len() == 1 { + return tokens_multi.into_iter().next().unwrap(); + } + + Token::Sequence(tokens_multi) + } + Token::Quote(inner) => Token::Quote(Box::new(inner.merged())), + Token::Small(inner) => Token::Small(Box::new(inner.merged())), + Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.merged())), + Token::Bold(inner) => Token::Bold(Box::new(inner.merged())), + Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), + Token::Center(inner) => Token::Center(Box::new(inner.merged())), + Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())), + Token::Link { embed, label, href } => Token::Link { + label: Box::new(label.merged()), + href: href.clone(), + embed: *embed, + }, + Token::Function { + name, + params, + inner, + } => Token::Function { + name: name.clone(), + params: params.clone(), + inner: Box::new(inner.merged()), + }, + other => other.clone(), + } + } +} + +#[derive(Debug, Default, Copy, Clone)] +pub struct SpanMeta { + depth: usize, +} + +impl SpanMeta { + fn new(depth: usize) -> Self { + Self { depth } + } +} + +type Span<'a> = LocatedSpan<&'a str, SpanMeta>; + +trait SliceOffset { + fn up_to(&self, other: &Self) -> Self; + + fn fragment_between<'a>(&self, other: &Self) -> &'a str + where + Self: 'a; +} + +impl SliceOffset for Span<'_> { + fn up_to(&self, other: &Self) -> Self { + self.slice(..self.offset(other)) + } + + fn fragment_between<'a>(&self, other: &Self) -> &'a str + where + Self: 'a, + { + self.up_to(other).into_fragment() + } +} + +#[inline] +fn boxing_token(func: impl Fn(Box) -> Token) -> impl Fn(Token) -> Token { + move |tokens| func(Box::new(tokens)) +} + +#[inline] +fn collect_sequence( + func: impl Fn(Vec) -> Token, + transform: impl Fn(Token) -> Token, +) -> impl Fn(&mut dyn Iterator) -> Token { + move |tokens| transform(func(tokens.collect())) +} + +#[inline] +fn collect_char_sequence( + func: impl Fn(String) -> Token, +) -> impl Fn(&mut dyn Iterator) -> Token { + move |chars| func(chars.collect()) +} + +#[inline] +fn space1_unicode(input: Span) -> IResult { + recognize(many1_count(tuple(( + not(line_ending), + satisfy(char::is_whitespace), + ))))(input) +} + +#[inline] +fn alphanumeric1_unicode(input: Span) -> IResult { + recognize(many1_count(satisfy(char::is_alphanumeric)))(input) +} + +fn spliced<'a>( + segments: &[Span<'a>], + func: impl Fn(Span) -> IResult, + parent: Span<'a>, +) -> IResult, Token, nom::error::Error>> { + let combined = segments + .iter() + .copied() + .map(Span::into_fragment) + .collect::>() + .join("\n"); + let cum_offset_combined = segments + .iter() + .scan(0, |acc, &x| { + *acc += x.len(); + Some(*acc) + }) + .collect::>(); + let current_seg = |input: Span| { + cum_offset_combined + .iter() + .enumerate() + .take_while(|(_, &o)| o > input.location_offset()) + .map(|(i, o)| (segments[i], o)) + .last() + }; + + type NE = nom::Err; + type NomError<'x> = nom::error::Error>; + + let quote_span = Span::new_extra( + &combined, + segments.first().map_or(SpanMeta::new(0), |s| s.extra), + ); + let (input, inner) = match func(quote_span) { + Ok(s) => s, + Err(e) => { + return match e { + NE::Error(e) => { + let offset_new = e.input.location_offset(); + if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) { + let offset = offset_new - offset_seg_new; + let offset_orig = offset + seg_parent.location_offset(); + Err(NE::Error(NomError::new( + Span::new_extra( + &parent.into_fragment()[offset_orig..], + seg_parent.extra, + ), + e.code, + ))) + } else { + // ??? + Err(NE::Failure(NomError::new(parent, ErrorKind::Fail))) + } + } + NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))), + NE::Incomplete(i) => Err(NE::Incomplete(i)), + }; + } + }; + + let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) { + let offset = input.location_offset() - offset_seg_new; + let offset_orig = offset + seg_parent.location_offset(); + parent.slice(offset_orig..) + } else { + parent + }; + + Ok((out, inner)) +} + +fn space(input: Span) -> IResult { + let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?; + Ok((input, Token::PlainText(frag.into_fragment().into()))) +} + +#[derive(Copy, Clone)] +struct Matcher<'a, 'b, T: Clone> { + matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), + _phantom_closure: PhantomData<&'a ()>, + _phantom_data: PhantomData<&'b ()>, + _phantom_output: PhantomData T>, +} + +impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> { + fn new( + matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), + ) -> Self { + Self { + matcher_inner, + collector, + _phantom_closure: PhantomData, + _phantom_data: PhantomData, + _phantom_output: PhantomData, + } + } +} + +impl<'a, 'b> Matcher<'a, 'b, Infallible> { + // Don't break this invariant, else a monster will come at night and eat all your socks + fn reject() -> Self { + Self { + matcher_inner: &fail::<_, Infallible, _>, + collector: &|_| unreachable!(), + _phantom_closure: PhantomData, + _phantom_data: PhantomData, + _phantom_output: PhantomData, + } + } +} + +#[derive(Copy, Clone, Debug)] +enum FlankingRule { + Lenient, + Strict, + DontCare, +} + +struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>>( + T, + FlankingRule, + PhantomData<&'a ()>, +); + +impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From<(T, FlankingRule)> + for FlankingDelim<'a, T> +{ + fn from((func, rule): (T, FlankingRule)) -> Self { + FlankingDelim(func, rule, PhantomData) + } +} + +impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From for FlankingDelim<'a, T> { + fn from(func: T) -> Self { + FlankingDelim(func, FlankingRule::DontCare, PhantomData) + } +} + +pub struct Context { + depth_limit: usize, +} + +const DEFAULT_DEPTH_LIMIT: usize = 24; + +impl Default for Context { + fn default() -> Self { + Context::new(DEFAULT_DEPTH_LIMIT) + } +} + +impl Context { + pub fn new(depth_limit: usize) -> Self { + Self { depth_limit } + } + + pub fn parse_full(&self, input: &str) -> Token { + match self.full(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Full parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } + } + } + + pub fn parse_inline(&self, input: &str) -> Token { + match self.full(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Inline parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } + } + } + + pub fn parse_ui(&self, input: &str) -> Token { + match self.inline_ui(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Inline parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } + } + } + + #[inline] + fn partial( + &self, + func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token> + 'static, + ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token> + '_ { + move |input| func(self, input) + } + + #[inline] + fn partial_span( + &self, + func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Span<'a>> + 'static, + ) -> impl for<'a> Fn(Span<'a>) -> IResult, Span<'a>> + '_ { + move |input| func(self, input) + } + + pub fn full<'a>(&self, input: Span<'a>) -> IResult, Token> { + map(many1(self.partial(Self::full_single)), Token::Sequence)(input) + } + + pub fn inline<'a>(&self, input: Span<'a>) -> IResult, Token> { + map(many1(self.partial(Self::inline_single)), Token::Sequence)(input) + } + + pub fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token> { + map( + many1(self.partial(Self::inline_label_safe_single)), + Token::Sequence, + )(input) + } + + fn inline_ui<'a>(&self, input: Span<'a>) -> IResult, Token> { + map( + many1(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::shortcode_emoji), + self.partial(Self::tag_raw_text), + ))), + Token::Sequence, + )(input) + } + + fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult, Token> { + alt(( + self.partial(Self::tag_bold_italic_asterisk), + self.partial(Self::tag_bold_italic_underscore), + self.partial(Self::tag_bold_asterisk), + self.partial(Self::tag_italic_asterisk), + self.partial(Self::tag_bold_underscore), + self.partial(Self::tag_italic_underscore), + ))(input) + } + + fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, token) = alt(( + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_block_center), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_block_code), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_quote), + self.partial(Self::tag_block_math), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::link), + self.partial(Self::raw_url), + ))), + self.partial(Self::tag_raw_text), + ))(input)?; + Ok((input, token)) + } + + fn inline_single<'a>(&self, input: Span<'a>) -> IResult, Token> { + alt(( + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::url_no_embed), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::link), + self.partial(Self::raw_url), + ))), + self.partial(Self::tag_raw_text), + ))(input) + } + + fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, token) = alt(( + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::url_no_embed), + self.partial(Self::tag_inline_code), + self.partial(Self::tag_inline_math), + self.partial(Self::tag_func), + self.partial(Self::tag_mention), + self.partial(Self::tag_hashtag), + self.partial(Self::shortcode_emoji), + self.partial(Self::raw_url), + ))), + self.partial(Self::tag_raw_text), + ))(input)?; + Ok((input, token)) + } + + fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, token) = alt(( + self.increase_nesting(alt(( + self.partial(Self::unicode_emoji), + self.partial(Self::tag_small), + self.partial(Self::tag_plain), + self.partial(Self::tag_bold), + self.partial(Self::tag_italic), + self.partial(Self::tag_strikethrough), + self.partial(Self::base_bold_italic), + self.partial(Self::tag_strikethrough_tilde), + self.partial(Self::tag_func), + self.partial(Self::shortcode_emoji), + ))), + self.partial(Self::tag_raw_text), + ))(input)?; + Ok((input, token)) + } + + fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; + + if let (None, None) = leading_spaces { + if input.get_column() != 1 { + return fail(input); + } + } + + let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input); + + let orig_input = input; + let (input, lines) = separated_list1(line_ending, quote_line)(input)?; + + let quote_lines = lines + .into_iter() + .map(|(_, _, text)| text) + .collect::>(); + + if quote_lines.len() == 1 + && quote_lines + .iter() + .map(Span::fragment) + .copied() + .any(&str::is_empty) + { + return fail(input); + } + + let (_, inner) = spliced("e_lines, self.partial(Self::full), orig_input)?; + + let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; + + Ok((input, Token::Quote(Box::new(inner)))) + } + + fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult, Token> { + let tag_start = &tag("
"); + let tag_end = &tag("
"); + + let (input, _) = opt(line_ending)(input)?; + + if input.get_column() != 1 { + return fail(input); + } + + let (input, _) = tag_start(input)?; + let (input, _) = opt(line_ending)(input)?; + + let (input, (center_seq, _)) = many_till( + self.partial(Self::inline_single), + tuple((opt(space1), opt(line_ending), tag_end)), + )(input)?; + + Ok(( + input, + boxing_token(Token::Center)(Token::Sequence(center_seq)), + )) + } + + fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token> { + let delim = &tag("```"); + + let (input, _) = opt(line_ending)(input)?; + + if input.get_column() != 1 { + return fail(input); + } + + let (input, _) = delim(input)?; + let (input, lang) = opt(map( + recognize(many1(tuple((not(delim), not_line_ending)))), + Span::into_fragment, + ))(input)?; + let (input, _) = line_ending(input)?; + + let (input, code) = map( + recognize(many1_count(tuple(( + not(tuple((line_ending, delim))), + anychar, + )))), + Span::into_fragment, + )(input)?; + + let (input, _) = line_ending(input)?; + let (input, _) = delim(input)?; + let (input, _) = many0(space)(input)?; + let (input, _) = not(not(line_ending))(input)?; + let (input, _) = opt(line_ending)(input)?; + + Ok(( + input, + Token::BlockCode { + lang: lang.map(<&str>::into), + inner: code.into(), + }, + )) + } + + fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult, Token> { + let start = &tag("\\["); + let end = &tag("\\]"); + + let (input, _) = opt(line_ending)(input)?; + + if input.get_column() != 1 { + return fail(input); + } + + let (input, _) = start(input)?; + let (input, _) = opt(line_ending)(input)?; + + let (input, math_span) = recognize(many1_count(tuple(( + not(tuple((opt(line_ending), end))), + not_line_ending, + ))))(input)?; + + let (input, _) = opt(line_ending)(input)?; + let (input, _) = end(input)?; + let (input, _) = many0(space)(input)?; + let (input, _) = not(not_line_ending)(input)?; + let (input, _) = opt(line_ending)(input)?; + + Ok(( + input, + Token::BlockMath(math_span.into_fragment().to_string()), + )) + } + + #[inline] + fn tag_delimited<'a, 'b: 'a, T: Clone, S: Clone, FOpen, FClose>( + &'a self, + opening_tag: impl Into> + 'a, + closing_tag: impl Into> + 'a, + escape: bool, + matcher: Matcher<'a, 'b, T>, + fallback: Matcher<'a, 'b, S>, + ) -> impl Fn(Span<'b>) -> IResult, Token> + '_ + where + FOpen: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + FClose: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, + { + let FlankingDelim(opening_tag, opening_rule, ..) = opening_tag.into(); + let FlankingDelim(closing_tag, closing_rule, ..) = closing_tag.into(); + + move |input| { + if escape { + if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) { + return Ok(( + input_escaped, + Token::PlainText(mark.fragment().to_string().into()), + )); + } + } + + if let FlankingRule::Strict = opening_rule { + let (input, pre) = + opt(recognize(tuple((alphanumeric1_unicode, &opening_tag))))(input)?; + if let Some(pre_text) = pre { + return Ok((input, Token::PlainText(pre_text.into_fragment().into()))); + } + } + + let begin = input; + let (post_open, _) = opening_tag(input)?; + + let res = tuple(( + many1(tuple((not(&closing_tag), &matcher.matcher_inner))), + &closing_tag, + ))(post_open); + + if let Err(nom::Err::Error(nom::error::Error { + input: input_past_err, + .. + })) = res + { + let res_fallback = tuple(( + many1(tuple((not(&closing_tag), &fallback.matcher_inner))), + &closing_tag, + ))(post_open); + + if res_fallback.is_err() { + return Ok(( + input_past_err, + Token::PlainText(begin.fragment_between(&input_past_err).into()), + )); + } + + let (input, (inner, closing)) = res_fallback.unwrap(); + let mut inner = inner.into_iter().map(|(_, t)| t); + + return Ok(( + input, + Token::Sequence(vec![ + Token::PlainText(begin.fragment_between(&post_open).into()), + ((fallback.collector)(&mut inner)), + Token::PlainText(closing.into_fragment().into()), + ]), + )); + } + + let (input, (inner, closing)) = res?; + let mut inner = inner.into_iter().map(|(_, t)| t); + + let inner_tok = (matcher.collector)(&mut inner); + + let correct_left_flanking = + if let FlankingRule::Lenient | FlankingRule::Strict = opening_rule { + let text_left = inner_tok.str_content_left(); + + !(text_left.is_some_and(|s| s.starts_with(char::is_whitespace)) + || text_left.is_none()) + } else { + true + }; + + let correct_right_flanking = + if let FlankingRule::Lenient | FlankingRule::Strict = closing_rule { + let text_right = inner_tok.str_content_right(); + !(text_right.is_some_and(|s| s.ends_with(char::is_whitespace)) + || text_right.is_none()) + } else { + true + }; + + let (input, alphanum) = opt(peek(alphanumeric1_unicode))(input)?; + let correct_right_outer = + alphanum.is_none() || !matches!(closing_rule, FlankingRule::Strict); + + let correct_flanking = + correct_left_flanking && correct_right_flanking && correct_right_outer; + + if !correct_flanking { + return Ok(( + input, + Token::Sequence(vec![ + Token::PlainText(begin.fragment_between(&post_open).into()), + inner_tok.inner(), + Token::PlainText(closing.into_fragment().into()), + ]), + )); + } + Ok((input, Token::Sequence(vec![inner_tok]))) + } + } + + fn tag_func<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, _) = tag("$[")(input)?; + + let func_ident = |input| { + recognize(tuple(( + many1_count(alt((alpha1, tag("_")))), + many0_count(alt((alphanumeric1, tag("_")))), + )))(input) + }; + + let param_value = recognize(many1_count(alt(( + alphanumeric1, + tag("."), + tag("-"), + tag("_"), + )))); + + let (input, func_name) = map(func_ident, Span::into_fragment)(input)?; + + let arg = tuple((func_ident, opt(tuple((tag("="), param_value))))); + + let (input, args) = + opt(tuple((one_char('.'), separated_list1(one_char(','), arg))))(input)?; + + let args_out = args.map_or_else(HashMap::new, |(_, items)| { + items + .into_iter() + .map(|(k, v)| { + ( + k.into_fragment().to_string(), + v.map(|(_, val)| val.into_fragment().to_string()), + ) + }) + .collect::>() + }); + + let (input, _) = opt(space)(input)?; + + let (input, (inner, _)) = many_till(self.partial(Self::inline_single), tag("]"))(input)?; + + Ok(( + input, + Token::Function { + name: func_name.to_string(), + params: args_out, + inner: Box::new(Token::Sequence(inner)), + }, + )) + } + + fn tag_plain<'a>(&self, input: Span<'a>) -> IResult, Token> { + let opening_tag = &tag(""); + let closing_tag = &tag(""); + + let (input, _) = opening_tag(input)?; + let (input, text) = map( + recognize(many1(tuple((not(line_ending), not(closing_tag), anychar)))), + Span::into_fragment, + )(input)?; + let (input, _) = closing_tag(input)?; + + Ok((input, Token::PlainTag(text.into()))) + } + + fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + tag_no_case(""), + tag_no_case(""), + false, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Small)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + (tag("***"), FlankingRule::Lenient), + (tag("***"), FlankingRule::Lenient), + true, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + (tag("___"), FlankingRule::Strict), + (tag("___"), FlankingRule::Strict), + true, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + tag_no_case(""), + tag_no_case(""), + false, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + (tag("**"), FlankingRule::Lenient), + (tag("**"), FlankingRule::Lenient), + true, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + (tag("__"), FlankingRule::Strict), + (tag("__"), FlankingRule::Strict), + true, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + tag_no_case(""), + tag_no_case(""), + false, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + (tag("*"), FlankingRule::Lenient), + (tag("*"), FlankingRule::Lenient), + true, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + (tag("_"), FlankingRule::Strict), + (tag("_"), FlankingRule::Strict), + true, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + tag_no_case(""), + tag_no_case(""), + false, + Matcher::new( + &self.partial(Self::inline_single), + &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), + ), + Matcher::new( + &self.partial(Self::inline_non_formatting_single), + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + (tag("~~"), FlankingRule::Lenient), + (tag("~~"), FlankingRule::Lenient), + true, + Matcher::new( + &move |input| { + map( + tuple(((not(line_ending)), self.partial(Self::inline_single))), + |(_, captured)| captured, + )(input) + }, + &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), + ), + Matcher::new( + &move |input| { + map( + tuple(( + (not(line_ending)), + self.partial(Self::inline_non_formatting_single), + )), + |(_, captured)| captured, + )(input) + }, + &collect_sequence(Token::Sequence, identity), + ), + )(input) + } + + fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + tag("`"), + |input| alt((tag("`"), tag("´")))(input), + true, + Matcher::new( + &move |input| { + map( + tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar)), + |(_, captured)| captured, + )(input) + }, + &collect_char_sequence(Token::InlineCode), + ), + Matcher::reject(), + )(input) + } + + fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token> { + self.tag_delimited( + tag("\\("), + tag("\\)"), + false, + Matcher::new( + &move |input| { + map(tuple((not(line_ending), anychar)), |(_, captured)| captured)(input) + }, + &collect_char_sequence(Token::InlineMath), + ), + Matcher::reject(), + )(input) + } + + fn tag_raw_text<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, text) = anychar(input)?; + Ok((input, Token::PlainText(text.to_compact_string()))) + } + + fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, url_span) = recognize(tuple(( + self.partial_span(Self::protocol), + self.url_chars( + |input| recognize(not(self.partial_span(Self::url_chars_base)))(input), + false, + ), + )))(input)?; + + let url = url_span.into_fragment(); + let url_bytes = url.as_bytes(); + + // Strip punctuation at the end of sentences that might have been consumed as a part of the URL + let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) { + url.slice(..url.len() - 1) + } else { + url + }; + + Ok((input, Token::UrlRaw(final_url.to_string()))) + } + + fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, _) = tag("<")(input)?; + let (input, url_span) = recognize(tuple(( + self.partial_span(Self::protocol), + self.url_chars(tag(">"), true), + )))(input)?; + let (input, _) = tag(">")(input)?; + + Ok(( + input, + Token::UrlNoEmbed(url_span.into_fragment().to_string()), + )) + } + + fn link<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, no_embed) = opt(tag("?"))(input)?; + let (input, _) = tag("[")(input)?; + let (input, _) = not(tag("["))(input)?; + let (input, (label_tok, _)) = + many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?; + let (input, url_span) = recognize(tuple(( + self.partial_span(Self::protocol), + self.url_chars(tag(")"), true), + )))(input)?; + let (input, _) = tag(")")(input)?; + + Ok(( + input, + Token::Link { + label: Box::new(Token::Sequence(label_tok)), + href: url_span.into_fragment().into(), + embed: no_embed.is_none(), + }, + )) + } + + fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token> { + let frag = input.fragment(); + let Some(grapheme) = frag.graphemes(true).next() else { + return fail(input); + }; + + let grapheme = grapheme.trim_end_matches(|c| c == '\u{200c}' || c == '\u{200d}'); + + let emoji = emojis::get(grapheme); + + if emoji.is_none() { + return fail(input); + } + + Ok(( + input.slice(grapheme.len()..), + Token::UnicodeEmoji(grapheme.into()), + )) + } + + fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token> { + if let (plain_out, Some(plain)) = map( + opt(recognize(tuple(( + alphanumeric1_unicode, + self.partial(Self::shortcode_emoji), + )))), + |o| o.map(Span::into_fragment), + )(input)? + { + return Ok((plain_out, Token::PlainText(plain.into()))); + } + + let (input, _) = tag(":")(input)?; + let (input, shortcode) = map( + recognize(many1(alt(( + alphanumeric1_unicode, + recognize(one_of("_+-")), + )))), + Span::into_fragment, + )(input)?; + let (input, _) = tag(":")(input)?; + let (input, _) = not(alphanumeric1_unicode)(input)?; + + Ok((input, Token::ShortcodeEmoji(shortcode.into()))) + } + + fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token> { + if let (plain_out, Some(plain)) = map( + opt(recognize(tuple(( + alt((tag("\\"), alphanumeric1_unicode)), + self.partial(Self::tag_mention), + )))), + |o| o.map(Span::into_fragment), + )(input)? + { + return Ok((plain_out, Token::PlainText(plain.into()))); + } + + let tags = one_of("@!"); + let (input, mention_type) = map(tags, |c| match c { + '@' => MentionType::User, + '!' => MentionType::Community, + _ => unreachable!(), + })(input)?; + + let (input, name) = map( + recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))), + Span::into_fragment, + )(input)?; + + let before = input; + let (_, host) = map( + opt(tuple(( + tag("@"), + map( + recognize(many1(alt((alphanumeric1, recognize(one_of("-_.")))))), + Span::into_fragment, + ), + ))), + |maybe_tag_host| maybe_tag_host.map(|(_, host)| host), + )(input)?; + + let host = host.map(|h| h.trim_end_matches(|c| matches!(c, '.' | '-' | '_'))); + + Ok(( + host.map(|c| before.slice(c.len() + 1..)).unwrap_or(before), + Token::Mention { + mention_type, + name: name.into(), + host: host.map(|h| h.into()), + }, + )) + } + + fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token> { + let (input, maybe_preceded) = + opt(recognize(tuple((alphanumeric1_unicode, tag("#")))))(input)?; + + if let Some(preceded) = maybe_preceded { + return Ok((input, Token::PlainText(preceded.into_fragment().into()))); + } + + let (input, _) = tag("#")(input)?; + + let (input, hashtag_text) = map( + recognize(many1(self.partial_span(Self::hashtag_chars))), + Span::into_fragment, + )(input)?; + + Ok((input, Token::Hashtag(hashtag_text.into()))) + } + + #[inline] + fn increase_nesting<'a, 'b, O, F>( + &'b self, + mut func: F, + ) -> impl FnMut(Span<'a>) -> IResult, O> + 'b + where + F: Parser, O, nom::error::Error>> + 'b, + { + move |mut input| { + if input.extra.depth >= self.depth_limit { + return fail(input); + } + + input.extra.depth += 1; + func.parse(input) + } + } + + #[inline] + fn hashtag_chars<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + recognize(alt(( + recognize(tuple(( + tag("("), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag(")"), + ))), + recognize(tuple(( + tag("["), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag("]"), + ))), + recognize(tuple(( + tag("「"), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag("」"), + ))), + recognize(tuple(( + tag("("), + self.increase_nesting(self.partial_span(Self::hashtag_chars)), + tag(")"), + ))), + recognize(tuple(( + not(space1_unicode), + not(line_ending), + not(one_of(".,:;!?#?/[]【】()「」()<>")), + anychar, + ))), + )))(input) + } + + #[inline] + fn protocol<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + alt((tag("https://"), tag("http://")))(input) + } + + #[inline] + fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + alt(( + alphanumeric1_unicode, + recognize(tuple(( + tag("["), + many_till( + self.increase_nesting(self.partial_span(Self::url_chars_base)), + tag("]"), + ), + ))), + recognize(tuple(( + tag("("), + many_till( + self.increase_nesting(self.partial_span(Self::url_chars_base)), + tag(")"), + ), + ))), + recognize(one_of(".,_/:%#$&?!~=+-@")), + ))(input) + } + + #[inline] + fn url_chars<'a, 'b, F>( + &'b self, + mut terminator: F, + spaces: bool, + ) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'b + where + F: Parser, Span<'a>, nom::error::Error>> + 'b, + { + move |input| { + recognize(many1_count(tuple(( + not(tuple((space1, eof))), + not(tuple((space1, tag("\"")))), + not(tuple((opt(space1), |input| terminator.parse(input)))), + alt(( + |input| self.url_chars_base(input), + if spaces { space1 } else { fail }, + )), + ))))(input) + } + } +} + +#[cfg(test)] +mod test { + use crate::{Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; + use nom::bytes::complete::tag; + use std::collections::HashMap; + + fn parse_full(string: &str) -> Token { + Context::default() + .full(Span::new_extra(string, SpanMeta::default())) + .unwrap() + .1 + .merged() + } + + #[test] + fn parse_url_chars() { + let ctx = Context::default(); + + assert_eq!( + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)" + ); + + assert_eq!( + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)", + ); + + assert_eq!( + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://cs.wikipedia.org/wiki/Among_Us ", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), + "https://cs.wikipedia.org/wiki/Among_Us", + ); + + assert_eq!( + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://cs.wikipedia.org/wiki/Among Us )", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), + "https://cs.wikipedia.org/wiki/Among Us" + ); + + assert_eq!( + ctx.url_chars(tag(")"), false)(Span::new_extra( + "https://en.wikipedia.org/wiki/Among Us )", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Among" + ); + } + + #[test] + fn parse_formatting() { + assert_eq!( + parse_full(r#"~~stikethrough~~"#), + Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), + ); + + assert_eq!( + parse_full(r#"**bold**"#), + Token::Bold(Box::new(Token::PlainText("bold".into()))), + ); + + assert_eq!( + parse_full(r#"*italic*"#), + Token::Italic(Box::new(Token::PlainText("italic".into()))), + ); + + assert_eq!( + parse_full(r#"* italic *"#), + Token::PlainText("* italic *".into()) + ); + + assert_eq!( + parse_full("snake_case_variable"), + Token::PlainText("snake_case_variable".into()) + ); + + assert_eq!( + parse_full("intra*word*italic"), + Token::Sequence(vec![ + Token::PlainText("intra".into()), + Token::Italic(Box::new(Token::PlainText("word".into()))), + Token::PlainText("italic".into()) + ]) + ); + + assert_eq!( + parse_full(r#"_ italic *"#), + Token::PlainText("_ italic *".into()) + ); + + assert_eq!( + parse_full(r#"*"italic"*"#), + Token::Italic(Box::new(Token::PlainText("\"italic\"".into()))) + ); + + assert_eq!( + parse_full(r#"not code `code` also not code"#), + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("code".into()), + Token::PlainText(" also not code".into()) + ]), + ); + + assert_eq!( + parse_full(r#"not code `code` also `not code"#), + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("code".into()), + Token::PlainText(" also `not code".into()) + ]), + ); + + assert_eq!( + parse_full(r#"not code `*not bold*` also not code"#), + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("*not bold*".into()), + Token::PlainText(" also not code".into()) + ]), + ); + + assert_eq!( + parse_full(r#"***bold italic***"#), + Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))) + ); + + assert_eq!( + parse_full(r#"bold italic"#), + Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( + "bold italic".into() + ))))) + ); + } + + #[test] + fn parse_complex() { + assert_eq!( + parse_full( + r#"
centered +🦋🏳️‍⚧️ +text
"# + ), + Token::Center(Box::new(Token::Sequence(vec![ + Token::PlainText("centered\n".into()), + Token::UnicodeEmoji("🦋".into()), + Token::UnicodeEmoji("🏳️‍⚧️".into()), + Token::PlainText("\ntext".into()) + ]))) + ); + + assert_eq!( + parse_full( + r#">
centered +> 👩🏽‍🤝‍👩🏼 +> text
"# + ), + Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ + Token::PlainText("centered\n".into()), + Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()), + Token::PlainText("\ntext".into()) + ]))))), + ); + + assert_eq!( + parse_full(r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#), + Token::Function { + name: "x2".into(), + params: HashMap::new(), + inner: Box::new(Token::Sequence(vec![ + Token::Function { + name: "sparkle".into(), + params: HashMap::new(), + inner: Box::new(Token::UnicodeEmoji("🥺".into())), + }, + Token::UnicodeEmoji("💜".into()), + Token::Function { + name: "spin".into(), + params: { + let mut params = HashMap::new(); + params.insert("y".into(), None); + params.insert("speed".into(), Some("5s".into())); + params + }, + inner: Box::new(Token::UnicodeEmoji("❤️".into())), + }, + Token::UnicodeEmoji("🦊".into()), + ])) + }, + ); + + assert_eq!( + parse_full(r#"bold @tag1 @tag2 italic"#), + Token::Sequence(vec![ + Token::PlainText("bold ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag1".into(), + host: None + }, + Token::PlainText(" ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag2".into(), + host: None + }, + Token::PlainText(" italic".into()) + ]), + ); + + assert_eq!( + parse_full( + r#" +> test +> +> italic +> +>> Nested quote +"# + ), + Token::Quote(Box::new(Token::Sequence(vec![ + Token::PlainText("test\n".into()), + Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), + Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) + ]))), + ); + } + + #[test] + fn parse_link() { + assert_eq!( + parse_full("IPv4 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv4 test: ".into()), + Token::UrlNoEmbed("https://0".into()) + ]) + ); + + assert_eq!( + parse_full("IPv4 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv4 test: ".into()), + Token::UrlNoEmbed("https://127.0.0.1".into()) + ]) + ); + + assert_eq!( + parse_full("IPv6 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv6 test: ".into()), + Token::UrlNoEmbed("https://[::2f:1]/nya".into()) + ]) + ); + + assert_eq!( + parse_full("IPv6 test: https://[::2f:1]/nya"), + Token::Sequence(vec![ + Token::PlainText("IPv6 test: ".into()), + Token::UrlRaw("https://[::2f:1]/nya".into()) + ]) + ); + + // IDNs + assert_eq!( + parse_full("IDN test: https://www.háčkyčárky.cz/"), + Token::Sequence(vec![ + Token::PlainText("IDN test: ".into()), + Token::UrlRaw("https://www.háčkyčárky.cz/".into()) + ]) + ); + + assert_eq!( + parse_full("Link test: [label](https://example.com)"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://example.com".into(), + embed: true + } + ]) + ); + + assert_eq!( + parse_full("test #hashtag tail"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Hashtag("hashtag".into()), + Token::PlainText(" tail".into()) + ]) + ); + + assert_eq!( + parse_full("not#hashtag tail"), + Token::PlainText("not#hashtag tail".into()) + ); + + assert_eq!( + parse_full(""), + Token::UrlNoEmbed("https://example.com".into()) + ); + + // Adjacent links okay + assert_eq!( + parse_full(""), + Token::Sequence(vec![ + Token::UrlNoEmbed("https://example.com/".into()), + Token::UrlNoEmbed("https://awawa.gay/".into()) + ]) + ); + + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay)"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false + } + ]) + ); + + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay)test"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false + }, + Token::PlainText("test".into()) + ]) + ); + + assert_eq!( + parse_full("Link test: (?[label](https://awawa.gay))"), + Token::Sequence(vec![ + Token::PlainText("Link test: (".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false + }, + Token::PlainText(")".into()) + ]) + ); + + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket + Token::Sequence(vec![ + Token::PlainText("Link test: ?[label](".into()), + Token::UrlRaw("https://awawa.gay".into()), + ]) + ); + } + + #[test] + fn limit_nesting() { + let mut tok = Token::PlainText(" test ".into()); + for _ in 0..DEFAULT_DEPTH_LIMIT { + tok = Token::Bold(Box::new(tok)); + } + + assert_eq!( + parse_full( + &("".repeat(DEFAULT_DEPTH_LIMIT) + + " test " + + &*"".repeat(DEFAULT_DEPTH_LIMIT)) + ), + tok + ); + } + + #[test] + fn parse_mention() { + assert_eq!( + parse_full("@tag"), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: None + } + ); + + assert_eq!( + parse_full("email@notactuallyamenmtion.org"), + Token::PlainText("email@notactuallyamenmtion.org".into()) + ); + + assert_eq!( + parse_full("hgsjlkdsa @tag fgahjsdkd"), + Token::Sequence(vec![ + Token::PlainText("hgsjlkdsa ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: None + }, + Token::PlainText(" fgahjsdkd".into()) + ]) + ); + + assert_eq!( + parse_full("hgsjlkdsa @tag@ fgahjsdkd"), + Token::Sequence(vec![ + Token::PlainText("hgsjlkdsa ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: None + }, + Token::PlainText("@ fgahjsdkd".into()) + ]) + ); + + assert_eq!( + parse_full("aaaa @tag@domain bbbbb"), + Token::Sequence(vec![ + Token::PlainText("aaaa ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain".into()) + }, + Token::PlainText(" bbbbb".into()) + ]) + ); + + assert_eq!( + parse_full("test @tag@domain, test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain".into()) + }, + Token::PlainText(", test".into()) + ]) + ); + + assert_eq!( + parse_full("test @tag@domain.gay. test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain.gay".into()) + }, + Token::PlainText(". test".into()) + ]) + ); + + assert_eq!( + parse_full("test @tag@domain? test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain".into()) + }, + Token::PlainText("? test".into()) + ]) + ); + + assert_eq!( + parse_full("test !tag@domain.com test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::Community, + name: "tag".into(), + host: Some("domain.com".into()) + }, + Token::PlainText(" test".into()) + ]) + ); + } + + #[test] + fn parse_shortcodes() { + assert_eq!( + parse_full(":bottom:"), + Token::ShortcodeEmoji("bottom".into()) + ); + + assert_eq!( + parse_full(":bottom::blobfox:"), + Token::Sequence(vec![ + Token::ShortcodeEmoji("bottom".into()), + Token::ShortcodeEmoji("blobfox".into()) + ]) + ); + + assert_eq!( + parse_full(":bottom:blobfox"), + Token::PlainText(":bottom:blobfox".into()) + ); + + assert_eq!( + parse_full("bottom:blobfox:"), + Token::PlainText("bottom:blobfox:".into()) + ); + } + + #[test] + fn parse_emoji() { + assert_eq!( + parse_full("🥺💜❤️🦊"), + Token::Sequence( + vec!["🥺", "💜", "❤️", "🦊"] + .into_iter() + .map(str::to_string) + .map(Token::UnicodeEmoji) + .collect::>() + ) + ); + + // Trans flag, ZWJ + assert_eq!( + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}"), + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into()) + ); + + assert_eq!( + parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}"), + Token::Sequence(vec![ + Token::PlainText("\u{0200d}".into()), // ZWJ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + ]) + ); + + // Trans flag, ZWNJ + assert_eq!( + parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}"), + Token::Sequence(vec![ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + Token::PlainText("\u{0200c}".into()), // ZWNJ + Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()) // Trans symbol + ]) + ); + + assert_eq!( + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}"), + Token::Sequence(vec![ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + Token::PlainText("\u{0200d}\u{0200d}\u{0200d}".into()), // ZWJ + ]) + ); + } +}