use std::collections::HashMap; use std::convert::{identity, Infallible}; use std::io::{Cursor, Write}; use std::marker::PhantomData; use compact_str::{CompactString, ToCompactString}; use either::Either; use nom::{IResult, Offset, Parser, Slice}; use nom::branch::alt; use nom::bytes::complete::{tag, tag_no_case}; use nom::character::complete::{ alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, satisfy, space1, tab, }; use nom::combinator::{eof, fail, map, not, opt, peek, recognize}; use nom::error::ErrorKind; use nom::multi::{many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; use nom_locate::LocatedSpan; use quick_xml::events::{BytesText, Event}; use serde::{Deserialize, Serialize}; use strum::IntoStaticStr; use tracing::trace; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize, IntoStaticStr)] // The alternative would be to implement a serde serializer for this one enum, but that's disgusting #[strum(serialize_all = "snake_case")] #[serde(rename_all = "snake_case")] pub enum MentionType { Community, User, MatrixUser, } impl MentionType { pub fn to_char(&self) -> char { match self { MentionType::Community => '!', MentionType::User => '@', MentionType::MatrixUser => ':', } } pub fn separator(&self) -> char { match self { MentionType::Community | MentionType::User => '@', MentionType::MatrixUser => ':', } } } #[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub enum Token { PlainText(CompactString), Sequence(Vec), Quote(Box), Small(Box), BoldItalic(Box), Bold(Box), Italic(Box), Center(Box), Strikethrough(Box), PlainTag(String), InlineCode(String), InlineMath(String), UrlRaw(String), UrlNoEmbed(String), Link { label: Box, href: String, embed: bool, }, BlockCode { lang: Option, inner: String, }, BlockMath(String), Function { name: String, params: HashMap>, inner: Box, }, Mention { name: String, host: Option, mention_type: MentionType, }, UnicodeEmoji(String), ShortcodeEmoji { shortcode: String, host: Option, }, Hashtag(String), } impl Token { fn str_content_left(&self) -> Option<&str> { match self { Token::PlainText(text) => Some(text.as_ref()), Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left), Token::Quote(inner) => inner.str_content_left(), Token::Small(inner) => inner.str_content_left(), Token::BoldItalic(inner) => inner.str_content_left(), Token::Bold(inner) => inner.str_content_left(), Token::Italic(inner) => inner.str_content_left(), Token::Center(inner) => inner.str_content_left(), Token::Strikethrough(inner) => inner.str_content_left(), Token::PlainTag(tag) => Some(tag.as_ref()), Token::UrlRaw(url) => Some(url.as_ref()), Token::UrlNoEmbed(url) => Some(url.as_ref()), Token::Link { label, .. } => label.str_content_left(), Token::Function { inner, .. } => inner.str_content_left(), Token::Mention { name, .. } => Some(name.as_ref()), Token::UnicodeEmoji(code) => Some(code.as_ref()), Token::Hashtag(tag) => Some(tag.as_ref()), _ => None, } } fn str_content_right(&self) -> Option<&str> { match self { Token::PlainText(text) => Some(text.as_ref()), Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right), Token::Quote(inner) => inner.str_content_right(), Token::Small(inner) => inner.str_content_right(), Token::BoldItalic(inner) => inner.str_content_right(), Token::Bold(inner) => inner.str_content_right(), Token::Italic(inner) => inner.str_content_right(), Token::Center(inner) => inner.str_content_right(), Token::Strikethrough(inner) => inner.str_content_right(), Token::PlainTag(tag) => Some(tag.as_ref()), Token::UrlRaw(url) => Some(url.as_ref()), Token::UrlNoEmbed(url) => Some(url.as_ref()), Token::Link { label, .. } => label.str_content_right(), Token::Function { inner, .. } => inner.str_content_right(), Token::Mention { name, .. } => Some(name.as_ref()), Token::UnicodeEmoji(code) => Some(code.as_ref()), Token::Hashtag(tag) => Some(tag.as_ref()), _ => None, } } fn inner(&self) -> Token { match self { plain @ Token::PlainText(_) => plain.clone(), sequence @ Token::Sequence(_) => sequence.clone(), Token::Quote(inner) => inner.inner(), Token::Small(inner) => inner.inner(), Token::BoldItalic(inner) => inner.inner(), Token::Bold(inner) => inner.inner(), Token::Italic(inner) => inner.inner(), Token::Center(inner) => inner.inner(), Token::Strikethrough(inner) => inner.inner(), Token::PlainTag(text) => Token::PlainText(text.clone().into()), Token::InlineCode(code) => Token::PlainText(code.clone().into()), Token::InlineMath(math) => Token::PlainText(math.clone().into()), Token::UrlRaw(url) => Token::PlainText(url.clone().into()), Token::UrlNoEmbed(url) => Token::PlainText(url.clone().into()), Token::Link { label, .. } => label.inner(), Token::BlockCode { inner, .. } => Token::PlainText(inner.clone().into()), Token::BlockMath(math) => Token::PlainText(math.clone().into()), Token::Function { inner, .. } => inner.inner(), Token::Mention { name, .. } => Token::PlainText(name.clone().into()), Token::UnicodeEmoji(code) => Token::PlainText(code.clone().into()), Token::ShortcodeEmoji { shortcode, .. } => Token::PlainText(shortcode.clone().into()), Token::Hashtag(tag) => Token::PlainText(tag.clone().into()), } } fn merged(&self) -> Token { match self { Token::Sequence(tokens) => { let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| { if let Some(Token::PlainText(last)) = acc.last_mut() { if let Token::PlainText(tok_text) = tok { *last += tok_text.as_ref(); return acc; } } if let Token::Sequence(seq) = tok { let items = seq.iter().map(Token::merged).flat_map(|t| match t { Token::Sequence(seq) => Either::Left(seq.into_iter()), other => Either::Right(std::iter::once(other)), }); for item in items { if let Some(Token::PlainText(last)) = acc.last_mut() { if let Token::PlainText(tok_text) = item { *last += tok_text.as_ref(); continue; } } acc.push(item); } return acc; } acc.push(tok.merged()); acc }); if tokens_multi.len() == 1 { return tokens_multi.into_iter().next().unwrap(); } Token::Sequence(tokens_multi) } Token::Quote(inner) => Token::Quote(Box::new(inner.merged())), Token::Small(inner) => Token::Small(Box::new(inner.merged())), Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.merged())), Token::Bold(inner) => Token::Bold(Box::new(inner.merged())), Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), Token::Center(inner) => Token::Center(Box::new(inner.merged())), Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())), Token::Link { embed, label, href } => Token::Link { label: Box::new(label.merged()), href: href.clone(), embed: *embed, }, Token::Function { name, params, inner, } => Token::Function { name: name.clone(), params: params.clone(), inner: Box::new(inner.merged()), }, other => other.clone(), } } pub fn walk_map_collect(&self, func: &impl Fn(&Token) -> Option, out: &mut Vec) { if let Some(v) = func(self) { out.push(v) } match self { Token::Sequence(items) => { items.iter().for_each(|tok| tok.walk_map_collect(func, out)); } Token::Quote(inner) | Token::Small(inner) | Token::BoldItalic(inner) | Token::Bold(inner) | Token::Italic(inner) | Token::Center(inner) | Token::Function { inner, .. } | Token::Link { label: inner, .. } | Token::Strikethrough(inner) => inner.walk_map_collect(func, out), _ => {} } } pub fn walk_speech_transform(&mut self, func: &impl Fn(&mut CompactString)) { match self { Token::Sequence(items) => { items .iter_mut() .for_each(|tok| tok.walk_speech_transform(func)); } Token::Small(inner) | Token::BoldItalic(inner) | Token::Bold(inner) | Token::Italic(inner) | Token::Center(inner) | Token::Function { inner, .. } | Token::Strikethrough(inner) => inner.walk_speech_transform(func), Token::PlainText(text) => func(text), _ => {} } } fn write(&self, writer: &mut quick_xml::Writer) -> quick_xml::Result<()> { match self { Token::PlainText(plain) => { writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; } Token::Sequence(sequence) => { sequence.iter().try_for_each(|item| item.write(writer))?; } Token::Quote(inner) => { writer .create_element("quote") .write_inner_content(|w| inner.write(w))?; } Token::Small(inner) => { writer .create_element("small") .write_inner_content(|w| inner.write(w))?; } Token::BoldItalic(inner) => { writer .create_element("b") .write_inner_content::<_, quick_xml::Error>(|w| { w.create_element("i") .write_inner_content(|w| inner.write(w))?; Ok(()) })?; } Token::Bold(inner) => { writer .create_element("b") .write_inner_content(|w| inner.write(w))?; } Token::Italic(inner) => { writer .create_element("i") .write_inner_content(|w| inner.write(w))?; } Token::Center(inner) => { writer .create_element("center") .write_inner_content(|w| inner.write(w))?; } Token::Strikethrough(inner) => { writer .create_element("s") .write_inner_content(|w| inner.write(w))?; } Token::PlainTag(plain) => { writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; } Token::InlineCode(code) => { writer .create_element("inline-code") .write_text_content(BytesText::new(code))?; } Token::InlineMath(math) => { writer .create_element("inline-math") .write_text_content(BytesText::new(math))?; } Token::UrlRaw(url) => { writer .create_element("a") .with_attribute(("href", url.as_str())) .write_text_content(BytesText::new(url))?; } Token::UrlNoEmbed(url) => { writer .create_element("a") .with_attribute(("href", url.as_str())) .with_attribute(("embed", "false")) .write_text_content(BytesText::new(url))?; } Token::Link { label, href, embed } => { writer .create_element("a") .with_attribute(("href", href.as_str())) .with_attribute(("embed", if *embed { "true" } else { "false" })) .write_inner_content(|w| label.write(w))?; } Token::BlockCode { inner, lang } => { let mut ew = writer.create_element("code"); if let Some(language) = lang { ew = ew.with_attribute(("lang", language.as_str())); } ew.write_text_content(BytesText::new(inner))?; } Token::BlockMath(math) => { writer .create_element("math") .write_text_content(BytesText::new(math))?; } Token::Function { inner, name, params, } => { let mut ew = writer .create_element("fn") .with_attribute(("name", name.as_str())); for (k, v) in params { ew = ew .with_attribute((format!("arg-{k}").as_str(), v.as_deref().unwrap_or(""))); } ew.write_inner_content(|w| inner.write(w))?; } Token::Mention { name, host, mention_type, } => { let mut ew = writer .create_element("mention") .with_attribute(("name", name.as_str())) .with_attribute(("type", mention_type.into())); if let Some(host) = host { ew = ew.with_attribute(("host", host.as_str())); } ew.write_empty()?; } Token::UnicodeEmoji(text) => { writer .create_element("ue") .write_text_content(BytesText::new(text))?; } Token::ShortcodeEmoji { shortcode, host } => { let mut ew = writer.create_element("ee"); if let Some(host) = host { ew = ew.with_attribute(("host", host.as_str())); } ew.write_text_content(BytesText::new(shortcode))?; } Token::Hashtag(tag) => { writer .create_element("hashtag") .write_text_content(BytesText::new(tag.as_str()))?; } } Ok(()) } } pub fn to_xml_string(token: &Token) -> quick_xml::Result { let mut writer = quick_xml::Writer::new(Cursor::new(Vec::new())); writer .create_element("mmm") .write_inner_content(|writer| token.write(writer))?; Ok(String::from_utf8(writer.into_inner().into_inner())?) } pub fn janky_is_line_begin(input: Span<'_>) -> bool { let offset = input.location_offset(); // VERY BAD // Safety: This is very janky, but hopefully will work as long as nom-locate keeps the invariant of fragments being subslices of the input // We do this to avoid scanning the entire input for a line separator when we just need the previous byte offset == 0 || unsafe { let frag_bytes = input.fragment().as_bytes(); let frag_ptr = frag_bytes.as_ptr(); let prev_byte = frag_ptr.offset(-1); matches!(*prev_byte, b'\n') } } #[derive(Debug, Default, Copy, Clone)] pub struct SpanMeta { depth: usize, } impl SpanMeta { fn new(depth: usize) -> Self { Self { depth } } } type Span<'a> = LocatedSpan<&'a str, SpanMeta>; trait SliceOffset { fn up_to(&self, other: &Self) -> Self; fn fragment_between<'a>(&self, other: &Self) -> &'a str where Self: 'a; } impl SliceOffset for Span<'_> { fn up_to(&self, other: &Self) -> Self { self.slice(..self.offset(other)) } fn fragment_between<'a>(&self, other: &Self) -> &'a str where Self: 'a, { self.up_to(other).into_fragment() } } #[inline] fn boxing_token(func: impl Fn(Box) -> Token) -> impl Fn(Token) -> Token { move |tokens| func(Box::new(tokens)) } #[inline] fn collect_sequence( func: impl Fn(Vec) -> Token, transform: impl Fn(Token) -> Token, ) -> impl Fn(&mut dyn Iterator) -> Token { move |tokens| transform(func(tokens.collect())) } #[inline] fn collect_char_sequence( func: impl Fn(String) -> Token, ) -> impl Fn(&mut dyn Iterator) -> Token { move |chars| func(chars.collect()) } #[inline] fn space1_unicode(input: Span) -> IResult { recognize(many1_count(tuple(( not(line_ending), satisfy(char::is_whitespace), ))))(input) } #[inline] fn alphanumeric1_unicode(input: Span) -> IResult { recognize(many1_count(char_alphanumeric_unicode))(input) } #[inline] fn char_alphanumeric_unicode(input: Span) -> IResult { satisfy(char::is_alphanumeric)(input) } fn spliced<'a>( segments: &[Span<'a>], func: impl Fn(Span) -> IResult, parent: Span<'a>, ) -> IResult, Token, nom::error::Error>> { let combined = segments .iter() .copied() .map(Span::into_fragment) .collect::>() .join("\n"); let cum_offset_combined = segments .iter() .scan(0, |acc, &x| { *acc += x.len(); Some(*acc) }) .collect::>(); let current_seg = |input: Span| { cum_offset_combined .iter() .enumerate() .take_while(|(_, &o)| o > input.location_offset()) .map(|(i, o)| (segments[i], o)) .last() }; type NE = nom::Err; type NomError<'x> = nom::error::Error>; let spliced_span = Span::new_extra( &combined, segments.first().map_or(SpanMeta::new(0), |s| s.extra), ); let (input, inner) = match func(spliced_span) { Ok(s) => s, Err(e) => { return match e { NE::Error(e) => { let offset_new = e.input.location_offset(); if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) { let offset = offset_new - offset_seg_new; let offset_orig = offset + seg_parent.location_offset(); Err(NE::Error(NomError::new( Span::new_extra( &parent.into_fragment()[offset_orig..], seg_parent.extra, ), e.code, ))) } else { // ??? Err(NE::Failure(NomError::new(parent, ErrorKind::Fail))) } } NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))), NE::Incomplete(i) => Err(NE::Incomplete(i)), }; } }; let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) { let offset = input.location_offset() - offset_seg_new; let offset_orig = offset + seg_parent.location_offset(); parent.slice(offset_orig..) } else { parent }; Ok((out, inner)) } fn space(input: Span) -> IResult { let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?; Ok((input, Token::PlainText(frag.into_fragment().into()))) } #[derive(Copy, Clone)] struct Matcher<'a, 'b, T: Clone> { matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), _phantom_closure: PhantomData<&'a ()>, _phantom_data: PhantomData<&'b ()>, _phantom_output: PhantomData T>, } impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> { fn new( matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), ) -> Self { Self { matcher_inner, collector, _phantom_closure: PhantomData, _phantom_data: PhantomData, _phantom_output: PhantomData, } } } impl<'a, 'b> Matcher<'a, 'b, Infallible> { // Don't break this invariant, else a monster will come at night and eat all your socks fn reject() -> Self { Self { matcher_inner: &fail::<_, Infallible, _>, collector: &|_| unreachable!(), _phantom_closure: PhantomData, _phantom_data: PhantomData, _phantom_output: PhantomData, } } } #[derive(Copy, Clone, Debug)] enum FlankingRule { Lenient, Strict, DontCare, } struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>>( T, FlankingRule, PhantomData<&'a ()>, ); impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From<(T, FlankingRule)> for FlankingDelim<'a, T> { fn from((func, rule): (T, FlankingRule)) -> Self { FlankingDelim(func, rule, PhantomData) } } impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From for FlankingDelim<'a, T> { fn from(func: T) -> Self { FlankingDelim(func, FlankingRule::DontCare, PhantomData) } } pub struct Context { depth_limit: usize, } const DEFAULT_DEPTH_LIMIT: usize = 24; impl Default for Context { fn default() -> Self { Context::new(DEFAULT_DEPTH_LIMIT) } } impl Context { pub fn new(depth_limit: usize) -> Self { Self { depth_limit } } pub fn parse_full(&self, input: &str) -> Token { match self.full(Span::new_extra(input, SpanMeta::default())) { Ok((_, t)) => t.merged(), Err(e) => { trace!(input = input, "Full parser fail: {:?}", e); Token::PlainText(e.to_compact_string()) } } } pub fn parse_inline(&self, input: &str) -> Token { match self.full(Span::new_extra(input, SpanMeta::default())) { Ok((_, t)) => t.merged(), Err(e) => { trace!(input = input, "Inline parser fail: {:?}", e); Token::PlainText(e.to_compact_string()) } } } pub fn parse_ui(&self, input: &str) -> Token { match self.inline_ui(Span::new_extra(input, SpanMeta::default())) { Ok((_, t)) => t.merged(), Err(e) => { trace!(input = input, "Inline parser fail: {:?}", e); Token::PlainText(e.to_compact_string()) } } } pub fn parse_profile_fields(&self, input: &str) -> Token { match self.inline_profile_fields(Span::new_extra(input, SpanMeta::default())) { Ok((_, t)) => t.merged(), Err(e) => { trace!(input = input, "Profile field parser fail: {:?}", e); Token::PlainText(e.to_compact_string()) } } } #[inline] fn partial( &self, func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token> + 'static, ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token> + '_ { move |input| func(self, input) } #[inline] fn partial_span( &self, func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Span<'a>> + 'static, ) -> impl for<'a> Fn(Span<'a>) -> IResult, Span<'a>> + '_ { move |input| func(self, input) } pub fn full<'a>(&self, input: Span<'a>) -> IResult, Token> { map( many_till(self.partial(Self::full_single), eof).map(|v| v.0), Token::Sequence, )(input) } pub fn inline<'a>(&self, input: Span<'a>) -> IResult, Token> { map( many_till(self.partial(Self::inline_single), eof).map(|v| v.0), Token::Sequence, )(input) } pub fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token> { map( many_till(self.partial(Self::inline_label_safe_single), eof).map(|v| v.0), Token::Sequence, )(input) } fn inline_profile_fields<'a>(&self, input: Span<'a>) -> IResult, Token> { map( many_till( alt(( self.partial(Self::unicode_emoji), self.partial(Self::tag_mention), self.partial(Self::tag_hashtag), self.partial(Self::raw_url), self.partial(Self::tag_raw_text), )), eof, ) .map(|v| v.0), Token::Sequence, )(input) } fn inline_ui<'a>(&self, input: Span<'a>) -> IResult, Token> { map( many_till( alt(( self.partial(Self::unicode_emoji), self.partial(Self::shortcode_emoji), self.partial(Self::tag_raw_text), )), eof, ) .map(|v| v.0), Token::Sequence, )(input) } fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult, Token> { alt(( self.partial(Self::tag_bold_italic_asterisk), self.partial(Self::tag_bold_italic_underscore), self.partial(Self::tag_bold_asterisk), self.partial(Self::tag_italic_asterisk), self.partial(Self::tag_bold_underscore), self.partial(Self::tag_italic_underscore), ))(input) } fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( self.increase_nesting(alt(( self.partial(Self::unicode_emoji), self.partial(Self::tag_block_center), self.partial(Self::tag_small), self.partial(Self::tag_plain), self.partial(Self::tag_bold), self.partial(Self::tag_italic), self.partial(Self::tag_strikethrough), self.partial(Self::url_no_embed), self.partial(Self::base_bold_italic), self.partial(Self::tag_block_code), self.partial(Self::tag_inline_code), self.partial(Self::tag_quote), self.partial(Self::tag_block_math), self.partial(Self::tag_inline_math), self.partial(Self::tag_strikethrough_tilde), self.partial(Self::tag_func), self.partial(Self::tag_mention), self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji), self.partial(Self::link), self.partial(Self::raw_url), ))), self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) } fn inline_single<'a>(&self, input: Span<'a>) -> IResult, Token> { alt(( self.increase_nesting(alt(( self.partial(Self::unicode_emoji), self.partial(Self::tag_small), self.partial(Self::tag_plain), self.partial(Self::tag_bold), self.partial(Self::tag_italic), self.partial(Self::tag_strikethrough), self.partial(Self::url_no_embed), self.partial(Self::base_bold_italic), self.partial(Self::tag_inline_code), self.partial(Self::tag_inline_math), self.partial(Self::tag_strikethrough_tilde), self.partial(Self::tag_func), self.partial(Self::tag_mention), self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji), self.partial(Self::link), self.partial(Self::raw_url), ))), self.partial(Self::tag_raw_text), ))(input) } fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( self.increase_nesting(alt(( self.partial(Self::unicode_emoji), self.partial(Self::url_no_embed), self.partial(Self::tag_inline_code), self.partial(Self::tag_inline_math), self.partial(Self::tag_func), self.partial(Self::tag_mention), self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji), self.partial(Self::raw_url), ))), self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) } fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, token) = alt(( self.increase_nesting(alt(( self.partial(Self::unicode_emoji), self.partial(Self::tag_small), self.partial(Self::tag_plain), self.partial(Self::tag_bold), self.partial(Self::tag_italic), self.partial(Self::tag_strikethrough), self.partial(Self::base_bold_italic), self.partial(Self::tag_strikethrough_tilde), self.partial(Self::tag_func), self.partial(Self::shortcode_emoji), ))), self.partial(Self::tag_raw_text), ))(input)?; Ok((input, token)) } fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; if let (None, None) = leading_spaces { if !janky_is_line_begin(input) { return fail(input); } } let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input); let orig_input = input; let (input, lines) = separated_list1(line_ending, quote_line)(input)?; let quote_lines = lines .into_iter() .map(|(_, _, text)| text) .collect::>(); if quote_lines.len() == 1 && quote_lines .iter() .map(Span::fragment) .copied() .any(&str::is_empty) { return fail(input); } let (_, inner) = spliced("e_lines, self.partial(Self::full), orig_input)?; let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; Ok((input, Token::Quote(Box::new(inner)))) } fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult, Token> { let tag_start = &tag("
"); let tag_end = &tag("
"); let (input, _) = opt(line_ending)(input)?; if !janky_is_line_begin(input) { return fail(input); } let (input, _) = tag_start(input)?; let (input, _) = opt(line_ending)(input)?; let (input, (center_seq, _)) = many_till( self.partial(Self::inline_single), tuple((opt(space1), opt(line_ending), tag_end)), )(input)?; Ok(( input, boxing_token(Token::Center)(Token::Sequence(center_seq)), )) } fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token> { let delim = &tag("```"); let (input, _) = opt(line_ending)(input)?; if !janky_is_line_begin(input) { return fail(input); } let (input, _) = delim(input)?; let (input, lang) = opt(map( recognize(many1(tuple((not(delim), not(line_ending), anychar)))), Span::into_fragment, ))(input)?; let (input, _) = line_ending(input)?; let (input, code) = map( recognize(many1_count(tuple(( not(tuple((line_ending, delim))), anychar, )))), Span::into_fragment, )(input)?; let (input, _) = line_ending(input)?; let (input, _) = delim(input)?; // Trailing whitespace after the triple backtick let (input, _) = opt(space1_unicode)(input)?; // If we got this far, the next character should be a line ending let (input, _) = not(tuple((not(line_ending), anychar)))(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( input, Token::BlockCode { lang: lang.map(<&str>::into), inner: code.into(), }, )) } fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult, Token> { let start = &tag("\\["); let end = &tag("\\]"); let (input, _) = opt(line_ending)(input)?; if !janky_is_line_begin(input) { return fail(input); } let (input, _) = start(input)?; let (input, _) = opt(line_ending)(input)?; let (input, _) = opt(space1_unicode)(input)?; let (input, math_span) = map( many_till(anychar, tuple((opt(space1_unicode), opt(line_ending), end))), |v| v.0, )(input)?; // Trailing whitespace after the closing delim let (input, _) = opt(space1_unicode)(input)?; // If we got this far, the next character should be a line ending let (input, _) = not(tuple((not(line_ending), anychar)))(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( input, Token::BlockMath(math_span.into_iter().collect::()), )) } #[inline] fn tag_delimited<'a, 'b: 'a, T: Clone, S: Clone, FOpen, FClose>( &'a self, opening_tag: impl Into> + 'a, closing_tag: impl Into> + 'a, escape: bool, matcher: Matcher<'a, 'b, T>, fallback: Matcher<'a, 'b, S>, ) -> impl Fn(Span<'b>) -> IResult, Token> + '_ where FOpen: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, FClose: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, { let FlankingDelim(opening_tag, opening_rule, ..) = opening_tag.into(); let FlankingDelim(closing_tag, closing_rule, ..) = closing_tag.into(); move |input| { if let FlankingRule::Strict = opening_rule { let (input, pre) = opt(recognize(tuple(( char_alphanumeric_unicode, opt(tag("\\")), &opening_tag, peek(not(alt((recognize(satisfy(|c| c.is_whitespace())), eof)))), ))))(input)?; if let Some(pre_text) = pre { return Ok((input, Token::PlainText(pre_text.into_fragment().into()))); } } if escape { if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) { return Ok(( input_escaped, Token::PlainText(mark.fragment().to_string().into()), )); } } let begin = input; let (post_open, _) = opening_tag(input)?; let res = tuple(( many1(tuple((not(&closing_tag), &matcher.matcher_inner))), &closing_tag, ))(post_open); if let Err(nom::Err::Error(nom::error::Error { .. })) = res { let res_fallback = tuple(( many1(tuple((not(&closing_tag), &fallback.matcher_inner))), &closing_tag, ))(post_open); if res_fallback.is_err() { return Ok(( post_open, Token::PlainText(begin.fragment_between(&post_open).into()), )); } let (input, (inner, closing)) = res_fallback.unwrap(); let mut inner = inner.into_iter().map(|(_, t)| t); return Ok(( input, Token::Sequence(vec![ Token::PlainText(begin.fragment_between(&post_open).into()), (fallback.collector)(&mut inner), Token::PlainText(closing.into_fragment().into()), ]), )); } let (input, (inner, closing)) = res?; let mut inner = inner.into_iter().map(|(_, t)| t); let inner_tok = (matcher.collector)(&mut inner); let correct_left_flanking = if let FlankingRule::Lenient | FlankingRule::Strict = opening_rule { let text_left = inner_tok.str_content_left(); !(text_left.is_some_and(|s| s.starts_with(char::is_whitespace)) || text_left.is_none()) } else { true }; let correct_right_flanking = if let FlankingRule::Lenient | FlankingRule::Strict = closing_rule { let text_right = inner_tok.str_content_right(); !(text_right.is_some_and(|s| s.ends_with(char::is_whitespace)) || text_right.is_none()) } else { true }; let (input, alphanum) = opt(peek(alphanumeric1_unicode))(input)?; let correct_right_outer = alphanum.is_none() || !matches!(closing_rule, FlankingRule::Strict); let correct_flanking = correct_left_flanking && correct_right_flanking && correct_right_outer; if !correct_flanking { return Ok(( input, Token::Sequence(vec![ Token::PlainText(begin.fragment_between(&post_open).into()), inner_tok.inner(), Token::PlainText(closing.into_fragment().into()), ]), )); } Ok((input, Token::Sequence(vec![inner_tok]))) } } fn tag_func<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, _) = tag("$[")(input)?; let func_ident = |input| { recognize(tuple(( many1_count(alt((alpha1, tag("_")))), many0_count(alt((alphanumeric1, tag("_")))), )))(input) }; let arg_value = recognize(many1_count(alt(( alphanumeric1, tag("."), tag("-"), tag("_"), )))); let (input, func_name) = map(func_ident, Span::into_fragment)(input)?; let arg = tuple((func_ident, opt(tuple((tag("="), arg_value))))); let (input, args) = opt(tuple((one_char('.'), separated_list1(one_char(','), arg))))(input)?; let args_out = args.map_or_else(HashMap::new, |(_, items)| { items .into_iter() .map(|(k, v)| { ( k.into_fragment().to_string(), v.map(|(_, val)| val.into_fragment().to_string()), ) }) .collect::>() }); let (input, _) = opt(space)(input)?; let (input, (inner, _)) = many_till(self.partial(Self::inline_single), tag("]"))(input)?; Ok(( input, Token::Function { name: func_name.to_string(), params: args_out, inner: Box::new(Token::Sequence(inner)), }, )) } fn tag_plain<'a>(&self, input: Span<'a>) -> IResult, Token> { let opening_tag = &tag(""); let closing_tag = &tag(""); let (input, _) = opening_tag(input)?; let (input, text) = map( recognize(many1(tuple((not(line_ending), not(closing_tag), anychar)))), Span::into_fragment, )(input)?; let (input, _) = closing_tag(input)?; Ok((input, Token::PlainTag(text.into()))) } fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag_no_case(""), tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Small)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("***"), FlankingRule::Lenient), (tag("***"), FlankingRule::Lenient), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("___"), FlankingRule::Strict), (tag("___"), FlankingRule::Strict), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag_no_case(""), tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("**"), FlankingRule::Lenient), (tag("**"), FlankingRule::Lenient), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("__"), FlankingRule::Strict), (tag("__"), FlankingRule::Strict), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag_no_case(""), tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("*"), FlankingRule::Lenient), (tag("*"), FlankingRule::Lenient), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("_"), FlankingRule::Strict), (tag("_"), FlankingRule::Strict), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag_no_case(""), tag_no_case(""), false, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( (tag("~~"), FlankingRule::Lenient), (tag("~~"), FlankingRule::Lenient), true, Matcher::new( &move |input| { map( tuple((not(line_ending), self.partial(Self::inline_single))), |(_, captured)| captured, )(input) }, &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), ), Matcher::new( &move |input| { map( tuple(( not(line_ending), self.partial(Self::inline_non_formatting_single), )), |(_, captured)| captured, )(input) }, &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag("`"), |input| alt((tag("`"), tag("´")))(input), true, Matcher::new( &move |input| { map( tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar)), |(_, captured)| captured, )(input) }, &collect_char_sequence(Token::InlineCode), ), Matcher::reject(), )(input) } fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token> { self.tag_delimited( tag("\\("), tag("\\)"), false, Matcher::new( &move |input| { map(tuple((not(line_ending), anychar)), |(_, captured)| captured)(input) }, &collect_char_sequence(Token::InlineMath), ), Matcher::reject(), )(input) } fn tag_raw_text<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, text) = anychar(input)?; Ok((input, Token::PlainText(text.to_compact_string()))) } fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, url_span) = recognize(tuple(( self.partial_span(Self::protocol), self.url_chars( |input| recognize(not(self.partial_span(Self::url_chars_base)))(input), false, ), )))(input)?; let url = url_span.into_fragment(); let url_bytes = url.as_bytes(); // Strip punctuation at the end of sentences that might have been consumed as a part of the URL let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) { url.slice(..url.len() - 1) } else { url }; Ok((input, Token::UrlRaw(final_url.to_string()))) } fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, _) = tag("<")(input)?; let (input, url_span) = recognize(tuple(( self.partial_span(Self::protocol), self.url_chars(tag(">"), true), )))(input)?; let (input, _) = tag(">")(input)?; Ok(( input, Token::UrlNoEmbed(url_span.into_fragment().to_string()), )) } fn link<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, no_embed) = opt(tag("?"))(input)?; let (input, _) = tag("[")(input)?; let (input, _) = not(tag("["))(input)?; let (input, (label_tok, _)) = many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?; let (input, url_span) = recognize(tuple(( self.partial_span(Self::protocol), self.url_chars(tag(")"), true), )))(input)?; let (input, _) = tag(")")(input)?; Ok(( input, Token::Link { label: Box::new(Token::Sequence(label_tok)), href: url_span.into_fragment().into(), embed: no_embed.is_none(), }, )) } fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token> { let frag = input.fragment(); let Some(grapheme) = frag.graphemes(true).next() else { return fail(input); }; let grapheme = grapheme.trim_end_matches(|c| c == '\u{200c}' || c == '\u{200d}'); let emoji = emojis::get(grapheme); if emoji.is_none() { return fail(input); } Ok(( input.slice(grapheme.len()..), Token::UnicodeEmoji(grapheme.into()), )) } fn shortcode_emoji_inner<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, _) = tag(":")(input)?; let (input, shortcode) = map( recognize(many1(alt(( alphanumeric1_unicode, recognize(one_of("_+-")), )))), Span::into_fragment, )(input)?; let (input, host) = opt(map( tuple(( tag("@"), map( recognize(many1(alt((alphanumeric1, recognize(one_of("-.")))))), Span::into_fragment, ), )), |(_at, host)| host, ))(input)?; let (input, _) = tag(":")(input)?; let (input, _) = not(alphanumeric1_unicode)(input)?; Ok(( input, Token::ShortcodeEmoji { shortcode: shortcode.into(), host: host.map(str::to_string), }, )) } fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token> { if let (plain_out, Some(plain)) = map( opt(recognize(tuple(( char_alphanumeric_unicode, self.partial(Self::shortcode_emoji_inner), )))), |o| o.map(Span::into_fragment), )(input)? { return Ok((plain_out, Token::PlainText(plain.into()))); } self.shortcode_emoji_inner(input) } fn tag_mention_inner<'a>(&self, input: Span<'a>) -> IResult, Token> { let tags = one_of("@!"); let (input, mention_type) = map(tags, |c| match c { '@' => MentionType::User, '!' => MentionType::Community, _ => unreachable!(), })(input)?; let (input, name) = map( recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))), Span::into_fragment, )(input)?; let before = input; let (_, host_opt) = opt(tuple(( one_of(if matches!(mention_type, MentionType::User) { "@:" } else { "@" }), map( recognize(many1(alt((alphanumeric1, recognize(one_of("-_.")))))), Span::into_fragment, ), )))(input)?; // Promote tags with a colon separator to Matrix handles let mention_type = if let Some((':', _)) = host_opt { MentionType::MatrixUser } else { mention_type }; let host = host_opt.map(|(_, name)| name.trim_end_matches(|c| matches!(c, '.' | '-' | '_'))); let input = host.map(|c| before.slice(c.len() + 1..)).unwrap_or(before); Ok(( input, Token::Mention { mention_type, name: name.into(), host: host.map(|h| h.into()), }, )) } fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token> { if let (plain_out, Some(plain)) = map( opt(recognize(tuple(( alt((tag("\\"), recognize(char_alphanumeric_unicode))), self.partial(Self::tag_mention_inner), )))), |o| o.map(Span::into_fragment), )(input)? { return Ok((plain_out, Token::PlainText(plain.into()))); } self.tag_mention_inner(input) } fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, maybe_preceded) = opt(recognize(tuple((char_alphanumeric_unicode, tag("#")))))(input)?; if let Some(preceded) = maybe_preceded { return Ok((input, Token::PlainText(preceded.into_fragment().into()))); } let (input, _) = tag("#")(input)?; let (input, hashtag_text) = map( recognize(many1(self.partial_span(Self::hashtag_chars))), Span::into_fragment, )(input)?; Ok((input, Token::Hashtag(hashtag_text.into()))) } #[inline] fn increase_nesting<'a, 'b, O, F>( &'b self, mut func: F, ) -> impl FnMut(Span<'a>) -> IResult, O> + 'b where F: Parser, O, nom::error::Error>> + 'b, { move |mut input| { if input.extra.depth >= self.depth_limit { return fail(input); } input.extra.depth += 1; func.parse(input).map(|mut v| { v.0.extra.depth -= 1; v }) } } #[inline] fn hashtag_chars<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { recognize(alt(( recognize(tuple(( tag("("), self.increase_nesting(self.partial_span(Self::hashtag_chars)), tag(")"), ))), recognize(tuple(( tag("["), self.increase_nesting(self.partial_span(Self::hashtag_chars)), tag("]"), ))), recognize(tuple(( tag("「"), self.increase_nesting(self.partial_span(Self::hashtag_chars)), tag("」"), ))), recognize(tuple(( tag("("), self.increase_nesting(self.partial_span(Self::hashtag_chars)), tag(")"), ))), recognize(tuple(( not(space1_unicode), not(line_ending), not(one_of(".,:;!?#?/[]【】()「」()<>")), anychar, ))), )))(input) } #[inline] fn protocol<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { alt((tag("https://"), tag("http://")))(input) } #[inline] fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { alt(( alphanumeric1_unicode, recognize(tuple(( tag("["), many_till( self.increase_nesting(self.partial_span(Self::url_chars_base)), tag("]"), ), ))), recognize(tuple(( tag("("), many_till( self.increase_nesting(self.partial_span(Self::url_chars_base)), tag(")"), ), ))), recognize(one_of(".,_/:%#$&?!~=+-@")), ))(input) } #[inline] fn url_chars<'a, 'b, F>( &'b self, mut terminator: F, spaces: bool, ) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'b where F: Parser, Span<'a>, nom::error::Error>> + 'b, { move |input| { recognize(many1_count(tuple(( not(tuple((space1, eof))), not(tuple((space1, tag("\"")))), not(tuple((opt(space1), |input| terminator.parse(input)))), alt(( |input| self.url_chars_base(input), if spaces { space1 } else { fail }, )), ))))(input) } } } #[cfg(test)] mod test { use std::collections::HashMap; use nom::bytes::complete::tag; use crate::{Context, DEFAULT_DEPTH_LIMIT, Span, SpanMeta, to_xml_string, Token}; fn parse_full(string: &str) -> Token { Context::default() .full(Span::new_extra(string, SpanMeta::default())) .unwrap() .1 .merged() } #[test] fn parse_empty() { assert_eq!(parse_full(""), Token::Sequence(vec![])); } #[test] fn parse_url_chars() { let ctx = Context::default(); assert_eq!( ctx.url_chars(tag(")"), true)(Span::new_extra( "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", SpanMeta::default(), )) .unwrap() .1 .into_fragment(), "https://en.wikipedia.org/wiki/Sandbox_(computer_security)" ); assert_eq!( ctx.url_chars(tag(")"), true)(Span::new_extra( "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))", SpanMeta::default() )) .unwrap() .1 .into_fragment(), "https://en.wikipedia.org/wiki/Sandbox_(computer_security)", ); assert_eq!( ctx.url_chars(tag(")"), true)(Span::new_extra( "https://cs.wikipedia.org/wiki/Among_Us ", SpanMeta::default() )) .unwrap() .1 .into_fragment(), "https://cs.wikipedia.org/wiki/Among_Us", ); assert_eq!( ctx.url_chars(tag(")"), true)(Span::new_extra( "https://cs.wikipedia.org/wiki/Among Us )", SpanMeta::default(), )) .unwrap() .1 .into_fragment(), "https://cs.wikipedia.org/wiki/Among Us" ); assert_eq!( ctx.url_chars(tag(")"), false)(Span::new_extra( "https://en.wikipedia.org/wiki/Among Us )", SpanMeta::default(), )) .unwrap() .1 .into_fragment(), "https://en.wikipedia.org/wiki/Among" ); } #[test] fn parse_formatting() { assert_eq!( parse_full(r#"~~stikethrough~~"#), Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), ); assert_eq!( parse_full(r#"**bold**"#), Token::Bold(Box::new(Token::PlainText("bold".into()))), ); assert_eq!( parse_full(r#"*italic*"#), Token::Italic(Box::new(Token::PlainText("italic".into()))), ); assert_eq!( parse_full(r#"* italic *"#), Token::PlainText("* italic *".into()) ); assert_eq!( parse_full("snake_case_variable"), Token::PlainText("snake_case_variable".into()) ); assert_eq!( parse_full("intra*word*italic"), Token::Sequence(vec![ Token::PlainText("intra".into()), Token::Italic(Box::new(Token::PlainText("word".into()))), Token::PlainText("italic".into()), ]) ); assert_eq!( parse_full(r#"_ italic *"#), Token::PlainText("_ italic *".into()) ); assert_eq!( parse_full(r#"long text with a *footnote text"#), Token::Sequence(vec![ Token::PlainText("long text with a *footnote ".into()), Token::Bold(Box::new(Token::PlainText("text".into()))), ]) ); assert_eq!( parse_full(r#"*"italic"*"#), Token::Italic(Box::new(Token::PlainText("\"italic\"".into()))) ); assert_eq!( parse_full(r#"not code `code` also not code"#), Token::Sequence(vec![ Token::PlainText("not code ".into()), Token::InlineCode("code".into()), Token::PlainText(" also not code".into()) ]), ); assert_eq!( parse_full(r#"not code `code` also `not code"#), Token::Sequence(vec![ Token::PlainText("not code ".into()), Token::InlineCode("code".into()), Token::PlainText(" also `not code".into()) ]), ); assert_eq!( parse_full(r#"not code `*not bold*` also not code"#), Token::Sequence(vec![ Token::PlainText("not code ".into()), Token::InlineCode("*not bold*".into()), Token::PlainText(" also not code".into()) ]), ); assert_eq!( parse_full(r#"***bold italic***"#), Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))) ); assert_eq!( parse_full(r#"bold italic"#), Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( "bold italic".into() ))))) ); assert_eq!( parse_full("~~*hello\nworld*"), Token::Sequence(vec![ Token::PlainText("~~".into()), Token::Italic(Box::new(Token::PlainText("hello\nworld".into()))), ]) ) } #[test] fn parse_flanking() { assert_eq!( parse_full(r#"aaa*iii*bbb"#), Token::Sequence(vec![ Token::PlainText("aaa".into()), Token::Italic(Box::new(Token::PlainText("iii".into()))), Token::PlainText("bbb".into()), ]) ); assert_eq!( parse_full(r#"aaa_nnn_bbb"#), Token::PlainText("aaa_nnn_bbb".into()) ); assert_eq!( parse_full("aaa\n_iii_\nbbb"), Token::Sequence(vec![ Token::PlainText("aaa\n".into()), Token::Italic(Box::new(Token::PlainText("iii".into()))), Token::PlainText("\nbbb".into()), ]) ); assert_eq!( parse_full(r#"*iii*"#), Token::Italic(Box::new(Token::PlainText("iii".into()))) ); assert_eq!( parse_full(r#"_iii_"#), Token::Italic(Box::new(Token::PlainText("iii".into()))) ); assert_eq!( parse_full(r#"aaa*iii*"#), Token::Sequence(vec![ Token::PlainText("aaa".into()), Token::Italic(Box::new(Token::PlainText("iii".into()))), ]) ); assert_eq!( parse_full(r#"*iii*bbb"#), Token::Sequence(vec![ Token::Italic(Box::new(Token::PlainText("iii".into()))), Token::PlainText("bbb".into()), ]) ); assert_eq!( parse_full(r#"aaa_nnn_"#), Token::PlainText("aaa_nnn_".into()) ); assert_eq!( parse_full(r#"_nnn_bbb"#), Token::PlainText("_nnn_bbb".into()) ); } #[test] fn parse_long() { parse_full(&"A".repeat(20000)); parse_full(&"*A".repeat(20000)); parse_full(&"@A".repeat(20000)); } #[test] fn parse_complex() { assert_eq!( parse_full(r"\( nya^3 \)"), Token::InlineMath(" nya^3 ".to_string()) ); assert_eq!( parse_full("\\( nya^3 \n \\)"), Token::PlainText("\\( nya^3 \n \\)".into()) ); assert_eq!( parse_full(r"`AbstractProxyFactoryBean`"), Token::InlineCode("AbstractProxyFactoryBean".to_string()) ); assert_eq!( parse_full("`let x = \n 5;`"), Token::PlainText("`let x = \n 5;`".into()) ); assert_eq!( parse_full( r#" ```js var x = undefined; ```"# ), Token::BlockCode { lang: Some("js".to_string()), inner: "var x = undefined;".to_string(), } ); assert_eq!( parse_full( r" \[ a^2 + b^2 = c^2 \]" ), Token::BlockMath("a^2 + b^2 = c^2".to_string()) ); assert_eq!( parse_full(r"\[ x^2 + y^2 = z^2 \]"), Token::BlockMath("x^2 + y^2 = z^2".to_string()) ); assert_eq!( parse_full( r#"
centered 🦋🏳️‍⚧️ text
"# ), Token::Center(Box::new(Token::Sequence(vec![ Token::PlainText("centered\n".into()), Token::UnicodeEmoji("🦋".into()), Token::UnicodeEmoji("🏳️‍⚧️".into()), Token::PlainText("\ntext".into()), ]))) ); assert_eq!( parse_full( r#">
centered > 👩🏽‍🤝‍👩🏼 > text
"# ), Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ Token::PlainText("centered\n".into()), Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()), Token::PlainText("\ntext".into()) ]))))), ); assert_eq!( parse_full(r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#), Token::Function { name: "x2".into(), params: HashMap::new(), inner: Box::new(Token::Sequence(vec![ Token::Function { name: "sparkle".into(), params: HashMap::new(), inner: Box::new(Token::UnicodeEmoji("🥺".into())), }, Token::UnicodeEmoji("💜".into()), Token::Function { name: "spin".into(), params: { let mut params = HashMap::new(); params.insert("y".into(), None); params.insert("speed".into(), Some("5s".into())); params }, inner: Box::new(Token::UnicodeEmoji("❤️".into())), }, Token::UnicodeEmoji("🦊".into()), ])) }, ); assert_eq!( parse_full(r#"bold @tag1 @tag2 italic"#), Token::Sequence(vec![ Token::PlainText("bold ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag1".into(), host: None }, Token::PlainText(" ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag2".into(), host: None }, Token::PlainText(" italic".into()) ]), ); assert_eq!( parse_full( r#" > test > > italic > >> Nested quote "# ), Token::Quote(Box::new(Token::Sequence(vec![ Token::PlainText("test\n".into()), Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) ]))), ); } #[test] fn parse_link() { assert_eq!( parse_full("IPv4 test: "), Token::Sequence(vec![ Token::PlainText("IPv4 test: ".into()), Token::UrlNoEmbed("https://0".into()), ]) ); assert_eq!( parse_full("IPv4 test: "), Token::Sequence(vec![ Token::PlainText("IPv4 test: ".into()), Token::UrlNoEmbed("https://127.0.0.1".into()), ]) ); assert_eq!( parse_full("IPv6 test: "), Token::Sequence(vec![ Token::PlainText("IPv6 test: ".into()), Token::UrlNoEmbed("https://[::2f:1]/nya".into()), ]) ); assert_eq!( parse_full("IPv6 test: https://[::2f:1]/nya"), Token::Sequence(vec![ Token::PlainText("IPv6 test: ".into()), Token::UrlRaw("https://[::2f:1]/nya".into()), ]) ); // IDNs assert_eq!( parse_full("IDN test: https://www.háčkyčárky.cz/"), Token::Sequence(vec![ Token::PlainText("IDN test: ".into()), Token::UrlRaw("https://www.háčkyčárky.cz/".into()), ]) ); assert_eq!( parse_full("Link test: [label](https://example.com)"), Token::Sequence(vec![ Token::PlainText("Link test: ".into()), Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://example.com".into(), embed: true, }, ]) ); assert_eq!( parse_full("test #hashtag tail"), Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Hashtag("hashtag".into()), Token::PlainText(" tail".into()), ]) ); assert_eq!( parse_full("not#hashtag tail"), Token::PlainText("not#hashtag tail".into()) ); assert_eq!( parse_full(""), Token::UrlNoEmbed("https://example.com".into()) ); // Adjacent links okay assert_eq!( parse_full(""), Token::Sequence(vec![ Token::UrlNoEmbed("https://example.com/".into()), Token::UrlNoEmbed("https://awawa.gay/".into()), ]) ); assert_eq!( parse_full("Link test: ?[label](https://awawa.gay)"), Token::Sequence(vec![ Token::PlainText("Link test: ".into()), Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://awawa.gay".into(), embed: false, }, ]) ); assert_eq!( parse_full("Link test: ?[label](https://awawa.gay)test"), Token::Sequence(vec![ Token::PlainText("Link test: ".into()), Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://awawa.gay".into(), embed: false, }, Token::PlainText("test".into()), ]) ); assert_eq!( parse_full("Link test: (?[label](https://awawa.gay))"), Token::Sequence(vec![ Token::PlainText("Link test: (".into()), Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://awawa.gay".into(), embed: false, }, Token::PlainText(")".into()), ]) ); assert_eq!( parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket Token::Sequence(vec![ Token::PlainText("Link test: ?[label](".into()), Token::UrlRaw("https://awawa.gay".into()), ]) ); } #[test] fn limit_nesting() { let mut tok = Token::PlainText(" test ".into()); for _ in 0..DEFAULT_DEPTH_LIMIT { tok = Token::Bold(Box::new(tok)); } assert_eq!( parse_full( &("".repeat(DEFAULT_DEPTH_LIMIT) + " test " + &*"".repeat(DEFAULT_DEPTH_LIMIT)) ), tok ); } #[test] fn parse_mention() { assert_eq!( parse_full("@tag"), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: None, } ); assert_eq!( parse_full("email@notactuallyamenmtion.org"), Token::PlainText("email@notactuallyamenmtion.org".into()) ); assert_eq!( parse_full("hgsjlkdsa @tag fgahjsdkd"), Token::Sequence(vec![ Token::PlainText("hgsjlkdsa ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: None, }, Token::PlainText(" fgahjsdkd".into()), ]) ); assert_eq!( parse_full("hgsjlkdsa @tag@ fgahjsdkd"), Token::Sequence(vec![ Token::PlainText("hgsjlkdsa ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: None, }, Token::PlainText("@ fgahjsdkd".into()), ]) ); assert_eq!( parse_full("aaaa @tag@domain bbbbb"), Token::Sequence(vec![ Token::PlainText("aaaa ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: Some("domain".into()), }, Token::PlainText(" bbbbb".into()), ]) ); assert_eq!( parse_full("test @tag@domain, test"), Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: Some("domain".into()), }, Token::PlainText(", test".into()), ]) ); assert_eq!( parse_full("test @tag@domain.gay. test"), Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: Some("domain.gay".into()), }, Token::PlainText(". test".into()), ]) ); assert_eq!( parse_full("test @tag@domain? test"), Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: Some("domain".into()), }, Token::PlainText("? test".into()), ]) ); assert_eq!( parse_full("test !tag@domain.com test"), Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { mention_type: crate::MentionType::Community, name: "tag".into(), host: Some("domain.com".into()), }, Token::PlainText(" test".into()), ]) ); assert_eq!( parse_full("@tag:domain.com"), Token::Mention { mention_type: crate::MentionType::MatrixUser, name: "tag".into(), host: Some("domain.com".into()) }, ); } #[test] fn parse_shortcodes() { assert_eq!( parse_full(":bottom:"), Token::ShortcodeEmoji { shortcode: "bottom".into(), host: None, } ); assert_eq!( parse_full(":bottom::blobfox:"), Token::Sequence(vec![ Token::ShortcodeEmoji { shortcode: "bottom".into(), host: None, }, Token::ShortcodeEmoji { shortcode: "blobfox".into(), host: None, }, ]) ); assert_eq!( parse_full(":bottom@magnetar.social:"), Token::ShortcodeEmoji { shortcode: "bottom".into(), host: Some("magnetar.social".into()), } ); assert_eq!( parse_full(":bottom:blobfox"), Token::PlainText(":bottom:blobfox".into()) ); assert_eq!( parse_full("bottom:blobfox:"), Token::PlainText("bottom:blobfox:".into()) ); } #[test] fn parse_emoji() { assert_eq!( parse_full("🥺💜❤️🦊"), Token::Sequence( vec!["🥺", "💜", "❤️", "🦊"] .into_iter() .map(str::to_string) .map(Token::UnicodeEmoji) .collect::>() ) ); // Trans flag, ZWJ assert_eq!( parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}"), Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into()) ); assert_eq!( parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}"), Token::Sequence(vec![ Token::PlainText("\u{0200d}".into()), // ZWJ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag ]) ); // Trans flag, ZWNJ assert_eq!( parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}"), Token::Sequence(vec![ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag Token::PlainText("\u{0200c}".into()), // ZWNJ Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()), // Trans symbol ]) ); assert_eq!( parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}"), Token::Sequence(vec![ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag Token::PlainText("\u{0200d}\u{0200d}\u{0200d}".into()), // ZWJ ]) ); } #[test] fn xml_serialization() { assert_eq!( &to_xml_string(&parse_full("***nyaaa***")).unwrap(), r#"nyaaa"# ); assert_eq!( &to_xml_string(&parse_full( "@natty $[spin.speed=0.5s 🥺]:cat_attack: test" )) .unwrap(), r#" 🥺cat_attack test"# ); assert_eq!( &to_xml_string(&parse_full( "Ring Galaxy AM 0644 741 from Hubble\nCredits: AURA, STScI, J. Higdon, Cornell, ESA, #NASA\n#nature #space #astrophotography" )) .unwrap(), r#"Ring Galaxy AM 0644 741 from Hubble Credits: AURA, STScI, J. Higdon, Cornell, ESA, NASA nature space astrophotography"# ); assert_eq!( &to_xml_string(&parse_full( r#" ```js var x = undefined; ``` "# )) .unwrap(), "var x = undefined;" ); } }