use either::Either; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{ alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1, tab, }; use nom::combinator::{eof, fail, map, not, opt, recognize}; use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; use std::convert::{identity, Infallible}; use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum MentionType { Community, User, } impl MentionType { pub fn to_char(&self) -> char { match self { MentionType::Community => '!', MentionType::User => '@', } } } #[derive(Clone, Debug, Eq, PartialEq)] pub enum Token<'a> { PlainText(Cow<'a, str>), Sequence(Vec>), Quote(Box>), Small(Box>), BoldItalic(Box>), Bold(Box>), Italic(Box>), Center(Box>), Strikethrough(Box>), PlainTag(Cow<'a, str>), InlineCode(Cow<'a, str>), InlineMath(Cow<'a, str>), UrlRaw(Cow<'a, str>), UrlNoEmbed(Cow<'a, str>), Link { label: Box>, href: Cow<'a, str>, embed: bool, }, BlockCode { lang: Option>, inner: Cow<'a, str>, }, BlockMath(Cow<'a, str>), Function { name: Cow<'a, str>, params: HashMap, Option>>, inner: Box>, }, Mention { name: Cow<'a, str>, host: Option>, mention_type: MentionType, }, UnicodeEmoji(Cow<'a, str>), ShortcodeEmoji(Cow<'a, str>), Hashtag(Cow<'a, str>), } impl Token<'_> { fn owned(&self) -> Token<'static> { match self { Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())), Token::Sequence(tokens) => Token::Sequence(tokens.iter().map(Token::owned).collect()), Token::Quote(inner) => Token::Quote(Box::new(inner.owned())), Token::Small(inner) => Token::Small(Box::new(inner.owned())), Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.owned())), Token::Bold(inner) => Token::Bold(Box::new(inner.owned())), Token::Italic(inner) => Token::Italic(Box::new(inner.owned())), Token::Center(inner) => Token::Center(Box::new(inner.owned())), Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.owned())), Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())), Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())), Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())), Token::UrlRaw(url) => Token::UrlRaw(Cow::Owned(url.clone().into_owned())), Token::UrlNoEmbed(url) => Token::UrlNoEmbed(Cow::Owned(url.clone().into_owned())), Token::Link { embed, label, href } => Token::Link { embed: *embed, label: Box::new(label.owned()), href: Cow::Owned(href.clone().into_owned()), }, Token::BlockCode { inner, lang } => Token::BlockCode { lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())), inner: Cow::Owned(inner.clone().into_owned()), }, Token::BlockMath(math) => Token::BlockMath(Cow::Owned(math.clone().into_owned())), Token::Function { name, params, inner, } => Token::Function { name: Cow::Owned(name.clone().into_owned()), params: params .iter() .map(|(k, v)| { ( Cow::Owned(k.clone().into_owned()), v.as_ref().map(|val| Cow::Owned(val.clone().into_owned())), ) }) .collect(), inner: Box::new(inner.owned()), }, Token::Mention { name, host, mention_type, } => Token::Mention { name: Cow::Owned(name.clone().into_owned()), host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())), mention_type: *mention_type, }, Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())), Token::ShortcodeEmoji(shortcode) => { Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned())) } Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())), } } fn merged(&self) -> Token { match self { Token::Sequence(tokens) => { let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| { if let Some(Token::PlainText(last)) = acc.last_mut() { if let Token::PlainText(tok_text) = tok { *last = Cow::from(last.to_string() + tok_text.as_ref()); return acc; } } if let Token::Sequence(seq) = tok { let items = seq.iter().map(Token::merged).flat_map(|t| match t { Token::Sequence(seq) => Either::Left(seq.into_iter()), other => Either::Right(std::iter::once(other)), }); for item in items { if let Some(Token::PlainText(last)) = acc.last_mut() { if let Token::PlainText(tok_text) = item { *last = Cow::from(last.to_string() + tok_text.as_ref()); continue; } } acc.push(item); } return acc; } acc.push(tok.merged()); acc }); if tokens_multi.len() == 1 { return tokens_multi.into_iter().next().unwrap(); } Token::Sequence(tokens_multi) } Token::Quote(inner) => Token::Quote(Box::new(inner.merged())), Token::Small(inner) => Token::Small(Box::new(inner.merged())), Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.merged())), Token::Bold(inner) => Token::Bold(Box::new(inner.merged())), Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), Token::Center(inner) => Token::Center(Box::new(inner.merged())), Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())), Token::Link { embed, label, href } => Token::Link { label: Box::new(label.merged()), href: href.clone(), embed: *embed, }, Token::Function { name, params, inner, } => Token::Function { name: name.clone(), params: params.clone(), inner: Box::new(inner.merged()), }, other => other.clone(), } } } type Span<'a> = LocatedSpan<&'a str>; trait SliceOffset { fn up_to(&self, other: &Self) -> Self; fn fragment_between<'a>(&self, other: &Self) -> &'a str where Self: 'a; } impl SliceOffset for Span<'_> { fn up_to(&self, other: &Self) -> Self { self.slice(..self.offset(other)) } fn fragment_between<'a>(&self, other: &Self) -> &'a str where Self: 'a, { self.up_to(other).into_fragment() } } #[inline] fn boxing_token<'a>(func: impl Fn(Box>) -> Token<'a>) -> impl Fn(Token<'a>) -> Token<'a> { move |tokens| func(Box::new(tokens)) } #[inline] fn collect_sequence<'a, T>( func: impl Fn(Vec) -> Token<'a>, transform: impl Fn(Token<'a>) -> Token<'a>, ) -> impl Fn(&mut dyn Iterator) -> Token<'a> { move |tokens| transform(func(tokens.collect())) } #[inline] fn collect_char_sequence<'a>( func: impl Fn(Cow<'a, str>) -> Token<'a>, ) -> impl Fn(&mut dyn Iterator) -> Token<'a> { move |chars| func(Cow::Owned(chars.collect())) } fn spliced<'a>( segments: &[Span<'a>], func: impl Fn(Span) -> IResult, parent: Span<'a>, ) -> IResult, Token<'static>, nom::error::Error>> { let combined = segments .iter() .copied() .map(Span::into_fragment) .collect::>() .join("\n"); let cum_offset_combined = segments .iter() .scan(0, |acc, &x| { *acc += x.len(); Some(*acc) }) .collect::>(); let current_seg = |input: Span| { cum_offset_combined .iter() .enumerate() .take_while(|(_, &o)| o > input.location_offset()) .map(|(i, o)| (segments[i], o)) .last() }; type NE = nom::Err; type NomError<'x> = nom::error::Error>; let quote_span = Span::new(&combined); let (input, inner) = match func(quote_span) { Ok((input, token)) => (input, token.owned()), Err(e) => { return match e { NE::Error(e) => { let offset_new = e.input.location_offset(); if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) { let offset = offset_new - offset_seg_new; let offset_orig = offset + seg_parent.location_offset(); Err(NE::Error(NomError::new( Span::new(&parent.into_fragment()[offset_orig..]), e.code, ))) } else { // ??? Err(NE::Failure(NomError::new(parent, ErrorKind::Fail))) } } NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))), NE::Incomplete(i) => Err(NE::Incomplete(i)), }; } }; let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) { let offset = input.location_offset() - offset_seg_new; let offset_orig = offset + seg_parent.location_offset(); parent.slice(offset_orig..) } else { parent }; Ok((out, inner.owned())) } fn space(input: Span) -> IResult { let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?; Ok((input, Token::PlainText(frag.into_fragment().into()))) } struct Matcher<'a, 'b, T> { matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), _phantom_closure: std::marker::PhantomData<&'a ()>, _phantom_data: std::marker::PhantomData<&'b ()>, _phantom_output: std::marker::PhantomData T>, } impl<'a, 'b, T> Matcher<'a, 'b, T> { fn new( matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), collector: &'a (dyn Fn(&mut dyn Iterator) -> Token<'b> + 'a), ) -> Self { Self { matcher_inner, collector, _phantom_closure: std::marker::PhantomData, _phantom_data: std::marker::PhantomData, _phantom_output: std::marker::PhantomData, } } } impl<'a, 'b> Matcher<'a, 'b, Infallible> { // Don't break this invariant, else a monster will come at night and eat all your socks fn reject() -> Self { Self { matcher_inner: &fail::<_, Infallible, _>, collector: &|_| unreachable!(), _phantom_closure: std::marker::PhantomData, _phantom_data: std::marker::PhantomData, _phantom_output: std::marker::PhantomData, } } } struct Context; impl Context { #[inline] const fn partial( &self, func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token<'a>> + 'static, ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token<'a>> + '_ { move |input| func(self, input) } fn full<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { map(many1(self.partial(Self::full_single)), Token::Sequence)(input) } fn inline<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { map(many1(self.partial(Self::inline_single)), Token::Sequence)(input) } fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { map( many1(self.partial(Self::inline_label_safe_single)), Token::Sequence, )(input) } fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { alt(( self.partial(Self::tag_bold_italic_asterisk), self.partial(Self::tag_bold_italic_underscore), self.partial(Self::tag_bold_asterisk), self.partial(Self::tag_italic_asterisk), self.partial(Self::tag_bold_underscore), self.partial(Self::tag_italic_underscore), ))(input) } fn full_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, token) = alt(( self.partial(Self::unicode_emoji), alt(( self.partial(Self::tag_block_center), self.partial(Self::tag_small), self.partial(Self::tag_plain), self.partial(Self::tag_bold), self.partial(Self::tag_italic), self.partial(Self::tag_strikethrough), )), self.partial(Self::url_no_embed), self.partial(Self::base_bold_italic), self.partial(Self::tag_block_code), self.partial(Self::tag_inline_code), self.partial(Self::tag_quote), self.partial(Self::tag_block_math), self.partial(Self::tag_inline_math), self.partial(Self::tag_strikethrough_tilde), self.partial(Self::tag_func), self.partial(Self::tag_mention), self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji), self.partial(Self::link), self.partial(Self::raw_url), self.partial(Self::text), ))(input)?; Ok((input, token)) } fn inline_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, token) = alt(( self.partial(Self::unicode_emoji), self.partial(Self::tag_small), self.partial(Self::tag_plain), self.partial(Self::tag_bold), self.partial(Self::tag_italic), self.partial(Self::tag_strikethrough), self.partial(Self::url_no_embed), self.partial(Self::base_bold_italic), self.partial(Self::tag_inline_code), self.partial(Self::tag_inline_math), self.partial(Self::tag_strikethrough_tilde), self.partial(Self::tag_func), self.partial(Self::tag_mention), self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji), self.partial(Self::link), self.partial(Self::raw_url), self.partial(Self::text), ))(input)?; Ok((input, token)) } fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, token) = alt(( self.partial(Self::unicode_emoji), self.partial(Self::url_no_embed), self.partial(Self::tag_inline_code), self.partial(Self::tag_inline_math), self.partial(Self::tag_func), self.partial(Self::tag_mention), self.partial(Self::tag_hashtag), self.partial(Self::shortcode_emoji), self.partial(Self::raw_url), self.partial(Self::text), ))(input)?; Ok((input, token)) } fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, token) = alt(( self.partial(Self::unicode_emoji), self.partial(Self::tag_small), self.partial(Self::tag_plain), self.partial(Self::tag_bold), self.partial(Self::tag_italic), self.partial(Self::tag_strikethrough), self.partial(Self::base_bold_italic), self.partial(Self::tag_strikethrough_tilde), self.partial(Self::tag_func), self.partial(Self::shortcode_emoji), self.partial(Self::text), ))(input)?; Ok((input, token)) } fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; if let (None, None) = leading_spaces { if input.get_column() != 1 { return fail(input); } } let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input); let orig_input = input; let (input, lines) = separated_list1(line_ending, quote_line)(input)?; let quote_lines = lines .into_iter() .map(|(_, _, text)| text) .collect::>(); if quote_lines.len() == 1 && quote_lines .iter() .map(Span::fragment) .copied() .any(&str::is_empty) { return fail(input); } let (_, inner) = spliced("e_lines, self.partial(Self::full), orig_input)?; let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; Ok((input, Token::Quote(Box::new(inner)))) } fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let tag_start = &tag("
"); let tag_end = &tag("
"); let (input, _) = opt(line_ending)(input)?; if input.get_column() != 1 { return fail(input); } let (input, _) = tag_start(input)?; let (input, _) = opt(line_ending)(input)?; let (input, (center_seq, _)) = many_till( self.partial(Self::inline_single), tuple((opt(space1), opt(line_ending), tag_end)), )(input)?; Ok(( input, boxing_token(Token::Center)(Token::Sequence(center_seq)), )) } fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let delim = &tag("```"); let (input, _) = opt(line_ending)(input)?; if input.get_column() != 1 { return fail(input); } let (input, _) = delim(input)?; let (input, lang) = opt(map( recognize(many1(tuple((not(delim), not_line_ending)))), Span::into_fragment, ))(input)?; let (input, _) = line_ending(input)?; let (input, code) = map( recognize(many1_count(tuple(( not(tuple((line_ending, delim))), anychar, )))), Span::into_fragment, )(input)?; let (input, _) = line_ending(input)?; let (input, _) = delim(input)?; let (input, _) = many0(space)(input)?; let (input, _) = not(not(line_ending))(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( input, Token::BlockCode { lang: lang.map(<&str>::into), inner: code.into(), }, )) } fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let start = &tag("\\["); let end = &tag("\\]"); let (input, _) = opt(line_ending)(input)?; if input.get_column() != 1 { return fail(input); } let (input, _) = start(input)?; let (input, _) = opt(line_ending)(input)?; let (input, math_span) = recognize(many1_count(tuple(( not(tuple((opt(line_ending), end))), not_line_ending, ))))(input)?; let (input, _) = opt(line_ending)(input)?; let (input, _) = end(input)?; let (input, _) = many0(space)(input)?; let (input, _) = not(not_line_ending)(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( input, Token::BlockMath(Cow::Borrowed(math_span.into_fragment())), )) } #[inline] fn tag_delimited<'a, 'b: 'a, T, S>( &'a self, opening_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, closing_tag: impl Fn(Span<'b>) -> IResult, Span<'b>> + 'a, escape: bool, matcher: Matcher<'a, 'b, T>, fallback: Matcher<'a, 'b, S>, ) -> impl Fn(Span<'b>) -> IResult, Token<'b>> + '_ { move |input| { if escape { if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) { return Ok(( input_escaped, Token::PlainText(Cow::Borrowed(mark.fragment())), )); } } let begin = input; let (post_open, _) = opening_tag(input)?; let res = tuple(( many1(tuple((not(&closing_tag), &matcher.matcher_inner))), &closing_tag, ))(post_open); if let Err(nom::Err::Error(nom::error::Error { input: input_past_err, .. })) = res { let res_fallback = tuple(( many1(tuple((not(&closing_tag), &fallback.matcher_inner))), &closing_tag, ))(post_open); if res_fallback.is_err() { return Ok(( input_past_err, Token::PlainText(begin.fragment_between(&input_past_err).into()), )); } let (input, (inner, closing)) = res_fallback.unwrap(); let mut inner = inner.into_iter().map(|(_, t)| t); return Ok(( input, Token::Sequence(vec![ Token::PlainText(begin.fragment_between(&post_open).into()), ((fallback.collector)(&mut inner)), Token::PlainText(closing.into_fragment().into()), ]), )); } let (input, (inner, _)) = res?; let mut inner = inner.into_iter().map(|(_, t)| t); Ok((input, (matcher.collector)(&mut inner))) } } fn tag_func<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, _) = tag("$[")(input)?; let func_ident = |input| { recognize(tuple(( many1_count(alt((alpha1, tag("_")))), many0_count(alt((alphanumeric1, tag("_")))), )))(input) }; let param_value = recognize(many1_count(alt(( alphanumeric1, tag("."), tag("-"), tag("_"), )))); let (input, func_name) = map(func_ident, Span::into_fragment)(input)?; let arg = tuple((func_ident, opt(tuple((tag("="), param_value))))); let (input, args) = opt(tuple((one_char('.'), separated_list1(one_char(','), arg))))(input)?; let args_out = args.map_or_else(HashMap::new, |(_, items)| { items .into_iter() .map(|(k, v)| { ( Cow::from(k.into_fragment()), v.map(|(_, val)| Cow::from(val.into_fragment())), ) }) .collect::>() }); let (input, _) = opt(space)(input)?; let (input, (inner, _)) = many_till(self.partial(Self::inline_single), tag("]"))(input)?; Ok(( input, Token::Function { name: Cow::from(func_name), params: args_out, inner: Box::new(Token::Sequence(inner)), }, )) } fn tag_plain<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let opening_tag = &tag(""); let closing_tag = &tag(""); let (input, _) = opening_tag(input)?; let (input, text) = map( recognize(many1(tuple((not_line_ending, not(closing_tag))))), Span::into_fragment, )(input)?; let (input, _) = closing_tag(input)?; Ok((input, Token::PlainTag(text.into()))) } fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag(""), tag(""), false, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Small)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag("***"), tag("***"), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag("___"), tag("___"), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag(""), tag(""), false, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag("**"), tag("**"), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } // TODO: CommonMark flanking rules fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag("__"), tag("__"), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag(""), tag(""), false, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } // TODO: CommonMark flanking rules fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag("*"), tag("*"), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } // TODO: CommonMark flanking rules fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag("_"), tag("_"), true, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag(""), tag(""), false, Matcher::new( &self.partial(Self::inline_single), &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), ), Matcher::new( &self.partial(Self::inline_non_formatting_single), &collect_sequence(Token::Sequence, identity), ), )(input) } // TODO: CommonMark flanking rules fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag("~~"), tag("~~"), true, Matcher::new( &move |input| { map( tuple(((not(line_ending)), self.partial(Self::inline_single))), |(_, captured)| captured, )(input) }, &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), ), Matcher::new( &move |input| { map( tuple(( (not(line_ending)), self.partial(Self::inline_non_formatting_single), )), |(_, captured)| captured, )(input) }, &collect_sequence(Token::Sequence, identity), ), )(input) } fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag("`"), |input| alt((tag("`"), tag("´")))(input), true, Matcher::new( &move |input| { map( tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar)), |(_, captured)| captured, )(input) }, &collect_char_sequence(Token::InlineCode), ), Matcher::reject(), )(input) } fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { self.tag_delimited( tag("\\("), tag("\\)"), false, Matcher::new( &move |input| { map(tuple((not(line_ending), anychar)), |(_, captured)| captured)(input) }, &collect_char_sequence(Token::InlineMath), ), Matcher::reject(), )(input) } fn text<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, text) = map(recognize(anychar), Span::into_fragment)(input)?; Ok((input, Token::PlainText(text.into()))) } fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, url_span) = recognize(tuple(( protocol, url_chars(|input| not(url_chars_base)(input), false), )))(input)?; let url = url_span.into_fragment(); let url_bytes = url.as_bytes(); // Strip punctuation at the end of sentences that might have been consumed as a part of the URL let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) { url.slice(..url.len() - 1) } else { url }; Ok((input, Token::UrlRaw(Cow::from(final_url)))) } fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, _) = tag("<")(input)?; let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?; let (input, _) = tag(">")(input)?; Ok(( input, Token::UrlNoEmbed(Cow::from(url_span.into_fragment())), )) } fn link<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let (input, no_embed) = opt(tag("?"))(input)?; let (input, _) = tag("[")(input)?; let (input, _) = not(tag("["))(input)?; let (input, (label_tok, _)) = many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?; let (input, url_span) = recognize(tuple((protocol, url_chars(tag(")"), true))))(input)?; let (input, _) = tag(")")(input)?; Ok(( input, Token::Link { label: Box::new(Token::Sequence(label_tok)), href: url_span.into_fragment().into(), embed: no_embed.is_none(), }, )) } fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { let frag = input.fragment(); let Some(grapheme) = frag.graphemes(true).next() else { return fail(input); }; let grapheme = grapheme.trim_end_matches(|c| c == '\u{200c}' || c == '\u{200d}'); let emoji = emojis::get(grapheme); if emoji.is_none() { return fail(input); } Ok(( input.slice(grapheme.len()..), Token::UnicodeEmoji(grapheme.into()), )) } fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { // TODO: Fail when preceded by alphanumerics let (input, _) = tag(":")(input)?; let (input, shortcode) = map( recognize(many1(alt((alphanumeric1, recognize(one_of("_+-")))))), Span::into_fragment, )(input)?; let (input, _) = tag(":")(input)?; let (input, _) = not(alphanumeric1)(input)?; Ok((input, Token::ShortcodeEmoji(shortcode.into()))) } fn tag_mention<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { // TODO: Escaping and skip when preceded by alphanumerics let tags = one_of("@!"); let (input, mention_type) = map(tags, |c| match c { '@' => MentionType::User, '!' => MentionType::Community, _ => unreachable!(), })(input)?; let (input, name) = map( recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))), Span::into_fragment, )(input)?; let before = input; let (_, host) = map( opt(tuple(( tag("@"), map( recognize(many1(alt((alphanumeric1, recognize(one_of("-_.")))))), Span::into_fragment, ), ))), |maybe_tag_host| maybe_tag_host.map(|(_, host)| host), )(input)?; let host = host.map(|h| h.trim_end_matches(|c| matches!(c, '.' | '-' | '_'))); Ok(( host.map(|c| before.slice(c.len() + 1..)).unwrap_or(before), Token::Mention { mention_type, name: name.into(), host: host.map(|h| h.into()), }, )) } fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { // TODO: Skip when preceded by alphanumerics let (input, _) = tag("#")(input)?; let (input, hashtag_text) = map(recognize(many1(hashtag_chars)), Span::into_fragment)(input)?; Ok((input, Token::Hashtag(hashtag_text.into()))) } } #[inline] fn hashtag_chars(input: Span) -> IResult { recognize(alt(( recognize(tuple((tag("("), hashtag_chars, tag(")")))), recognize(tuple((tag("["), hashtag_chars, tag("]")))), recognize(tuple((tag("「"), hashtag_chars, tag("」")))), recognize(tuple((tag("("), hashtag_chars, tag(")")))), recognize(tuple(( not(space1), not_line_ending, not(one_of(".,:;!?#?/[]【】()「」()<>")), anychar, ))), )))(input) } #[inline] fn protocol(input: Span) -> IResult { alt((tag("https://"), tag("http://")))(input) } #[inline] fn url_chars_base(input: Span) -> IResult { recognize(alt(( alpha1, recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))), recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))), recognize(one_of(".,_/:%#$&?!~=+-@")), )))(input) } #[inline] fn url_chars<'a, T: 'a>( terminator: impl Fn(Span<'a>) -> IResult, T> + 'a, spaces: bool, ) -> impl FnMut(Span<'a>) -> IResult, Span<'a>> + 'a { let chars = tuple(( not(tuple((space1, eof))), not(tuple((space1, tag("\"")))), not(tuple((opt(space1), terminator))), alt((url_chars_base, if spaces { space1 } else { fail })), )); recognize(many1_count(chars)) } #[cfg(test)] mod test { use crate::{url_chars, Context, Span, Token}; use nom::bytes::complete::tag; use std::borrow::Cow; use std::collections::HashMap; fn parse_full(string: &str) -> Token { Context.full(Span::new(string)).unwrap().1.merged().owned() } #[test] fn parse_url_chars() { assert_eq!( url_chars(tag(")"), true)(Span::new( "https://en.wikipedia.org/wiki/Sandbox_(computer_security))" )) .unwrap() .1 .into_fragment(), "https://en.wikipedia.org/wiki/Sandbox_(computer_security)" ); assert_eq!( url_chars(tag(")"), true)(Span::new( "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))" )) .unwrap() .1 .into_fragment(), "https://en.wikipedia.org/wiki/Sandbox_(computer_security)", ); assert_eq!( url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among_Us ")) .unwrap() .1 .into_fragment(), "https://cs.wikipedia.org/wiki/Among_Us", ); assert_eq!( url_chars(tag(")"), true)(Span::new("https://cs.wikipedia.org/wiki/Among Us )")) .unwrap() .1 .into_fragment(), "https://cs.wikipedia.org/wiki/Among Us" ); assert_eq!( url_chars(tag(")"), false)(Span::new("https://en.wikipedia.org/wiki/Among Us )")) .unwrap() .1 .into_fragment(), "https://en.wikipedia.org/wiki/Among" ); } #[test] fn parse_formatting() { assert_eq!( parse_full(r#"~~stikethrough~~"#), Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), ); assert_eq!( parse_full(r#"**bold**"#), Token::Bold(Box::new(Token::PlainText("bold".into()))), ); assert_eq!( parse_full(r#"*italic*"#), Token::Italic(Box::new(Token::PlainText("italic".into()))), ); assert_eq!( parse_full(r#"not code `code` also not code"#), Token::Sequence(vec![ Token::PlainText("not code ".into()), Token::InlineCode("code".into()), Token::PlainText(" also not code".into()) ]), ); assert_eq!( parse_full(r#"not code `code` also `not code"#), Token::Sequence(vec![ Token::PlainText("not code ".into()), Token::InlineCode("code".into()), Token::PlainText(" also `not code".into()) ]), ); assert_eq!( parse_full(r#"not code `*not bold*` also not code"#), Token::Sequence(vec![ Token::PlainText("not code ".into()), Token::InlineCode("*not bold*".into()), Token::PlainText(" also not code".into()) ]), ); assert_eq!( parse_full(r#"***bold italic***"#), Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))) ); assert_eq!( parse_full(r#"bold italic"#), Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( "bold italic".into() ))))) ); } #[test] fn parse_complex() { assert_eq!( parse_full( r#"
centered 🦋🏳️‍⚧️ text
"# ), Token::Center(Box::new(Token::Sequence(vec![ Token::PlainText("centered\n".into()), Token::UnicodeEmoji("🦋".into()), Token::UnicodeEmoji("🏳️‍⚧️".into()), Token::PlainText("\ntext".into()) ]))) ); assert_eq!( parse_full( r#">
centered > 👩🏽‍🤝‍👩🏼 > text
"# ), Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ Token::PlainText("centered\n".into()), Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()), Token::PlainText("\ntext".into()) ]))))), ); assert_eq!( parse_full(r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#), Token::Function { name: "x2".into(), params: HashMap::new(), inner: Box::new(Token::Sequence(vec![ Token::Function { name: "sparkle".into(), params: HashMap::new(), inner: Box::new(Token::UnicodeEmoji("🥺".into())), }, Token::UnicodeEmoji("💜".into()), Token::Function { name: "spin".into(), params: { let mut params = HashMap::new(); params.insert("y".into(), None); params.insert("speed".into(), Some("5s".into())); params }, inner: Box::new(Token::UnicodeEmoji("❤️".into())), }, Token::UnicodeEmoji("🦊".into()), ])) }, ); assert_eq!( parse_full(r#"bold @tag1 @tag2 italic"#), Token::Sequence(vec![ Token::PlainText("bold ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag1".into(), host: None }, Token::PlainText(" ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag2".into(), host: None }, Token::PlainText(" italic".into()) ]), ); assert_eq!( parse_full( r#" > test > > italic > >> Nested quote "# ), Token::Quote(Box::new(Token::Sequence(vec![ Token::PlainText("test\n".into()), Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) ]))), ); } #[test] fn parse_link() { assert_eq!( parse_full("Link test: [label](https://example.com)"), Token::Sequence(vec![ Token::PlainText("Link test: ".into()), Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://example.com".into(), embed: true } ]) ); assert_eq!( parse_full(""), Token::UrlNoEmbed("https://example.com".into()) ); // Adjacent links okay assert_eq!( parse_full(""), Token::Sequence(vec![ Token::UrlNoEmbed("https://example.com/".into()), Token::UrlNoEmbed("https://awawa.gay/".into()) ]) ); assert_eq!( parse_full("Link test: ?[label](https://awawa.gay)"), Token::Sequence(vec![ Token::PlainText("Link test: ".into()), Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://awawa.gay".into(), embed: false } ]) ); assert_eq!( parse_full("Link test: ?[label](https://awawa.gay)test"), Token::Sequence(vec![ Token::PlainText("Link test: ".into()), Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://awawa.gay".into(), embed: false }, Token::PlainText("test".into()) ]) ); assert_eq!( parse_full("Link test: (?[label](https://awawa.gay))"), Token::Sequence(vec![ Token::PlainText("Link test: (".into()), Token::Link { label: Box::new(Token::PlainText("label".into())), href: "https://awawa.gay".into(), embed: false }, Token::PlainText(")".into()) ]) ); assert_eq!( parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket Token::Sequence(vec![ Token::PlainText("Link test: ?[label](".into()), Token::UrlRaw("https://awawa.gay".into()), ]) ); } #[test] fn parse_mention() { assert_eq!( parse_full("@tag"), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: None } ); assert_eq!( parse_full("hgsjlkdsa @tag fgahjsdkd"), Token::Sequence(vec![ Token::PlainText("hgsjlkdsa ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: None }, Token::PlainText(" fgahjsdkd".into()) ]) ); assert_eq!( parse_full("hgsjlkdsa @tag@ fgahjsdkd"), Token::Sequence(vec![ Token::PlainText("hgsjlkdsa ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: None }, Token::PlainText("@ fgahjsdkd".into()) ]) ); assert_eq!( parse_full("aaaa @tag@domain bbbbb"), Token::Sequence(vec![ Token::PlainText("aaaa ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: Some("domain".into()) }, Token::PlainText(" bbbbb".into()) ]) ); assert_eq!( parse_full("test @tag@domain, test"), Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: Some("domain".into()) }, Token::PlainText(", test".into()) ]) ); assert_eq!( parse_full("test @tag@domain.gay. test"), Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: Some("domain.gay".into()) }, Token::PlainText(". test".into()) ]) ); assert_eq!( parse_full("test @tag@domain? test"), Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { mention_type: crate::MentionType::User, name: "tag".into(), host: Some("domain".into()) }, Token::PlainText("? test".into()) ]) ); assert_eq!( parse_full("test !tag@domain.com test"), Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { mention_type: crate::MentionType::Community, name: "tag".into(), host: Some("domain.com".into()) }, Token::PlainText(" test".into()) ]) ); } #[test] fn parse_emoji() { assert_eq!( parse_full("🥺💜❤️🦊"), Token::Sequence( vec!["🥺", "💜", "❤️", "🦊"] .into_iter() .map(<&str as Into>>::into) .map(Token::UnicodeEmoji) .collect::>() ) ); // Trans flag, ZWJ assert_eq!( parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}"), Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into()) ); assert_eq!( parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}"), Token::Sequence(vec![ Token::PlainText("\u{0200d}".into()), // ZWJ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag ]) ); // Trans flag, ZWNJ assert_eq!( parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}"), Token::Sequence(vec![ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag Token::PlainText("\u{0200c}".into()), // ZWNJ Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()) // Trans symbol ]) ); assert_eq!( parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}"), Token::Sequence(vec![ Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag Token::PlainText("\u{0200d}\u{0200d}\u{0200d}".into()), // ZWJ ]) ); } }