magnetar/magnetar_mmm_parser/src/lib.rs

1215 lines
40 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use either::Either;
use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::character::complete::{
alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1,
tab,
};
use nom::combinator::{eof, fail, map, not, opt, recognize};
use nom::error::ErrorKind;
use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1};
use nom::sequence::tuple;
use nom::{IResult, Offset, Slice};
use nom_locate::LocatedSpan;
use std::borrow::Cow;
use std::collections::HashMap;
use std::convert::identity;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum MentionType {
Community,
User,
}
impl MentionType {
pub fn to_char(&self) -> char {
match self {
MentionType::Community => '!',
MentionType::User => '@',
}
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Token<'a> {
PlainText(Cow<'a, str>),
Sequence(Vec<Token<'a>>),
Quote(Box<Token<'a>>),
Small(Box<Token<'a>>),
Big(Box<Token<'a>>),
BoldItalic(Box<Token<'a>>),
Bold(Box<Token<'a>>),
Italic(Box<Token<'a>>),
Center(Box<Token<'a>>),
Strikethrough(Box<Token<'a>>),
PlainTag(Cow<'a, str>),
InlineCode(Cow<'a, str>),
InlineMath(Cow<'a, str>),
UrlRaw(Cow<'a, str>),
UrlNoEmbed(Cow<'a, str>),
Link {
label: Cow<'a, str>,
href: Cow<'a, str>,
embed: bool,
},
BlockCode {
lang: Option<Cow<'a, str>>,
inner: Cow<'a, str>,
},
BlockMath(Cow<'a, str>),
Function {
name: Cow<'a, str>,
params: HashMap<Cow<'a, str>, Option<Cow<'a, str>>>,
inner: Box<Token<'a>>,
},
Mention {
name: Cow<'a, str>,
host: Option<Cow<'a, str>>,
mention_type: MentionType,
},
UnicodeEmoji(Cow<'a, str>),
ShortcodeEmoji(Cow<'a, str>),
Hashtag(Cow<'a, str>),
}
impl Token<'_> {
fn owned(&self) -> Token<'static> {
match self {
Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())),
Token::Sequence(tokens) => Token::Sequence(tokens.iter().map(Token::owned).collect()),
Token::Quote(inner) => Token::Quote(Box::new(inner.owned())),
Token::Small(inner) => Token::Small(Box::new(inner.owned())),
Token::Big(inner) => Token::Big(Box::new(inner.owned())),
Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.owned())),
Token::Bold(inner) => Token::Bold(Box::new(inner.owned())),
Token::Italic(inner) => Token::Italic(Box::new(inner.owned())),
Token::Center(inner) => Token::Center(Box::new(inner.owned())),
Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.owned())),
Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())),
Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())),
Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())),
Token::UrlRaw(url) => Token::UrlRaw(Cow::Owned(url.clone().into_owned())),
Token::UrlNoEmbed(url) => Token::UrlNoEmbed(Cow::Owned(url.clone().into_owned())),
Token::Link { embed, label, href } => Token::Link {
embed: *embed,
label: Cow::Owned(label.clone().into_owned()),
href: Cow::Owned(href.clone().into_owned()),
},
Token::BlockCode { inner, lang } => Token::BlockCode {
lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())),
inner: Cow::Owned(inner.clone().into_owned()),
},
Token::BlockMath(math) => Token::BlockMath(Cow::Owned(math.clone().into_owned())),
Token::Function {
name,
params,
inner,
} => Token::Function {
name: Cow::Owned(name.clone().into_owned()),
params: params
.iter()
.map(|(k, v)| {
(
Cow::Owned(k.clone().into_owned()),
v.as_ref().map(|val| Cow::Owned(val.clone().into_owned())),
)
})
.collect(),
inner: Box::new(inner.owned()),
},
Token::Mention {
name,
host,
mention_type,
} => Token::Mention {
name: Cow::Owned(name.clone().into_owned()),
host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())),
mention_type: *mention_type,
},
Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())),
Token::ShortcodeEmoji(shortcode) => {
Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned()))
}
Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())),
}
}
fn merged(&self) -> Token {
match self {
Token::Sequence(tokens) => {
let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| {
if let Some(Token::PlainText(last)) = acc.last_mut() {
if let Token::PlainText(tok_text) = tok {
*last = Cow::from(last.to_string() + tok_text.as_ref());
return acc;
}
}
if let Token::Sequence(seq) = tok {
let items = seq.iter().map(Token::merged).flat_map(|t| match t {
Token::Sequence(seq) => Either::Left(seq.into_iter()),
other => Either::Right(std::iter::once(other)),
});
for item in items {
if let Some(Token::PlainText(last)) = acc.last_mut() {
if let Token::PlainText(tok_text) = item {
*last = Cow::from(last.to_string() + tok_text.as_ref());
continue;
}
}
acc.push(item);
}
return acc;
}
acc.push(tok.merged());
acc
});
if tokens_multi.len() == 1 {
return tokens_multi.into_iter().next().unwrap();
}
Token::Sequence(tokens_multi)
}
Token::Quote(inner) => Token::Quote(Box::new(inner.merged())),
Token::Small(inner) => Token::Small(Box::new(inner.merged())),
Token::Big(inner) => Token::Big(Box::new(inner.merged())),
Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.merged())),
Token::Bold(inner) => Token::Bold(Box::new(inner.merged())),
Token::Italic(inner) => Token::Italic(Box::new(inner.merged())),
Token::Center(inner) => Token::Center(Box::new(inner.merged())),
Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())),
Token::Function {
name,
params,
inner,
} => Token::Function {
name: name.clone(),
params: params.clone(),
inner: Box::new(inner.merged()),
},
other => other.clone(),
}
}
}
type Span<'a> = LocatedSpan<&'a str>;
trait SliceOffset {
fn up_to(&self, other: &Self) -> Self;
fn fragment_between<'a>(&self, other: &Self) -> &'a str
where
Self: 'a;
}
impl SliceOffset for Span<'_> {
fn up_to(&self, other: &Self) -> Self {
self.slice(..self.offset(other))
}
fn fragment_between<'a>(&self, other: &Self) -> &'a str
where
Self: 'a,
{
self.up_to(other).into_fragment()
}
}
#[inline]
fn boxing_token<'a>(func: impl Fn(Box<Token<'a>>) -> Token<'a>) -> impl Fn(Token<'a>) -> Token<'a> {
move |tokens| func(Box::new(tokens))
}
#[inline]
fn collect_char_sequence<'a>(
func: impl Fn(Cow<'a, str>) -> Token<'a>,
) -> impl Fn(Vec<char>) -> Token<'a> {
move |chars| func(Cow::Owned(chars.into_iter().collect()))
}
fn spliced<'a>(
segments: &[Span<'a>],
func: impl Fn(Span) -> IResult<Span, Token>,
parent: Span<'a>,
) -> IResult<Span<'a>, Token<'static>, nom::error::Error<Span<'a>>> {
let combined = segments
.iter()
.copied()
.map(Span::into_fragment)
.collect::<Vec<_>>()
.join("\n");
let cum_offset_combined = segments
.iter()
.scan(0, |acc, &x| {
*acc += x.len();
Some(*acc)
})
.collect::<Vec<_>>();
let current_seg = |input: Span| {
cum_offset_combined
.iter()
.enumerate()
.take_while(|(_, &o)| o > input.location_offset())
.map(|(i, o)| (segments[i], o))
.last()
};
type NE<E> = nom::Err<E>;
type NomError<'x> = nom::error::Error<Span<'x>>;
let quote_span = Span::new(&combined);
let (input, inner) = match func(quote_span) {
Ok((input, token)) => (input, token.owned()),
Err(e) => {
return match e {
NE::Error(e) => {
let offset_new = e.input.location_offset();
if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) {
let offset = offset_new - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset();
Err(NE::Error(NomError::new(
Span::new(&parent.into_fragment()[offset_orig..]),
e.code,
)))
} else {
// ???
Err(NE::Failure(NomError::new(parent, ErrorKind::Fail)))
}
}
NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))),
NE::Incomplete(i) => Err(NE::Incomplete(i)),
};
}
};
let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) {
let offset = input.location_offset() - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset();
parent.slice(offset_orig..)
} else {
parent
};
Ok((out, inner.owned()))
}
fn space(input: Span) -> IResult<Span, Token> {
let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?;
Ok((input, Token::PlainText(frag.into_fragment().into())))
}
struct Context;
impl Context {
#[inline]
const fn partial(
&self,
func: impl for<'a> Fn(&Self, Span<'a>) -> IResult<Span<'a>, Token<'a>> + 'static,
) -> impl for<'a> Fn(Span<'a>) -> IResult<Span<'a>, Token<'a>> + '_ {
move |input| func(self, input)
}
fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
map(many1(self.partial(Self::full_single)), Token::Sequence)(input)
}
fn inline<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
map(many1(self.partial(Self::inline_single)), Token::Sequence)(input)
}
fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
map(
many1(self.partial(Self::inline_label_safe_single)),
Token::Sequence,
)(input)
}
fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
alt((
self.partial(Self::tag_bold_italic_asterisk),
self.partial(Self::tag_bold_italic_underscore),
self.partial(Self::tag_bold_asterisk),
self.partial(Self::tag_italic_asterisk),
self.partial(Self::tag_bold_underscore),
self.partial(Self::tag_italic_underscore),
))(input)
}
fn full_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_block_center),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::url_no_embed),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_block_code),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_quote),
self.partial(Self::tag_block_math),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url),
self.partial(Self::text),
))(input)?;
Ok((input, token))
}
fn inline_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::url_no_embed),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url),
self.partial(Self::text),
))(input)?;
Ok((input, token))
}
fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
self.partial(Self::url_no_embed),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url),
self.partial(Self::text),
))(input)?;
Ok((input, token))
}
fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::shortcode_emoji),
self.partial(Self::text),
))(input)?;
Ok((input, token))
}
fn tag_quote<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?;
if let (None, None) = leading_spaces {
if input.get_column() != 1 {
return fail(input);
}
}
let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input);
let orig_input = input;
let (input, lines) = separated_list1(line_ending, quote_line)(input)?;
let quote_lines = lines
.into_iter()
.map(|(_, _, text)| text)
.collect::<Vec<_>>();
if quote_lines.len() == 1
&& quote_lines
.iter()
.map(Span::fragment)
.copied()
.any(&str::is_empty)
{
return fail(input);
}
let (_, inner) = spliced(&quote_lines, self.partial(Self::full), orig_input)?;
let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?;
Ok((input, Token::Quote(Box::new(inner))))
}
fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let tag_start = &tag("<center>");
let tag_end = &tag("</center>");
let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 1 {
return fail(input);
}
let (input, _) = tag_start(input)?;
let (input, _) = opt(line_ending)(input)?;
let (input, (center_seq, _)) = many_till(
self.partial(Self::inline_single),
tuple((opt(line_ending), tag_end)),
)(input)?;
let (input, _) = many0(space)(input)?;
let (input, _) = not(not(line_ending))(input)?;
let (input, _) = opt(line_ending)(input)?;
Ok((
input,
boxing_token(Token::Center)(Token::Sequence(center_seq)),
))
}
fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let delim = &tag("```");
let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 1 {
return fail(input);
}
let (input, _) = delim(input)?;
let (input, lang) = opt(map(
recognize(many1(tuple((not(delim), not_line_ending)))),
Span::into_fragment,
))(input)?;
let (input, _) = line_ending(input)?;
let (input, code) = map(
recognize(many1_count(tuple((
not(tuple((line_ending, delim))),
anychar,
)))),
Span::into_fragment,
)(input)?;
let (input, _) = line_ending(input)?;
let (input, _) = delim(input)?;
let (input, _) = many0(space)(input)?;
let (input, _) = not(not(line_ending))(input)?;
let (input, _) = opt(line_ending)(input)?;
Ok((
input,
Token::BlockCode {
lang: lang.map(<&str>::into),
inner: code.into(),
},
))
}
fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let start = &tag("\\[");
let end = &tag("\\]");
let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 1 {
return fail(input);
}
let (input, _) = start(input)?;
let (input, _) = opt(line_ending)(input)?;
let (input, math_span) = recognize(many1_count(tuple((
not(tuple((opt(line_ending), end))),
not_line_ending,
))))(input)?;
let (input, _) = opt(line_ending)(input)?;
let (input, _) = end(input)?;
let (input, _) = many0(space)(input)?;
let (input, _) = not(not_line_ending)(input)?;
let (input, _) = opt(line_ending)(input)?;
Ok((
input,
Token::BlockMath(Cow::Borrowed(math_span.into_fragment())),
))
}
#[inline]
fn tag_delimited<'a, 'b: 'a, T>(
&'a self,
start: &'b str,
end: &'b str,
escape: bool,
matcher_inner: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
matcher_inner_fallback: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
collector: impl Fn(Vec<T>) -> Token<'b> + 'a,
mapper: impl Fn(Token<'b>) -> Token<'b> + 'a,
) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_ {
move |input| {
let opening_tag = &tag(start);
let closing_tag = &tag(end);
if escape {
if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), opening_tag))(input) {
return Ok((input_escaped, Token::PlainText(Cow::Borrowed(&mark))));
}
}
let begin = input;
let (post_open, _) = opening_tag(input)?;
let res = tuple((
many1(tuple((not(closing_tag), &matcher_inner))),
closing_tag,
))(post_open);
if let Err(nom::Err::Error(nom::error::Error {
input: input_past_err,
..
})) = res
{
let res_fallback = tuple((
many1(tuple((not(closing_tag), &matcher_inner_fallback))),
closing_tag,
))(post_open);
if res_fallback.is_err() {
return Ok((
input_past_err,
Token::PlainText(begin.fragment_between(&input_past_err).into()),
));
}
let (input, (inner, closing)) = res_fallback.unwrap();
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
return Ok((
input,
Token::Sequence(vec![
Token::PlainText(begin.fragment_between(&post_open).into()),
collector(inner),
Token::PlainText(closing.into_fragment().into()),
]),
));
}
let (input, (inner, _)) = res?;
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
Ok((input, mapper(collector(inner))))
}
}
fn tag_func<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, _) = tag("$[")(input)?;
let func_ident = |input| {
recognize(tuple((
many1_count(alt((alpha1, tag("_")))),
many0_count(alt((alphanumeric1, tag("_")))),
)))(input)
};
let param_value = recognize(many1_count(alt((
alphanumeric1,
tag("."),
tag("-"),
tag("_"),
))));
let (input, func_name) = map(func_ident, Span::into_fragment)(input)?;
let arg = tuple((func_ident, opt(tuple((tag("="), param_value)))));
let (input, args) =
opt(tuple((one_char('.'), separated_list1(one_char(','), arg))))(input)?;
let args_out = args.map_or_else(HashMap::new, |(_, items)| {
items
.into_iter()
.map(|(k, v)| {
(
Cow::from(k.into_fragment()),
v.map(|(_, val)| Cow::from(val.into_fragment())),
)
})
.collect::<HashMap<_, _>>()
});
let (input, _) = opt(space)(input)?;
let (input, (inner, _)) = many_till(self.partial(Self::inline_single), tag("]"))(input)?;
Ok((
input,
Token::Function {
name: Cow::from(func_name),
params: args_out,
inner: Box::new(Token::Sequence(inner)),
},
))
}
fn tag_plain<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let opening_tag = &tag("<small>");
let closing_tag = &tag("</small>");
let (input, _) = opening_tag(input)?;
let (input, text) = map(
recognize(many1(tuple((not_line_ending, not(closing_tag))))),
Span::into_fragment,
)(input)?;
let (input, _) = closing_tag(input)?;
Ok((input, Token::PlainTag(text.into())))
}
fn tag_small<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<small>",
"</small>",
false,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Small),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"***",
"***",
true,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::BoldItalic),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"___",
"___",
true,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::BoldItalic),
)(input)
}
fn tag_bold<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<b>",
"</b>",
false,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Bold),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"**",
"**",
true,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Bold),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"__",
"__",
true,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Bold),
)(input)
}
fn tag_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<i>",
"</i>",
false,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Italic),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"*",
"*",
true,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Italic),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"_",
"_",
true,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Italic),
)(input)
}
fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<s>",
"</s>",
false,
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Strikethrough),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"~~",
"~~",
true,
move |input| {
tuple((not_line_ending, self.partial(Self::inline_single)))(input)
.map(|(i, t)| (i, t.1))
},
move |input| {
tuple((
not_line_ending,
self.partial(Self::inline_non_formatting_single),
))(input)
.map(|(i, t)| (i, t.1))
},
Token::Sequence,
boxing_token(Token::Strikethrough),
)(input)
}
fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"`",
"",
true,
move |input| {
tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input)
.map(|(i, (_skip, c))| (i, c))
},
fail,
collect_char_sequence(Token::InlineCode),
identity,
)(input)
}
fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"\\(",
"\\)",
false,
move |input| tuple((not(line_ending), anychar))(input).map(|(i, (_skip, c))| (i, c)),
fail,
collect_char_sequence(Token::InlineCode),
identity,
)(input)
}
fn text<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, text) = map(recognize(anychar), Span::into_fragment)(input)?;
Ok((input, Token::PlainText(text.into())))
}
fn raw_url<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, url_span) = recognize(tuple((
protocol,
url_chars(|input| not(url_chars_base)(input), false),
)))(input)?;
let url = url_span.into_fragment();
let url_bytes = url.as_bytes();
// Strip punctuation at the end of sentences that might have been consumed as a part of the URL
let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) {
url.slice(..url.len() - 1)
} else {
url
};
Ok((input, Token::UrlRaw(Cow::from(final_url))))
}
fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, _) = tag("<")(input)?;
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?;
let (input, _) = tag(">")(input)?;
Ok((input, Token::UrlRaw(Cow::from(url_span.into_fragment()))))
}
fn link<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, no_embed) = opt(tag("?"))(input)?;
let (input, _) = tag("[")(input)?;
let (input, _) = not(tag("["))(input)?;
let (input, label_span) = recognize(many1(tuple((
not(tag("](")),
self.partial(Self::inline_label_safe_single),
))))(input)?;
let (input, _) = tag("]")(input)?;
let (input, _) = tag("(")(input)?;
let (input, url_span) = recognize(tuple((protocol, url_chars(tag("]"), true))))(input)?;
let (input, _) = tag(")")(input)?;
Ok((
input,
Token::Link {
label: label_span.into_fragment().into(),
href: url_span.into_fragment().into(),
embed: no_embed.is_none(),
},
))
}
fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let frag = input.fragment();
let Some(grapheme) = frag.graphemes(true).next() else {
return fail(input);
};
let emoji = emojis::get(grapheme);
if emoji.is_none() {
return fail(input);
}
Ok((
input.slice(grapheme.len()..),
Token::UnicodeEmoji(grapheme.into()),
))
}
fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
// TODO: Fail when preceded by alphanumerics
let (input, _) = tag(":")(input)?;
let (input, shortcode) = map(
recognize(many1(alt((alphanumeric1, recognize(one_of("_+-")))))),
Span::into_fragment,
)(input)?;
let (input, _) = tag(":")(input)?;
let (input, _) = not(alphanumeric1)(input)?;
Ok((input, Token::ShortcodeEmoji(shortcode.into())))
}
fn tag_mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
// TODO: Escaping and skip when preceded by alphanumerics
let tags = one_of("@!");
let (input, mention_type) = map(tags, |c| match c {
'@' => MentionType::User,
'!' => MentionType::Community,
_ => unreachable!(),
})(input)?;
let (input, name) = map(
recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))),
Span::into_fragment,
)(input)?;
let (input, host) = map(
opt(tuple((
tag("@"),
map(
recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))),
Span::into_fragment,
),
))),
|maybe_tag_host| maybe_tag_host.map(|(_, host)| host),
)(input)?;
Ok((
input,
Token::Mention {
mention_type,
name: name.into(),
host: host.map(|h| h.into()),
},
))
}
fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
// TODO: Skip when preceded by alphanumerics
let (input, _) = tag("#")(input)?;
let (input, hashtag_text) =
map(recognize(many1(hashtag_chars)), Span::into_fragment)(input)?;
Ok((input, Token::Hashtag(hashtag_text.into())))
}
}
#[inline]
fn hashtag_chars(input: Span) -> IResult<Span, Span> {
recognize(alt((
recognize(tuple((tag("("), hashtag_chars, tag(")")))),
recognize(tuple((tag("["), hashtag_chars, tag("]")))),
recognize(tuple((tag(""), hashtag_chars, tag("")))),
recognize(tuple((tag(""), hashtag_chars, tag("")))),
recognize(tuple((
not(space1),
not_line_ending,
not(one_of(".,:;!?#?/[]【】()「」()<>")),
anychar,
))),
)))(input)
}
#[inline]
fn protocol(input: Span) -> IResult<Span, Span> {
alt((tag("https://"), tag("http://")))(input)
}
#[inline]
fn url_chars_base(input: Span) -> IResult<Span, Span> {
recognize(alt((alpha1, recognize(one_of(".,_/:%#$&?!~=+-()[]@")))))(input)
}
#[inline]
fn url_chars<'a, T: 'a>(
terminator: impl Fn(Span<'a>) -> IResult<Span<'a>, T> + 'a,
spaces: bool,
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'a {
let terminating = move |input| {
tuple((
&terminator,
alt((
space1,
line_ending,
eof,
recognize(one_of("([<'\"")),
recognize(tuple((
alt((alpha1, recognize(one_of("*")))),
alt((space1, line_ending, eof)),
))),
)),
))(input)
};
let chars = tuple((
not(tuple((space1, eof))),
not(tuple((space1, tag("\"")))),
not(tuple((opt(space1), terminating))),
alt((url_chars_base, if spaces { space1 } else { fail })),
));
recognize(many1_count(chars))
}
#[cfg(test)]
mod test {
use crate::{url_chars, Context, Span, Token};
use nom::bytes::complete::tag;
use nom::multi::many1;
use std::borrow::Cow;
use std::collections::HashMap;
#[test]
fn parse_url_chars() {
let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))";
assert_eq!(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)",
url_chars(tag(")"), true)(Span::new(test1))
.unwrap()
.1
.into_fragment()
);
let test2 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))";
assert_eq!(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
url_chars(tag(")"), true)(Span::new(test2))
.unwrap()
.1
.into_fragment()
);
let test3 = "https://en.wikipedia.org/wiki/(";
assert_eq!(
test3,
url_chars(tag(")"), true)(Span::new(test3))
.unwrap()
.1
.into_fragment()
);
let test4 = "https://cs.wikipedia.org/wiki/Among_Us ";
assert_eq!(
"https://cs.wikipedia.org/wiki/Among_Us",
url_chars(tag(")"), true)(Span::new(test4))
.unwrap()
.1
.into_fragment()
);
let test5 = "https://cs.wikipedia.org/wiki/Among Us )";
assert_eq!(
"https://cs.wikipedia.org/wiki/Among Us",
url_chars(tag(")"), true)(Span::new(test5))
.unwrap()
.1
.into_fragment()
);
}
#[test]
fn parse_complex() {
let emoji = r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#;
assert_eq!(
Token::Function {
name: "x2".into(),
params: HashMap::new(),
inner: Box::new(Token::Sequence(vec![
Token::Function {
name: "sparkle".into(),
params: HashMap::new(),
inner: Box::new(Token::UnicodeEmoji("🥺".into())),
},
Token::UnicodeEmoji("💜".into()),
Token::Function {
name: "spin".into(),
params: {
let mut params = HashMap::new();
params.insert("y".into(), None);
params.insert("speed".into(), Some("5s".into()));
params
},
inner: Box::new(Token::UnicodeEmoji("❤️".into())),
},
Token::UnicodeEmoji("🦊".into()),
]))
},
Context.full(Span::new(emoji)).unwrap().1.merged()
);
let bold_italic = r#"***bold italic***"#;
assert_eq!(
Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))),
Context.full(Span::new(bold_italic)).unwrap().1.merged()
);
let bold_italic_tag = r#"<b><i>bold italic</i></b>"#;
assert_eq!(
Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText(
"bold italic".into()
))))),
Context.full(Span::new(bold_italic_tag)).unwrap().1.merged()
);
assert_eq!(
Token::Sequence(vec![
Token::PlainText("<b>bold ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag1".into(),
host: None
},
Token::PlainText(" <i> ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag2".into(),
host: None
},
Token::PlainText(" </b>italic</i>".into())
]),
Context
.full(Span::new(r#"<b>bold @tag1 <i> @tag2 </b>italic</i>"#))
.unwrap()
.1
.merged()
);
let quote = r#"
> test
> <i>
> italic
> </i>
>> Nested quote
"#;
assert_eq!(
Token::Quote(Box::new(Token::Sequence(vec![
Token::PlainText("test\n".into()),
Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))),
Token::Quote(Box::new(Token::PlainText("Nested quote".into())))
]))),
Context.full(Span::new(quote)).unwrap().1.merged()
);
}
#[test]
fn parse_emoji() {
let test = "🥺💜❤️🦊";
let ctx = Context;
let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap();
assert_eq!(
vec!["🥺", "💜", "❤️", "🦊"]
.into_iter()
.map(<&str as Into<Cow<_>>>::into)
.map(Token::UnicodeEmoji)
.collect::<Vec<_>>(),
tokens.1
);
}
}