magnetar/magnetar_mmm_parser/src/lib.rs

812 lines
25 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::character::complete::{
alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1,
tab,
};
use nom::combinator::{eof, fail, map, not, opt, recognize};
use nom::error::ErrorKind;
use nom::multi::{many0, many0_count, many1, many1_count, separated_list1};
use nom::sequence::tuple;
use nom::{IResult, Offset, Slice};
use nom_locate::LocatedSpan;
use std::borrow::Cow;
use std::collections::HashMap;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug)]
pub enum MentionType {
Community,
User,
}
impl MentionType {
pub fn to_char(&self) -> char {
match self {
MentionType::Community => '!',
MentionType::User => '@',
}
}
}
#[derive(Clone, Debug)]
pub enum Token<'a> {
PlainText(Cow<'a, str>),
Sequence(Vec<Token<'a>>),
Quote(Box<Token<'a>>),
Small(Box<Token<'a>>),
Big(Box<Token<'a>>),
BoldItalic(Box<Token<'a>>),
Bold(Box<Token<'a>>),
Italic(Box<Token<'a>>),
Center(Box<Token<'a>>),
Strikethrough(Box<Token<'a>>),
PlainTag(Cow<'a, str>),
InlineCode(Cow<'a, str>),
InlineMath(Cow<'a, str>),
UrlRaw(Cow<'a, str>),
UrlNoEmbed(Cow<'a, str>),
Link {
label: Cow<'a, str>,
href: Cow<'a, str>,
embed: bool,
},
BlockCode {
lang: Option<Cow<'a, str>>,
inner: Cow<'a, str>,
},
BlockMath(Cow<'a, str>),
Function {
name: Cow<'a, str>,
params: HashMap<Cow<'a, str>, Option<Cow<'a, str>>>,
inner: Box<Token<'a>>,
},
Mention {
name: Cow<'a, str>,
host: Option<Cow<'a, str>>,
mention_type: MentionType,
},
UnicodeEmoji(Cow<'a, str>),
ShortcodeEmoji(Cow<'a, str>),
}
impl Token<'_> {
fn owned(&self) -> Token<'static> {
match self {
Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())),
Token::Sequence(tokens) => Token::Sequence(tokens.iter().map(Token::owned).collect()),
Token::Quote(inner) => Token::Quote(Box::new(inner.owned())),
Token::Small(inner) => Token::Small(Box::new(inner.owned())),
Token::Big(inner) => Token::Big(Box::new(inner.owned())),
Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.owned())),
Token::Bold(inner) => Token::Bold(Box::new(inner.owned())),
Token::Italic(inner) => Token::Italic(Box::new(inner.owned())),
Token::Center(inner) => Token::Center(Box::new(inner.owned())),
Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.owned())),
Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())),
Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())),
Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())),
Token::UrlRaw(url) => Token::UrlRaw(Cow::Owned(url.clone().into_owned())),
Token::UrlNoEmbed(url) => Token::UrlNoEmbed(Cow::Owned(url.clone().into_owned())),
Token::Link { embed, label, href } => Token::Link {
embed: *embed,
label: Cow::Owned(label.clone().into_owned()),
href: Cow::Owned(href.clone().into_owned()),
},
Token::BlockCode { inner, lang } => Token::BlockCode {
lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())),
inner: Cow::Owned(inner.clone().into_owned()),
},
Token::BlockMath(math) => Token::BlockMath(Cow::Owned(math.clone().into_owned())),
Token::Function {
name,
params,
inner,
} => Token::Function {
name: Cow::Owned(name.clone().into_owned()),
params: params
.iter()
.map(|(k, v)| {
(
Cow::Owned(k.clone().into_owned()),
v.as_ref().map(|val| Cow::Owned(val.clone().into_owned())),
)
})
.collect(),
inner: Box::new(inner.owned()),
},
Token::Mention {
name,
host,
mention_type,
} => Token::Mention {
name: Cow::Owned(name.clone().into_owned()),
host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())),
mention_type: *mention_type,
},
Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())),
Token::ShortcodeEmoji(shortcode) => {
Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned()))
}
}
}
}
type Span<'a> = LocatedSpan<&'a str>;
trait SliceOffset {
fn up_to(&self, other: &Self) -> Self;
fn fragment_between<'a>(&self, other: &Self) -> &'a str
where
Self: 'a;
}
impl SliceOffset for Span<'_> {
fn up_to(&self, other: &Self) -> Self {
self.slice(..self.offset(other))
}
fn fragment_between<'a>(&self, other: &Self) -> &'a str
where
Self: 'a,
{
self.up_to(other).into_fragment()
}
}
const fn boxing_sequence<'a>(
func: impl Fn(Box<Token<'a>>) -> Token<'a>,
) -> impl Fn(Vec<Token<'a>>) -> Token<'a> {
move |tokens| func(Box::new(Token::Sequence(tokens)))
}
const fn collect_char_sequence<'a>(
func: impl Fn(Cow<'a, str>) -> Token<'a>,
) -> impl Fn(Vec<char>) -> Token<'a> {
move |chars| func(Cow::Owned(chars.into_iter().collect()))
}
fn spliced<'a>(
segments: &[Span<'a>],
func: impl Fn(Span) -> IResult<Span, Token>,
output_mapper: impl Fn(Box<Token<'static>>) -> Token<'static>,
parent: Span<'a>,
) -> IResult<Span<'a>, Token<'static>, nom::error::Error<Span<'a>>> {
let combined = segments
.iter()
.copied()
.map(Span::into_fragment)
.collect::<String>();
let cum_offset_combined = segments
.iter()
.scan(0, |acc, &x| {
*acc += x.len();
Some(*acc)
})
.collect::<Vec<_>>();
let current_seg = |input: Span| {
cum_offset_combined
.iter()
.enumerate()
.filter(|(_, &o)| o >= input.location_offset())
.map(|(i, o)| (segments[i], o))
.last()
};
type NE<E> = nom::Err<E>;
type NomError<'x> = nom::error::Error<Span<'x>>;
let quote_span = Span::new(&combined);
let (input, inner) = match func(quote_span) {
Ok((input, token)) => (input, token.owned()),
Err(e) => {
return match e {
NE::Error(e) => {
let offset_new = e.input.location_offset();
if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) {
let offset = offset_new - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset();
Err(NE::Error(NomError::new(
Span::new(&parent.into_fragment()[offset_orig..]),
e.code,
)))
} else {
// ???
Err(NE::Failure(NomError::new(parent, ErrorKind::Fail)))
}
}
NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))),
NE::Incomplete(i) => Err(NE::Incomplete(i)),
};
}
};
let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) {
let offset = input.location_offset() - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset();
parent.slice(offset_orig..)
} else {
parent
};
Ok((out, output_mapper(Box::new(inner.owned()))))
}
fn space(input: Span) -> IResult<Span, Token> {
let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?;
Ok((input, Token::PlainText(frag.into_fragment().into())))
}
struct Context;
impl Context {
#[inline]
const fn partial<'a>(
&self,
func: impl Fn(&Self, Span<'a>) -> IResult<Span<'a>, Token<'a>> + 'static,
) -> impl Fn(Span<'a>) -> IResult<Span<'a>, Token<'a>> + '_ {
move |input| func(self, input)
}
fn root<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((self.partial(Self::tag_quote),))(input)?;
Ok((input, token))
}
fn inline<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?;
Ok((input, token))
}
fn inline_no_link<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?;
Ok((input, token))
}
fn tag_quote<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?;
if let (None, None) = leading_spaces {
if input.get_column() != 0 {
return fail(input);
}
}
let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input);
let orig_input = input;
let (input, lines) = separated_list1(line_ending, quote_line)(input)?;
let quote_lines = lines
.into_iter()
.map(|(_, _, text)| text)
.collect::<Vec<_>>();
if quote_lines.len() == 1
&& quote_lines
.iter()
.map(Span::fragment)
.copied()
.any(&str::is_empty)
{
return fail(input);
}
let (_, inner) = spliced(&quote_lines, space, Token::Quote, orig_input)?;
let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?;
Ok((input, Token::Quote(Box::new(inner))))
}
fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let tag_start = &tag("<center>");
let tag_end = &tag("</center>");
let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 0 {
return fail(input);
}
let (input, _) = tag_start(input)?;
let (input, _) = opt(line_ending)(input)?;
let (input, center_seq) = many0(tuple((
not(tuple((opt(line_ending), tag_end))),
self.partial(Self::inline),
)))(input)?;
let (input, _) = opt(line_ending)(input)?;
let (input, _) = tag_end(input)?;
let (input, _) = many0(space)(input)?;
let (input, _) = not(not_line_ending)(input)?;
let (input, _) = opt(line_ending)(input)?;
let tokens = center_seq.into_iter().map(|(_, v)| v).collect::<Vec<_>>();
Ok((input, boxing_sequence(Token::Center)(tokens)))
}
fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let start = &tag("\\[");
let end = &tag("\\]");
let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 0 {
return fail(input);
}
let (input, _) = start(input)?;
let (input, _) = opt(line_ending)(input)?;
let (input, math_span) = recognize(many1_count(tuple((
not(tuple((opt(line_ending), end))),
not_line_ending,
))))(input)?;
let (input, _) = opt(line_ending)(input)?;
let (input, _) = end(input)?;
let (input, _) = many0(space)(input)?;
let (input, _) = not(not_line_ending)(input)?;
let (input, _) = opt(line_ending)(input)?;
Ok((
input,
Token::BlockMath(Cow::Borrowed(math_span.into_fragment())),
))
}
const fn tag_delimited<'a, 'b: 'a, T>(
&'a self,
start: &'b str,
end: &'b str,
escape: bool,
matcher_inner: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
mapper: impl Fn(Vec<T>) -> Token<'b> + 'a,
) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_ {
move |input| {
let opening_tag = &tag(start);
let closing_tag = &tag(end);
if escape {
if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), opening_tag))(input) {
return Ok((input_escaped, Token::PlainText(Cow::Borrowed(&mark))));
}
}
let begin = input;
let (post_open, _) = opening_tag(input)?;
let res = tuple((
many1(tuple((not(closing_tag), &matcher_inner))),
closing_tag,
))(post_open);
if let Err(nom::Err::Error(nom::error::Error { .. })) = res {
return Ok((
post_open,
Token::PlainText(begin.fragment_between(&post_open).into()),
));
}
let (input, (inner, _)) = res?;
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
Ok((input, mapper(inner)))
}
}
fn tag_func<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, _) = tag("$[")(input)?;
let func_ident = |input| {
recognize(tuple((
many1_count(alt((alpha1, tag("_")))),
many0_count(alt((alphanumeric1, tag("_")))),
)))(input)
};
let param_value = recognize(many1_count(alt((
alphanumeric1,
tag("."),
tag("-"),
tag("_"),
))));
let (input, func_name_span) = func_ident(input)?;
let func_name = func_name_span.into_fragment();
let arg = tuple((func_ident, opt(tuple((tag("="), param_value)))));
let (input, args) =
opt(tuple((one_char('.'), separated_list1(one_char(','), arg))))(input)?;
let args_out = args.map_or_else(HashMap::new, |(_, items)| {
items
.into_iter()
.map(|(k, v)| {
(
Cow::from(k.into_fragment()),
v.map(|(_, val)| Cow::from(val.into_fragment())),
)
})
.collect::<HashMap<_, _>>()
});
let (input, inner) = self.partial(Self::inline)(input)?;
let (input, _) = tag("]")(input)?;
Ok((
input,
Token::Function {
name: Cow::from(func_name),
params: args_out,
inner: Box::new(inner),
},
))
}
fn tag_small<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<small>",
"</small>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Small),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"***",
"***",
true,
self.partial(Self::inline),
boxing_sequence(Token::BoldItalic),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"___",
"___",
true,
self.partial(Self::inline),
boxing_sequence(Token::BoldItalic),
)(input)
}
fn tag_bold<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<b>",
"</b>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Bold),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"**",
"**",
true,
self.partial(Self::inline),
boxing_sequence(Token::Bold),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"__",
"__",
true,
self.partial(Self::inline),
boxing_sequence(Token::Bold),
)(input)
}
fn tag_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<i>",
"</i>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Italic),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"*",
"*",
true,
self.partial(Self::inline),
boxing_sequence(Token::Italic),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"_",
"_",
true,
self.partial(Self::inline),
boxing_sequence(Token::Italic),
)(input)
}
fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<s>",
"</s>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Strikethrough),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"~~",
"~~",
true,
move |input| {
tuple((not_line_ending, self.partial(Self::inline)))(input).map(|(i, t)| (i, t.1))
},
boxing_sequence(Token::Strikethrough),
)(input)
}
fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"`",
"",
true,
move |input| {
tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input)
.map(|(i, (_skip, c))| (i, c))
},
collect_char_sequence(Token::InlineCode),
)(input)
}
fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"\\(",
"\\)",
false,
move |input| tuple((not_line_ending, anychar))(input).map(|(i, (_skip, c))| (i, c)),
collect_char_sequence(Token::InlineMath),
)(input)
}
fn text<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let before = input;
let (input, _) = anychar(input)?;
Ok((
input,
Token::PlainText(before.fragment_between(&input).into()),
))
}
fn url<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, url_span) = recognize(tuple((
protocol,
url_chars(|input| not(url_chars_base)(input), false),
)))(input)?;
let url = url_span.into_fragment();
let url_bytes = url.as_bytes();
// Strip punctuation at the end of sentences that might have been consumed as a part of the URL
let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) {
url.slice(..url.len() - 1)
} else {
url
};
Ok((input, Token::UrlRaw(Cow::from(final_url))))
}
fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, _) = tag("<")(input)?;
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?;
let (input, _) = tag(">")(input)?;
Ok((input, Token::UrlRaw(Cow::from(url_span.into_fragment()))))
}
fn link<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, no_embed) = opt(tag("?"))(input)?;
let (input, _) = tag("[")(input)?;
let (input, _) = not(tag("["))(input)?;
let (input, label_span) =
recognize(many1(tuple((not(tag("](")), not_line_ending))))(input)?;
let (input, _) = tag("]")(input)?;
let (input, _) = tag("(")(input)?;
let (input, url_span) = recognize(tuple((protocol, url_chars(tag("]"), true))))(input)?;
let (input, _) = tag(")")(input)?;
Ok((
input,
Token::Link {
label: label_span.into_fragment().into(),
href: url_span.into_fragment().into(),
embed: no_embed.is_none(),
},
))
}
fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let frag = input.fragment();
let Some(grapheme) = frag.graphemes(true).next() else {
return fail(input);
};
let emoji = emojis::get(grapheme);
if emoji.is_none() {
return fail(input);
}
Ok((
input.slice(grapheme.len()..),
Token::UnicodeEmoji(grapheme.into()),
))
}
fn mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
// TODO: Escaping and skip when preceded by alphanumerics
let tags = one_of("@!");
let (input, mention_type) = map(tags, |c| match c {
'@' => MentionType::User,
'!' => MentionType::Community,
_ => unreachable!(),
})(input)?;
let (input, name) = map(
recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))),
Span::into_fragment,
)(input)?;
let (input, host) = map(
opt(tuple((
tag("@"),
map(
recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))),
Span::into_fragment,
),
))),
|maybe_tag_host| maybe_tag_host.map(|(_, host)| host),
)(input)?;
Ok((
input,
Token::Mention {
mention_type,
name: name.into(),
host: host.map(|h| h.into()),
},
))
}
}
#[inline]
fn protocol(input: Span) -> IResult<Span, Span> {
alt((tag("https://"), tag("http://")))(input)
}
#[inline]
fn url_chars_base(input: Span) -> IResult<Span, Span> {
recognize(alt((alpha1, recognize(one_of(".,_/:%#$&?!~=+-()[]@")))))(input)
}
#[inline]
fn url_chars<'a, T: 'a>(
terminator: impl Fn(Span<'a>) -> IResult<Span<'a>, T> + 'a,
spaces: bool,
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'a {
let terminating = move |input| {
tuple((
&terminator,
alt((
space1,
line_ending,
eof,
recognize(one_of("([<'\"")),
recognize(tuple((
alt((alpha1, recognize(one_of("*")))),
alt((space1, line_ending, eof)),
))),
)),
))(input)
};
let chars = tuple((
not(tuple((space1, eof))),
not(tuple((space1, tag("\"")))),
not(tuple((opt(space1), terminating))),
alt((url_chars_base, if spaces { space1 } else { fail })),
));
recognize(many1_count(chars))
}
#[cfg(test)]
mod test {
use crate::{url_chars, Context, Span};
use nom::bytes::complete::tag;
use nom::multi::many1;
#[test]
fn parse_url_chars() {
let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))";
assert_eq!(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)",
url_chars(tag(")"), true)(Span::new(test1))
.unwrap()
.1
.into_fragment()
);
let test2 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))";
assert_eq!(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
url_chars(tag(")"), true)(Span::new(test2))
.unwrap()
.1
.into_fragment()
);
let test3 = "https://en.wikipedia.org/wiki/(";
assert_eq!(
test3,
url_chars(tag(")"), true)(Span::new(test3))
.unwrap()
.1
.into_fragment()
);
let test4 = "https://cs.wikipedia.org/wiki/Among_Us ";
assert_eq!(
"https://cs.wikipedia.org/wiki/Among_Us",
url_chars(tag(")"), true)(Span::new(test4))
.unwrap()
.1
.into_fragment()
);
let test5 = "https://cs.wikipedia.org/wiki/Among Us )";
assert_eq!(
"https://cs.wikipedia.org/wiki/Among Us",
url_chars(tag(")"), true)(Span::new(test5))
.unwrap()
.1
.into_fragment()
);
}
#[test]
fn parse_emoji() {
let test = "🥺💜❤️🦊";
let ctx = Context;
let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap();
println!("{:#?}", tokens.1)
}
}