magnetar/magnetar_mmm_parser/src/lib.rs

406 lines
13 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::character::complete;
use nom::character::complete::{anychar, line_ending, not_line_ending, tab};
use nom::combinator::{fail, not, opt};
use nom::error::ErrorKind;
use nom::multi::{many1, separated_list1};
use nom::sequence::tuple;
use nom::{IResult, Offset, Slice};
use nom_locate::LocatedSpan;
use std::borrow::Cow;
enum Token<'a> {
PlainText(Cow<'a, str>),
Sequence(Vec<Token<'a>>),
Quote(Box<Token<'a>>),
Small(Box<Token<'a>>),
Big(Box<Token<'a>>),
BoldItalic(Box<Token<'a>>),
Bold(Box<Token<'a>>),
Italic(Box<Token<'a>>),
Center(Box<Token<'a>>),
Strikethrough(Box<Token<'a>>),
PlainTag(Cow<'a, str>),
InlineCode(Cow<'a, str>),
InlineMath(Cow<'a, str>),
}
impl Token<'_> {
fn owned(&self) -> Token<'static> {
match self {
Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())),
Token::Sequence(tokens) => Token::Sequence(tokens.iter().map(Token::owned).collect()),
Token::Quote(inner) => Token::Quote(Box::new(inner.owned())),
Token::Small(inner) => Token::Small(Box::new(inner.owned())),
Token::Big(inner) => Token::Big(Box::new(inner.owned())),
Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.owned())),
Token::Bold(inner) => Token::Bold(Box::new(inner.owned())),
Token::Italic(inner) => Token::Italic(Box::new(inner.owned())),
Token::Center(inner) => Token::Center(Box::new(inner.owned())),
Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.owned())),
Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())),
Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())),
Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())),
}
}
}
type Span<'a> = LocatedSpan<&'a str>;
trait SliceOffset {
fn up_to(&self, other: &Self) -> Self;
fn fragment_between<'a>(&self, other: &Self) -> &'a str
where
Self: 'a;
}
impl SliceOffset for Span<'_> {
fn up_to(&self, other: &Self) -> Self {
self.slice(..self.offset(other))
}
fn fragment_between<'a>(&self, other: &Self) -> &'a str
where
Self: 'a,
{
self.up_to(other).into_fragment()
}
}
const fn boxing_sequence<'a>(
func: impl Fn(Box<Token<'a>>) -> Token<'a>,
) -> impl Fn(Vec<Token<'a>>) -> Token<'a> {
move |tokens| func(Box::new(Token::Sequence(tokens)))
}
const fn collect_char_sequence<'a>(
func: impl Fn(Cow<'a, str>) -> Token<'a>,
) -> impl Fn(Vec<char>) -> Token<'a> {
move |chars| func(Cow::Owned(chars.into_iter().collect()))
}
fn spliced<'a>(
segments: &[Span<'a>],
func: impl Fn(Span) -> IResult<Span, Token>,
parent: Span<'a>,
) -> IResult<Span<'a>, Token<'static>, nom::error::Error<Span<'a>>> {
let combined = segments
.iter()
.copied()
.map(Span::into_fragment)
.collect::<String>();
let cum_offset_combined = segments
.iter()
.scan(0, |acc, &x| {
*acc += x.len();
Some(*acc)
})
.collect::<Vec<_>>();
let current_seg = |input: Span| {
cum_offset_combined
.iter()
.enumerate()
.filter(|(_, &o)| o >= input.location_offset())
.map(|(i, o)| (segments[i], o))
.last()
};
type NE<E> = nom::Err<E>;
type NomError<'x> = nom::error::Error<Span<'x>>;
let quote_span = Span::new(&combined);
let (input, inner) = match func(quote_span) {
Ok((input, token)) => (input, token.owned()),
Err(e) => {
return match e {
NE::Error(e) => {
let offset_new = e.input.location_offset();
if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) {
let offset = offset_new - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset();
Err(NE::Error(NomError::new(
Span::new(&parent.into_fragment()[offset_orig..]),
e.code,
)))
} else {
// ???
Err(NE::Failure(NomError::new(parent, ErrorKind::Fail)))
}
}
NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))),
NE::Incomplete(i) => Err(NE::Incomplete(i)),
};
}
};
let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) {
let offset = input.location_offset() - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset();
parent.slice(offset_orig..)
} else {
parent
};
Ok((out, Token::Quote(Box::new(inner.owned()))))
}
fn space(input: Span) -> IResult<Span, Token> {
let start = input;
let (input, _) = alt((complete::char('\u{0020}'), complete::char('\u{3000}'), tab))(input)?;
Ok((
input,
Token::PlainText(start.fragment_between(&input).into()),
))
}
struct Context;
impl Context {
const fn partial<'a>(
&self,
func: impl Fn(&Self, Span<'a>) -> IResult<Span<'a>, Token<'a>> + 'static,
) -> impl Fn(Span<'a>) -> IResult<Span<'a>, Token<'a>> + '_ {
move |input| func(self, input)
}
fn root<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((self.partial(Self::tag_quote),))(input)?;
Ok((input, token))
}
fn inline<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?;
Ok((input, token))
}
fn tag_quote<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?;
if let (None, None) = leading_spaces {
if input.get_column() != 0 {
return fail(input);
}
}
let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input);
let orig_input = input;
let (input, lines) = separated_list1(line_ending, quote_line)(input)?;
let quote_lines = lines
.into_iter()
.map(|(_, _, text)| text)
.collect::<Vec<_>>();
if quote_lines.len() == 1
&& quote_lines
.iter()
.map(Span::fragment)
.copied()
.any(&str::is_empty)
{
return fail(input);
}
let (_, inner) = spliced(&quote_lines, space, orig_input)?;
let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?;
Ok((input, Token::Quote(Box::new(inner))))
}
const fn tag_delimited<'a, 'b: 'a, T>(
&'a self,
start: &'b str,
end: &'b str,
escape: bool,
matcher_inner: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
mapper: impl Fn(Vec<T>) -> Token<'b> + 'a,
) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_ {
move |input| {
let opening_tag = &tag(start);
let closing_tag = &tag(end);
if escape {
if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), opening_tag))(input) {
return Ok((input_escaped, Token::PlainText(Cow::Borrowed(&mark))));
}
}
let begin = input;
let (post_open, _) = opening_tag(input)?;
let res = tuple((
many1(tuple((not(closing_tag), &matcher_inner))),
closing_tag,
))(post_open);
if let Err(nom::Err::Error(nom::error::Error { .. })) = res {
return Ok((
post_open,
Token::PlainText(begin.fragment_between(&post_open).into()),
));
}
let (input, (inner, _)) = res?;
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
Ok((input, mapper(inner)))
}
}
fn tag_small<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<small>",
"</small>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Small),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"***",
"***",
true,
self.partial(Self::inline),
boxing_sequence(Token::BoldItalic),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"___",
"___",
true,
self.partial(Self::inline),
boxing_sequence(Token::BoldItalic),
)(input)
}
fn tag_bold<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<b>",
"</b>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Bold),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"**",
"**",
true,
self.partial(Self::inline),
boxing_sequence(Token::Bold),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"__",
"__",
true,
self.partial(Self::inline),
boxing_sequence(Token::Bold),
)(input)
}
fn tag_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<i>",
"</i>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Italic),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"*",
"*",
true,
self.partial(Self::inline),
boxing_sequence(Token::Italic),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"_",
"_",
true,
self.partial(Self::inline),
boxing_sequence(Token::Italic),
)(input)
}
fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"<s>",
"</s>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Strikethrough),
)(input)
}
// TODO: CommonMark flanking rules
fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"~~",
"~~",
true,
move |input| {
tuple((not_line_ending, self.partial(Self::inline)))(input).map(|(i, t)| (i, t.1))
},
boxing_sequence(Token::Strikethrough),
)(input)
}
fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"`",
"",
true,
move |input| {
tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input)
.map(|(i, (_skip, c))| (i, c))
},
collect_char_sequence(Token::InlineCode),
)(input)
}
fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
"\\(",
"\\)",
false,
move |input| tuple((not_line_ending, anychar))(input).map(|(i, (_skip, c))| (i, c)),
collect_char_sequence(Token::InlineMath),
)(input)
}
fn text<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let before = input;
let (input, _) = anychar(input)?;
Ok((
input,
Token::PlainText(before.fragment_between(&input).into()),
))
}
}