Janky sequence unnesting and attempting to salvage nested parsing in incorrect formatting tags

This commit is contained in:
Natty 2023-10-07 01:46:20 +02:00
parent 453891ddf4
commit 703e1191c2
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
4 changed files with 200 additions and 48 deletions

1
Cargo.lock generated
View File

@ -1622,6 +1622,7 @@ dependencies = [
name = "mmm_parser"
version = "0.2.1-alpha"
dependencies = [
"either",
"emojis",
"nom",
"nom_locate",

View File

@ -29,6 +29,7 @@ cached = "0.46"
cfg-if = "1"
chrono = "0.4"
dotenvy = "0.15"
either = "1.9"
emojis = "0.6"
futures-core = "0.3"
futures-util = "0.3"

View File

@ -5,6 +5,7 @@ edition.workspace = true
license = "MIT OR Apache-2.0"
[dependencies]
either = { workspace = true }
emojis = { workspace = true }
nom = { workspace = true }
nom_locate = { workspace = true }

View File

@ -1,3 +1,4 @@
use either::Either;
use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::character::complete::{
@ -12,6 +13,7 @@ use nom::{IResult, Offset, Slice};
use nom_locate::LocatedSpan;
use std::borrow::Cow;
use std::collections::HashMap;
use std::convert::identity;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
@ -145,6 +147,27 @@ impl Token<'_> {
}
}
if let Token::Sequence(seq) = tok {
let items = seq.iter().map(Token::merged).flat_map(|t| match t {
Token::Sequence(seq) => Either::Left(seq.into_iter()),
other => Either::Right(std::iter::once(other)),
});
for item in items {
if let Some(Token::PlainText(last)) = acc.last_mut() {
if let Token::PlainText(tok_text) = item {
*last = Cow::from(last.to_string() + tok_text.as_ref());
continue;
}
}
acc.push(item);
}
return acc;
}
acc.push(tok.merged());
acc
});
@ -200,13 +223,13 @@ impl SliceOffset for Span<'_> {
}
}
const fn boxing_sequence<'a>(
func: impl Fn(Box<Token<'a>>) -> Token<'a>,
) -> impl Fn(Vec<Token<'a>>) -> Token<'a> {
move |tokens| func(Box::new(Token::Sequence(tokens)))
#[inline]
fn boxing_token<'a>(func: impl Fn(Box<Token<'a>>) -> Token<'a>) -> impl Fn(Token<'a>) -> Token<'a> {
move |tokens| func(Box::new(tokens))
}
const fn collect_char_sequence<'a>(
#[inline]
fn collect_char_sequence<'a>(
func: impl Fn(Cow<'a, str>) -> Token<'a>,
) -> impl Fn(Vec<char>) -> Token<'a> {
move |chars| func(Cow::Owned(chars.into_iter().collect()))
@ -215,14 +238,14 @@ const fn collect_char_sequence<'a>(
fn spliced<'a>(
segments: &[Span<'a>],
func: impl Fn(Span) -> IResult<Span, Token>,
output_mapper: impl Fn(Box<Token<'static>>) -> Token<'static>,
parent: Span<'a>,
) -> IResult<Span<'a>, Token<'static>, nom::error::Error<Span<'a>>> {
let combined = segments
.iter()
.copied()
.map(Span::into_fragment)
.collect::<String>();
.collect::<Vec<_>>()
.join("\n");
let cum_offset_combined = segments
.iter()
.scan(0, |acc, &x| {
@ -234,7 +257,7 @@ fn spliced<'a>(
cum_offset_combined
.iter()
.enumerate()
.filter(|(_, &o)| o >= input.location_offset())
.take_while(|(_, &o)| o > input.location_offset())
.map(|(i, o)| (segments[i], o))
.last()
};
@ -275,7 +298,7 @@ fn spliced<'a>(
parent
};
Ok((out, output_mapper(Box::new(inner.owned()))))
Ok((out, inner.owned()))
}
fn space(input: Span) -> IResult<Span, Token> {
@ -370,6 +393,22 @@ impl Context {
Ok((input, token))
}
fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
self.partial(Self::url_no_embed),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url),
self.partial(Self::text),
))(input)?;
Ok((input, token))
}
fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
@ -416,12 +455,7 @@ impl Context {
return fail(input);
}
let (_, inner) = spliced(
&quote_lines,
self.partial(Self::full),
Token::Quote,
orig_input,
)?;
let (_, inner) = spliced(&quote_lines, self.partial(Self::full), orig_input)?;
let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?;
@ -450,7 +484,10 @@ impl Context {
let (input, _) = not(not(line_ending))(input)?;
let (input, _) = opt(line_ending)(input)?;
Ok((input, boxing_sequence(Token::Center)(center_seq)))
Ok((
input,
boxing_token(Token::Center)(Token::Sequence(center_seq)),
))
}
fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
@ -522,13 +559,16 @@ impl Context {
))
}
const fn tag_delimited<'a, 'b: 'a, T>(
#[inline]
fn tag_delimited<'a, 'b: 'a, T>(
&'a self,
start: &'b str,
end: &'b str,
escape: bool,
matcher_inner: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
mapper: impl Fn(Vec<T>) -> Token<'b> + 'a,
matcher_inner_fallback: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
collector: impl Fn(Vec<T>) -> Token<'b> + 'a,
mapper: impl Fn(Token<'b>) -> Token<'b> + 'a,
) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_ {
move |input| {
let opening_tag = &tag(start);
@ -548,18 +588,40 @@ impl Context {
closing_tag,
))(post_open);
if let Err(nom::Err::Error(nom::error::Error { .. })) = res {
if let Err(nom::Err::Error(nom::error::Error {
input: input_past_err,
..
})) = res
{
let res_fallback = tuple((
many1(tuple((not(closing_tag), &matcher_inner_fallback))),
closing_tag,
))(post_open);
if res_fallback.is_err() {
return Ok((
input_past_err,
Token::PlainText(begin.fragment_between(&input_past_err).into()),
));
}
let (input, (inner, closing)) = res_fallback.unwrap();
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
return Ok((
post_open,
Token::PlainText(begin.fragment_between(&post_open).into()),
input,
Token::Sequence(vec![
Token::PlainText(begin.fragment_between(&post_open).into()),
collector(inner),
Token::PlainText(closing.into_fragment().into()),
]),
));
}
let (input, (inner, _)) = res?;
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
Ok((input, mapper(inner)))
Ok((input, mapper(collector(inner))))
}
}
@ -632,8 +694,10 @@ impl Context {
"<small>",
"</small>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Small),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Small),
)(input)
}
@ -643,8 +707,10 @@ impl Context {
"***",
"***",
true,
self.partial(Self::inline),
boxing_sequence(Token::BoldItalic),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::BoldItalic),
)(input)
}
@ -654,8 +720,10 @@ impl Context {
"___",
"___",
true,
self.partial(Self::inline),
boxing_sequence(Token::BoldItalic),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::BoldItalic),
)(input)
}
@ -664,8 +732,10 @@ impl Context {
"<b>",
"</b>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Bold),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Bold),
)(input)
}
@ -675,8 +745,10 @@ impl Context {
"**",
"**",
true,
self.partial(Self::inline),
boxing_sequence(Token::Bold),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Bold),
)(input)
}
@ -686,8 +758,10 @@ impl Context {
"__",
"__",
true,
self.partial(Self::inline),
boxing_sequence(Token::Bold),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Bold),
)(input)
}
@ -696,8 +770,10 @@ impl Context {
"<i>",
"</i>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Italic),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Italic),
)(input)
}
@ -707,8 +783,10 @@ impl Context {
"*",
"*",
true,
self.partial(Self::inline),
boxing_sequence(Token::Italic),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Italic),
)(input)
}
@ -718,8 +796,10 @@ impl Context {
"_",
"_",
true,
self.partial(Self::inline),
boxing_sequence(Token::Italic),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Italic),
)(input)
}
@ -728,8 +808,10 @@ impl Context {
"<s>",
"</s>",
false,
self.partial(Self::inline),
boxing_sequence(Token::Strikethrough),
self.partial(Self::inline_single),
self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Strikethrough),
)(input)
}
@ -740,9 +822,18 @@ impl Context {
"~~",
true,
move |input| {
tuple((not_line_ending, self.partial(Self::inline)))(input).map(|(i, t)| (i, t.1))
tuple((not_line_ending, self.partial(Self::inline_single)))(input)
.map(|(i, t)| (i, t.1))
},
boxing_sequence(Token::Strikethrough),
move |input| {
tuple((
not_line_ending,
self.partial(Self::inline_non_formatting_single),
))(input)
.map(|(i, t)| (i, t.1))
},
Token::Sequence,
boxing_token(Token::Strikethrough),
)(input)
}
@ -755,7 +846,9 @@ impl Context {
tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input)
.map(|(i, (_skip, c))| (i, c))
},
fail,
collect_char_sequence(Token::InlineCode),
identity,
)(input)
}
@ -764,8 +857,10 @@ impl Context {
"\\(",
"\\)",
false,
move |input| tuple((not_line_ending, anychar))(input).map(|(i, (_skip, c))| (i, c)),
collect_char_sequence(Token::InlineMath),
move |input| tuple((not(line_ending), anychar))(input).map(|(i, (_skip, c))| (i, c)),
fail,
collect_char_sequence(Token::InlineCode),
identity,
)(input)
}
@ -1044,7 +1139,61 @@ mod test {
]))
},
Context.full(Span::new(emoji)).unwrap().1.merged()
)
);
let bold_italic = r#"***bold italic***"#;
assert_eq!(
Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))),
Context.full(Span::new(bold_italic)).unwrap().1.merged()
);
let bold_italic_tag = r#"<b><i>bold italic</i></b>"#;
assert_eq!(
Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText(
"bold italic".into()
))))),
Context.full(Span::new(bold_italic_tag)).unwrap().1.merged()
);
assert_eq!(
Token::Sequence(vec![
Token::PlainText("<b>bold ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag1".into(),
host: None
},
Token::PlainText(" <i> ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag2".into(),
host: None
},
Token::PlainText(" </b>italic</i>".into())
]),
Context
.full(Span::new(r#"<b>bold @tag1 <i> @tag2 </b>italic</i>"#))
.unwrap()
.1
.merged()
);
let quote = r#"
> test
> <i>
> italic
> </i>
>> Nested quote
"#;
assert_eq!(
Token::Quote(Box::new(Token::Sequence(vec![
Token::PlainText("test\n".into()),
Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))),
Token::Quote(Box::new(Token::PlainText("Nested quote".into())))
]))),
Context.full(Span::new(quote)).unwrap().1.merged()
);
}
#[test]