Janky sequence unnesting and attempting to salvage nested parsing in incorrect formatting tags

This commit is contained in:
Natty 2023-10-07 01:46:20 +02:00
parent 453891ddf4
commit 703e1191c2
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
4 changed files with 200 additions and 48 deletions

1
Cargo.lock generated
View File

@ -1622,6 +1622,7 @@ dependencies = [
name = "mmm_parser" name = "mmm_parser"
version = "0.2.1-alpha" version = "0.2.1-alpha"
dependencies = [ dependencies = [
"either",
"emojis", "emojis",
"nom", "nom",
"nom_locate", "nom_locate",

View File

@ -29,6 +29,7 @@ cached = "0.46"
cfg-if = "1" cfg-if = "1"
chrono = "0.4" chrono = "0.4"
dotenvy = "0.15" dotenvy = "0.15"
either = "1.9"
emojis = "0.6" emojis = "0.6"
futures-core = "0.3" futures-core = "0.3"
futures-util = "0.3" futures-util = "0.3"

View File

@ -5,6 +5,7 @@ edition.workspace = true
license = "MIT OR Apache-2.0" license = "MIT OR Apache-2.0"
[dependencies] [dependencies]
either = { workspace = true }
emojis = { workspace = true } emojis = { workspace = true }
nom = { workspace = true } nom = { workspace = true }
nom_locate = { workspace = true } nom_locate = { workspace = true }

View File

@ -1,3 +1,4 @@
use either::Either;
use nom::branch::alt; use nom::branch::alt;
use nom::bytes::complete::tag; use nom::bytes::complete::tag;
use nom::character::complete::{ use nom::character::complete::{
@ -12,6 +13,7 @@ use nom::{IResult, Offset, Slice};
use nom_locate::LocatedSpan; use nom_locate::LocatedSpan;
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::identity;
use unicode_segmentation::UnicodeSegmentation; use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug, Eq, PartialEq)] #[derive(Copy, Clone, Debug, Eq, PartialEq)]
@ -145,6 +147,27 @@ impl Token<'_> {
} }
} }
if let Token::Sequence(seq) = tok {
let items = seq.iter().map(Token::merged).flat_map(|t| match t {
Token::Sequence(seq) => Either::Left(seq.into_iter()),
other => Either::Right(std::iter::once(other)),
});
for item in items {
if let Some(Token::PlainText(last)) = acc.last_mut() {
if let Token::PlainText(tok_text) = item {
*last = Cow::from(last.to_string() + tok_text.as_ref());
continue;
}
}
acc.push(item);
}
return acc;
}
acc.push(tok.merged()); acc.push(tok.merged());
acc acc
}); });
@ -200,13 +223,13 @@ impl SliceOffset for Span<'_> {
} }
} }
const fn boxing_sequence<'a>( #[inline]
func: impl Fn(Box<Token<'a>>) -> Token<'a>, fn boxing_token<'a>(func: impl Fn(Box<Token<'a>>) -> Token<'a>) -> impl Fn(Token<'a>) -> Token<'a> {
) -> impl Fn(Vec<Token<'a>>) -> Token<'a> { move |tokens| func(Box::new(tokens))
move |tokens| func(Box::new(Token::Sequence(tokens)))
} }
const fn collect_char_sequence<'a>( #[inline]
fn collect_char_sequence<'a>(
func: impl Fn(Cow<'a, str>) -> Token<'a>, func: impl Fn(Cow<'a, str>) -> Token<'a>,
) -> impl Fn(Vec<char>) -> Token<'a> { ) -> impl Fn(Vec<char>) -> Token<'a> {
move |chars| func(Cow::Owned(chars.into_iter().collect())) move |chars| func(Cow::Owned(chars.into_iter().collect()))
@ -215,14 +238,14 @@ const fn collect_char_sequence<'a>(
fn spliced<'a>( fn spliced<'a>(
segments: &[Span<'a>], segments: &[Span<'a>],
func: impl Fn(Span) -> IResult<Span, Token>, func: impl Fn(Span) -> IResult<Span, Token>,
output_mapper: impl Fn(Box<Token<'static>>) -> Token<'static>,
parent: Span<'a>, parent: Span<'a>,
) -> IResult<Span<'a>, Token<'static>, nom::error::Error<Span<'a>>> { ) -> IResult<Span<'a>, Token<'static>, nom::error::Error<Span<'a>>> {
let combined = segments let combined = segments
.iter() .iter()
.copied() .copied()
.map(Span::into_fragment) .map(Span::into_fragment)
.collect::<String>(); .collect::<Vec<_>>()
.join("\n");
let cum_offset_combined = segments let cum_offset_combined = segments
.iter() .iter()
.scan(0, |acc, &x| { .scan(0, |acc, &x| {
@ -234,7 +257,7 @@ fn spliced<'a>(
cum_offset_combined cum_offset_combined
.iter() .iter()
.enumerate() .enumerate()
.filter(|(_, &o)| o >= input.location_offset()) .take_while(|(_, &o)| o > input.location_offset())
.map(|(i, o)| (segments[i], o)) .map(|(i, o)| (segments[i], o))
.last() .last()
}; };
@ -275,7 +298,7 @@ fn spliced<'a>(
parent parent
}; };
Ok((out, output_mapper(Box::new(inner.owned())))) Ok((out, inner.owned()))
} }
fn space(input: Span) -> IResult<Span, Token> { fn space(input: Span) -> IResult<Span, Token> {
@ -370,6 +393,22 @@ impl Context {
Ok((input, token)) Ok((input, token))
} }
fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt((
self.partial(Self::unicode_emoji),
self.partial(Self::url_no_embed),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url),
self.partial(Self::text),
))(input)?;
Ok((input, token))
}
fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> { fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let (input, token) = alt(( let (input, token) = alt((
self.partial(Self::unicode_emoji), self.partial(Self::unicode_emoji),
@ -416,12 +455,7 @@ impl Context {
return fail(input); return fail(input);
} }
let (_, inner) = spliced( let (_, inner) = spliced(&quote_lines, self.partial(Self::full), orig_input)?;
&quote_lines,
self.partial(Self::full),
Token::Quote,
orig_input,
)?;
let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?;
@ -450,7 +484,10 @@ impl Context {
let (input, _) = not(not(line_ending))(input)?; let (input, _) = not(not(line_ending))(input)?;
let (input, _) = opt(line_ending)(input)?; let (input, _) = opt(line_ending)(input)?;
Ok((input, boxing_sequence(Token::Center)(center_seq))) Ok((
input,
boxing_token(Token::Center)(Token::Sequence(center_seq)),
))
} }
fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> { fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
@ -522,13 +559,16 @@ impl Context {
)) ))
} }
const fn tag_delimited<'a, 'b: 'a, T>( #[inline]
fn tag_delimited<'a, 'b: 'a, T>(
&'a self, &'a self,
start: &'b str, start: &'b str,
end: &'b str, end: &'b str,
escape: bool, escape: bool,
matcher_inner: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a, matcher_inner: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
mapper: impl Fn(Vec<T>) -> Token<'b> + 'a, matcher_inner_fallback: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
collector: impl Fn(Vec<T>) -> Token<'b> + 'a,
mapper: impl Fn(Token<'b>) -> Token<'b> + 'a,
) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_ { ) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_ {
move |input| { move |input| {
let opening_tag = &tag(start); let opening_tag = &tag(start);
@ -548,18 +588,40 @@ impl Context {
closing_tag, closing_tag,
))(post_open); ))(post_open);
if let Err(nom::Err::Error(nom::error::Error { .. })) = res { if let Err(nom::Err::Error(nom::error::Error {
input: input_past_err,
..
})) = res
{
let res_fallback = tuple((
many1(tuple((not(closing_tag), &matcher_inner_fallback))),
closing_tag,
))(post_open);
if res_fallback.is_err() {
return Ok((
input_past_err,
Token::PlainText(begin.fragment_between(&input_past_err).into()),
));
}
let (input, (inner, closing)) = res_fallback.unwrap();
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
return Ok(( return Ok((
post_open, input,
Token::PlainText(begin.fragment_between(&post_open).into()), Token::Sequence(vec![
Token::PlainText(begin.fragment_between(&post_open).into()),
collector(inner),
Token::PlainText(closing.into_fragment().into()),
]),
)); ));
} }
let (input, (inner, _)) = res?; let (input, (inner, _)) = res?;
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>(); let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
Ok((input, mapper(inner))) Ok((input, mapper(collector(inner))))
} }
} }
@ -632,8 +694,10 @@ impl Context {
"<small>", "<small>",
"</small>", "</small>",
false, false,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::Small), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Small),
)(input) )(input)
} }
@ -643,8 +707,10 @@ impl Context {
"***", "***",
"***", "***",
true, true,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::BoldItalic), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::BoldItalic),
)(input) )(input)
} }
@ -654,8 +720,10 @@ impl Context {
"___", "___",
"___", "___",
true, true,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::BoldItalic), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::BoldItalic),
)(input) )(input)
} }
@ -664,8 +732,10 @@ impl Context {
"<b>", "<b>",
"</b>", "</b>",
false, false,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::Bold), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Bold),
)(input) )(input)
} }
@ -675,8 +745,10 @@ impl Context {
"**", "**",
"**", "**",
true, true,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::Bold), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Bold),
)(input) )(input)
} }
@ -686,8 +758,10 @@ impl Context {
"__", "__",
"__", "__",
true, true,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::Bold), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Bold),
)(input) )(input)
} }
@ -696,8 +770,10 @@ impl Context {
"<i>", "<i>",
"</i>", "</i>",
false, false,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::Italic), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Italic),
)(input) )(input)
} }
@ -707,8 +783,10 @@ impl Context {
"*", "*",
"*", "*",
true, true,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::Italic), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Italic),
)(input) )(input)
} }
@ -718,8 +796,10 @@ impl Context {
"_", "_",
"_", "_",
true, true,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::Italic), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Italic),
)(input) )(input)
} }
@ -728,8 +808,10 @@ impl Context {
"<s>", "<s>",
"</s>", "</s>",
false, false,
self.partial(Self::inline), self.partial(Self::inline_single),
boxing_sequence(Token::Strikethrough), self.partial(Self::inline_non_formatting_single),
Token::Sequence,
boxing_token(Token::Strikethrough),
)(input) )(input)
} }
@ -740,9 +822,18 @@ impl Context {
"~~", "~~",
true, true,
move |input| { move |input| {
tuple((not_line_ending, self.partial(Self::inline)))(input).map(|(i, t)| (i, t.1)) tuple((not_line_ending, self.partial(Self::inline_single)))(input)
.map(|(i, t)| (i, t.1))
}, },
boxing_sequence(Token::Strikethrough), move |input| {
tuple((
not_line_ending,
self.partial(Self::inline_non_formatting_single),
))(input)
.map(|(i, t)| (i, t.1))
},
Token::Sequence,
boxing_token(Token::Strikethrough),
)(input) )(input)
} }
@ -755,7 +846,9 @@ impl Context {
tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input) tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input)
.map(|(i, (_skip, c))| (i, c)) .map(|(i, (_skip, c))| (i, c))
}, },
fail,
collect_char_sequence(Token::InlineCode), collect_char_sequence(Token::InlineCode),
identity,
)(input) )(input)
} }
@ -764,8 +857,10 @@ impl Context {
"\\(", "\\(",
"\\)", "\\)",
false, false,
move |input| tuple((not_line_ending, anychar))(input).map(|(i, (_skip, c))| (i, c)), move |input| tuple((not(line_ending), anychar))(input).map(|(i, (_skip, c))| (i, c)),
collect_char_sequence(Token::InlineMath), fail,
collect_char_sequence(Token::InlineCode),
identity,
)(input) )(input)
} }
@ -1044,7 +1139,61 @@ mod test {
])) ]))
}, },
Context.full(Span::new(emoji)).unwrap().1.merged() Context.full(Span::new(emoji)).unwrap().1.merged()
) );
let bold_italic = r#"***bold italic***"#;
assert_eq!(
Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))),
Context.full(Span::new(bold_italic)).unwrap().1.merged()
);
let bold_italic_tag = r#"<b><i>bold italic</i></b>"#;
assert_eq!(
Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText(
"bold italic".into()
))))),
Context.full(Span::new(bold_italic_tag)).unwrap().1.merged()
);
assert_eq!(
Token::Sequence(vec![
Token::PlainText("<b>bold ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag1".into(),
host: None
},
Token::PlainText(" <i> ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag2".into(),
host: None
},
Token::PlainText(" </b>italic</i>".into())
]),
Context
.full(Span::new(r#"<b>bold @tag1 <i> @tag2 </b>italic</i>"#))
.unwrap()
.1
.merged()
);
let quote = r#"
> test
> <i>
> italic
> </i>
>> Nested quote
"#;
assert_eq!(
Token::Quote(Box::new(Token::Sequence(vec![
Token::PlainText("test\n".into()),
Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))),
Token::Quote(Box::new(Token::PlainText("Nested quote".into())))
]))),
Context.full(Span::new(quote)).unwrap().1.merged()
);
} }
#[test] #[test]