Janky sequence unnesting and attempting to salvage nested parsing in incorrect formatting tags
This commit is contained in:
parent
453891ddf4
commit
703e1191c2
|
@ -1622,6 +1622,7 @@ dependencies = [
|
|||
name = "mmm_parser"
|
||||
version = "0.2.1-alpha"
|
||||
dependencies = [
|
||||
"either",
|
||||
"emojis",
|
||||
"nom",
|
||||
"nom_locate",
|
||||
|
|
|
@ -29,6 +29,7 @@ cached = "0.46"
|
|||
cfg-if = "1"
|
||||
chrono = "0.4"
|
||||
dotenvy = "0.15"
|
||||
either = "1.9"
|
||||
emojis = "0.6"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
|
|
|
@ -5,6 +5,7 @@ edition.workspace = true
|
|||
license = "MIT OR Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
either = { workspace = true }
|
||||
emojis = { workspace = true }
|
||||
nom = { workspace = true }
|
||||
nom_locate = { workspace = true }
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
use either::Either;
|
||||
use nom::branch::alt;
|
||||
use nom::bytes::complete::tag;
|
||||
use nom::character::complete::{
|
||||
|
@ -12,6 +13,7 @@ use nom::{IResult, Offset, Slice};
|
|||
use nom_locate::LocatedSpan;
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::identity;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
|
||||
|
@ -145,6 +147,27 @@ impl Token<'_> {
|
|||
}
|
||||
}
|
||||
|
||||
if let Token::Sequence(seq) = tok {
|
||||
let items = seq.iter().map(Token::merged).flat_map(|t| match t {
|
||||
Token::Sequence(seq) => Either::Left(seq.into_iter()),
|
||||
other => Either::Right(std::iter::once(other)),
|
||||
});
|
||||
|
||||
for item in items {
|
||||
if let Some(Token::PlainText(last)) = acc.last_mut() {
|
||||
if let Token::PlainText(tok_text) = item {
|
||||
*last = Cow::from(last.to_string() + tok_text.as_ref());
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
acc.push(item);
|
||||
}
|
||||
|
||||
return acc;
|
||||
}
|
||||
|
||||
acc.push(tok.merged());
|
||||
acc
|
||||
});
|
||||
|
@ -200,13 +223,13 @@ impl SliceOffset for Span<'_> {
|
|||
}
|
||||
}
|
||||
|
||||
const fn boxing_sequence<'a>(
|
||||
func: impl Fn(Box<Token<'a>>) -> Token<'a>,
|
||||
) -> impl Fn(Vec<Token<'a>>) -> Token<'a> {
|
||||
move |tokens| func(Box::new(Token::Sequence(tokens)))
|
||||
#[inline]
|
||||
fn boxing_token<'a>(func: impl Fn(Box<Token<'a>>) -> Token<'a>) -> impl Fn(Token<'a>) -> Token<'a> {
|
||||
move |tokens| func(Box::new(tokens))
|
||||
}
|
||||
|
||||
const fn collect_char_sequence<'a>(
|
||||
#[inline]
|
||||
fn collect_char_sequence<'a>(
|
||||
func: impl Fn(Cow<'a, str>) -> Token<'a>,
|
||||
) -> impl Fn(Vec<char>) -> Token<'a> {
|
||||
move |chars| func(Cow::Owned(chars.into_iter().collect()))
|
||||
|
@ -215,14 +238,14 @@ const fn collect_char_sequence<'a>(
|
|||
fn spliced<'a>(
|
||||
segments: &[Span<'a>],
|
||||
func: impl Fn(Span) -> IResult<Span, Token>,
|
||||
output_mapper: impl Fn(Box<Token<'static>>) -> Token<'static>,
|
||||
parent: Span<'a>,
|
||||
) -> IResult<Span<'a>, Token<'static>, nom::error::Error<Span<'a>>> {
|
||||
let combined = segments
|
||||
.iter()
|
||||
.copied()
|
||||
.map(Span::into_fragment)
|
||||
.collect::<String>();
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let cum_offset_combined = segments
|
||||
.iter()
|
||||
.scan(0, |acc, &x| {
|
||||
|
@ -234,7 +257,7 @@ fn spliced<'a>(
|
|||
cum_offset_combined
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &o)| o >= input.location_offset())
|
||||
.take_while(|(_, &o)| o > input.location_offset())
|
||||
.map(|(i, o)| (segments[i], o))
|
||||
.last()
|
||||
};
|
||||
|
@ -275,7 +298,7 @@ fn spliced<'a>(
|
|||
parent
|
||||
};
|
||||
|
||||
Ok((out, output_mapper(Box::new(inner.owned()))))
|
||||
Ok((out, inner.owned()))
|
||||
}
|
||||
|
||||
fn space(input: Span) -> IResult<Span, Token> {
|
||||
|
@ -370,6 +393,22 @@ impl Context {
|
|||
Ok((input, token))
|
||||
}
|
||||
|
||||
fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||
let (input, token) = alt((
|
||||
self.partial(Self::unicode_emoji),
|
||||
self.partial(Self::url_no_embed),
|
||||
self.partial(Self::tag_inline_code),
|
||||
self.partial(Self::tag_inline_math),
|
||||
self.partial(Self::tag_func),
|
||||
self.partial(Self::tag_mention),
|
||||
self.partial(Self::tag_hashtag),
|
||||
self.partial(Self::shortcode_emoji),
|
||||
self.partial(Self::raw_url),
|
||||
self.partial(Self::text),
|
||||
))(input)?;
|
||||
Ok((input, token))
|
||||
}
|
||||
|
||||
fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||
let (input, token) = alt((
|
||||
self.partial(Self::unicode_emoji),
|
||||
|
@ -416,12 +455,7 @@ impl Context {
|
|||
return fail(input);
|
||||
}
|
||||
|
||||
let (_, inner) = spliced(
|
||||
"e_lines,
|
||||
self.partial(Self::full),
|
||||
Token::Quote,
|
||||
orig_input,
|
||||
)?;
|
||||
let (_, inner) = spliced("e_lines, self.partial(Self::full), orig_input)?;
|
||||
|
||||
let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?;
|
||||
|
||||
|
@ -450,7 +484,10 @@ impl Context {
|
|||
let (input, _) = not(not(line_ending))(input)?;
|
||||
let (input, _) = opt(line_ending)(input)?;
|
||||
|
||||
Ok((input, boxing_sequence(Token::Center)(center_seq)))
|
||||
Ok((
|
||||
input,
|
||||
boxing_token(Token::Center)(Token::Sequence(center_seq)),
|
||||
))
|
||||
}
|
||||
|
||||
fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||
|
@ -522,13 +559,16 @@ impl Context {
|
|||
))
|
||||
}
|
||||
|
||||
const fn tag_delimited<'a, 'b: 'a, T>(
|
||||
#[inline]
|
||||
fn tag_delimited<'a, 'b: 'a, T>(
|
||||
&'a self,
|
||||
start: &'b str,
|
||||
end: &'b str,
|
||||
escape: bool,
|
||||
matcher_inner: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
|
||||
mapper: impl Fn(Vec<T>) -> Token<'b> + 'a,
|
||||
matcher_inner_fallback: impl Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a,
|
||||
collector: impl Fn(Vec<T>) -> Token<'b> + 'a,
|
||||
mapper: impl Fn(Token<'b>) -> Token<'b> + 'a,
|
||||
) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_ {
|
||||
move |input| {
|
||||
let opening_tag = &tag(start);
|
||||
|
@ -548,18 +588,40 @@ impl Context {
|
|||
closing_tag,
|
||||
))(post_open);
|
||||
|
||||
if let Err(nom::Err::Error(nom::error::Error { .. })) = res {
|
||||
if let Err(nom::Err::Error(nom::error::Error {
|
||||
input: input_past_err,
|
||||
..
|
||||
})) = res
|
||||
{
|
||||
let res_fallback = tuple((
|
||||
many1(tuple((not(closing_tag), &matcher_inner_fallback))),
|
||||
closing_tag,
|
||||
))(post_open);
|
||||
|
||||
if res_fallback.is_err() {
|
||||
return Ok((
|
||||
input_past_err,
|
||||
Token::PlainText(begin.fragment_between(&input_past_err).into()),
|
||||
));
|
||||
}
|
||||
|
||||
let (input, (inner, closing)) = res_fallback.unwrap();
|
||||
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
|
||||
|
||||
return Ok((
|
||||
post_open,
|
||||
Token::PlainText(begin.fragment_between(&post_open).into()),
|
||||
input,
|
||||
Token::Sequence(vec![
|
||||
Token::PlainText(begin.fragment_between(&post_open).into()),
|
||||
collector(inner),
|
||||
Token::PlainText(closing.into_fragment().into()),
|
||||
]),
|
||||
));
|
||||
}
|
||||
|
||||
let (input, (inner, _)) = res?;
|
||||
|
||||
let inner = inner.into_iter().map(|(_, t)| t).collect::<Vec<_>>();
|
||||
|
||||
Ok((input, mapper(inner)))
|
||||
Ok((input, mapper(collector(inner))))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -632,8 +694,10 @@ impl Context {
|
|||
"<small>",
|
||||
"</small>",
|
||||
false,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::Small),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::Small),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -643,8 +707,10 @@ impl Context {
|
|||
"***",
|
||||
"***",
|
||||
true,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::BoldItalic),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::BoldItalic),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -654,8 +720,10 @@ impl Context {
|
|||
"___",
|
||||
"___",
|
||||
true,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::BoldItalic),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::BoldItalic),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -664,8 +732,10 @@ impl Context {
|
|||
"<b>",
|
||||
"</b>",
|
||||
false,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::Bold),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::Bold),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -675,8 +745,10 @@ impl Context {
|
|||
"**",
|
||||
"**",
|
||||
true,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::Bold),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::Bold),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -686,8 +758,10 @@ impl Context {
|
|||
"__",
|
||||
"__",
|
||||
true,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::Bold),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::Bold),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -696,8 +770,10 @@ impl Context {
|
|||
"<i>",
|
||||
"</i>",
|
||||
false,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::Italic),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::Italic),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -707,8 +783,10 @@ impl Context {
|
|||
"*",
|
||||
"*",
|
||||
true,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::Italic),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::Italic),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -718,8 +796,10 @@ impl Context {
|
|||
"_",
|
||||
"_",
|
||||
true,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::Italic),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::Italic),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -728,8 +808,10 @@ impl Context {
|
|||
"<s>",
|
||||
"</s>",
|
||||
false,
|
||||
self.partial(Self::inline),
|
||||
boxing_sequence(Token::Strikethrough),
|
||||
self.partial(Self::inline_single),
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
Token::Sequence,
|
||||
boxing_token(Token::Strikethrough),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -740,9 +822,18 @@ impl Context {
|
|||
"~~",
|
||||
true,
|
||||
move |input| {
|
||||
tuple((not_line_ending, self.partial(Self::inline)))(input).map(|(i, t)| (i, t.1))
|
||||
tuple((not_line_ending, self.partial(Self::inline_single)))(input)
|
||||
.map(|(i, t)| (i, t.1))
|
||||
},
|
||||
boxing_sequence(Token::Strikethrough),
|
||||
move |input| {
|
||||
tuple((
|
||||
not_line_ending,
|
||||
self.partial(Self::inline_non_formatting_single),
|
||||
))(input)
|
||||
.map(|(i, t)| (i, t.1))
|
||||
},
|
||||
Token::Sequence,
|
||||
boxing_token(Token::Strikethrough),
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -755,7 +846,9 @@ impl Context {
|
|||
tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar))(input)
|
||||
.map(|(i, (_skip, c))| (i, c))
|
||||
},
|
||||
fail,
|
||||
collect_char_sequence(Token::InlineCode),
|
||||
identity,
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -764,8 +857,10 @@ impl Context {
|
|||
"\\(",
|
||||
"\\)",
|
||||
false,
|
||||
move |input| tuple((not_line_ending, anychar))(input).map(|(i, (_skip, c))| (i, c)),
|
||||
collect_char_sequence(Token::InlineMath),
|
||||
move |input| tuple((not(line_ending), anychar))(input).map(|(i, (_skip, c))| (i, c)),
|
||||
fail,
|
||||
collect_char_sequence(Token::InlineCode),
|
||||
identity,
|
||||
)(input)
|
||||
}
|
||||
|
||||
|
@ -1044,7 +1139,61 @@ mod test {
|
|||
]))
|
||||
},
|
||||
Context.full(Span::new(emoji)).unwrap().1.merged()
|
||||
)
|
||||
);
|
||||
|
||||
let bold_italic = r#"***bold italic***"#;
|
||||
assert_eq!(
|
||||
Token::BoldItalic(Box::new(Token::PlainText("bold italic".into()))),
|
||||
Context.full(Span::new(bold_italic)).unwrap().1.merged()
|
||||
);
|
||||
|
||||
let bold_italic_tag = r#"<b><i>bold italic</i></b>"#;
|
||||
assert_eq!(
|
||||
Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText(
|
||||
"bold italic".into()
|
||||
))))),
|
||||
Context.full(Span::new(bold_italic_tag)).unwrap().1.merged()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
Token::Sequence(vec![
|
||||
Token::PlainText("<b>bold ".into()),
|
||||
Token::Mention {
|
||||
mention_type: crate::MentionType::User,
|
||||
name: "tag1".into(),
|
||||
host: None
|
||||
},
|
||||
Token::PlainText(" <i> ".into()),
|
||||
Token::Mention {
|
||||
mention_type: crate::MentionType::User,
|
||||
name: "tag2".into(),
|
||||
host: None
|
||||
},
|
||||
Token::PlainText(" </b>italic</i>".into())
|
||||
]),
|
||||
Context
|
||||
.full(Span::new(r#"<b>bold @tag1 <i> @tag2 </b>italic</i>"#))
|
||||
.unwrap()
|
||||
.1
|
||||
.merged()
|
||||
);
|
||||
|
||||
let quote = r#"
|
||||
> test
|
||||
> <i>
|
||||
> italic
|
||||
> </i>
|
||||
>> Nested quote
|
||||
"#;
|
||||
|
||||
assert_eq!(
|
||||
Token::Quote(Box::new(Token::Sequence(vec![
|
||||
Token::PlainText("test\n".into()),
|
||||
Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))),
|
||||
Token::Quote(Box::new(Token::PlainText("Nested quote".into())))
|
||||
]))),
|
||||
Context.full(Span::new(quote)).unwrap().1.merged()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
Loading…
Reference in New Issue