From 9f62c72f29f768f6538355310bcd819fb5cb3082 Mon Sep 17 00:00:00 2001 From: Natty Date: Sun, 8 Dec 2024 00:04:48 +0100 Subject: [PATCH] Switch towards recursive ascent --- Cargo.lock | 22 +- magnetar_mmm_parser/Cargo.toml | 4 +- magnetar_mmm_parser/src/lib.rs | 1012 +---------------------- magnetar_mmm_parser/src/output_types.rs | 261 ++++++ magnetar_mmm_parser/src/parser.rs | 157 ++++ magnetar_mmm_parser/src/test.rs | 145 ++-- magnetar_mmm_parser/src/types.rs | 120 +++ magnetar_mmm_parser/src/xml_write.rs | 3 +- 8 files changed, 633 insertions(+), 1091 deletions(-) create mode 100644 magnetar_mmm_parser/src/output_types.rs create mode 100644 magnetar_mmm_parser/src/parser.rs create mode 100644 magnetar_mmm_parser/src/types.rs diff --git a/Cargo.lock b/Cargo.lock index 343d5ad..5402dc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -435,12 +435,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "bytecount" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" - [[package]] name = "byteorder" version = "1.5.0" @@ -632,7 +626,6 @@ dependencies = [ "itoa", "rustversion", "ryu", - "serde", "static_assertions", ] @@ -2054,13 +2047,11 @@ dependencies = [ name = "magnetar_mmm_parser" version = "0.3.0-alpha" dependencies = [ - "compact_str", "either", "emojis", - "nom", - "nom_locate", "quick-xml", "serde", + "smallvec", "strum", "tracing", "unicode-segmentation", @@ -2325,17 +2316,6 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "nom_locate" -version = "4.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e3c83c053b0713da60c5b8de47fe8e494fe3ece5267b2f23090a07a053ba8f3" -dependencies = [ - "bytecount", - "memchr", - "nom", -] - [[package]] name = "nu-ansi-term" version = "0.46.0" diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 07fab72..f78823d 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -11,10 +11,8 @@ xml = ["dep:quick-xml"] [dependencies] either = { workspace = true } emojis = { workspace = true } -nom = { workspace = true } -nom_locate = { workspace = true } -compact_str = { workspace = true, features = ["serde"] } serde = { workspace = true, features = ["derive"] } +smallvec = { workspace = true } strum = { workspace = true, features = ["derive"] } tracing = { workspace = true } unicode-segmentation = { workspace = true } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 5face12..5bfb907 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,577 +1,61 @@ +mod parser; mod test; mod xml_write; +mod types; +mod output_types; use std::collections::HashMap; -use std::convert::{identity, Infallible}; -use std::marker::PhantomData; - -use compact_str::{CompactString, ToCompactString}; -use either::Either; -use nom::branch::alt; -use nom::bytes::complete::{tag, tag_no_case}; -use nom::character::complete::{ - alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, - satisfy, space1, tab, -}; -use nom::combinator::{eof, fail, map, not, opt, peek, recognize}; -use nom::error::ErrorKind; -use nom::multi::{many0_count, many1, many1_count, many_till, separated_list1}; -use nom::sequence::tuple; -use nom::{IResult, Offset, Parser, Slice}; -use nom_locate::LocatedSpan; -use serde::{Deserialize, Serialize}; -use strum::IntoStaticStr; use tracing::trace; use unicode_segmentation::UnicodeSegmentation; -#[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize, IntoStaticStr)] -// The alternative would be to implement a serde serializer for this one enum, but that's disgusting -#[strum(serialize_all = "snake_case")] -#[serde(rename_all = "snake_case")] -pub enum MentionType { - Community, - User, - MatrixUser, -} +use crate::output_types::Token; -impl MentionType { - pub fn to_char(&self) -> char { - match self { - MentionType::Community => '!', - MentionType::User => '@', - MentionType::MatrixUser => ':', - } - } - - pub fn separator(&self) -> char { - match self { - MentionType::Community | MentionType::User => '@', - MentionType::MatrixUser => ':', - } - } -} - -#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] -pub enum Token { - PlainText(CompactString), - Sequence(Vec), - Quote(Box), - Small(Box), - Bold(Box), - Italic(Box), - Center(Box), - Strikethrough(Box), - PlainTag(String), - InlineCode(String), - InlineMath(String), - UrlRaw(String), - UrlNoEmbed(String), - Link { - label: Box, - href: String, - embed: bool, - }, - BlockCode { - lang: Option, - inner: String, - }, - BlockMath(String), - Function { - name: String, - params: HashMap>, - inner: Box, - }, - Mention { - name: String, - host: Option, - mention_type: MentionType, - }, - UnicodeEmoji(String), - ShortcodeEmoji { - shortcode: String, - host: Option, - }, - Hashtag(String), -} - -impl Token { - fn str_content_left(&self) -> Option<&str> { - match self { - Token::PlainText(text) => Some(text.as_ref()), - Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left), - Token::Quote(inner) => inner.str_content_left(), - Token::Small(inner) => inner.str_content_left(), - Token::Bold(inner) => inner.str_content_left(), - Token::Italic(inner) => inner.str_content_left(), - Token::Center(inner) => inner.str_content_left(), - Token::Strikethrough(inner) => inner.str_content_left(), - Token::PlainTag(tag) => Some(tag.as_ref()), - Token::UrlRaw(url) => Some(url.as_ref()), - Token::UrlNoEmbed(url) => Some(url.as_ref()), - Token::Link { label, .. } => label.str_content_left(), - Token::Function { inner, .. } => inner.str_content_left(), - Token::Mention { name, .. } => Some(name.as_ref()), - Token::UnicodeEmoji(code) => Some(code.as_ref()), - Token::Hashtag(tag) => Some(tag.as_ref()), - _ => None, - } - } - - fn str_content_right(&self) -> Option<&str> { - match self { - Token::PlainText(text) => Some(text.as_ref()), - Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right), - Token::Quote(inner) => inner.str_content_right(), - Token::Small(inner) => inner.str_content_right(), - Token::Bold(inner) => inner.str_content_right(), - Token::Italic(inner) => inner.str_content_right(), - Token::Center(inner) => inner.str_content_right(), - Token::Strikethrough(inner) => inner.str_content_right(), - Token::PlainTag(tag) => Some(tag.as_ref()), - Token::UrlRaw(url) => Some(url.as_ref()), - Token::UrlNoEmbed(url) => Some(url.as_ref()), - Token::Link { label, .. } => label.str_content_right(), - Token::Function { inner, .. } => inner.str_content_right(), - Token::Mention { name, .. } => Some(name.as_ref()), - Token::UnicodeEmoji(code) => Some(code.as_ref()), - Token::Hashtag(tag) => Some(tag.as_ref()), - _ => None, - } - } - - fn inner(&self) -> Token { - match self { - plain @ Token::PlainText(_) => plain.clone(), - sequence @ Token::Sequence(_) => sequence.clone(), - Token::Quote(inner) => inner.inner(), - Token::Small(inner) => inner.inner(), - Token::Bold(inner) => inner.inner(), - Token::Italic(inner) => inner.inner(), - Token::Center(inner) => inner.inner(), - Token::Strikethrough(inner) => inner.inner(), - Token::PlainTag(text) => Token::PlainText(text.clone().into()), - Token::InlineCode(code) => Token::PlainText(code.clone().into()), - Token::InlineMath(math) => Token::PlainText(math.clone().into()), - Token::UrlRaw(url) => Token::PlainText(url.clone().into()), - Token::UrlNoEmbed(url) => Token::PlainText(url.clone().into()), - Token::Link { label, .. } => label.inner(), - Token::BlockCode { inner, .. } => Token::PlainText(inner.clone().into()), - Token::BlockMath(math) => Token::PlainText(math.clone().into()), - Token::Function { inner, .. } => inner.inner(), - Token::Mention { name, .. } => Token::PlainText(name.clone().into()), - Token::UnicodeEmoji(code) => Token::PlainText(code.clone().into()), - Token::ShortcodeEmoji { shortcode, .. } => Token::PlainText(shortcode.clone().into()), - Token::Hashtag(tag) => Token::PlainText(tag.clone().into()), - } - } - - fn merged(&self) -> Token { - match self { - Token::Sequence(tokens) => { - let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| { - if let Some(Token::PlainText(last)) = acc.last_mut() { - if let Token::PlainText(tok_text) = tok { - *last += tok_text.as_ref(); - - return acc; - } - } - - if let Token::Sequence(seq) = tok { - let items = seq.iter().map(Token::merged).flat_map(|t| match t { - Token::Sequence(seq) => Either::Left(seq.into_iter()), - other => Either::Right(std::iter::once(other)), - }); - - for item in items { - if let Some(Token::PlainText(last)) = acc.last_mut() { - if let Token::PlainText(tok_text) = item { - *last += tok_text.as_ref(); - - continue; - } - } - - acc.push(item); - } - - return acc; - } - - acc.push(tok.merged()); - acc - }); - - if tokens_multi.len() == 1 { - return tokens_multi.into_iter().next().unwrap(); - } - - Token::Sequence(tokens_multi) - } - Token::Quote(inner) => Token::Quote(Box::new(inner.merged())), - Token::Small(inner) => Token::Small(Box::new(inner.merged())), - Token::Bold(inner) => Token::Bold(Box::new(inner.merged())), - Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), - Token::Center(inner) => Token::Center(Box::new(inner.merged())), - Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())), - Token::Link { embed, label, href } => Token::Link { - label: Box::new(label.merged()), - href: href.clone(), - embed: *embed, - }, - Token::Function { - name, - params, - inner, - } => Token::Function { - name: name.clone(), - params: params.clone(), - inner: Box::new(inner.merged()), - }, - other => other.clone(), - } - } - - pub fn walk_map_collect(&self, func: &impl Fn(&Token) -> Option, out: &mut Vec) { - if let Some(v) = func(self) { - out.push(v) - } - - match self { - Token::Sequence(items) => { - items.iter().for_each(|tok| tok.walk_map_collect(func, out)); - } - Token::Quote(inner) - | Token::Small(inner) - | Token::Bold(inner) - | Token::Italic(inner) - | Token::Center(inner) - | Token::Function { inner, .. } - | Token::Link { label: inner, .. } - | Token::Strikethrough(inner) => inner.walk_map_collect(func, out), - _ => {} - } - } - - pub fn walk_speech_transform(&mut self, func: &impl Fn(&mut CompactString)) { - match self { - Token::Sequence(items) => { - items - .iter_mut() - .for_each(|tok| tok.walk_speech_transform(func)); - } - Token::Small(inner) - | Token::Bold(inner) - | Token::Italic(inner) - | Token::Center(inner) - | Token::Function { inner, .. } - | Token::Strikethrough(inner) => inner.walk_speech_transform(func), - Token::PlainText(text) => func(text), - _ => {} - } - } -} - -pub fn janky_is_line_begin(input: Span<'_>) -> bool { - let offset = input.location_offset(); - - // VERY BAD - // Safety: This is very janky, but hopefully will work as long as nom-locate keeps the invariant of fragments being subslices of the input - // We do this to avoid scanning the entire input for a line separator when we just need the previous byte - offset == 0 - || unsafe { - let frag_bytes = input.fragment().as_bytes(); - let frag_ptr = frag_bytes.as_ptr(); - let prev_byte = frag_ptr.offset(-1); - matches!(*prev_byte, b'\n') - } -} - -#[derive(Debug, Default, Copy, Clone)] -pub struct SpanMeta { - depth: usize, -} - -impl SpanMeta { - fn new(depth: usize) -> Self { - Self { depth } - } -} - -type Span<'a> = LocatedSpan<&'a str, SpanMeta>; - -trait SliceOffset { - fn up_to(&self, other: &Self) -> Self; - - fn fragment_between<'a>(&self, other: &Self) -> &'a str - where - Self: 'a; -} - -impl SliceOffset for Span<'_> { - fn up_to(&self, other: &Self) -> Self { - self.slice(..self.offset(other)) - } - - fn fragment_between<'a>(&self, other: &Self) -> &'a str - where - Self: 'a, - { - self.up_to(other).into_fragment() - } -} - -#[inline] -fn boxing_token(func: impl Fn(Box) -> Token) -> impl Fn(Token) -> Token { - move |tokens| func(Box::new(tokens)) -} - -#[inline] -fn collect_sequence( - func: impl Fn(Vec) -> Token, - transform: impl Fn(Token) -> Token, -) -> impl Fn(&mut dyn Iterator) -> Token { - move |tokens| transform(func(tokens.collect())) -} - -#[inline] -fn collect_char_sequence( - func: impl Fn(String) -> Token, -) -> impl Fn(&mut dyn Iterator) -> Token { - move |chars| func(chars.collect()) -} - -#[inline] -fn space1_unicode(input: Span) -> IResult { - recognize(many1_count(tuple(( - not(line_ending), - satisfy(char::is_whitespace), - ))))(input) -} - -#[inline] -fn alphanumeric1_unicode(input: Span) -> IResult { - recognize(many1_count(char_alphanumeric_unicode))(input) -} - -#[inline] -fn char_alphanumeric_unicode(input: Span) -> IResult { - satisfy(char::is_alphanumeric)(input) -} - -fn spliced<'a>( - segments: &[Span<'a>], - func: impl Fn(Span) -> IResult, - parent: Span<'a>, -) -> IResult, Token, nom::error::Error>> { - let combined = segments - .iter() - .copied() - .map(Span::into_fragment) - .collect::>() - .join("\n"); - let cum_offset_combined = segments - .iter() - .scan(0, |acc, &x| { - *acc += x.len(); - Some(*acc) - }) - .collect::>(); - let current_seg = |input: Span| { - cum_offset_combined - .iter() - .enumerate() - .take_while(|(_, &o)| o > input.location_offset()) - .map(|(i, o)| (segments[i], o)) - .last() - }; - - type NE = nom::Err; - type NomError<'x> = nom::error::Error>; - - let spliced_span = Span::new_extra( - &combined, - segments.first().map_or(SpanMeta::new(0), |s| s.extra), - ); - let (input, inner) = match func(spliced_span) { - Ok(s) => s, +pub fn parse_full(input: &str) -> Token { + match self.full(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), Err(e) => { - return match e { - NE::Error(e) => { - let offset_new = e.input.location_offset(); - if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) { - let offset = offset_new - offset_seg_new; - let offset_orig = offset + seg_parent.location_offset(); - Err(NE::Error(NomError::new( - Span::new_extra( - &parent.into_fragment()[offset_orig..], - seg_parent.extra, - ), - e.code, - ))) - } else { - // ??? - Err(NE::Failure(NomError::new(parent, ErrorKind::Fail))) - } - } - NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))), - NE::Incomplete(i) => Err(NE::Incomplete(i)), - }; - } - }; - - let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) { - let offset = input.location_offset() - offset_seg_new; - let offset_orig = offset + seg_parent.location_offset(); - parent.slice(offset_orig..) - } else { - parent - }; - - Ok((out, inner)) -} - -fn space(input: Span) -> IResult { - let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?; - Ok((input, Token::PlainText(frag.into_fragment().into()))) -} - -#[derive(Copy, Clone)] -struct Matcher<'a, 'b, T: Clone> { - matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), - collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), - _phantom_closure: PhantomData<&'a ()>, - _phantom_data: PhantomData<&'b ()>, - _phantom_output: PhantomData T>, -} - -impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> { - fn new( - matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), - collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), - ) -> Self { - Self { - matcher_inner, - collector, - _phantom_closure: PhantomData, - _phantom_data: PhantomData, - _phantom_output: PhantomData, + trace!(input = input, "Full parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) } } } -impl<'a, 'b> Matcher<'a, 'b, Infallible> { - // Don't break this invariant, else a monster will come at night and eat all your socks - fn reject() -> Self { - Self { - matcher_inner: &fail::<_, Infallible, _>, - collector: &|_| unreachable!(), - _phantom_closure: PhantomData, - _phantom_data: PhantomData, - _phantom_output: PhantomData, +pub fn parse_inline(input: &str) -> Token { + match self.full(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Inline parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) } } } -#[derive(Copy, Clone, Debug)] -enum FlankingRule { - Lenient, - Strict, - DontCare, -} - -struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>>( - T, - FlankingRule, - PhantomData<&'a ()>, -); - -impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From<(T, FlankingRule)> - for FlankingDelim<'a, T> -{ - fn from((func, rule): (T, FlankingRule)) -> Self { - FlankingDelim(func, rule, PhantomData) +pub fn parse_ui(input: &str) -> Token { + match self.inline_ui(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Inline parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } } } -impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From for FlankingDelim<'a, T> { - fn from(func: T) -> Self { - FlankingDelim(func, FlankingRule::DontCare, PhantomData) +pub fn parse_profile_fields(input: &str) -> Token { + match self.inline_profile_fields(Span::new_extra(input, SpanMeta::default())) { + Ok((_, t)) => t.merged(), + Err(e) => { + trace!(input = input, "Profile field parser fail: {:?}", e); + Token::PlainText(e.to_compact_string()) + } } } -pub struct Context { - depth_limit: usize, -} - -const DEFAULT_DEPTH_LIMIT: usize = 24; - -impl Default for Context { - fn default() -> Self { - Context::new(DEFAULT_DEPTH_LIMIT) - } -} impl Context { pub fn new(depth_limit: usize) -> Self { Self { depth_limit } } - pub fn parse_full(&self, input: &str) -> Token { - match self.full(Span::new_extra(input, SpanMeta::default())) { - Ok((_, t)) => t.merged(), - Err(e) => { - trace!(input = input, "Full parser fail: {:?}", e); - Token::PlainText(e.to_compact_string()) - } - } - } - - pub fn parse_inline(&self, input: &str) -> Token { - match self.full(Span::new_extra(input, SpanMeta::default())) { - Ok((_, t)) => t.merged(), - Err(e) => { - trace!(input = input, "Inline parser fail: {:?}", e); - Token::PlainText(e.to_compact_string()) - } - } - } - - pub fn parse_ui(&self, input: &str) -> Token { - match self.inline_ui(Span::new_extra(input, SpanMeta::default())) { - Ok((_, t)) => t.merged(), - Err(e) => { - trace!(input = input, "Inline parser fail: {:?}", e); - Token::PlainText(e.to_compact_string()) - } - } - } - - pub fn parse_profile_fields(&self, input: &str) -> Token { - match self.inline_profile_fields(Span::new_extra(input, SpanMeta::default())) { - Ok((_, t)) => t.merged(), - Err(e) => { - trace!(input = input, "Profile field parser fail: {:?}", e); - Token::PlainText(e.to_compact_string()) - } - } - } - - #[inline] - fn partial( - &self, - func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Token> + 'static, - ) -> impl for<'a> Fn(Span<'a>) -> IResult, Token> + '_ { - move |input| func(self, input) - } - - #[inline] - fn partial_span( - &self, - func: impl for<'a> Fn(&Self, Span<'a>) -> IResult, Span<'a>> + 'static, - ) -> impl for<'a> Fn(Span<'a>) -> IResult, Span<'a>> + '_ { - move |input| func(self, input) - } pub fn full<'a>(&self, input: Span<'a>) -> IResult, Token> { map( @@ -606,7 +90,7 @@ impl Context { )), eof, ) - .map(|v| v.0), + .map(|v| v.0), Token::Sequence, )(input) } @@ -621,7 +105,7 @@ impl Context { )), eof, ) - .map(|v| v.0), + .map(|v| v.0), Token::Sequence, )(input) } @@ -727,66 +211,6 @@ impl Context { Ok((input, token)) } - fn tag_quote<'a>(&self, input: Span<'a>) -> IResult, Token> { - let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?; - - if let (None, None) = leading_spaces { - if !janky_is_line_begin(input) { - return fail(input); - } - } - - let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input); - - let orig_input = input; - let (input, lines) = separated_list1(line_ending, quote_line)(input)?; - - let quote_lines = lines - .into_iter() - .map(|(_, _, text)| text) - .collect::>(); - - if quote_lines.len() == 1 - && quote_lines - .iter() - .map(Span::fragment) - .copied() - .any(&str::is_empty) - { - return fail(input); - } - - let (_, inner) = spliced("e_lines, self.partial(Self::full), orig_input)?; - - let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?; - - Ok((input, Token::Quote(Box::new(inner)))) - } - - fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult, Token> { - let tag_start = &tag("
"); - let tag_end = &tag("
"); - - let (input, _) = opt(line_ending)(input)?; - - if !janky_is_line_begin(input) { - return fail(input); - } - - let (input, _) = tag_start(input)?; - let (input, _) = opt(line_ending)(input)?; - - let (input, (center_seq, _)) = many_till( - self.partial(Self::inline_single), - tuple((opt(space1), opt(line_ending), tag_end)), - )(input)?; - - Ok(( - input, - boxing_token(Token::Center)(Token::Sequence(center_seq)), - )) - } - fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult, Token> { let delim = &tag("```"); @@ -859,124 +283,6 @@ impl Context { )) } - #[inline] - fn tag_delimited<'a, 'b: 'a, T: Clone, S: Clone, FOpen, FClose>( - &'a self, - opening_tag: impl Into> + 'a, - closing_tag: impl Into> + 'a, - escape: bool, - matcher: Matcher<'a, 'b, T>, - fallback: Matcher<'a, 'b, S>, - ) -> impl Fn(Span<'b>) -> IResult, Token> + 'a - where - FOpen: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, - FClose: Fn(Span<'b>) -> IResult, Span<'b>> + 'a, - { - let FlankingDelim(opening_tag, opening_rule, ..) = opening_tag.into(); - let FlankingDelim(closing_tag, closing_rule, ..) = closing_tag.into(); - - move |input| { - if let FlankingRule::Strict = opening_rule { - let (input, pre) = opt(recognize(tuple(( - char_alphanumeric_unicode, - opt(tag("\\")), - &opening_tag, - peek(not(alt((recognize(satisfy(|c| c.is_whitespace())), eof)))), - ))))(input)?; - - if let Some(pre_text) = pre { - return Ok((input, Token::PlainText(pre_text.into_fragment().into()))); - } - } - - if escape { - if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) { - return Ok(( - input_escaped, - Token::PlainText(mark.fragment().to_string().into()), - )); - } - } - - let begin = input; - let (post_open, _) = opening_tag(input)?; - - let res = tuple(( - many1(tuple((not(&closing_tag), &matcher.matcher_inner))), - &closing_tag, - ))(post_open); - - if let Err(nom::Err::Error(nom::error::Error { .. })) = res { - let res_fallback = tuple(( - many1(tuple((not(&closing_tag), &fallback.matcher_inner))), - &closing_tag, - ))(post_open); - - if res_fallback.is_err() { - return Ok(( - post_open, - Token::PlainText(begin.fragment_between(&post_open).into()), - )); - } - - let (input, (inner, closing)) = res_fallback.unwrap(); - let mut inner = inner.into_iter().map(|(_, t)| t); - - return Ok(( - input, - Token::Sequence(vec![ - Token::PlainText(begin.fragment_between(&post_open).into()), - (fallback.collector)(&mut inner), - Token::PlainText(closing.into_fragment().into()), - ]), - )); - } - - let (input, (inner, closing)) = res?; - let mut inner = inner.into_iter().map(|(_, t)| t); - - let inner_tok = (matcher.collector)(&mut inner); - - let correct_left_flanking = - if let FlankingRule::Lenient | FlankingRule::Strict = opening_rule { - let text_left = inner_tok.str_content_left(); - - !(text_left.is_some_and(|s| s.starts_with(char::is_whitespace)) - || text_left.is_none()) - } else { - true - }; - - let correct_right_flanking = - if let FlankingRule::Lenient | FlankingRule::Strict = closing_rule { - let text_right = inner_tok.str_content_right(); - !(text_right.is_some_and(|s| s.ends_with(char::is_whitespace)) - || text_right.is_none()) - } else { - true - }; - - let (input, alphanum) = opt(peek(alphanumeric1_unicode))(input)?; - let correct_right_outer = - alphanum.is_none() || !matches!(closing_rule, FlankingRule::Strict); - - let correct_flanking = - correct_left_flanking && correct_right_flanking && correct_right_outer; - - if !correct_flanking { - return Ok(( - input, - Token::Sequence(vec![ - Token::PlainText(begin.fragment_between(&post_open).into()), - inner_tok.inner(), - Token::PlainText(closing.into_fragment().into()), - ]), - )); - } - Ok((input, Token::Sequence(vec![inner_tok]))) - } - } - fn tag_func<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, _) = tag("$[")(input)?; @@ -1027,215 +333,6 @@ impl Context { )) } - fn tag_plain<'a>(&self, input: Span<'a>) -> IResult, Token> { - let opening_tag = &tag(""); - let closing_tag = &tag(""); - - let (input, _) = opening_tag(input)?; - let (input, text) = map( - recognize(many1(tuple((not(line_ending), not(closing_tag), anychar)))), - Span::into_fragment, - )(input)?; - let (input, _) = closing_tag(input)?; - - Ok((input, Token::PlainTag(text.into()))) - } - - fn tag_small<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - tag_no_case(""), - tag_no_case(""), - false, - Matcher::new( - &self.partial(Self::inline_single), - &collect_sequence(Token::Sequence, boxing_token(Token::Small)), - ), - Matcher::new( - &self.partial(Self::inline_non_formatting_single), - &collect_sequence(Token::Sequence, identity), - ), - )(input) - } - - fn tag_bold<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - tag_no_case(""), - tag_no_case(""), - false, - Matcher::new( - &self.partial(Self::inline_single), - &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), - ), - Matcher::new( - &self.partial(Self::inline_non_formatting_single), - &collect_sequence(Token::Sequence, identity), - ), - )(input) - } - - fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - (tag("**"), FlankingRule::Lenient), - (tag("**"), FlankingRule::Lenient), - true, - Matcher::new( - &self.partial(Self::inline_single), - &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), - ), - Matcher::new( - &self.partial(Self::inline_non_formatting_single), - &collect_sequence(Token::Sequence, identity), - ), - )(input) - } - - fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - (tag("__"), FlankingRule::Strict), - (tag("__"), FlankingRule::Strict), - true, - Matcher::new( - &self.partial(Self::inline_single), - &collect_sequence(Token::Sequence, boxing_token(Token::Bold)), - ), - Matcher::new( - &self.partial(Self::inline_non_formatting_single), - &collect_sequence(Token::Sequence, identity), - ), - )(input) - } - - fn tag_italic<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - tag_no_case(""), - tag_no_case(""), - false, - Matcher::new( - &self.partial(Self::inline_single), - &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), - ), - Matcher::new( - &self.partial(Self::inline_non_formatting_single), - &collect_sequence(Token::Sequence, identity), - ), - )(input) - } - - fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - (tag("*"), FlankingRule::Lenient), - (tag("*"), FlankingRule::Lenient), - true, - Matcher::new( - &self.partial(Self::inline_single), - &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), - ), - Matcher::new( - &self.partial(Self::inline_non_formatting_single), - &collect_sequence(Token::Sequence, identity), - ), - )(input) - } - - fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - (tag("_"), FlankingRule::Strict), - (tag("_"), FlankingRule::Strict), - true, - Matcher::new( - &self.partial(Self::inline_single), - &collect_sequence(Token::Sequence, boxing_token(Token::Italic)), - ), - Matcher::new( - &self.partial(Self::inline_non_formatting_single), - &collect_sequence(Token::Sequence, identity), - ), - )(input) - } - - fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - tag_no_case(""), - tag_no_case(""), - false, - Matcher::new( - &self.partial(Self::inline_single), - &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), - ), - Matcher::new( - &self.partial(Self::inline_non_formatting_single), - &collect_sequence(Token::Sequence, identity), - ), - )(input) - } - - fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - (tag("~~"), FlankingRule::Lenient), - (tag("~~"), FlankingRule::Lenient), - true, - Matcher::new( - &move |input| { - map( - tuple((not(line_ending), self.partial(Self::inline_single))), - |(_, captured)| captured, - )(input) - }, - &collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)), - ), - Matcher::new( - &move |input| { - map( - tuple(( - not(line_ending), - self.partial(Self::inline_non_formatting_single), - )), - |(_, captured)| captured, - )(input) - }, - &collect_sequence(Token::Sequence, identity), - ), - )(input) - } - - fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - tag("`"), - |input| alt((tag("`"), tag("ยด")))(input), - true, - Matcher::new( - &move |input| { - map( - tuple((not(alt((tag("`"), tag("ยด"), line_ending))), anychar)), - |(_, captured)| captured, - )(input) - }, - &collect_char_sequence(Token::InlineCode), - ), - Matcher::reject(), - )(input) - } - - fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult, Token> { - self.tag_delimited( - tag("\\("), - tag("\\)"), - false, - Matcher::new( - &move |input| { - map(tuple((not(line_ending), anychar)), |(_, captured)| captured)(input) - }, - &collect_char_sequence(Token::InlineMath), - ), - Matcher::reject(), - )(input) - } - - fn tag_raw_text<'a>(&self, input: Span<'a>) -> IResult, Token> { - let (input, text) = anychar(input)?; - Ok((input, Token::PlainText(text.to_compact_string()))) - } - fn raw_url<'a>(&self, input: Span<'a>) -> IResult, Token> { let (input, url_span) = recognize(tuple(( self.partial_span(Self::protocol), @@ -1438,27 +535,6 @@ impl Context { Ok((input, Token::Hashtag(hashtag_text.into()))) } - #[inline] - fn increase_nesting<'a, 'b, O, F>( - &'b self, - mut func: F, - ) -> impl FnMut(Span<'a>) -> IResult, O> + 'b - where - F: Parser, O, nom::error::Error>> + 'b, - { - move |mut input| { - if input.extra.depth >= self.depth_limit { - return fail(input); - } - - input.extra.depth += 1; - func.parse(input).map(|mut v| { - v.0.extra.depth -= 1; - v - }) - } - } - #[inline] fn hashtag_chars<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { recognize(alt(( @@ -1496,32 +572,6 @@ impl Context { alt((tag("https://"), tag("http://")))(input) } - #[inline] - fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { - alt(( - recognize(tuple(( - tag("["), - many_till( - self.increase_nesting(self.partial_span(Self::url_chars_base)), - tag("]"), - ), - ))), - recognize(tuple(( - tag("("), - many_till( - self.increase_nesting(self.partial_span(Self::url_chars_base)), - tag(")"), - ), - ))), - recognize(tuple(( - not(satisfy(char::is_control)), - not(satisfy(char::is_whitespace)), - not(one_of(")]>")), - anychar, - ))), - ))(input) - } - #[inline] fn url_chars<'a, 'b, F>( &'b self, diff --git a/magnetar_mmm_parser/src/output_types.rs b/magnetar_mmm_parser/src/output_types.rs new file mode 100644 index 0000000..e07726d --- /dev/null +++ b/magnetar_mmm_parser/src/output_types.rs @@ -0,0 +1,261 @@ +use either::Either; +use serde::{Deserialize, Serialize}; +use std::borrow::Cow; +use std::collections::HashMap; +use strum::IntoStaticStr; + +#[derive(Debug, Clone, Deserialize, Serialize, Eq, PartialEq)] +pub enum Token<'a> { + PlainText(Cow<'a, str>), + Sequence(Vec>), + Quote(Vec>), + Small(Vec>), + BoldItalic(Vec>), + Bold(Vec>), + Italic(Vec>), + Center(Vec>), + Strikethrough(Vec>), + PlainTag(String), + InlineCode(String), + InlineMath(String), + UrlRaw(String), + UrlNoEmbed(String), + Link { + label: Vec>, + href: String, + }, + LinkNoEmbed { + label: Vec>, + href: String, + }, + BlockCode { + lang: Option, + inner: String, + }, + BlockMath(String), + Function { + name: String, + params: HashMap>, + inner: Vec>, + }, + Mention { + name: String, + host: Option, + mention_type: MentionType, + }, + UnicodeEmoji(String), + ShortcodeEmoji { + shortcode: String, + host: Option, + }, + Hashtag(String), +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize, IntoStaticStr)] +// The alternative would be to implement a serde serializer for this one enum, but that's disgusting +#[strum(serialize_all = "snake_case")] +#[serde(rename_all = "snake_case")] +pub enum MentionType { + Community, + User, + MatrixUser, +} + +impl MentionType { + pub fn to_char(&self) -> char { + match self { + MentionType::Community => '!', + MentionType::User => '@', + MentionType::MatrixUser => ':', + } + } + + pub fn separator(&self) -> char { + match self { + MentionType::Community | MentionType::User => '@', + MentionType::MatrixUser => ':', + } + } +} + + +impl Token<'_> { + fn str_content_left(&self) -> Option<&str> { + match self { + Token::PlainText(text) => Some(text.as_ref()), + Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left), + Token::Quote(inner) => inner.str_content_left(), + Token::Small(inner) => inner.str_content_left(), + Token::Bold(inner) => inner.str_content_left(), + Token::Italic(inner) => inner.str_content_left(), + Token::Center(inner) => inner.str_content_left(), + Token::Strikethrough(inner) => inner.str_content_left(), + Token::PlainTag(tag) => Some(tag.as_ref()), + Token::UrlRaw(url) => Some(url.as_ref()), + Token::UrlNoEmbed(url) => Some(url.as_ref()), + Token::Link { label, .. } => label.str_content_left(), + Token::Function { inner, .. } => inner.str_content_left(), + Token::Mention { name, .. } => Some(name.as_ref()), + Token::UnicodeEmoji(code) => Some(code.as_ref()), + Token::Hashtag(tag) => Some(tag.as_ref()), + _ => None, + } + } + + fn str_content_right(&self) -> Option<&str> { + match self { + Token::PlainText(text) => Some(text.as_ref()), + Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right), + Token::Quote(inner) => inner.str_content_right(), + Token::Small(inner) => inner.str_content_right(), + Token::Bold(inner) => inner.str_content_right(), + Token::Italic(inner) => inner.str_content_right(), + Token::Center(inner) => inner.str_content_right(), + Token::Strikethrough(inner) => inner.str_content_right(), + Token::PlainTag(tag) => Some(tag.as_ref()), + Token::UrlRaw(url) => Some(url.as_ref()), + Token::UrlNoEmbed(url) => Some(url.as_ref()), + Token::Link { label, .. } => label.str_content_right(), + Token::Function { inner, .. } => inner.str_content_right(), + Token::Mention { name, .. } => Some(name.as_ref()), + Token::UnicodeEmoji(code) => Some(code.as_ref()), + Token::Hashtag(tag) => Some(tag.as_ref()), + _ => None, + } + } + + fn inner(&self) -> Token { + match self { + plain @ Token::PlainText(_) => plain.clone(), + sequence @ Token::Sequence(_) => sequence.clone(), + Token::Quote(inner) => inner.inner(), + Token::Small(inner) => inner.inner(), + Token::Bold(inner) => inner.inner(), + Token::Italic(inner) => inner.inner(), + Token::Center(inner) => inner.inner(), + Token::Strikethrough(inner) => inner.inner(), + Token::PlainTag(text) => Token::PlainText(text.clone().into()), + Token::InlineCode(code) => Token::PlainText(code.clone().into()), + Token::InlineMath(math) => Token::PlainText(math.clone().into()), + Token::UrlRaw(url) => Token::PlainText(url.clone().into()), + Token::UrlNoEmbed(url) => Token::PlainText(url.clone().into()), + Token::Link { label, .. } => label.inner(), + Token::BlockCode { inner, .. } => Token::PlainText(inner.clone().into()), + Token::BlockMath(math) => Token::PlainText(math.clone().into()), + Token::Function { inner, .. } => inner.inner(), + Token::Mention { name, .. } => Token::PlainText(name.clone().into()), + Token::UnicodeEmoji(code) => Token::PlainText(code.clone().into()), + Token::ShortcodeEmoji { shortcode, .. } => Token::PlainText(shortcode.clone().into()), + Token::Hashtag(tag) => Token::PlainText(tag.clone().into()), + } + } + + fn merged(&self) -> Token { + match self { + Token::Sequence(tokens) => { + let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| { + if let Some(Token::PlainText(last)) = acc.last_mut() { + if let Token::PlainText(tok_text) = tok { + *last += tok_text.as_ref(); + + return acc; + } + } + + if let Token::Sequence(seq) = tok { + let items = seq.iter().map(Token::merged).flat_map(|t| match t { + Token::Sequence(seq) => Either::Left(seq.into_iter()), + other => Either::Right(std::iter::once(other)), + }); + + for item in items { + if let Some(Token::PlainText(last)) = acc.last_mut() { + if let Token::PlainText(tok_text) = item { + *last += tok_text.as_ref(); + + continue; + } + } + + acc.push(item); + } + + return acc; + } + + acc.push(tok.merged()); + acc + }); + + if tokens_multi.len() == 1 { + return tokens_multi.into_iter().next().unwrap(); + } + + Token::Sequence(tokens_multi) + } + Token::Quote(inner) => Token::Quote(Box::new(inner.merged())), + Token::Small(inner) => Token::Small(Box::new(inner.merged())), + Token::Bold(inner) => Token::Bold(Box::new(inner.merged())), + Token::Italic(inner) => Token::Italic(Box::new(inner.merged())), + Token::Center(inner) => Token::Center(Box::new(inner.merged())), + Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())), + Token::Link { label, href } => Token::Link { + label: Box::new(label.merged()), + href: href.clone(), + }, + Token::LinkNoEmbed { label, href } => Token::LinkNoEmbed { + label: Box::new(label.merged()), + href: href.clone(), + }, + Token::Function { + name, + params, + inner, + } => Token::Function { + name: name.clone(), + params: params.clone(), + inner: Box::new(inner.merged()), + }, + other => other.clone(), + } + } + + pub fn walk_map_collect(&self, func: &impl Fn(&Token) -> Option, out: &mut Vec) { + if let Some(v) = func(self) { + out.push(v) + } + + match self { + Token::Sequence(items) => { + items.iter().for_each(|tok| tok.walk_map_collect(func, out)); + } + Token::Quote(inner) + | Token::Small(inner) + | Token::Bold(inner) + | Token::Italic(inner) + | Token::Center(inner) + | Token::Function { inner, .. } + | Token::Link { label: inner, .. } + | Token::Strikethrough(inner) => inner.walk_map_collect(func, out), + _ => {} + } + } + + pub fn walk_speech_transform(&mut self, func: &impl Fn(&mut Cow<'_, str>)) { + match self { + Token::Sequence(items) => { + items + .iter_mut() + .for_each(|tok| tok.walk_speech_transform(func)); + } + Token::Small(inner) + | Token::Bold(inner) + | Token::Italic(inner) + | Token::Center(inner) + | Token::Function { inner, .. } + | Token::Strikethrough(inner) => inner.walk_speech_transform(func), + Token::PlainText(text) => func(text), + _ => {} + } + } +} diff --git a/magnetar_mmm_parser/src/parser.rs b/magnetar_mmm_parser/src/parser.rs new file mode 100644 index 0000000..e707850 --- /dev/null +++ b/magnetar_mmm_parser/src/parser.rs @@ -0,0 +1,157 @@ +use crate::types::{Effect, Input, Parser, ParserCont, ParserRet, State}; + +fn line_start<'a>( + state: &mut State, + inp: &mut impl Input<'a>, + _output: &'_ mut impl FnMut(Effect<'a>), + cont: impl ParserCont, +) -> ParserRet { + match inp.view().as_bytes() { + [b'>', b' ', ..] => cont.continue_with2((line_start, quote)), + [b'`', b'`', b'`', ..] => cont.continue_with(CodeBlock {}), + [b'\\', b'[', ..] => cont.continue_with(BlockMath {}), + [b'<', b'c', b'e', b'n', b't', b'e', b'r', b'>', ..] => cont.continue_with2((inline, center_tag_end)), + _ => cont.continue_with(inline) + } +} + +fn inline<'a>( + state: &mut State, + inp: &mut impl Input<'a>, + _output: &'_ mut impl FnMut(Effect<'a>), + cont: impl ParserCont, +) -> ParserRet { + match inp.view().as_bytes() { + [b'\n', ..] => return cont.continue_with(line_start), + [b'<', b'b', b'>', ..] => return cont.continue_with(inline), + [b'<', b's', b'>', ..] => return cont.continue_with(inline), + [b'<', b'i', b'>', ..] => return cont.continue_with(inline), + [b'<', b'p', b'l', b'a', b'i', b'n', b'>', ..] => return cont.continue_with(inline), + [b'<', b's', b'm', b'a', b'l', b'l', b'>', ..] => return cont.continue_with(inline), + [b'*', b'*', ..] => return cont.continue_with(inline), + [b'_', b'_', ..] => return cont.continue_with(inline), + [b'*', ..] => return cont.continue_with(inline), + [b'_', ..] => return cont.continue_with(inline), + [b'~', b'~', ..] => return cont.continue_with(inline), + [b'`', ..] => return cont.continue_with(inline), + [b'\\', b'(', ..] => return cont.continue_with(inline), + }; +} + +fn text_or_emoji<'a>( + state: &mut State, + input: &mut impl Input<'a>, + _output: &'_ mut impl FnMut(Effect<'a>), + cont: impl ParserCont, +) -> ParserRet { + let Some(view) = input.next() else { + return; + }; + + let emoji_str = view.trim_end_matches(['\u{200c}', '\u{200d}']); + if let Some(_) = emojis::get(emoji_str) { + output(Effect::Output(emoji_str)); + return; + }; + + output(Effect::Output(view)); +} + +fn block_quote_end<'a>( + state: &mut State, + inp: &mut impl Input<'a>, + _output: &'_ mut impl FnMut(Effect<'a>), + cont: impl ParserCont, +) -> ParserRet {} + +fn code_block_end<'a>( + state: &mut State, + inp: &mut impl Input<'a>, + _output: &'_ mut impl FnMut(Effect<'a>), + cont: impl ParserCont, +) -> ParserRet {} + +fn block_math_end<'a>( + state: &mut State, + inp: &mut impl Input<'a>, + _output: &'_ mut impl FnMut(Effect<'a>), + cont: impl ParserCont, +) -> ParserRet {} + + +fn center_tag_end<'a>( + state: &mut State, + inp: &mut impl Input<'a>, + _output: &'_ mut impl FnMut(Effect<'a>), + cont: impl ParserCont, +) -> ParserRet {} + + +#[derive(Copy, Clone)] +enum TagInlineKind { + TagSmall, + TagPlain, + TagBold, + TagItalic, + TagStrikethrough, +} + +struct TagInline { + kind: TagInlineKind, +} + +impl Parser for TagInline {} + + +fn inline_math_end<'a>( + state: &mut State, + inp: &mut impl Input<'a>, + _output: &'_ mut impl FnMut(Effect<'a>), + cont: impl ParserCont, +) -> ParserRet {} + + +fn inline_code_end<'a>( + state: &mut State, + inp: &mut impl Input<'a>, + _output: &'_ mut impl FnMut(Effect<'a>), + cont: impl ParserCont, +) -> ParserRet {} + + +struct Url {} + +impl Parser for Url { + fn take<'a>( + &mut self, + state: State, + input: &mut impl Input<'a>, + output: &'_ mut impl FnMut(Effect<'a>), + ) -> impl Parser {} +} + +#[inline] +fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { + alt(( + recognize(tuple(( + tag("["), + many_till( + self.increase_nesting(self.partial_span(Self::url_chars_base)), + tag("]"), + ), + ))), + recognize(tuple(( + tag("("), + many_till( + self.increase_nesting(self.partial_span(Self::url_chars_base)), + tag(")"), + ), + ))), + recognize(tuple(( + not(satisfy(char::is_control)), + not(satisfy(char::is_whitespace)), + not(one_of(")]>")), + anychar, + ))), + ))(input) +} diff --git a/magnetar_mmm_parser/src/test.rs b/magnetar_mmm_parser/src/test.rs index 5f3e5da..b47b1af 100644 --- a/magnetar_mmm_parser/src/test.rs +++ b/magnetar_mmm_parser/src/test.rs @@ -1,17 +1,8 @@ #![cfg(test)] use std::collections::HashMap; -use nom::bytes::complete::tag; - -use crate::{xml_write::to_xml_string, Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; - -fn parse_full(string: &str) -> Token { - Context::default() - .full(Span::new_extra(string, SpanMeta::default())) - .unwrap() - .1 - .merged() -} +use crate::output_types::{MentionType, Token}; +use crate::{parse_full, xml_write::to_xml_string}; #[test] fn parse_empty() { @@ -27,9 +18,9 @@ fn parse_url_chars() { "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", SpanMeta::default(), )) - .unwrap() - .1 - .into_fragment(), + .unwrap() + .1 + .into_fragment(), "https://en.wikipedia.org/wiki/Sandbox_(computer_security)" ); @@ -60,9 +51,9 @@ fn parse_url_chars() { "https://cs.wikipedia.org/wiki/Among Us )", SpanMeta::default(), )) - .unwrap() - .1 - .into_fragment(), + .unwrap() + .1 + .into_fragment(), "https://cs.wikipedia.org/wiki/Among Us" ); @@ -71,9 +62,9 @@ fn parse_url_chars() { "https://en.wikipedia.org/wiki/Among Us )", SpanMeta::default(), )) - .unwrap() - .1 - .into_fragment(), + .unwrap() + .1 + .into_fragment(), "https://en.wikipedia.org/wiki/Among" ); } @@ -82,17 +73,17 @@ fn parse_url_chars() { fn parse_formatting() { assert_eq!( parse_full(r#"~~stikethrough~~"#), - Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), + Token::Strikethrough(vec![Token::PlainText("stikethrough".into())]), ); assert_eq!( parse_full(r#"**bold**"#), - Token::Bold(Box::new(Token::PlainText("bold".into()))), + Token::Bold(vec![Token::PlainText("bold".into())]), ); assert_eq!( parse_full(r#"*italic*"#), - Token::Italic(Box::new(Token::PlainText("italic".into()))), + Token::Italic(vec![Token::PlainText("italic".into())]), ); assert_eq!( @@ -109,7 +100,7 @@ fn parse_formatting() { parse_full("intra*word*italic"), Token::Sequence(vec![ Token::PlainText("intra".into()), - Token::Italic(Box::new(Token::PlainText("word".into()))), + Token::Italic(vec![Token::PlainText("word".into())]), Token::PlainText("italic".into()), ]) ); @@ -123,13 +114,13 @@ fn parse_formatting() { parse_full(r#"long text with a *footnote text"#), Token::Sequence(vec![ Token::PlainText("long text with a *footnote ".into()), - Token::Bold(Box::new(Token::PlainText("text".into()))), + Token::Bold(vec![Token::PlainText("text".into())]), ]) ); assert_eq!( parse_full(r#"*"italic"*"#), - Token::Italic(Box::new(Token::PlainText("\"italic\"".into()))) + Token::Italic(vec![Token::PlainText("\"italic\"".into())]) ); assert_eq!( @@ -161,23 +152,23 @@ fn parse_formatting() { assert_eq!( parse_full(r#"***bold italic***"#), - Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( + Token::Bold(vec![Token::Italic(vec![Token::PlainText( "bold italic".into() - ))))) + )])]) ); assert_eq!( parse_full(r#"bold italic"#), - Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( + Token::Bold(vec![Token::Italic(vec![Token::PlainText( "bold italic".into() - ))))) + )])]) ); assert_eq!( parse_full("~~*hello\nworld*"), Token::Sequence(vec![ Token::PlainText("~~".into()), - Token::Italic(Box::new(Token::PlainText("hello\nworld".into()))), + Token::Italic(vec![Token::PlainText("hello\nworld".into())]), ]) ) } @@ -188,7 +179,7 @@ fn parse_flanking() { parse_full(r#"aaa*iii*bbb"#), Token::Sequence(vec![ Token::PlainText("aaa".into()), - Token::Italic(Box::new(Token::PlainText("iii".into()))), + Token::Italic(vec![Token::PlainText("iii".into())]), Token::PlainText("bbb".into()), ]) ); @@ -202,33 +193,33 @@ fn parse_flanking() { parse_full("aaa\n_iii_\nbbb"), Token::Sequence(vec![ Token::PlainText("aaa\n".into()), - Token::Italic(Box::new(Token::PlainText("iii".into()))), + Token::Italic(vec![Token::PlainText("iii".into())]), Token::PlainText("\nbbb".into()), ]) ); assert_eq!( parse_full(r#"*iii*"#), - Token::Italic(Box::new(Token::PlainText("iii".into()))) + Token::Italic(vec![Token::PlainText("iii".into())]) ); assert_eq!( parse_full(r#"_iii_"#), - Token::Italic(Box::new(Token::PlainText("iii".into()))) + Token::Italic(vec![Token::PlainText("iii".into())]) ); assert_eq!( parse_full(r#"aaa*iii*"#), Token::Sequence(vec![ Token::PlainText("aaa".into()), - Token::Italic(Box::new(Token::PlainText("iii".into()))), + Token::Italic(vec![Token::PlainText("iii".into())]), ]) ); assert_eq!( parse_full(r#"*iii*bbb"#), Token::Sequence(vec![ - Token::Italic(Box::new(Token::PlainText("iii".into()))), + Token::Italic(vec![Token::PlainText("iii".into())]), Token::PlainText("bbb".into()), ]) ); @@ -309,12 +300,12 @@ a^2 + b^2 = c^2 ๐Ÿฆ‹๐Ÿณ๏ธโ€โšง๏ธ text"# ), - Token::Center(Box::new(Token::Sequence(vec![ + Token::Center(vec![ Token::PlainText("centered\n".into()), Token::UnicodeEmoji("๐Ÿฆ‹".into()), Token::UnicodeEmoji("๐Ÿณ๏ธโ€โšง๏ธ".into()), Token::PlainText("\ntext".into()), - ]))) + ]) ); assert_eq!( @@ -323,11 +314,11 @@ a^2 + b^2 = c^2 > ๐Ÿ‘ฉ๐Ÿฝโ€๐Ÿคโ€๐Ÿ‘ฉ๐Ÿผ > text"# ), - Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ + Token::Quote(vec![Token::Center(vec![ Token::PlainText("centered\n".into()), Token::UnicodeEmoji("๐Ÿ‘ฉ๐Ÿฝโ€๐Ÿคโ€๐Ÿ‘ฉ๐Ÿผ".into()), Token::PlainText("\ntext".into()) - ]))))), + ])]), ); assert_eq!( @@ -335,11 +326,11 @@ a^2 + b^2 = c^2 Token::Function { name: "x2".into(), params: HashMap::new(), - inner: Box::new(Token::Sequence(vec![ + inner: vec![ Token::Function { name: "sparkle".into(), params: HashMap::new(), - inner: Box::new(Token::UnicodeEmoji("๐Ÿฅบ".into())), + inner: vec![Token::UnicodeEmoji("๐Ÿฅบ".into())], }, Token::UnicodeEmoji("๐Ÿ’œ".into()), Token::Function { @@ -350,10 +341,10 @@ a^2 + b^2 = c^2 params.insert("speed".into(), Some("5s".into())); params }, - inner: Box::new(Token::UnicodeEmoji("โค๏ธ".into())), + inner: vec![Token::UnicodeEmoji("โค๏ธ".into())], }, Token::UnicodeEmoji("๐ŸฆŠ".into()), - ])) + ] }, ); @@ -362,13 +353,13 @@ a^2 + b^2 = c^2 Token::Sequence(vec![ Token::PlainText("bold ".into()), Token::Mention { - mention_type: crate::MentionType::User, + mention_type: MentionType::User, name: "tag1".into(), host: None }, Token::PlainText(" ".into()), Token::Mention { - mention_type: crate::MentionType::User, + mention_type: MentionType::User, name: "tag2".into(), host: None }, @@ -386,11 +377,11 @@ a^2 + b^2 = c^2 >> Nested quote "# ), - Token::Quote(Box::new(Token::Sequence(vec![ + Token::Quote(vec![ Token::PlainText("test\n".into()), - Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), - Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) - ]))), + Token::Italic(vec![Token::PlainText("\nitalic\n".into())]), + Token::Quote(vec![Token::PlainText("Nested quote".into())]) + ]), ); } @@ -442,9 +433,8 @@ fn parse_link() { Token::Sequence(vec![ Token::PlainText("Link test: ".into()), Token::Link { - label: Box::new(Token::PlainText("label".into())), - href: "https://example.com".into(), - embed: true, + label: vec![Token::PlainText("label".into())], + href: "https://example.com".into() }, ]) ); @@ -481,10 +471,9 @@ fn parse_link() { parse_full("Link test: ?[label](https://awawa.gay)"), Token::Sequence(vec![ Token::PlainText("Link test: ".into()), - Token::Link { - label: Box::new(Token::PlainText("label".into())), + Token::LinkNoEmbed { + label: vec![Token::PlainText("label".into())], href: "https://awawa.gay".into(), - embed: false, }, ]) ); @@ -493,10 +482,9 @@ fn parse_link() { parse_full("Link test: ?[label](https://awawa.gay)test"), Token::Sequence(vec![ Token::PlainText("Link test: ".into()), - Token::Link { - label: Box::new(Token::PlainText("label".into())), + Token::LinkNoEmbed { + label: vec![Token::PlainText("label".into())], href: "https://awawa.gay".into(), - embed: false, }, Token::PlainText("test".into()), ]) @@ -506,10 +494,9 @@ fn parse_link() { parse_full("Link test: (?[label](https://awawa.gay))"), Token::Sequence(vec![ Token::PlainText("Link test: (".into()), - Token::Link { - label: Box::new(Token::PlainText("label".into())), + Token::LinkNoEmbed { + label: vec![Token::PlainText("label".into())], href: "https://awawa.gay".into(), - embed: false, }, Token::PlainText(")".into()), ]) @@ -546,7 +533,7 @@ fn parse_mention() { assert_eq!( parse_full("@tag"), Token::Mention { - mention_type: crate::MentionType::User, + mention_type: MentionType::User, name: "tag".into(), host: None, } @@ -562,7 +549,7 @@ fn parse_mention() { Token::Sequence(vec![ Token::PlainText("hgsjlkdsa ".into()), Token::Mention { - mention_type: crate::MentionType::User, + mention_type: MentionType::User, name: "tag".into(), host: None, }, @@ -575,7 +562,7 @@ fn parse_mention() { Token::Sequence(vec![ Token::PlainText("hgsjlkdsa ".into()), Token::Mention { - mention_type: crate::MentionType::User, + mention_type: MentionType::User, name: "tag".into(), host: None, }, @@ -588,7 +575,7 @@ fn parse_mention() { Token::Sequence(vec![ Token::PlainText("aaaa ".into()), Token::Mention { - mention_type: crate::MentionType::User, + mention_type: MentionType::User, name: "tag".into(), host: Some("domain".into()), }, @@ -601,7 +588,7 @@ fn parse_mention() { Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { - mention_type: crate::MentionType::User, + mention_type: MentionType::User, name: "tag".into(), host: Some("domain".into()), }, @@ -614,7 +601,7 @@ fn parse_mention() { Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { - mention_type: crate::MentionType::User, + mention_type: MentionType::User, name: "tag".into(), host: Some("domain.gay".into()), }, @@ -627,7 +614,7 @@ fn parse_mention() { Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { - mention_type: crate::MentionType::User, + mention_type: MentionType::User, name: "tag".into(), host: Some("domain".into()), }, @@ -640,7 +627,7 @@ fn parse_mention() { Token::Sequence(vec![ Token::PlainText("test ".into()), Token::Mention { - mention_type: crate::MentionType::Community, + mention_type: MentionType::Community, name: "tag".into(), host: Some("domain.com".into()), }, @@ -651,7 +638,7 @@ fn parse_mention() { assert_eq!( parse_full("@tag:domain.com"), Token::Mention { - mention_type: crate::MentionType::MatrixUser, + mention_type: MentionType::MatrixUser, name: "tag".into(), host: Some("domain.com".into()) }, @@ -758,20 +745,10 @@ fn xml_serialization() { &to_xml_string(&parse_full( "@natty $[spin.speed=0.5s ๐Ÿฅบ]:cat_attack: test" )) - .unwrap(), + .unwrap(), r#" ๐Ÿฅบcat_attack test"# ); - assert_eq!( - &to_xml_string(&parse_full( - "Ring Galaxy AM 0644 741 from Hubble\nCredits: AURA, STScI, J. Higdon, Cornell, ESA, #NASA\n#nature #space #astrophotography" - )) - .unwrap(), - r#"Ring Galaxy AM 0644 741 from Hubble -Credits: AURA, STScI, J. Higdon, Cornell, ESA, NASA -nature space astrophotography"# - ); - assert_eq!( &to_xml_string(&parse_full( r#" @@ -779,7 +756,7 @@ Credits: AURA, STScI, J. Higdon, Cornell, ESA, NASA var x = undefined; ``` "# )) - .unwrap(), + .unwrap(), "var x = undefined;" ); } diff --git a/magnetar_mmm_parser/src/types.rs b/magnetar_mmm_parser/src/types.rs new file mode 100644 index 0000000..1d2f8cd --- /dev/null +++ b/magnetar_mmm_parser/src/types.rs @@ -0,0 +1,120 @@ +use unicode_segmentation::{Graphemes, UnicodeSegmentation}; + +#[derive(Debug, Copy, Clone)] +pub(crate) struct ParseSpan<'a> { + pub(crate) source: &'a str, + pub(crate) offset: usize, + pub(crate) length: usize, +} + +impl ParseSpan<'_> { + pub(crate) fn concat(self, other: Self) -> Option { + if self.source != other.source { + panic!("Attempted to concat slices from different strings"); + } + + if self.offset + self.length != other.offset { + return None; + } + + Some(ParseSpan { + source: self.source, + offset: self.offset, + length: self.length + other.length, + }) + } + + pub(crate) fn spanned_source(&self) -> &str { + &self.source[self.offset..self.offset + self.length] + } +} + + +pub(crate) struct TokStream<'a>(ParseSpan<'a>, Graphemes<'a>); + +impl<'a> From<&'a str> for TokStream<'a> { + fn from(source: &'a str) -> Self { + TokStream( + ParseSpan { + source, + length: source.len(), + offset: 0, + }, + source.graphemes(true), + ) + } +} + +pub(crate) trait Input<'a> { + fn next(&mut self) -> Option<&'a str>; + fn view(&self) -> &'a str; +} + +impl<'a> Input<'a> for TokStream<'a> { + #[inline] + fn next(&mut self) -> Option<&'a str> { + if let Some(p) = self.1.next() { + let length = p.len(); + self.0.offset += length; + self.0.length -= length; + return Some(p); + } + + None + } + + #[inline] + fn view(&self) -> &'a str { + &self.0.source[self.0.offset..self.0.offset + self.0.length] + } +} + +#[derive(Debug, Copy, Clone)] +pub(crate) struct Lex<'a> { + pub(crate) token: &'a str, + pub(crate) span: ParseSpan<'a>, +} + +pub(crate) type OutTok<'a> = Lex<'a>; + +pub(crate) const MAX_DEPTH: usize = 24; + +#[derive(Debug, Default, Clone, Copy)] +pub(crate) struct State { + pub(crate) depth: usize, +} + +pub(crate) enum Effect<'a> { + Output(OutTok<'a>) +} + + +#[must_use] +pub(crate) struct ParserRet { + _private: (), +} + +pub(crate) trait ParserCont { + fn continue_with(self, to: impl Parser) -> ParserRet; + fn continue_with2(self, to: (impl Parser, impl Parser)) -> ParserRet; +} + +pub(crate) trait Parser { + fn take<'a>( + &mut self, + state: &mut State, + input: &mut impl Input<'a>, + handler: &'_ mut impl FnMut(Effect<'a>), + visitor: impl ParserCont, + ) -> ParserRet; +} + +impl Parser for fn(&mut State, &mut I, &'_ mut F, V) -> ParserRet { + fn take<'a>(&mut self, + state: &mut State, + input: &mut impl Input<'a>, + handler: &'_ mut impl FnMut(Effect<'a>), + visitor: impl ParserCont) -> ParserRet { + self(state, input, handler, visitor) + } +} diff --git a/magnetar_mmm_parser/src/xml_write.rs b/magnetar_mmm_parser/src/xml_write.rs index 6565807..2cdb3b6 100644 --- a/magnetar_mmm_parser/src/xml_write.rs +++ b/magnetar_mmm_parser/src/xml_write.rs @@ -1,9 +1,8 @@ use std::io::{Cursor, Write}; +use crate::output_types::Token; use quick_xml::events::{BytesText, Event}; -use crate::Token; - impl Token { fn write(&self, writer: &mut quick_xml::Writer) -> quick_xml::Result<()> { match self {