Switch towards recursive ascent

This commit is contained in:
Natty 2024-12-08 00:04:48 +01:00
parent f71429bfe0
commit 9f62c72f29
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
8 changed files with 633 additions and 1091 deletions

22
Cargo.lock generated
View File

@ -435,12 +435,6 @@ dependencies = [
"syn 1.0.109", "syn 1.0.109",
] ]
[[package]]
name = "bytecount"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
[[package]] [[package]]
name = "byteorder" name = "byteorder"
version = "1.5.0" version = "1.5.0"
@ -632,7 +626,6 @@ dependencies = [
"itoa", "itoa",
"rustversion", "rustversion",
"ryu", "ryu",
"serde",
"static_assertions", "static_assertions",
] ]
@ -2054,13 +2047,11 @@ dependencies = [
name = "magnetar_mmm_parser" name = "magnetar_mmm_parser"
version = "0.3.0-alpha" version = "0.3.0-alpha"
dependencies = [ dependencies = [
"compact_str",
"either", "either",
"emojis", "emojis",
"nom",
"nom_locate",
"quick-xml", "quick-xml",
"serde", "serde",
"smallvec",
"strum", "strum",
"tracing", "tracing",
"unicode-segmentation", "unicode-segmentation",
@ -2325,17 +2316,6 @@ dependencies = [
"minimal-lexical", "minimal-lexical",
] ]
[[package]]
name = "nom_locate"
version = "4.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e3c83c053b0713da60c5b8de47fe8e494fe3ece5267b2f23090a07a053ba8f3"
dependencies = [
"bytecount",
"memchr",
"nom",
]
[[package]] [[package]]
name = "nu-ansi-term" name = "nu-ansi-term"
version = "0.46.0" version = "0.46.0"

View File

@ -11,10 +11,8 @@ xml = ["dep:quick-xml"]
[dependencies] [dependencies]
either = { workspace = true } either = { workspace = true }
emojis = { workspace = true } emojis = { workspace = true }
nom = { workspace = true }
nom_locate = { workspace = true }
compact_str = { workspace = true, features = ["serde"] }
serde = { workspace = true, features = ["derive"] } serde = { workspace = true, features = ["derive"] }
smallvec = { workspace = true }
strum = { workspace = true, features = ["derive"] } strum = { workspace = true, features = ["derive"] }
tracing = { workspace = true } tracing = { workspace = true }
unicode-segmentation = { workspace = true } unicode-segmentation = { workspace = true }

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,261 @@
use either::Either;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::collections::HashMap;
use strum::IntoStaticStr;
#[derive(Debug, Clone, Deserialize, Serialize, Eq, PartialEq)]
pub enum Token<'a> {
PlainText(Cow<'a, str>),
Sequence(Vec<Token<'a>>),
Quote(Vec<Token<'a>>),
Small(Vec<Token<'a>>),
BoldItalic(Vec<Token<'a>>),
Bold(Vec<Token<'a>>),
Italic(Vec<Token<'a>>),
Center(Vec<Token<'a>>),
Strikethrough(Vec<Token<'a>>),
PlainTag(String),
InlineCode(String),
InlineMath(String),
UrlRaw(String),
UrlNoEmbed(String),
Link {
label: Vec<Token<'a>>,
href: String,
},
LinkNoEmbed {
label: Vec<Token<'a>>,
href: String,
},
BlockCode {
lang: Option<String>,
inner: String,
},
BlockMath(String),
Function {
name: String,
params: HashMap<String, Option<String>>,
inner: Vec<Token<'a>>,
},
Mention {
name: String,
host: Option<String>,
mention_type: MentionType,
},
UnicodeEmoji(String),
ShortcodeEmoji {
shortcode: String,
host: Option<String>,
},
Hashtag(String),
}
#[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize, IntoStaticStr)]
// The alternative would be to implement a serde serializer for this one enum, but that's disgusting
#[strum(serialize_all = "snake_case")]
#[serde(rename_all = "snake_case")]
pub enum MentionType {
Community,
User,
MatrixUser,
}
impl MentionType {
pub fn to_char(&self) -> char {
match self {
MentionType::Community => '!',
MentionType::User => '@',
MentionType::MatrixUser => ':',
}
}
pub fn separator(&self) -> char {
match self {
MentionType::Community | MentionType::User => '@',
MentionType::MatrixUser => ':',
}
}
}
impl Token<'_> {
fn str_content_left(&self) -> Option<&str> {
match self {
Token::PlainText(text) => Some(text.as_ref()),
Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left),
Token::Quote(inner) => inner.str_content_left(),
Token::Small(inner) => inner.str_content_left(),
Token::Bold(inner) => inner.str_content_left(),
Token::Italic(inner) => inner.str_content_left(),
Token::Center(inner) => inner.str_content_left(),
Token::Strikethrough(inner) => inner.str_content_left(),
Token::PlainTag(tag) => Some(tag.as_ref()),
Token::UrlRaw(url) => Some(url.as_ref()),
Token::UrlNoEmbed(url) => Some(url.as_ref()),
Token::Link { label, .. } => label.str_content_left(),
Token::Function { inner, .. } => inner.str_content_left(),
Token::Mention { name, .. } => Some(name.as_ref()),
Token::UnicodeEmoji(code) => Some(code.as_ref()),
Token::Hashtag(tag) => Some(tag.as_ref()),
_ => None,
}
}
fn str_content_right(&self) -> Option<&str> {
match self {
Token::PlainText(text) => Some(text.as_ref()),
Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right),
Token::Quote(inner) => inner.str_content_right(),
Token::Small(inner) => inner.str_content_right(),
Token::Bold(inner) => inner.str_content_right(),
Token::Italic(inner) => inner.str_content_right(),
Token::Center(inner) => inner.str_content_right(),
Token::Strikethrough(inner) => inner.str_content_right(),
Token::PlainTag(tag) => Some(tag.as_ref()),
Token::UrlRaw(url) => Some(url.as_ref()),
Token::UrlNoEmbed(url) => Some(url.as_ref()),
Token::Link { label, .. } => label.str_content_right(),
Token::Function { inner, .. } => inner.str_content_right(),
Token::Mention { name, .. } => Some(name.as_ref()),
Token::UnicodeEmoji(code) => Some(code.as_ref()),
Token::Hashtag(tag) => Some(tag.as_ref()),
_ => None,
}
}
fn inner(&self) -> Token {
match self {
plain @ Token::PlainText(_) => plain.clone(),
sequence @ Token::Sequence(_) => sequence.clone(),
Token::Quote(inner) => inner.inner(),
Token::Small(inner) => inner.inner(),
Token::Bold(inner) => inner.inner(),
Token::Italic(inner) => inner.inner(),
Token::Center(inner) => inner.inner(),
Token::Strikethrough(inner) => inner.inner(),
Token::PlainTag(text) => Token::PlainText(text.clone().into()),
Token::InlineCode(code) => Token::PlainText(code.clone().into()),
Token::InlineMath(math) => Token::PlainText(math.clone().into()),
Token::UrlRaw(url) => Token::PlainText(url.clone().into()),
Token::UrlNoEmbed(url) => Token::PlainText(url.clone().into()),
Token::Link { label, .. } => label.inner(),
Token::BlockCode { inner, .. } => Token::PlainText(inner.clone().into()),
Token::BlockMath(math) => Token::PlainText(math.clone().into()),
Token::Function { inner, .. } => inner.inner(),
Token::Mention { name, .. } => Token::PlainText(name.clone().into()),
Token::UnicodeEmoji(code) => Token::PlainText(code.clone().into()),
Token::ShortcodeEmoji { shortcode, .. } => Token::PlainText(shortcode.clone().into()),
Token::Hashtag(tag) => Token::PlainText(tag.clone().into()),
}
}
fn merged(&self) -> Token {
match self {
Token::Sequence(tokens) => {
let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| {
if let Some(Token::PlainText(last)) = acc.last_mut() {
if let Token::PlainText(tok_text) = tok {
*last += tok_text.as_ref();
return acc;
}
}
if let Token::Sequence(seq) = tok {
let items = seq.iter().map(Token::merged).flat_map(|t| match t {
Token::Sequence(seq) => Either::Left(seq.into_iter()),
other => Either::Right(std::iter::once(other)),
});
for item in items {
if let Some(Token::PlainText(last)) = acc.last_mut() {
if let Token::PlainText(tok_text) = item {
*last += tok_text.as_ref();
continue;
}
}
acc.push(item);
}
return acc;
}
acc.push(tok.merged());
acc
});
if tokens_multi.len() == 1 {
return tokens_multi.into_iter().next().unwrap();
}
Token::Sequence(tokens_multi)
}
Token::Quote(inner) => Token::Quote(Box::new(inner.merged())),
Token::Small(inner) => Token::Small(Box::new(inner.merged())),
Token::Bold(inner) => Token::Bold(Box::new(inner.merged())),
Token::Italic(inner) => Token::Italic(Box::new(inner.merged())),
Token::Center(inner) => Token::Center(Box::new(inner.merged())),
Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())),
Token::Link { label, href } => Token::Link {
label: Box::new(label.merged()),
href: href.clone(),
},
Token::LinkNoEmbed { label, href } => Token::LinkNoEmbed {
label: Box::new(label.merged()),
href: href.clone(),
},
Token::Function {
name,
params,
inner,
} => Token::Function {
name: name.clone(),
params: params.clone(),
inner: Box::new(inner.merged()),
},
other => other.clone(),
}
}
pub fn walk_map_collect<T>(&self, func: &impl Fn(&Token) -> Option<T>, out: &mut Vec<T>) {
if let Some(v) = func(self) {
out.push(v)
}
match self {
Token::Sequence(items) => {
items.iter().for_each(|tok| tok.walk_map_collect(func, out));
}
Token::Quote(inner)
| Token::Small(inner)
| Token::Bold(inner)
| Token::Italic(inner)
| Token::Center(inner)
| Token::Function { inner, .. }
| Token::Link { label: inner, .. }
| Token::Strikethrough(inner) => inner.walk_map_collect(func, out),
_ => {}
}
}
pub fn walk_speech_transform(&mut self, func: &impl Fn(&mut Cow<'_, str>)) {
match self {
Token::Sequence(items) => {
items
.iter_mut()
.for_each(|tok| tok.walk_speech_transform(func));
}
Token::Small(inner)
| Token::Bold(inner)
| Token::Italic(inner)
| Token::Center(inner)
| Token::Function { inner, .. }
| Token::Strikethrough(inner) => inner.walk_speech_transform(func),
Token::PlainText(text) => func(text),
_ => {}
}
}
}

View File

@ -0,0 +1,157 @@
use crate::types::{Effect, Input, Parser, ParserCont, ParserRet, State};
fn line_start<'a>(
state: &mut State,
inp: &mut impl Input<'a>,
_output: &'_ mut impl FnMut(Effect<'a>),
cont: impl ParserCont,
) -> ParserRet {
match inp.view().as_bytes() {
[b'>', b' ', ..] => cont.continue_with2((line_start, quote)),
[b'`', b'`', b'`', ..] => cont.continue_with(CodeBlock {}),
[b'\\', b'[', ..] => cont.continue_with(BlockMath {}),
[b'<', b'c', b'e', b'n', b't', b'e', b'r', b'>', ..] => cont.continue_with2((inline, center_tag_end)),
_ => cont.continue_with(inline)
}
}
fn inline<'a>(
state: &mut State,
inp: &mut impl Input<'a>,
_output: &'_ mut impl FnMut(Effect<'a>),
cont: impl ParserCont,
) -> ParserRet {
match inp.view().as_bytes() {
[b'\n', ..] => return cont.continue_with(line_start),
[b'<', b'b', b'>', ..] => return cont.continue_with(inline),
[b'<', b's', b'>', ..] => return cont.continue_with(inline),
[b'<', b'i', b'>', ..] => return cont.continue_with(inline),
[b'<', b'p', b'l', b'a', b'i', b'n', b'>', ..] => return cont.continue_with(inline),
[b'<', b's', b'm', b'a', b'l', b'l', b'>', ..] => return cont.continue_with(inline),
[b'*', b'*', ..] => return cont.continue_with(inline),
[b'_', b'_', ..] => return cont.continue_with(inline),
[b'*', ..] => return cont.continue_with(inline),
[b'_', ..] => return cont.continue_with(inline),
[b'~', b'~', ..] => return cont.continue_with(inline),
[b'`', ..] => return cont.continue_with(inline),
[b'\\', b'(', ..] => return cont.continue_with(inline),
};
}
fn text_or_emoji<'a>(
state: &mut State,
input: &mut impl Input<'a>,
_output: &'_ mut impl FnMut(Effect<'a>),
cont: impl ParserCont,
) -> ParserRet {
let Some(view) = input.next() else {
return;
};
let emoji_str = view.trim_end_matches(['\u{200c}', '\u{200d}']);
if let Some(_) = emojis::get(emoji_str) {
output(Effect::Output(emoji_str));
return;
};
output(Effect::Output(view));
}
fn block_quote_end<'a>(
state: &mut State,
inp: &mut impl Input<'a>,
_output: &'_ mut impl FnMut(Effect<'a>),
cont: impl ParserCont,
) -> ParserRet {}
fn code_block_end<'a>(
state: &mut State,
inp: &mut impl Input<'a>,
_output: &'_ mut impl FnMut(Effect<'a>),
cont: impl ParserCont,
) -> ParserRet {}
fn block_math_end<'a>(
state: &mut State,
inp: &mut impl Input<'a>,
_output: &'_ mut impl FnMut(Effect<'a>),
cont: impl ParserCont,
) -> ParserRet {}
fn center_tag_end<'a>(
state: &mut State,
inp: &mut impl Input<'a>,
_output: &'_ mut impl FnMut(Effect<'a>),
cont: impl ParserCont,
) -> ParserRet {}
#[derive(Copy, Clone)]
enum TagInlineKind {
TagSmall,
TagPlain,
TagBold,
TagItalic,
TagStrikethrough,
}
struct TagInline {
kind: TagInlineKind,
}
impl Parser for TagInline {}
fn inline_math_end<'a>(
state: &mut State,
inp: &mut impl Input<'a>,
_output: &'_ mut impl FnMut(Effect<'a>),
cont: impl ParserCont,
) -> ParserRet {}
fn inline_code_end<'a>(
state: &mut State,
inp: &mut impl Input<'a>,
_output: &'_ mut impl FnMut(Effect<'a>),
cont: impl ParserCont,
) -> ParserRet {}
struct Url {}
impl Parser for Url {
fn take<'a>(
&mut self,
state: State,
input: &mut impl Input<'a>,
output: &'_ mut impl FnMut(Effect<'a>),
) -> impl Parser {}
}
#[inline]
fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
alt((
recognize(tuple((
tag("["),
many_till(
self.increase_nesting(self.partial_span(Self::url_chars_base)),
tag("]"),
),
))),
recognize(tuple((
tag("("),
many_till(
self.increase_nesting(self.partial_span(Self::url_chars_base)),
tag(")"),
),
))),
recognize(tuple((
not(satisfy(char::is_control)),
not(satisfy(char::is_whitespace)),
not(one_of(")]>")),
anychar,
))),
))(input)
}

View File

@ -1,17 +1,8 @@
#![cfg(test)] #![cfg(test)]
use std::collections::HashMap; use std::collections::HashMap;
use nom::bytes::complete::tag; use crate::output_types::{MentionType, Token};
use crate::{parse_full, xml_write::to_xml_string};
use crate::{xml_write::to_xml_string, Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT};
fn parse_full(string: &str) -> Token {
Context::default()
.full(Span::new_extra(string, SpanMeta::default()))
.unwrap()
.1
.merged()
}
#[test] #[test]
fn parse_empty() { fn parse_empty() {
@ -27,9 +18,9 @@ fn parse_url_chars() {
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))", "https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
SpanMeta::default(), SpanMeta::default(),
)) ))
.unwrap() .unwrap()
.1 .1
.into_fragment(), .into_fragment(),
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)" "https://en.wikipedia.org/wiki/Sandbox_(computer_security)"
); );
@ -60,9 +51,9 @@ fn parse_url_chars() {
"https://cs.wikipedia.org/wiki/Among Us )", "https://cs.wikipedia.org/wiki/Among Us )",
SpanMeta::default(), SpanMeta::default(),
)) ))
.unwrap() .unwrap()
.1 .1
.into_fragment(), .into_fragment(),
"https://cs.wikipedia.org/wiki/Among Us" "https://cs.wikipedia.org/wiki/Among Us"
); );
@ -71,9 +62,9 @@ fn parse_url_chars() {
"https://en.wikipedia.org/wiki/Among Us )", "https://en.wikipedia.org/wiki/Among Us )",
SpanMeta::default(), SpanMeta::default(),
)) ))
.unwrap() .unwrap()
.1 .1
.into_fragment(), .into_fragment(),
"https://en.wikipedia.org/wiki/Among" "https://en.wikipedia.org/wiki/Among"
); );
} }
@ -82,17 +73,17 @@ fn parse_url_chars() {
fn parse_formatting() { fn parse_formatting() {
assert_eq!( assert_eq!(
parse_full(r#"~~stikethrough~~"#), parse_full(r#"~~stikethrough~~"#),
Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), Token::Strikethrough(vec![Token::PlainText("stikethrough".into())]),
); );
assert_eq!( assert_eq!(
parse_full(r#"**bold**"#), parse_full(r#"**bold**"#),
Token::Bold(Box::new(Token::PlainText("bold".into()))), Token::Bold(vec![Token::PlainText("bold".into())]),
); );
assert_eq!( assert_eq!(
parse_full(r#"*italic*"#), parse_full(r#"*italic*"#),
Token::Italic(Box::new(Token::PlainText("italic".into()))), Token::Italic(vec![Token::PlainText("italic".into())]),
); );
assert_eq!( assert_eq!(
@ -109,7 +100,7 @@ fn parse_formatting() {
parse_full("intra*word*italic"), parse_full("intra*word*italic"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("intra".into()), Token::PlainText("intra".into()),
Token::Italic(Box::new(Token::PlainText("word".into()))), Token::Italic(vec![Token::PlainText("word".into())]),
Token::PlainText("italic".into()), Token::PlainText("italic".into()),
]) ])
); );
@ -123,13 +114,13 @@ fn parse_formatting() {
parse_full(r#"long text with a *footnote <b>text</b>"#), parse_full(r#"long text with a *footnote <b>text</b>"#),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("long text with a *footnote ".into()), Token::PlainText("long text with a *footnote ".into()),
Token::Bold(Box::new(Token::PlainText("text".into()))), Token::Bold(vec![Token::PlainText("text".into())]),
]) ])
); );
assert_eq!( assert_eq!(
parse_full(r#"*"italic"*"#), parse_full(r#"*"italic"*"#),
Token::Italic(Box::new(Token::PlainText("\"italic\"".into()))) Token::Italic(vec![Token::PlainText("\"italic\"".into())])
); );
assert_eq!( assert_eq!(
@ -161,23 +152,23 @@ fn parse_formatting() {
assert_eq!( assert_eq!(
parse_full(r#"***bold italic***"#), parse_full(r#"***bold italic***"#),
Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( Token::Bold(vec![Token::Italic(vec![Token::PlainText(
"bold italic".into() "bold italic".into()
))))) )])])
); );
assert_eq!( assert_eq!(
parse_full(r#"<b><i>bold italic</i></b>"#), parse_full(r#"<b><i>bold italic</i></b>"#),
Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( Token::Bold(vec![Token::Italic(vec![Token::PlainText(
"bold italic".into() "bold italic".into()
))))) )])])
); );
assert_eq!( assert_eq!(
parse_full("~~*hello\nworld*"), parse_full("~~*hello\nworld*"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("~~".into()), Token::PlainText("~~".into()),
Token::Italic(Box::new(Token::PlainText("hello\nworld".into()))), Token::Italic(vec![Token::PlainText("hello\nworld".into())]),
]) ])
) )
} }
@ -188,7 +179,7 @@ fn parse_flanking() {
parse_full(r#"aaa*iii*bbb"#), parse_full(r#"aaa*iii*bbb"#),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("aaa".into()), Token::PlainText("aaa".into()),
Token::Italic(Box::new(Token::PlainText("iii".into()))), Token::Italic(vec![Token::PlainText("iii".into())]),
Token::PlainText("bbb".into()), Token::PlainText("bbb".into()),
]) ])
); );
@ -202,33 +193,33 @@ fn parse_flanking() {
parse_full("aaa\n_iii_\nbbb"), parse_full("aaa\n_iii_\nbbb"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("aaa\n".into()), Token::PlainText("aaa\n".into()),
Token::Italic(Box::new(Token::PlainText("iii".into()))), Token::Italic(vec![Token::PlainText("iii".into())]),
Token::PlainText("\nbbb".into()), Token::PlainText("\nbbb".into()),
]) ])
); );
assert_eq!( assert_eq!(
parse_full(r#"*iii*"#), parse_full(r#"*iii*"#),
Token::Italic(Box::new(Token::PlainText("iii".into()))) Token::Italic(vec![Token::PlainText("iii".into())])
); );
assert_eq!( assert_eq!(
parse_full(r#"_iii_"#), parse_full(r#"_iii_"#),
Token::Italic(Box::new(Token::PlainText("iii".into()))) Token::Italic(vec![Token::PlainText("iii".into())])
); );
assert_eq!( assert_eq!(
parse_full(r#"aaa*iii*"#), parse_full(r#"aaa*iii*"#),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("aaa".into()), Token::PlainText("aaa".into()),
Token::Italic(Box::new(Token::PlainText("iii".into()))), Token::Italic(vec![Token::PlainText("iii".into())]),
]) ])
); );
assert_eq!( assert_eq!(
parse_full(r#"*iii*bbb"#), parse_full(r#"*iii*bbb"#),
Token::Sequence(vec![ Token::Sequence(vec![
Token::Italic(Box::new(Token::PlainText("iii".into()))), Token::Italic(vec![Token::PlainText("iii".into())]),
Token::PlainText("bbb".into()), Token::PlainText("bbb".into()),
]) ])
); );
@ -309,12 +300,12 @@ a^2 + b^2 = c^2
🦋🏳 🦋🏳
text</center>"# text</center>"#
), ),
Token::Center(Box::new(Token::Sequence(vec![ Token::Center(vec![
Token::PlainText("centered\n".into()), Token::PlainText("centered\n".into()),
Token::UnicodeEmoji("🦋".into()), Token::UnicodeEmoji("🦋".into()),
Token::UnicodeEmoji("🏳️‍⚧️".into()), Token::UnicodeEmoji("🏳️‍⚧️".into()),
Token::PlainText("\ntext".into()), Token::PlainText("\ntext".into()),
]))) ])
); );
assert_eq!( assert_eq!(
@ -323,11 +314,11 @@ a^2 + b^2 = c^2
> 👩🏽🤝👩🏼 > 👩🏽🤝👩🏼
> text</center>"# > text</center>"#
), ),
Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ Token::Quote(vec![Token::Center(vec![
Token::PlainText("centered\n".into()), Token::PlainText("centered\n".into()),
Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()), Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()),
Token::PlainText("\ntext".into()) Token::PlainText("\ntext".into())
]))))), ])]),
); );
assert_eq!( assert_eq!(
@ -335,11 +326,11 @@ a^2 + b^2 = c^2
Token::Function { Token::Function {
name: "x2".into(), name: "x2".into(),
params: HashMap::new(), params: HashMap::new(),
inner: Box::new(Token::Sequence(vec![ inner: vec![
Token::Function { Token::Function {
name: "sparkle".into(), name: "sparkle".into(),
params: HashMap::new(), params: HashMap::new(),
inner: Box::new(Token::UnicodeEmoji("🥺".into())), inner: vec![Token::UnicodeEmoji("🥺".into())],
}, },
Token::UnicodeEmoji("💜".into()), Token::UnicodeEmoji("💜".into()),
Token::Function { Token::Function {
@ -350,10 +341,10 @@ a^2 + b^2 = c^2
params.insert("speed".into(), Some("5s".into())); params.insert("speed".into(), Some("5s".into()));
params params
}, },
inner: Box::new(Token::UnicodeEmoji("❤️".into())), inner: vec![Token::UnicodeEmoji("❤️".into())],
}, },
Token::UnicodeEmoji("🦊".into()), Token::UnicodeEmoji("🦊".into()),
])) ]
}, },
); );
@ -362,13 +353,13 @@ a^2 + b^2 = c^2
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("<b>bold ".into()), Token::PlainText("<b>bold ".into()),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: MentionType::User,
name: "tag1".into(), name: "tag1".into(),
host: None host: None
}, },
Token::PlainText(" <i> ".into()), Token::PlainText(" <i> ".into()),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: MentionType::User,
name: "tag2".into(), name: "tag2".into(),
host: None host: None
}, },
@ -386,11 +377,11 @@ a^2 + b^2 = c^2
>> Nested quote >> Nested quote
"# "#
), ),
Token::Quote(Box::new(Token::Sequence(vec![ Token::Quote(vec![
Token::PlainText("test\n".into()), Token::PlainText("test\n".into()),
Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), Token::Italic(vec![Token::PlainText("\nitalic\n".into())]),
Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) Token::Quote(vec![Token::PlainText("Nested quote".into())])
]))), ]),
); );
} }
@ -442,9 +433,8 @@ fn parse_link() {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("Link test: ".into()), Token::PlainText("Link test: ".into()),
Token::Link { Token::Link {
label: Box::new(Token::PlainText("label".into())), label: vec![Token::PlainText("label".into())],
href: "https://example.com".into(), href: "https://example.com".into()
embed: true,
}, },
]) ])
); );
@ -481,10 +471,9 @@ fn parse_link() {
parse_full("Link test: ?[label](https://awawa.gay)"), parse_full("Link test: ?[label](https://awawa.gay)"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("Link test: ".into()), Token::PlainText("Link test: ".into()),
Token::Link { Token::LinkNoEmbed {
label: Box::new(Token::PlainText("label".into())), label: vec![Token::PlainText("label".into())],
href: "https://awawa.gay".into(), href: "https://awawa.gay".into(),
embed: false,
}, },
]) ])
); );
@ -493,10 +482,9 @@ fn parse_link() {
parse_full("Link test: ?[label](https://awawa.gay)test"), parse_full("Link test: ?[label](https://awawa.gay)test"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("Link test: ".into()), Token::PlainText("Link test: ".into()),
Token::Link { Token::LinkNoEmbed {
label: Box::new(Token::PlainText("label".into())), label: vec![Token::PlainText("label".into())],
href: "https://awawa.gay".into(), href: "https://awawa.gay".into(),
embed: false,
}, },
Token::PlainText("test".into()), Token::PlainText("test".into()),
]) ])
@ -506,10 +494,9 @@ fn parse_link() {
parse_full("Link test: (?[label](https://awawa.gay))"), parse_full("Link test: (?[label](https://awawa.gay))"),
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("Link test: (".into()), Token::PlainText("Link test: (".into()),
Token::Link { Token::LinkNoEmbed {
label: Box::new(Token::PlainText("label".into())), label: vec![Token::PlainText("label".into())],
href: "https://awawa.gay".into(), href: "https://awawa.gay".into(),
embed: false,
}, },
Token::PlainText(")".into()), Token::PlainText(")".into()),
]) ])
@ -546,7 +533,7 @@ fn parse_mention() {
assert_eq!( assert_eq!(
parse_full("@tag"), parse_full("@tag"),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: MentionType::User,
name: "tag".into(), name: "tag".into(),
host: None, host: None,
} }
@ -562,7 +549,7 @@ fn parse_mention() {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("hgsjlkdsa ".into()), Token::PlainText("hgsjlkdsa ".into()),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: MentionType::User,
name: "tag".into(), name: "tag".into(),
host: None, host: None,
}, },
@ -575,7 +562,7 @@ fn parse_mention() {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("hgsjlkdsa ".into()), Token::PlainText("hgsjlkdsa ".into()),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: MentionType::User,
name: "tag".into(), name: "tag".into(),
host: None, host: None,
}, },
@ -588,7 +575,7 @@ fn parse_mention() {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("aaaa ".into()), Token::PlainText("aaaa ".into()),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: MentionType::User,
name: "tag".into(), name: "tag".into(),
host: Some("domain".into()), host: Some("domain".into()),
}, },
@ -601,7 +588,7 @@ fn parse_mention() {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("test ".into()), Token::PlainText("test ".into()),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: MentionType::User,
name: "tag".into(), name: "tag".into(),
host: Some("domain".into()), host: Some("domain".into()),
}, },
@ -614,7 +601,7 @@ fn parse_mention() {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("test ".into()), Token::PlainText("test ".into()),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: MentionType::User,
name: "tag".into(), name: "tag".into(),
host: Some("domain.gay".into()), host: Some("domain.gay".into()),
}, },
@ -627,7 +614,7 @@ fn parse_mention() {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("test ".into()), Token::PlainText("test ".into()),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::User, mention_type: MentionType::User,
name: "tag".into(), name: "tag".into(),
host: Some("domain".into()), host: Some("domain".into()),
}, },
@ -640,7 +627,7 @@ fn parse_mention() {
Token::Sequence(vec![ Token::Sequence(vec![
Token::PlainText("test ".into()), Token::PlainText("test ".into()),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::Community, mention_type: MentionType::Community,
name: "tag".into(), name: "tag".into(),
host: Some("domain.com".into()), host: Some("domain.com".into()),
}, },
@ -651,7 +638,7 @@ fn parse_mention() {
assert_eq!( assert_eq!(
parse_full("@tag:domain.com"), parse_full("@tag:domain.com"),
Token::Mention { Token::Mention {
mention_type: crate::MentionType::MatrixUser, mention_type: MentionType::MatrixUser,
name: "tag".into(), name: "tag".into(),
host: Some("domain.com".into()) host: Some("domain.com".into())
}, },
@ -758,20 +745,10 @@ fn xml_serialization() {
&to_xml_string(&parse_full( &to_xml_string(&parse_full(
"@natty $[spin.speed=0.5s 🥺]:cat_attack: <plain>test</plain>" "@natty $[spin.speed=0.5s 🥺]:cat_attack: <plain>test</plain>"
)) ))
.unwrap(), .unwrap(),
r#"<mmm><mention name="natty" type="user"/> <fn name="spin" arg-speed="0.5s"><ue>🥺</ue></fn><ee>cat_attack</ee> test</mmm>"# r#"<mmm><mention name="natty" type="user"/> <fn name="spin" arg-speed="0.5s"><ue>🥺</ue></fn><ee>cat_attack</ee> test</mmm>"#
); );
assert_eq!(
&to_xml_string(&parse_full(
"Ring Galaxy AM 0644 741 from Hubble\nCredits: AURA, STScI, J. Higdon, Cornell, ESA, #NASA\n#nature #space #astrophotography"
))
.unwrap(),
r#"<mmm>Ring Galaxy AM 0644 741 from Hubble
Credits: AURA, STScI, J. Higdon, Cornell, ESA, <hashtag>NASA</hashtag>
<hashtag>nature</hashtag> <hashtag>space</hashtag> <hashtag>astrophotography</hashtag></mmm>"#
);
assert_eq!( assert_eq!(
&to_xml_string(&parse_full( &to_xml_string(&parse_full(
r#" r#"
@ -779,7 +756,7 @@ Credits: AURA, STScI, J. Higdon, Cornell, ESA, <hashtag>NASA</hashtag>
var x = undefined; var x = undefined;
``` "# ``` "#
)) ))
.unwrap(), .unwrap(),
"<mmm><code lang=\"js\">var x = undefined;</code></mmm>" "<mmm><code lang=\"js\">var x = undefined;</code></mmm>"
); );
} }

View File

@ -0,0 +1,120 @@
use unicode_segmentation::{Graphemes, UnicodeSegmentation};
#[derive(Debug, Copy, Clone)]
pub(crate) struct ParseSpan<'a> {
pub(crate) source: &'a str,
pub(crate) offset: usize,
pub(crate) length: usize,
}
impl ParseSpan<'_> {
pub(crate) fn concat(self, other: Self) -> Option<Self> {
if self.source != other.source {
panic!("Attempted to concat slices from different strings");
}
if self.offset + self.length != other.offset {
return None;
}
Some(ParseSpan {
source: self.source,
offset: self.offset,
length: self.length + other.length,
})
}
pub(crate) fn spanned_source(&self) -> &str {
&self.source[self.offset..self.offset + self.length]
}
}
pub(crate) struct TokStream<'a>(ParseSpan<'a>, Graphemes<'a>);
impl<'a> From<&'a str> for TokStream<'a> {
fn from(source: &'a str) -> Self {
TokStream(
ParseSpan {
source,
length: source.len(),
offset: 0,
},
source.graphemes(true),
)
}
}
pub(crate) trait Input<'a> {
fn next(&mut self) -> Option<&'a str>;
fn view(&self) -> &'a str;
}
impl<'a> Input<'a> for TokStream<'a> {
#[inline]
fn next(&mut self) -> Option<&'a str> {
if let Some(p) = self.1.next() {
let length = p.len();
self.0.offset += length;
self.0.length -= length;
return Some(p);
}
None
}
#[inline]
fn view(&self) -> &'a str {
&self.0.source[self.0.offset..self.0.offset + self.0.length]
}
}
#[derive(Debug, Copy, Clone)]
pub(crate) struct Lex<'a> {
pub(crate) token: &'a str,
pub(crate) span: ParseSpan<'a>,
}
pub(crate) type OutTok<'a> = Lex<'a>;
pub(crate) const MAX_DEPTH: usize = 24;
#[derive(Debug, Default, Clone, Copy)]
pub(crate) struct State {
pub(crate) depth: usize,
}
pub(crate) enum Effect<'a> {
Output(OutTok<'a>)
}
#[must_use]
pub(crate) struct ParserRet {
_private: (),
}
pub(crate) trait ParserCont {
fn continue_with(self, to: impl Parser) -> ParserRet;
fn continue_with2(self, to: (impl Parser, impl Parser)) -> ParserRet;
}
pub(crate) trait Parser {
fn take<'a>(
&mut self,
state: &mut State,
input: &mut impl Input<'a>,
handler: &'_ mut impl FnMut(Effect<'a>),
visitor: impl ParserCont,
) -> ParserRet;
}
impl<I, F, V> Parser for fn(&mut State, &mut I, &'_ mut F, V) -> ParserRet {
fn take<'a>(&mut self,
state: &mut State,
input: &mut impl Input<'a>,
handler: &'_ mut impl FnMut(Effect<'a>),
visitor: impl ParserCont) -> ParserRet {
self(state, input, handler, visitor)
}
}

View File

@ -1,9 +1,8 @@
use std::io::{Cursor, Write}; use std::io::{Cursor, Write};
use crate::output_types::Token;
use quick_xml::events::{BytesText, Event}; use quick_xml::events::{BytesText, Event};
use crate::Token;
impl Token { impl Token {
fn write<T: Write>(&self, writer: &mut quick_xml::Writer<T>) -> quick_xml::Result<()> { fn write<T: Write>(&self, writer: &mut quick_xml::Writer<T>) -> quick_xml::Result<()> {
match self { match self {