From f71429bfe08af6b72e1a1ba47bc8217723391e6a Mon Sep 17 00:00:00 2001 From: Natty Date: Thu, 21 Nov 2024 16:06:25 +0100 Subject: [PATCH] Restructured MMM parser and updated URL parsing --- magnetar_mmm_parser/src/lib.rs | 987 +-------------------------- magnetar_mmm_parser/src/test.rs | 785 +++++++++++++++++++++ magnetar_mmm_parser/src/xml_write.rs | 156 +++++ 3 files changed, 969 insertions(+), 959 deletions(-) create mode 100644 magnetar_mmm_parser/src/test.rs create mode 100644 magnetar_mmm_parser/src/xml_write.rs diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 18c6fa5..5face12 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -1,6 +1,8 @@ +mod test; +mod xml_write; + use std::collections::HashMap; use std::convert::{identity, Infallible}; -use std::io::{Cursor, Write}; use std::marker::PhantomData; use compact_str::{CompactString, ToCompactString}; @@ -17,7 +19,6 @@ use nom::multi::{many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; use nom::{IResult, Offset, Parser, Slice}; use nom_locate::LocatedSpan; -use quick_xml::events::{BytesText, Event}; use serde::{Deserialize, Serialize}; use strum::IntoStaticStr; use tracing::trace; @@ -269,155 +270,6 @@ impl Token { _ => {} } } - - fn write(&self, writer: &mut quick_xml::Writer) -> quick_xml::Result<()> { - match self { - Token::PlainText(plain) => { - writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; - } - Token::Sequence(sequence) => { - sequence.iter().try_for_each(|item| item.write(writer))?; - } - Token::Quote(inner) => { - writer - .create_element("quote") - .write_inner_content(|w| inner.write(w))?; - } - Token::Small(inner) => { - writer - .create_element("small") - .write_inner_content(|w| inner.write(w))?; - } - Token::Bold(inner) => { - writer - .create_element("b") - .write_inner_content(|w| inner.write(w))?; - } - Token::Italic(inner) => { - writer - .create_element("i") - .write_inner_content(|w| inner.write(w))?; - } - Token::Center(inner) => { - writer - .create_element("center") - .write_inner_content(|w| inner.write(w))?; - } - Token::Strikethrough(inner) => { - writer - .create_element("s") - .write_inner_content(|w| inner.write(w))?; - } - Token::PlainTag(plain) => { - writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; - } - Token::InlineCode(code) => { - writer - .create_element("inline-code") - .write_text_content(BytesText::new(code))?; - } - Token::InlineMath(math) => { - writer - .create_element("inline-math") - .write_text_content(BytesText::new(math))?; - } - Token::UrlRaw(url) => { - writer - .create_element("a") - .with_attribute(("href", url.as_str())) - .write_text_content(BytesText::new(url))?; - } - Token::UrlNoEmbed(url) => { - writer - .create_element("a") - .with_attribute(("href", url.as_str())) - .with_attribute(("embed", "false")) - .write_text_content(BytesText::new(url))?; - } - Token::Link { label, href, embed } => { - writer - .create_element("a") - .with_attribute(("href", href.as_str())) - .with_attribute(("embed", if *embed { "true" } else { "false" })) - .write_inner_content(|w| label.write(w))?; - } - Token::BlockCode { inner, lang } => { - let mut ew = writer.create_element("code"); - - if let Some(language) = lang { - ew = ew.with_attribute(("lang", language.as_str())); - } - - ew.write_text_content(BytesText::new(inner))?; - } - Token::BlockMath(math) => { - writer - .create_element("math") - .write_text_content(BytesText::new(math))?; - } - Token::Function { - inner, - name, - params, - } => { - let mut ew = writer - .create_element("fn") - .with_attribute(("name", name.as_str())); - - for (k, v) in params { - ew = ew - .with_attribute((format!("arg-{k}").as_str(), v.as_deref().unwrap_or(""))); - } - - ew.write_inner_content(|w| inner.write(w))?; - } - Token::Mention { - name, - host, - mention_type, - } => { - let mut ew = writer - .create_element("mention") - .with_attribute(("name", name.as_str())) - .with_attribute(("type", mention_type.into())); - - if let Some(host) = host { - ew = ew.with_attribute(("host", host.as_str())); - } - - ew.write_empty()?; - } - Token::UnicodeEmoji(text) => { - writer - .create_element("ue") - .write_text_content(BytesText::new(text))?; - } - Token::ShortcodeEmoji { shortcode, host } => { - let mut ew = writer.create_element("ee"); - - if let Some(host) = host { - ew = ew.with_attribute(("host", host.as_str())); - } - - ew.write_text_content(BytesText::new(shortcode))?; - } - Token::Hashtag(tag) => { - writer - .create_element("hashtag") - .write_text_content(BytesText::new(tag.as_str()))?; - } - } - - Ok(()) - } -} - -pub fn to_xml_string(token: &Token) -> quick_xml::Result { - let mut writer = quick_xml::Writer::new(Cursor::new(Vec::new())); - writer - .create_element("mmm") - .write_inner_content(|writer| token.write(writer))?; - Ok(String::from_utf8(writer.into_inner().into_inner())?) } pub fn janky_is_line_begin(input: Span<'_>) -> bool { @@ -426,12 +278,13 @@ pub fn janky_is_line_begin(input: Span<'_>) -> bool { // VERY BAD // Safety: This is very janky, but hopefully will work as long as nom-locate keeps the invariant of fragments being subslices of the input // We do this to avoid scanning the entire input for a line separator when we just need the previous byte - offset == 0 || unsafe { - let frag_bytes = input.fragment().as_bytes(); - let frag_ptr = frag_bytes.as_ptr(); - let prev_byte = frag_ptr.offset(-1); - matches!(*prev_byte, b'\n') - } + offset == 0 + || unsafe { + let frag_bytes = input.fragment().as_bytes(); + let frag_ptr = frag_bytes.as_ptr(); + let prev_byte = frag_ptr.offset(-1); + matches!(*prev_byte, b'\n') + } } #[derive(Debug, Default, Copy, Clone)] @@ -477,14 +330,14 @@ fn boxing_token(func: impl Fn(Box) -> Token) -> impl Fn(Token) -> Token { fn collect_sequence( func: impl Fn(Vec) -> Token, transform: impl Fn(Token) -> Token, -) -> impl Fn(&mut dyn Iterator) -> Token { +) -> impl Fn(&mut dyn Iterator) -> Token { move |tokens| transform(func(tokens.collect())) } #[inline] fn collect_char_sequence( func: impl Fn(String) -> Token, -) -> impl Fn(&mut dyn Iterator) -> Token { +) -> impl Fn(&mut dyn Iterator) -> Token { move |chars| func(chars.collect()) } @@ -586,7 +439,7 @@ fn space(input: Span) -> IResult { #[derive(Copy, Clone)] struct Matcher<'a, 'b, T: Clone> { matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), - collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), _phantom_closure: PhantomData<&'a ()>, _phantom_data: PhantomData<&'b ()>, _phantom_output: PhantomData T>, @@ -595,7 +448,7 @@ struct Matcher<'a, 'b, T: Clone> { impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> { fn new( matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult, T> + 'a), - collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), + collector: &'a (dyn Fn(&mut dyn Iterator) -> Token + 'a), ) -> Self { Self { matcher_inner, @@ -634,7 +487,7 @@ struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>>( ); impl<'a, T: Fn(Span<'a>) -> IResult, Span<'a>>> From<(T, FlankingRule)> -for FlankingDelim<'a, T> + for FlankingDelim<'a, T> { fn from((func, rule): (T, FlankingRule)) -> Self { FlankingDelim(func, rule, PhantomData) @@ -753,7 +606,7 @@ impl Context { )), eof, ) - .map(|v| v.0), + .map(|v| v.0), Token::Sequence, )(input) } @@ -768,7 +621,7 @@ impl Context { )), eof, ) - .map(|v| v.0), + .map(|v| v.0), Token::Sequence, )(input) } @@ -895,10 +748,10 @@ impl Context { if quote_lines.len() == 1 && quote_lines - .iter() - .map(Span::fragment) - .copied() - .any(&str::is_empty) + .iter() + .map(Span::fragment) + .copied() + .any(&str::is_empty) { return fail(input); } @@ -1539,8 +1392,7 @@ impl Context { } else { mention_type }; - let host = - host_opt.map(|(_, name)| name.trim_end_matches(['.', '-', '_'])); + let host = host_opt.map(|(_, name)| name.trim_end_matches(['.', '-', '_'])); let input = host.map(|c| before.slice(c.len() + 1..)).unwrap_or(before); Ok(( @@ -1647,7 +1499,6 @@ impl Context { #[inline] fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult, Span<'a>> { alt(( - alphanumeric1_unicode, recognize(tuple(( tag("["), many_till( @@ -1662,7 +1513,12 @@ impl Context { tag(")"), ), ))), - recognize(one_of(".,_/:%#$&?!~=+-@")), + recognize(tuple(( + not(satisfy(char::is_control)), + not(satisfy(char::is_whitespace)), + not(one_of(")]>")), + anychar, + ))), ))(input) } @@ -1688,790 +1544,3 @@ impl Context { } } } - -#[cfg(test)] -mod test { - use std::collections::HashMap; - - use nom::bytes::complete::tag; - - use crate::{to_xml_string, Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; - - fn parse_full(string: &str) -> Token { - Context::default() - .full(Span::new_extra(string, SpanMeta::default())) - .unwrap() - .1 - .merged() - } - - #[test] - fn parse_empty() { - assert_eq!(parse_full(""), Token::Sequence(vec![])); - } - - #[test] - fn parse_url_chars() { - let ctx = Context::default(); - - assert_eq!( - ctx.url_chars(tag(")"), true)(Span::new_extra( - "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", - SpanMeta::default(), - )) - .unwrap() - .1 - .into_fragment(), - "https://en.wikipedia.org/wiki/Sandbox_(computer_security)" - ); - - assert_eq!( - ctx.url_chars(tag(")"), true)(Span::new_extra( - "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))", - SpanMeta::default() - )) - .unwrap() - .1 - .into_fragment(), - "https://en.wikipedia.org/wiki/Sandbox_(computer_security)", - ); - - assert_eq!( - ctx.url_chars(tag(")"), true)(Span::new_extra( - "https://cs.wikipedia.org/wiki/Among_Us ", - SpanMeta::default() - )) - .unwrap() - .1 - .into_fragment(), - "https://cs.wikipedia.org/wiki/Among_Us", - ); - - assert_eq!( - ctx.url_chars(tag(")"), true)(Span::new_extra( - "https://cs.wikipedia.org/wiki/Among Us )", - SpanMeta::default(), - )) - .unwrap() - .1 - .into_fragment(), - "https://cs.wikipedia.org/wiki/Among Us" - ); - - assert_eq!( - ctx.url_chars(tag(")"), false)(Span::new_extra( - "https://en.wikipedia.org/wiki/Among Us )", - SpanMeta::default(), - )) - .unwrap() - .1 - .into_fragment(), - "https://en.wikipedia.org/wiki/Among" - ); - } - - #[test] - fn parse_formatting() { - assert_eq!( - parse_full(r#"~~stikethrough~~"#), - Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), - ); - - assert_eq!( - parse_full(r#"**bold**"#), - Token::Bold(Box::new(Token::PlainText("bold".into()))), - ); - - assert_eq!( - parse_full(r#"*italic*"#), - Token::Italic(Box::new(Token::PlainText("italic".into()))), - ); - - assert_eq!( - parse_full(r#"* italic *"#), - Token::PlainText("* italic *".into()) - ); - - assert_eq!( - parse_full("snake_case_variable"), - Token::PlainText("snake_case_variable".into()) - ); - - assert_eq!( - parse_full("intra*word*italic"), - Token::Sequence(vec![ - Token::PlainText("intra".into()), - Token::Italic(Box::new(Token::PlainText("word".into()))), - Token::PlainText("italic".into()), - ]) - ); - - assert_eq!( - parse_full(r#"_ italic *"#), - Token::PlainText("_ italic *".into()) - ); - - assert_eq!( - parse_full(r#"long text with a *footnote text"#), - Token::Sequence(vec![ - Token::PlainText("long text with a *footnote ".into()), - Token::Bold(Box::new(Token::PlainText("text".into()))), - ]) - ); - - assert_eq!( - parse_full(r#"*"italic"*"#), - Token::Italic(Box::new(Token::PlainText("\"italic\"".into()))) - ); - - assert_eq!( - parse_full(r#"not code `code` also not code"#), - Token::Sequence(vec![ - Token::PlainText("not code ".into()), - Token::InlineCode("code".into()), - Token::PlainText(" also not code".into()) - ]), - ); - - assert_eq!( - parse_full(r#"not code `code` also `not code"#), - Token::Sequence(vec![ - Token::PlainText("not code ".into()), - Token::InlineCode("code".into()), - Token::PlainText(" also `not code".into()) - ]), - ); - - assert_eq!( - parse_full(r#"not code `*not bold*` also not code"#), - Token::Sequence(vec![ - Token::PlainText("not code ".into()), - Token::InlineCode("*not bold*".into()), - Token::PlainText(" also not code".into()) - ]), - ); - - assert_eq!( - parse_full(r#"***bold italic***"#), - Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText("bold italic".into()))))) - ); - - assert_eq!( - parse_full(r#"bold italic"#), - Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( - "bold italic".into() - ))))) - ); - - assert_eq!( - parse_full("~~*hello\nworld*"), - Token::Sequence(vec![ - Token::PlainText("~~".into()), - Token::Italic(Box::new(Token::PlainText("hello\nworld".into()))), - ]) - ) - } - - #[test] - fn parse_flanking() { - assert_eq!( - parse_full(r#"aaa*iii*bbb"#), - Token::Sequence(vec![ - Token::PlainText("aaa".into()), - Token::Italic(Box::new(Token::PlainText("iii".into()))), - Token::PlainText("bbb".into()), - ]) - ); - - assert_eq!( - parse_full(r#"aaa_nnn_bbb"#), - Token::PlainText("aaa_nnn_bbb".into()) - ); - - assert_eq!( - parse_full("aaa\n_iii_\nbbb"), - Token::Sequence(vec![ - Token::PlainText("aaa\n".into()), - Token::Italic(Box::new(Token::PlainText("iii".into()))), - Token::PlainText("\nbbb".into()), - ]) - ); - - assert_eq!( - parse_full(r#"*iii*"#), - Token::Italic(Box::new(Token::PlainText("iii".into()))) - ); - - assert_eq!( - parse_full(r#"_iii_"#), - Token::Italic(Box::new(Token::PlainText("iii".into()))) - ); - - assert_eq!( - parse_full(r#"aaa*iii*"#), - Token::Sequence(vec![ - Token::PlainText("aaa".into()), - Token::Italic(Box::new(Token::PlainText("iii".into()))), - ]) - ); - - assert_eq!( - parse_full(r#"*iii*bbb"#), - Token::Sequence(vec![ - Token::Italic(Box::new(Token::PlainText("iii".into()))), - Token::PlainText("bbb".into()), - ]) - ); - - assert_eq!( - parse_full(r#"aaa_nnn_"#), - Token::PlainText("aaa_nnn_".into()) - ); - - assert_eq!( - parse_full(r#"_nnn_bbb"#), - Token::PlainText("_nnn_bbb".into()) - ); - } - - #[test] - fn parse_long() { - parse_full(&"A".repeat(20000)); - - - parse_full(&"*A".repeat(20000)); - - parse_full(&"@A".repeat(20000)); - } - - #[test] - fn parse_complex() { - assert_eq!( - parse_full(r"\( nya^3 \)"), - Token::InlineMath(" nya^3 ".to_string()) - ); - - assert_eq!( - parse_full("\\( nya^3 \n \\)"), - Token::PlainText("\\( nya^3 \n \\)".into()) - ); - - assert_eq!( - parse_full(r"`AbstractProxyFactoryBean`"), - Token::InlineCode("AbstractProxyFactoryBean".to_string()) - ); - - assert_eq!( - parse_full("`let x = \n 5;`"), - Token::PlainText("`let x = \n 5;`".into()) - ); - - assert_eq!( - parse_full( - r#" -```js -var x = undefined; -```"# - ), - Token::BlockCode { - lang: Some("js".to_string()), - inner: "var x = undefined;".to_string(), - } - ); - - assert_eq!( - parse_full( - r" -\[ -a^2 + b^2 = c^2 -\]" - ), - Token::BlockMath("a^2 + b^2 = c^2".to_string()) - ); - - assert_eq!( - parse_full(r"\[ x^2 + y^2 = z^2 \]"), - Token::BlockMath("x^2 + y^2 = z^2".to_string()) - ); - - assert_eq!( - parse_full( - r#"
centered -🦋🏳️‍⚧️ -text
"# - ), - Token::Center(Box::new(Token::Sequence(vec![ - Token::PlainText("centered\n".into()), - Token::UnicodeEmoji("🦋".into()), - Token::UnicodeEmoji("🏳️‍⚧️".into()), - Token::PlainText("\ntext".into()), - ]))) - ); - - assert_eq!( - parse_full( - r#">
centered -> 👩🏽‍🤝‍👩🏼 -> text
"# - ), - Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ - Token::PlainText("centered\n".into()), - Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()), - Token::PlainText("\ntext".into()) - ]))))), - ); - - assert_eq!( - parse_full(r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#), - Token::Function { - name: "x2".into(), - params: HashMap::new(), - inner: Box::new(Token::Sequence(vec![ - Token::Function { - name: "sparkle".into(), - params: HashMap::new(), - inner: Box::new(Token::UnicodeEmoji("🥺".into())), - }, - Token::UnicodeEmoji("💜".into()), - Token::Function { - name: "spin".into(), - params: { - let mut params = HashMap::new(); - params.insert("y".into(), None); - params.insert("speed".into(), Some("5s".into())); - params - }, - inner: Box::new(Token::UnicodeEmoji("❤️".into())), - }, - Token::UnicodeEmoji("🦊".into()), - ])) - }, - ); - - assert_eq!( - parse_full(r#"bold @tag1 @tag2 italic"#), - Token::Sequence(vec![ - Token::PlainText("bold ".into()), - Token::Mention { - mention_type: crate::MentionType::User, - name: "tag1".into(), - host: None - }, - Token::PlainText(" ".into()), - Token::Mention { - mention_type: crate::MentionType::User, - name: "tag2".into(), - host: None - }, - Token::PlainText(" italic".into()) - ]), - ); - - assert_eq!( - parse_full( - r#" -> test -> -> italic -> ->> Nested quote -"# - ), - Token::Quote(Box::new(Token::Sequence(vec![ - Token::PlainText("test\n".into()), - Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), - Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) - ]))), - ); - } - - #[test] - fn parse_link() { - assert_eq!( - parse_full("IPv4 test: "), - Token::Sequence(vec![ - Token::PlainText("IPv4 test: ".into()), - Token::UrlNoEmbed("https://0".into()), - ]) - ); - - assert_eq!( - parse_full("IPv4 test: "), - Token::Sequence(vec![ - Token::PlainText("IPv4 test: ".into()), - Token::UrlNoEmbed("https://127.0.0.1".into()), - ]) - ); - - assert_eq!( - parse_full("IPv6 test: "), - Token::Sequence(vec![ - Token::PlainText("IPv6 test: ".into()), - Token::UrlNoEmbed("https://[::2f:1]/nya".into()), - ]) - ); - - assert_eq!( - parse_full("IPv6 test: https://[::2f:1]/nya"), - Token::Sequence(vec![ - Token::PlainText("IPv6 test: ".into()), - Token::UrlRaw("https://[::2f:1]/nya".into()), - ]) - ); - - // IDNs - assert_eq!( - parse_full("IDN test: https://www.háčkyčárky.cz/"), - Token::Sequence(vec![ - Token::PlainText("IDN test: ".into()), - Token::UrlRaw("https://www.háčkyčárky.cz/".into()), - ]) - ); - - assert_eq!( - parse_full("Link test: [label](https://example.com)"), - Token::Sequence(vec![ - Token::PlainText("Link test: ".into()), - Token::Link { - label: Box::new(Token::PlainText("label".into())), - href: "https://example.com".into(), - embed: true, - }, - ]) - ); - - assert_eq!( - parse_full("test #hashtag tail"), - Token::Sequence(vec![ - Token::PlainText("test ".into()), - Token::Hashtag("hashtag".into()), - Token::PlainText(" tail".into()), - ]) - ); - - assert_eq!( - parse_full("not#hashtag tail"), - Token::PlainText("not#hashtag tail".into()) - ); - - assert_eq!( - parse_full(""), - Token::UrlNoEmbed("https://example.com".into()) - ); - - // Adjacent links okay - assert_eq!( - parse_full(""), - Token::Sequence(vec![ - Token::UrlNoEmbed("https://example.com/".into()), - Token::UrlNoEmbed("https://awawa.gay/".into()), - ]) - ); - - assert_eq!( - parse_full("Link test: ?[label](https://awawa.gay)"), - Token::Sequence(vec![ - Token::PlainText("Link test: ".into()), - Token::Link { - label: Box::new(Token::PlainText("label".into())), - href: "https://awawa.gay".into(), - embed: false, - }, - ]) - ); - - assert_eq!( - parse_full("Link test: ?[label](https://awawa.gay)test"), - Token::Sequence(vec![ - Token::PlainText("Link test: ".into()), - Token::Link { - label: Box::new(Token::PlainText("label".into())), - href: "https://awawa.gay".into(), - embed: false, - }, - Token::PlainText("test".into()), - ]) - ); - - assert_eq!( - parse_full("Link test: (?[label](https://awawa.gay))"), - Token::Sequence(vec![ - Token::PlainText("Link test: (".into()), - Token::Link { - label: Box::new(Token::PlainText("label".into())), - href: "https://awawa.gay".into(), - embed: false, - }, - Token::PlainText(")".into()), - ]) - ); - - assert_eq!( - parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket - Token::Sequence(vec![ - Token::PlainText("Link test: ?[label](".into()), - Token::UrlRaw("https://awawa.gay".into()), - ]) - ); - } - - #[test] - fn limit_nesting() { - let mut tok = Token::PlainText(" test ".into()); - for _ in 0..DEFAULT_DEPTH_LIMIT { - tok = Token::Bold(Box::new(tok)); - } - - assert_eq!( - parse_full( - &("".repeat(DEFAULT_DEPTH_LIMIT) - + " test " - + &*"".repeat(DEFAULT_DEPTH_LIMIT)) - ), - tok - ); - } - - #[test] - fn parse_mention() { - assert_eq!( - parse_full("@tag"), - Token::Mention { - mention_type: crate::MentionType::User, - name: "tag".into(), - host: None, - } - ); - - assert_eq!( - parse_full("email@notactuallyamenmtion.org"), - Token::PlainText("email@notactuallyamenmtion.org".into()) - ); - - assert_eq!( - parse_full("hgsjlkdsa @tag fgahjsdkd"), - Token::Sequence(vec![ - Token::PlainText("hgsjlkdsa ".into()), - Token::Mention { - mention_type: crate::MentionType::User, - name: "tag".into(), - host: None, - }, - Token::PlainText(" fgahjsdkd".into()), - ]) - ); - - assert_eq!( - parse_full("hgsjlkdsa @tag@ fgahjsdkd"), - Token::Sequence(vec![ - Token::PlainText("hgsjlkdsa ".into()), - Token::Mention { - mention_type: crate::MentionType::User, - name: "tag".into(), - host: None, - }, - Token::PlainText("@ fgahjsdkd".into()), - ]) - ); - - assert_eq!( - parse_full("aaaa @tag@domain bbbbb"), - Token::Sequence(vec![ - Token::PlainText("aaaa ".into()), - Token::Mention { - mention_type: crate::MentionType::User, - name: "tag".into(), - host: Some("domain".into()), - }, - Token::PlainText(" bbbbb".into()), - ]) - ); - - assert_eq!( - parse_full("test @tag@domain, test"), - Token::Sequence(vec![ - Token::PlainText("test ".into()), - Token::Mention { - mention_type: crate::MentionType::User, - name: "tag".into(), - host: Some("domain".into()), - }, - Token::PlainText(", test".into()), - ]) - ); - - assert_eq!( - parse_full("test @tag@domain.gay. test"), - Token::Sequence(vec![ - Token::PlainText("test ".into()), - Token::Mention { - mention_type: crate::MentionType::User, - name: "tag".into(), - host: Some("domain.gay".into()), - }, - Token::PlainText(". test".into()), - ]) - ); - - assert_eq!( - parse_full("test @tag@domain? test"), - Token::Sequence(vec![ - Token::PlainText("test ".into()), - Token::Mention { - mention_type: crate::MentionType::User, - name: "tag".into(), - host: Some("domain".into()), - }, - Token::PlainText("? test".into()), - ]) - ); - - assert_eq!( - parse_full("test !tag@domain.com test"), - Token::Sequence(vec![ - Token::PlainText("test ".into()), - Token::Mention { - mention_type: crate::MentionType::Community, - name: "tag".into(), - host: Some("domain.com".into()), - }, - Token::PlainText(" test".into()), - ]) - ); - - assert_eq!( - parse_full("@tag:domain.com"), - Token::Mention { - mention_type: crate::MentionType::MatrixUser, - name: "tag".into(), - host: Some("domain.com".into()) - }, - ); - } - - #[test] - fn parse_shortcodes() { - assert_eq!( - parse_full(":bottom:"), - Token::ShortcodeEmoji { - shortcode: "bottom".into(), - host: None, - } - ); - - assert_eq!( - parse_full(":bottom::blobfox:"), - Token::Sequence(vec![ - Token::ShortcodeEmoji { - shortcode: "bottom".into(), - host: None, - }, - Token::ShortcodeEmoji { - shortcode: "blobfox".into(), - host: None, - }, - ]) - ); - - assert_eq!( - parse_full(":bottom@magnetar.social:"), - Token::ShortcodeEmoji { - shortcode: "bottom".into(), - host: Some("magnetar.social".into()), - } - ); - - assert_eq!( - parse_full(":bottom:blobfox"), - Token::PlainText(":bottom:blobfox".into()) - ); - - assert_eq!( - parse_full("bottom:blobfox:"), - Token::PlainText("bottom:blobfox:".into()) - ); - } - - #[test] - fn parse_emoji() { - assert_eq!( - parse_full("🥺💜❤️🦊"), - Token::Sequence( - vec!["🥺", "💜", "❤️", "🦊"] - .into_iter() - .map(str::to_string) - .map(Token::UnicodeEmoji) - .collect::>() - ) - ); - - // Trans flag, ZWJ - assert_eq!( - parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}"), - Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into()) - ); - - assert_eq!( - parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}"), - Token::Sequence(vec![ - Token::PlainText("\u{0200d}".into()), // ZWJ - Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag - ]) - ); - - // Trans flag, ZWNJ - assert_eq!( - parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}"), - Token::Sequence(vec![ - Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag - Token::PlainText("\u{0200c}".into()), // ZWNJ - Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()), // Trans symbol - ]) - ); - - assert_eq!( - parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}"), - Token::Sequence(vec![ - Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag - Token::PlainText("\u{0200d}\u{0200d}\u{0200d}".into()), // ZWJ - ]) - ); - } - - #[test] - fn xml_serialization() { - assert_eq!( - &to_xml_string(&parse_full("***nyaaa***")).unwrap(), - r#"nyaaa"# - ); - - assert_eq!( - &to_xml_string(&parse_full( - "@natty $[spin.speed=0.5s 🥺]:cat_attack: test" - )) - .unwrap(), - r#" 🥺cat_attack test"# - ); - - assert_eq!( - &to_xml_string(&parse_full( - "Ring Galaxy AM 0644 741 from Hubble\nCredits: AURA, STScI, J. Higdon, Cornell, ESA, #NASA\n#nature #space #astrophotography" - )) - .unwrap(), - r#"Ring Galaxy AM 0644 741 from Hubble -Credits: AURA, STScI, J. Higdon, Cornell, ESA, NASA -nature space astrophotography"# - ); - - assert_eq!( - &to_xml_string(&parse_full( - r#" -```js -var x = undefined; -``` "# - )) - .unwrap(), - "var x = undefined;" - ); - } -} diff --git a/magnetar_mmm_parser/src/test.rs b/magnetar_mmm_parser/src/test.rs new file mode 100644 index 0000000..5f3e5da --- /dev/null +++ b/magnetar_mmm_parser/src/test.rs @@ -0,0 +1,785 @@ +#![cfg(test)] +use std::collections::HashMap; + +use nom::bytes::complete::tag; + +use crate::{xml_write::to_xml_string, Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; + +fn parse_full(string: &str) -> Token { + Context::default() + .full(Span::new_extra(string, SpanMeta::default())) + .unwrap() + .1 + .merged() +} + +#[test] +fn parse_empty() { + assert_eq!(parse_full(""), Token::Sequence(vec![])); +} + +#[test] +fn parse_url_chars() { + let ctx = Context::default(); + + assert_eq!( + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security))", + SpanMeta::default(), + )) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)" + ); + + assert_eq!( + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Sandbox_(computer_security)", + ); + + assert_eq!( + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://cs.wikipedia.org/wiki/Among_Us ", + SpanMeta::default() + )) + .unwrap() + .1 + .into_fragment(), + "https://cs.wikipedia.org/wiki/Among_Us", + ); + + assert_eq!( + ctx.url_chars(tag(")"), true)(Span::new_extra( + "https://cs.wikipedia.org/wiki/Among Us )", + SpanMeta::default(), + )) + .unwrap() + .1 + .into_fragment(), + "https://cs.wikipedia.org/wiki/Among Us" + ); + + assert_eq!( + ctx.url_chars(tag(")"), false)(Span::new_extra( + "https://en.wikipedia.org/wiki/Among Us )", + SpanMeta::default(), + )) + .unwrap() + .1 + .into_fragment(), + "https://en.wikipedia.org/wiki/Among" + ); +} + +#[test] +fn parse_formatting() { + assert_eq!( + parse_full(r#"~~stikethrough~~"#), + Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))), + ); + + assert_eq!( + parse_full(r#"**bold**"#), + Token::Bold(Box::new(Token::PlainText("bold".into()))), + ); + + assert_eq!( + parse_full(r#"*italic*"#), + Token::Italic(Box::new(Token::PlainText("italic".into()))), + ); + + assert_eq!( + parse_full(r#"* italic *"#), + Token::PlainText("* italic *".into()) + ); + + assert_eq!( + parse_full("snake_case_variable"), + Token::PlainText("snake_case_variable".into()) + ); + + assert_eq!( + parse_full("intra*word*italic"), + Token::Sequence(vec![ + Token::PlainText("intra".into()), + Token::Italic(Box::new(Token::PlainText("word".into()))), + Token::PlainText("italic".into()), + ]) + ); + + assert_eq!( + parse_full(r#"_ italic *"#), + Token::PlainText("_ italic *".into()) + ); + + assert_eq!( + parse_full(r#"long text with a *footnote text"#), + Token::Sequence(vec![ + Token::PlainText("long text with a *footnote ".into()), + Token::Bold(Box::new(Token::PlainText("text".into()))), + ]) + ); + + assert_eq!( + parse_full(r#"*"italic"*"#), + Token::Italic(Box::new(Token::PlainText("\"italic\"".into()))) + ); + + assert_eq!( + parse_full(r#"not code `code` also not code"#), + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("code".into()), + Token::PlainText(" also not code".into()) + ]), + ); + + assert_eq!( + parse_full(r#"not code `code` also `not code"#), + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("code".into()), + Token::PlainText(" also `not code".into()) + ]), + ); + + assert_eq!( + parse_full(r#"not code `*not bold*` also not code"#), + Token::Sequence(vec![ + Token::PlainText("not code ".into()), + Token::InlineCode("*not bold*".into()), + Token::PlainText(" also not code".into()) + ]), + ); + + assert_eq!( + parse_full(r#"***bold italic***"#), + Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( + "bold italic".into() + ))))) + ); + + assert_eq!( + parse_full(r#"bold italic"#), + Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText( + "bold italic".into() + ))))) + ); + + assert_eq!( + parse_full("~~*hello\nworld*"), + Token::Sequence(vec![ + Token::PlainText("~~".into()), + Token::Italic(Box::new(Token::PlainText("hello\nworld".into()))), + ]) + ) +} + +#[test] +fn parse_flanking() { + assert_eq!( + parse_full(r#"aaa*iii*bbb"#), + Token::Sequence(vec![ + Token::PlainText("aaa".into()), + Token::Italic(Box::new(Token::PlainText("iii".into()))), + Token::PlainText("bbb".into()), + ]) + ); + + assert_eq!( + parse_full(r#"aaa_nnn_bbb"#), + Token::PlainText("aaa_nnn_bbb".into()) + ); + + assert_eq!( + parse_full("aaa\n_iii_\nbbb"), + Token::Sequence(vec![ + Token::PlainText("aaa\n".into()), + Token::Italic(Box::new(Token::PlainText("iii".into()))), + Token::PlainText("\nbbb".into()), + ]) + ); + + assert_eq!( + parse_full(r#"*iii*"#), + Token::Italic(Box::new(Token::PlainText("iii".into()))) + ); + + assert_eq!( + parse_full(r#"_iii_"#), + Token::Italic(Box::new(Token::PlainText("iii".into()))) + ); + + assert_eq!( + parse_full(r#"aaa*iii*"#), + Token::Sequence(vec![ + Token::PlainText("aaa".into()), + Token::Italic(Box::new(Token::PlainText("iii".into()))), + ]) + ); + + assert_eq!( + parse_full(r#"*iii*bbb"#), + Token::Sequence(vec![ + Token::Italic(Box::new(Token::PlainText("iii".into()))), + Token::PlainText("bbb".into()), + ]) + ); + + assert_eq!( + parse_full(r#"aaa_nnn_"#), + Token::PlainText("aaa_nnn_".into()) + ); + + assert_eq!( + parse_full(r#"_nnn_bbb"#), + Token::PlainText("_nnn_bbb".into()) + ); +} + +#[test] +fn parse_long() { + parse_full(&"A".repeat(20000)); + + parse_full(&"*A".repeat(20000)); + + parse_full(&"@A".repeat(20000)); +} + +#[test] +fn parse_complex() { + assert_eq!( + parse_full(r"\( nya^3 \)"), + Token::InlineMath(" nya^3 ".to_string()) + ); + + assert_eq!( + parse_full("\\( nya^3 \n \\)"), + Token::PlainText("\\( nya^3 \n \\)".into()) + ); + + assert_eq!( + parse_full(r"`AbstractProxyFactoryBean`"), + Token::InlineCode("AbstractProxyFactoryBean".to_string()) + ); + + assert_eq!( + parse_full("`let x = \n 5;`"), + Token::PlainText("`let x = \n 5;`".into()) + ); + + assert_eq!( + parse_full( + r#" +```js +var x = undefined; +```"# + ), + Token::BlockCode { + lang: Some("js".to_string()), + inner: "var x = undefined;".to_string(), + } + ); + + assert_eq!( + parse_full( + r" +\[ +a^2 + b^2 = c^2 +\]" + ), + Token::BlockMath("a^2 + b^2 = c^2".to_string()) + ); + + assert_eq!( + parse_full(r"\[ x^2 + y^2 = z^2 \]"), + Token::BlockMath("x^2 + y^2 = z^2".to_string()) + ); + + assert_eq!( + parse_full( + r#"
centered + 🦋🏳️‍⚧️ + text
"# + ), + Token::Center(Box::new(Token::Sequence(vec![ + Token::PlainText("centered\n".into()), + Token::UnicodeEmoji("🦋".into()), + Token::UnicodeEmoji("🏳️‍⚧️".into()), + Token::PlainText("\ntext".into()), + ]))) + ); + + assert_eq!( + parse_full( + r#">
centered +> 👩🏽‍🤝‍👩🏼 +> text
"# + ), + Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![ + Token::PlainText("centered\n".into()), + Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()), + Token::PlainText("\ntext".into()) + ]))))), + ); + + assert_eq!( + parse_full(r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#), + Token::Function { + name: "x2".into(), + params: HashMap::new(), + inner: Box::new(Token::Sequence(vec![ + Token::Function { + name: "sparkle".into(), + params: HashMap::new(), + inner: Box::new(Token::UnicodeEmoji("🥺".into())), + }, + Token::UnicodeEmoji("💜".into()), + Token::Function { + name: "spin".into(), + params: { + let mut params = HashMap::new(); + params.insert("y".into(), None); + params.insert("speed".into(), Some("5s".into())); + params + }, + inner: Box::new(Token::UnicodeEmoji("❤️".into())), + }, + Token::UnicodeEmoji("🦊".into()), + ])) + }, + ); + + assert_eq!( + parse_full(r#"bold @tag1 @tag2 italic"#), + Token::Sequence(vec![ + Token::PlainText("bold ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag1".into(), + host: None + }, + Token::PlainText(" ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag2".into(), + host: None + }, + Token::PlainText(" italic".into()) + ]), + ); + + assert_eq!( + parse_full( + r#" +> test +> +> italic +> +>> Nested quote +"# + ), + Token::Quote(Box::new(Token::Sequence(vec![ + Token::PlainText("test\n".into()), + Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))), + Token::Quote(Box::new(Token::PlainText("Nested quote".into()))) + ]))), + ); +} + +#[test] +fn parse_link() { + assert_eq!( + parse_full("IPv4 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv4 test: ".into()), + Token::UrlNoEmbed("https://0".into()), + ]) + ); + + assert_eq!( + parse_full("IPv4 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv4 test: ".into()), + Token::UrlNoEmbed("https://127.0.0.1".into()), + ]) + ); + + assert_eq!( + parse_full("IPv6 test: "), + Token::Sequence(vec![ + Token::PlainText("IPv6 test: ".into()), + Token::UrlNoEmbed("https://[::2f:1]/nya".into()), + ]) + ); + + assert_eq!( + parse_full("IPv6 test: https://[::2f:1]/nya"), + Token::Sequence(vec![ + Token::PlainText("IPv6 test: ".into()), + Token::UrlRaw("https://[::2f:1]/nya".into()), + ]) + ); + + // IDNs + assert_eq!( + parse_full("IDN test: https://www.háčkyčárky.cz/"), + Token::Sequence(vec![ + Token::PlainText("IDN test: ".into()), + Token::UrlRaw("https://www.háčkyčárky.cz/".into()), + ]) + ); + + assert_eq!( + parse_full("Link test: [label](https://example.com)"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://example.com".into(), + embed: true, + }, + ]) + ); + + assert_eq!( + parse_full("test #hashtag tail"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Hashtag("hashtag".into()), + Token::PlainText(" tail".into()), + ]) + ); + + assert_eq!( + parse_full("not#hashtag tail"), + Token::PlainText("not#hashtag tail".into()) + ); + + assert_eq!( + parse_full(""), + Token::UrlNoEmbed("https://example.com".into()) + ); + + // Adjacent links okay + assert_eq!( + parse_full(""), + Token::Sequence(vec![ + Token::UrlNoEmbed("https://example.com/".into()), + Token::UrlNoEmbed("https://awawa.gay/".into()), + ]) + ); + + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay)"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false, + }, + ]) + ); + + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay)test"), + Token::Sequence(vec![ + Token::PlainText("Link test: ".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false, + }, + Token::PlainText("test".into()), + ]) + ); + + assert_eq!( + parse_full("Link test: (?[label](https://awawa.gay))"), + Token::Sequence(vec![ + Token::PlainText("Link test: (".into()), + Token::Link { + label: Box::new(Token::PlainText("label".into())), + href: "https://awawa.gay".into(), + embed: false, + }, + Token::PlainText(")".into()), + ]) + ); + + assert_eq!( + parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket + Token::Sequence(vec![ + Token::PlainText("Link test: ?[label](".into()), + Token::UrlRaw("https://awawa.gay".into()), + ]) + ); +} + +#[test] +fn limit_nesting() { + let mut tok = Token::PlainText(" test ".into()); + for _ in 0..DEFAULT_DEPTH_LIMIT { + tok = Token::Bold(Box::new(tok)); + } + + assert_eq!( + parse_full( + &("".repeat(DEFAULT_DEPTH_LIMIT) + + " test " + + &*"".repeat(DEFAULT_DEPTH_LIMIT)) + ), + tok + ); +} + +#[test] +fn parse_mention() { + assert_eq!( + parse_full("@tag"), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: None, + } + ); + + assert_eq!( + parse_full("email@notactuallyamenmtion.org"), + Token::PlainText("email@notactuallyamenmtion.org".into()) + ); + + assert_eq!( + parse_full("hgsjlkdsa @tag fgahjsdkd"), + Token::Sequence(vec![ + Token::PlainText("hgsjlkdsa ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: None, + }, + Token::PlainText(" fgahjsdkd".into()), + ]) + ); + + assert_eq!( + parse_full("hgsjlkdsa @tag@ fgahjsdkd"), + Token::Sequence(vec![ + Token::PlainText("hgsjlkdsa ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: None, + }, + Token::PlainText("@ fgahjsdkd".into()), + ]) + ); + + assert_eq!( + parse_full("aaaa @tag@domain bbbbb"), + Token::Sequence(vec![ + Token::PlainText("aaaa ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain".into()), + }, + Token::PlainText(" bbbbb".into()), + ]) + ); + + assert_eq!( + parse_full("test @tag@domain, test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain".into()), + }, + Token::PlainText(", test".into()), + ]) + ); + + assert_eq!( + parse_full("test @tag@domain.gay. test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain.gay".into()), + }, + Token::PlainText(". test".into()), + ]) + ); + + assert_eq!( + parse_full("test @tag@domain? test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::User, + name: "tag".into(), + host: Some("domain".into()), + }, + Token::PlainText("? test".into()), + ]) + ); + + assert_eq!( + parse_full("test !tag@domain.com test"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Mention { + mention_type: crate::MentionType::Community, + name: "tag".into(), + host: Some("domain.com".into()), + }, + Token::PlainText(" test".into()), + ]) + ); + + assert_eq!( + parse_full("@tag:domain.com"), + Token::Mention { + mention_type: crate::MentionType::MatrixUser, + name: "tag".into(), + host: Some("domain.com".into()) + }, + ); +} + +#[test] +fn parse_shortcodes() { + assert_eq!( + parse_full(":bottom:"), + Token::ShortcodeEmoji { + shortcode: "bottom".into(), + host: None, + } + ); + + assert_eq!( + parse_full(":bottom::blobfox:"), + Token::Sequence(vec![ + Token::ShortcodeEmoji { + shortcode: "bottom".into(), + host: None, + }, + Token::ShortcodeEmoji { + shortcode: "blobfox".into(), + host: None, + }, + ]) + ); + + assert_eq!( + parse_full(":bottom@magnetar.social:"), + Token::ShortcodeEmoji { + shortcode: "bottom".into(), + host: Some("magnetar.social".into()), + } + ); + + assert_eq!( + parse_full(":bottom:blobfox"), + Token::PlainText(":bottom:blobfox".into()) + ); + + assert_eq!( + parse_full("bottom:blobfox:"), + Token::PlainText("bottom:blobfox:".into()) + ); +} + +#[test] +fn parse_emoji() { + assert_eq!( + parse_full("🥺💜❤️🦊"), + Token::Sequence( + vec!["🥺", "💜", "❤️", "🦊"] + .into_iter() + .map(str::to_string) + .map(Token::UnicodeEmoji) + .collect::>() + ) + ); + + // Trans flag, ZWJ + assert_eq!( + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}"), + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into()) + ); + + assert_eq!( + parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}"), + Token::Sequence(vec![ + Token::PlainText("\u{0200d}".into()), // ZWJ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + ]) + ); + + // Trans flag, ZWNJ + assert_eq!( + parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}"), + Token::Sequence(vec![ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + Token::PlainText("\u{0200c}".into()), // ZWNJ + Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()), // Trans symbol + ]) + ); + + assert_eq!( + parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}"), + Token::Sequence(vec![ + Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag + Token::PlainText("\u{0200d}\u{0200d}\u{0200d}".into()), // ZWJ + ]) + ); +} + +#[test] +fn xml_serialization() { + assert_eq!( + &to_xml_string(&parse_full("***nyaaa***")).unwrap(), + r#"nyaaa"# + ); + + assert_eq!( + &to_xml_string(&parse_full( + "@natty $[spin.speed=0.5s 🥺]:cat_attack: test" + )) + .unwrap(), + r#" 🥺cat_attack test"# + ); + + assert_eq!( + &to_xml_string(&parse_full( + "Ring Galaxy AM 0644 741 from Hubble\nCredits: AURA, STScI, J. Higdon, Cornell, ESA, #NASA\n#nature #space #astrophotography" + )) + .unwrap(), + r#"Ring Galaxy AM 0644 741 from Hubble +Credits: AURA, STScI, J. Higdon, Cornell, ESA, NASA +nature space astrophotography"# + ); + + assert_eq!( + &to_xml_string(&parse_full( + r#" +```js +var x = undefined; +``` "# + )) + .unwrap(), + "var x = undefined;" + ); +} diff --git a/magnetar_mmm_parser/src/xml_write.rs b/magnetar_mmm_parser/src/xml_write.rs new file mode 100644 index 0000000..6565807 --- /dev/null +++ b/magnetar_mmm_parser/src/xml_write.rs @@ -0,0 +1,156 @@ +use std::io::{Cursor, Write}; + +use quick_xml::events::{BytesText, Event}; + +use crate::Token; + +impl Token { + fn write(&self, writer: &mut quick_xml::Writer) -> quick_xml::Result<()> { + match self { + Token::PlainText(plain) => { + writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; + } + Token::Sequence(sequence) => { + sequence.iter().try_for_each(|item| item.write(writer))?; + } + Token::Quote(inner) => { + writer + .create_element("quote") + .write_inner_content(|w| inner.write(w))?; + } + Token::Small(inner) => { + writer + .create_element("small") + .write_inner_content(|w| inner.write(w))?; + } + Token::Bold(inner) => { + writer + .create_element("b") + .write_inner_content(|w| inner.write(w))?; + } + Token::Italic(inner) => { + writer + .create_element("i") + .write_inner_content(|w| inner.write(w))?; + } + Token::Center(inner) => { + writer + .create_element("center") + .write_inner_content(|w| inner.write(w))?; + } + Token::Strikethrough(inner) => { + writer + .create_element("s") + .write_inner_content(|w| inner.write(w))?; + } + Token::PlainTag(plain) => { + writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; + } + Token::InlineCode(code) => { + writer + .create_element("inline-code") + .write_text_content(BytesText::new(code))?; + } + Token::InlineMath(math) => { + writer + .create_element("inline-math") + .write_text_content(BytesText::new(math))?; + } + Token::UrlRaw(url) => { + writer + .create_element("a") + .with_attribute(("href", url.as_str())) + .write_text_content(BytesText::new(url))?; + } + Token::UrlNoEmbed(url) => { + writer + .create_element("a") + .with_attribute(("href", url.as_str())) + .with_attribute(("embed", "false")) + .write_text_content(BytesText::new(url))?; + } + Token::Link { label, href, embed } => { + writer + .create_element("a") + .with_attribute(("href", href.as_str())) + .with_attribute(("embed", if *embed { "true" } else { "false" })) + .write_inner_content(|w| label.write(w))?; + } + Token::BlockCode { inner, lang } => { + let mut ew = writer.create_element("code"); + + if let Some(language) = lang { + ew = ew.with_attribute(("lang", language.as_str())); + } + + ew.write_text_content(BytesText::new(inner))?; + } + Token::BlockMath(math) => { + writer + .create_element("math") + .write_text_content(BytesText::new(math))?; + } + Token::Function { + inner, + name, + params, + } => { + let mut ew = writer + .create_element("fn") + .with_attribute(("name", name.as_str())); + + for (k, v) in params { + ew = ew + .with_attribute((format!("arg-{k}").as_str(), v.as_deref().unwrap_or(""))); + } + + ew.write_inner_content(|w| inner.write(w))?; + } + Token::Mention { + name, + host, + mention_type, + } => { + let mut ew = writer + .create_element("mention") + .with_attribute(("name", name.as_str())) + .with_attribute(("type", mention_type.into())); + + if let Some(host) = host { + ew = ew.with_attribute(("host", host.as_str())); + } + + ew.write_empty()?; + } + Token::UnicodeEmoji(text) => { + writer + .create_element("ue") + .write_text_content(BytesText::new(text))?; + } + Token::ShortcodeEmoji { shortcode, host } => { + let mut ew = writer.create_element("ee"); + + if let Some(host) = host { + ew = ew.with_attribute(("host", host.as_str())); + } + + ew.write_text_content(BytesText::new(shortcode))?; + } + Token::Hashtag(tag) => { + writer + .create_element("hashtag") + .write_text_content(BytesText::new(tag.as_str()))?; + } + } + + Ok(()) + } +} + +pub fn to_xml_string(token: &Token) -> quick_xml::Result { + let mut writer = quick_xml::Writer::new(Cursor::new(Vec::new())); + writer + .create_element("mmm") + .write_inner_content(|writer| token.write(writer))?; + Ok(String::from_utf8(writer.into_inner().into_inner())?) +}