diff --git a/Cargo.lock b/Cargo.lock index 1112838..91a74f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1517,7 +1517,9 @@ dependencies = [ "emojis", "nom", "nom_locate", + "quick-xml", "serde", + "strum", "tracing", "unicode-segmentation", ] @@ -2073,6 +2075,16 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quote" version = "1.0.32" diff --git a/Cargo.toml b/Cargo.toml index 9828764..05886ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ miette = "5.9" nom = "7" nom_locate = "4" percent-encoding = "2.2" +quick-xml = "0.31" redis = "0.23" regex = "1.9" reqwest = "0.11" diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index bee8c3a..7e956a8 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -4,6 +4,10 @@ version.workspace = true edition.workspace = true license = "MIT OR Apache-2.0" +[features] +default = ["xml"] +xml = ["dep:quick-xml"] + [dependencies] either = { workspace = true } emojis = { workspace = true } @@ -11,5 +15,7 @@ nom = { workspace = true } nom_locate = { workspace = true } compact_str = { workspace = true, features = ["serde"] } serde = { workspace = true, features = ["derive"] } +strum = { workspace = true, features = ["derive"] } tracing = { workspace = true } unicode-segmentation = { workspace = true } +quick-xml = { workspace = "true", optional = true, features = ["serialize"] } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 811b157..797cc3a 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -8,18 +8,24 @@ use nom::character::complete::{ }; use nom::combinator::{eof, fail, map, not, opt, peek, recognize}; use nom::error::ErrorKind; -use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; +use nom::multi::{many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; use nom::{IResult, Offset, Parser, Slice}; use nom_locate::LocatedSpan; +use quick_xml::events::{BytesText, Event}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::convert::{identity, Infallible}; +use std::io::{Cursor, Write}; use std::marker::PhantomData; +use strum::IntoStaticStr; use tracing::trace; use unicode_segmentation::UnicodeSegmentation; -#[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize, IntoStaticStr)] +// The alternative would be to implement a serde serializer for this one enum, but that's disgusting +#[strum(serialize_all = "snake_case")] +#[serde(rename_all = "snake_case")] pub enum MentionType { Community, User, @@ -217,6 +223,161 @@ impl Token { other => other.clone(), } } + + fn write(&self, writer: &mut quick_xml::Writer) -> quick_xml::Result<()> { + match self { + Token::PlainText(plain) => { + writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; + } + Token::Sequence(sequence) => { + sequence.iter().try_for_each(|item| item.write(writer))?; + } + Token::Quote(inner) => { + writer + .create_element("quote") + .write_inner_content(|w| inner.write(w))?; + } + Token::Small(inner) => { + writer + .create_element("small") + .write_inner_content(|w| inner.write(w))?; + } + Token::BoldItalic(inner) => { + writer + .create_element("b") + .write_inner_content::<_, quick_xml::Error>(|w| { + w.create_element("i") + .write_inner_content(|w| inner.write(w))?; + + Ok(()) + })?; + } + Token::Bold(inner) => { + writer + .create_element("b") + .write_inner_content(|w| inner.write(w))?; + } + Token::Italic(inner) => { + writer + .create_element("i") + .write_inner_content(|w| inner.write(w))?; + } + Token::Center(inner) => { + writer + .create_element("center") + .write_inner_content(|w| inner.write(w))?; + } + Token::Strikethrough(inner) => { + writer + .create_element("s") + .write_inner_content(|w| inner.write(w))?; + } + Token::PlainTag(plain) => { + writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; + } + Token::InlineCode(code) => { + writer + .create_element("inline-code") + .write_text_content(BytesText::new(code))?; + } + Token::InlineMath(math) => { + writer + .create_element("inline-math") + .write_text_content(BytesText::new(math))?; + } + Token::UrlRaw(url) => { + writer + .create_element("a") + .with_attribute(("href", url.as_str())) + .write_text_content(BytesText::new(url))?; + } + Token::UrlNoEmbed(url) => { + writer + .create_element("a") + .with_attribute(("href", url.as_str())) + .with_attribute(("embed", "false")) + .write_text_content(BytesText::new(url))?; + } + Token::Link { label, href, embed } => { + writer + .create_element("a") + .with_attribute(("href", href.as_str())) + .with_attribute(("embed", if *embed { "true" } else { "false" })) + .write_inner_content(|w| label.write(w))?; + } + Token::BlockCode { inner, lang } => { + let mut ew = writer.create_element("code"); + + if let Some(language) = lang { + ew = ew.with_attribute(("lang", language.as_str())); + } + + ew.write_text_content(BytesText::new(inner))?; + } + Token::BlockMath(math) => { + writer + .create_element("math") + .write_text_content(BytesText::new(math))?; + } + Token::Function { + inner, + name, + params, + } => { + let mut ew = writer + .create_element("fn") + .with_attribute(("name", name.as_str())); + + for (k, v) in params { + ew = ew + .with_attribute((format!("arg-{k}").as_str(), v.as_deref().unwrap_or(""))); + } + + ew.write_inner_content(|w| inner.write(w))?; + } + Token::Mention { + name, + host, + mention_type, + } => { + let mut ew = writer + .create_element("mention") + .with_attribute(("name", name.as_str())) + .with_attribute(("type", mention_type.into())); + + if let Some(host) = host { + ew = ew.with_attribute(("host", host.as_str())); + } + + ew.write_empty()?; + } + Token::UnicodeEmoji(text) => { + writer + .create_element("ue") + .write_text_content(BytesText::new(text))?; + } + Token::ShortcodeEmoji(shortcode) => { + writer + .create_element("ee") + .write_text_content(BytesText::new(shortcode))?; + } + Token::Hashtag(tag) => { + writer + .create_element("hashtag") + .with_attribute(("tag", tag.as_str())); + } + } + + Ok(()) + } +} + +pub fn to_xml_string(token: &Token) -> quick_xml::Result { + let mut writer = quick_xml::Writer::new(Cursor::new(Vec::new())); + writer + .create_element("mmm") + .write_inner_content(|writer| token.write(writer))?; + Ok(String::from_utf8(writer.into_inner().into_inner())?) } #[derive(Debug, Default, Copy, Clone)] @@ -690,7 +851,7 @@ impl Context { let (input, _) = delim(input)?; let (input, lang) = opt(map( - recognize(many1(tuple((not(delim), not_line_ending)))), + recognize(many1(tuple((not(delim), not(line_ending), anychar)))), Span::into_fragment, ))(input)?; let (input, _) = line_ending(input)?; @@ -705,8 +866,10 @@ impl Context { let (input, _) = line_ending(input)?; let (input, _) = delim(input)?; - let (input, _) = many0(space)(input)?; - let (input, _) = not(not(line_ending))(input)?; + // Trailing whitespace after the triple backtick + let (input, _) = opt(space1_unicode)(input)?; + // If we got this far, the next character should be a line ending + let (input, _) = not(tuple((not(line_ending), anychar)))(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( @@ -738,8 +901,10 @@ impl Context { let (input, _) = opt(line_ending)(input)?; let (input, _) = end(input)?; - let (input, _) = many0(space)(input)?; - let (input, _) = not(not_line_ending)(input)?; + // Trailing whitespace after the closing delim + let (input, _) = opt(space1_unicode)(input)?; + // If we got this far, the next character should be a line ending + let (input, _) = not(tuple((not(line_ending), anychar)))(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( @@ -875,7 +1040,7 @@ impl Context { )))(input) }; - let param_value = recognize(many1_count(alt(( + let arg_value = recognize(many1_count(alt(( alphanumeric1, tag("."), tag("-"), @@ -884,7 +1049,7 @@ impl Context { let (input, func_name) = map(func_ident, Span::into_fragment)(input)?; - let arg = tuple((func_ident, opt(tuple((tag("="), param_value))))); + let arg = tuple((func_ident, opt(tuple((tag("="), arg_value))))); let (input, args) = opt(tuple((one_char('.'), separated_list1(one_char(','), arg))))(input)?; @@ -1428,7 +1593,7 @@ impl Context { #[cfg(test)] mod test { - use crate::{Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; + use crate::{to_xml_string, Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; use nom::bytes::complete::tag; use std::collections::HashMap; @@ -1588,6 +1753,49 @@ mod test { #[test] fn parse_complex() { + assert_eq!( + parse_full(r"\( nya^3 \)"), + Token::InlineMath(" nya^3 ".to_string()) + ); + + assert_eq!( + parse_full("\\( nya^3 \n \\)"), + Token::PlainText("\\( nya^3 \n \\)".into()) + ); + + assert_eq!( + parse_full(r"`AbstractProxyFactoryBean`"), + Token::InlineCode("AbstractProxyFactoryBean".to_string()) + ); + + assert_eq!( + parse_full("`let x = \n 5;`"), + Token::PlainText("`let x = \n 5;`".into()) + ); + + assert_eq!( + parse_full( + r#" +```js +var x = undefined; +```"# + ), + Token::BlockCode { + lang: Some("js".to_string()), + inner: "var x = undefined;".to_string(), + } + ); + + assert_eq!( + parse_full( + r" +\[ +a^2 + b^2 = c^2 +\]" + ), + Token::BlockMath("a^2 + b^2 = c^2".to_string()) + ); + assert_eq!( parse_full( r#"
centered @@ -2005,4 +2213,31 @@ text
"# ]) ); } + + #[test] + fn xml_serialization() { + assert_eq!( + &to_xml_string(&parse_full("***nyaaa***")).unwrap(), + r#"nyaaa"# + ); + + assert_eq!( + &to_xml_string(&parse_full( + "@natty $[spin.speed=0.5s 🥺]:cat_attack: test" + )) + .unwrap(), + r#" 🥺cat_attack test"# + ); + + assert_eq!( + &to_xml_string(&parse_full( + r#" +```js +var x = undefined; +``` "# + )) + .unwrap(), + "var x = undefined;" + ); + } }