diff --git a/Cargo.lock b/Cargo.lock index 35e50cd..91a74f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -603,6 +603,7 @@ dependencies = [ "cfg-if", "itoa", "ryu", + "serde", "static_assertions", ] @@ -1507,6 +1508,22 @@ dependencies = [ "serde_json", ] +[[package]] +name = "magnetar_mmm_parser" +version = "0.2.1-alpha" +dependencies = [ + "compact_str", + "either", + "emojis", + "nom", + "nom_locate", + "quick-xml", + "serde", + "strum", + "tracing", + "unicode-segmentation", +] + [[package]] name = "magnetar_nodeinfo" version = "0.2.1-alpha" @@ -1521,6 +1538,7 @@ version = "0.2.1-alpha" dependencies = [ "chrono", "http", + "magnetar_mmm_parser", "magnetar_sdk_macros", "serde", "serde_json", @@ -1640,19 +1658,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "mmm_parser" -version = "0.2.1-alpha" -dependencies = [ - "compact_str", - "either", - "emojis", - "nom", - "nom_locate", - "tracing", - "unicode-segmentation", -] - [[package]] name = "nom" version = "7.1.3" @@ -2070,6 +2075,16 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quote" version = "1.0.32" diff --git a/Cargo.toml b/Cargo.toml index 9828764..05886ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ miette = "5.9" nom = "7" nom_locate = "4" percent-encoding = "2.2" +quick-xml = "0.31" redis = "0.23" regex = "1.9" reqwest = "0.11" diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index d7b9b2d..7e956a8 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -1,14 +1,21 @@ [package] -name = "mmm_parser" +name = "magnetar_mmm_parser" version.workspace = true edition.workspace = true license = "MIT OR Apache-2.0" +[features] +default = ["xml"] +xml = ["dep:quick-xml"] + [dependencies] either = { workspace = true } emojis = { workspace = true } nom = { workspace = true } nom_locate = { workspace = true } -compact_str = { workspace = true } +compact_str = { workspace = true, features = ["serde"] } +serde = { workspace = true, features = ["derive"] } +strum = { workspace = true, features = ["derive"] } tracing = { workspace = true } unicode-segmentation = { workspace = true } +quick-xml = { workspace = "true", optional = true, features = ["serialize"] } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 26661e6..25dfbc2 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -8,17 +8,24 @@ use nom::character::complete::{ }; use nom::combinator::{eof, fail, map, not, opt, peek, recognize}; use nom::error::ErrorKind; -use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; +use nom::multi::{many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; use nom::{IResult, Offset, Parser, Slice}; use nom_locate::LocatedSpan; +use quick_xml::events::{BytesText, Event}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::convert::{identity, Infallible}; +use std::io::{Cursor, Write}; use std::marker::PhantomData; +use strum::IntoStaticStr; use tracing::trace; use unicode_segmentation::UnicodeSegmentation; -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize, IntoStaticStr)] +// The alternative would be to implement a serde serializer for this one enum, but that's disgusting +#[strum(serialize_all = "snake_case")] +#[serde(rename_all = "snake_case")] pub enum MentionType { Community, User, @@ -33,7 +40,7 @@ impl MentionType { } } -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub enum Token { PlainText(CompactString), Sequence(Vec), @@ -216,6 +223,161 @@ impl Token { other => other.clone(), } } + + fn write(&self, writer: &mut quick_xml::Writer) -> quick_xml::Result<()> { + match self { + Token::PlainText(plain) => { + writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; + } + Token::Sequence(sequence) => { + sequence.iter().try_for_each(|item| item.write(writer))?; + } + Token::Quote(inner) => { + writer + .create_element("quote") + .write_inner_content(|w| inner.write(w))?; + } + Token::Small(inner) => { + writer + .create_element("small") + .write_inner_content(|w| inner.write(w))?; + } + Token::BoldItalic(inner) => { + writer + .create_element("b") + .write_inner_content::<_, quick_xml::Error>(|w| { + w.create_element("i") + .write_inner_content(|w| inner.write(w))?; + + Ok(()) + })?; + } + Token::Bold(inner) => { + writer + .create_element("b") + .write_inner_content(|w| inner.write(w))?; + } + Token::Italic(inner) => { + writer + .create_element("i") + .write_inner_content(|w| inner.write(w))?; + } + Token::Center(inner) => { + writer + .create_element("center") + .write_inner_content(|w| inner.write(w))?; + } + Token::Strikethrough(inner) => { + writer + .create_element("s") + .write_inner_content(|w| inner.write(w))?; + } + Token::PlainTag(plain) => { + writer.write_event(Event::Text(BytesText::new(plain.as_str())))?; + } + Token::InlineCode(code) => { + writer + .create_element("inline-code") + .write_text_content(BytesText::new(code))?; + } + Token::InlineMath(math) => { + writer + .create_element("inline-math") + .write_text_content(BytesText::new(math))?; + } + Token::UrlRaw(url) => { + writer + .create_element("a") + .with_attribute(("href", url.as_str())) + .write_text_content(BytesText::new(url))?; + } + Token::UrlNoEmbed(url) => { + writer + .create_element("a") + .with_attribute(("href", url.as_str())) + .with_attribute(("embed", "false")) + .write_text_content(BytesText::new(url))?; + } + Token::Link { label, href, embed } => { + writer + .create_element("a") + .with_attribute(("href", href.as_str())) + .with_attribute(("embed", if *embed { "true" } else { "false" })) + .write_inner_content(|w| label.write(w))?; + } + Token::BlockCode { inner, lang } => { + let mut ew = writer.create_element("code"); + + if let Some(language) = lang { + ew = ew.with_attribute(("lang", language.as_str())); + } + + ew.write_text_content(BytesText::new(inner))?; + } + Token::BlockMath(math) => { + writer + .create_element("math") + .write_text_content(BytesText::new(math))?; + } + Token::Function { + inner, + name, + params, + } => { + let mut ew = writer + .create_element("fn") + .with_attribute(("name", name.as_str())); + + for (k, v) in params { + ew = ew + .with_attribute((format!("arg-{k}").as_str(), v.as_deref().unwrap_or(""))); + } + + ew.write_inner_content(|w| inner.write(w))?; + } + Token::Mention { + name, + host, + mention_type, + } => { + let mut ew = writer + .create_element("mention") + .with_attribute(("name", name.as_str())) + .with_attribute(("type", mention_type.into())); + + if let Some(host) = host { + ew = ew.with_attribute(("host", host.as_str())); + } + + ew.write_empty()?; + } + Token::UnicodeEmoji(text) => { + writer + .create_element("ue") + .write_text_content(BytesText::new(text))?; + } + Token::ShortcodeEmoji(shortcode) => { + writer + .create_element("ee") + .write_text_content(BytesText::new(shortcode))?; + } + Token::Hashtag(tag) => { + writer + .create_element("hashtag") + .with_attribute(("tag", tag.as_str())); + } + } + + Ok(()) + } +} + +pub fn to_xml_string(token: &Token) -> quick_xml::Result { + let mut writer = quick_xml::Writer::new(Cursor::new(Vec::new())); + writer + .create_element("mmm") + .write_inner_content(|writer| token.write(writer))?; + Ok(String::from_utf8(writer.into_inner().into_inner())?) } #[derive(Debug, Default, Copy, Clone)] @@ -315,11 +477,11 @@ fn spliced<'a>( type NE = nom::Err; type NomError<'x> = nom::error::Error>; - let quote_span = Span::new_extra( + let spliced_span = Span::new_extra( &combined, segments.first().map_or(SpanMeta::new(0), |s| s.extra), ); - let (input, inner) = match func(quote_span) { + let (input, inner) = match func(spliced_span) { Ok(s) => s, Err(e) => { return match e { @@ -689,7 +851,7 @@ impl Context { let (input, _) = delim(input)?; let (input, lang) = opt(map( - recognize(many1(tuple((not(delim), not_line_ending)))), + recognize(many1(tuple((not(delim), not(line_ending), anychar)))), Span::into_fragment, ))(input)?; let (input, _) = line_ending(input)?; @@ -704,8 +866,10 @@ impl Context { let (input, _) = line_ending(input)?; let (input, _) = delim(input)?; - let (input, _) = many0(space)(input)?; - let (input, _) = not(not(line_ending))(input)?; + // Trailing whitespace after the triple backtick + let (input, _) = opt(space1_unicode)(input)?; + // If we got this far, the next character should be a line ending + let (input, _) = not(tuple((not(line_ending), anychar)))(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( @@ -737,8 +901,10 @@ impl Context { let (input, _) = opt(line_ending)(input)?; let (input, _) = end(input)?; - let (input, _) = many0(space)(input)?; - let (input, _) = not(not_line_ending)(input)?; + // Trailing whitespace after the closing delim + let (input, _) = opt(space1_unicode)(input)?; + // If we got this far, the next character should be a line ending + let (input, _) = not(tuple((not(line_ending), anychar)))(input)?; let (input, _) = opt(line_ending)(input)?; Ok(( @@ -874,7 +1040,7 @@ impl Context { )))(input) }; - let param_value = recognize(many1_count(alt(( + let arg_value = recognize(many1_count(alt(( alphanumeric1, tag("."), tag("-"), @@ -883,7 +1049,7 @@ impl Context { let (input, func_name) = map(func_ident, Span::into_fragment)(input)?; - let arg = tuple((func_ident, opt(tuple((tag("="), param_value))))); + let arg = tuple((func_ident, opt(tuple((tag("="), arg_value))))); let (input, args) = opt(tuple((one_char('.'), separated_list1(one_char(','), arg))))(input)?; @@ -1427,7 +1593,7 @@ impl Context { #[cfg(test)] mod test { - use crate::{Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; + use crate::{to_xml_string, Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT}; use nom::bytes::complete::tag; use std::collections::HashMap; @@ -1583,10 +1749,58 @@ mod test { "bold italic".into() ))))) ); + + assert_eq!( + parse_full("~~*hello\nworld*"), + Token::PlainText("~~*hello\nworld*".into()) + ) } #[test] fn parse_complex() { + assert_eq!( + parse_full(r"\( nya^3 \)"), + Token::InlineMath(" nya^3 ".to_string()) + ); + + assert_eq!( + parse_full("\\( nya^3 \n \\)"), + Token::PlainText("\\( nya^3 \n \\)".into()) + ); + + assert_eq!( + parse_full(r"`AbstractProxyFactoryBean`"), + Token::InlineCode("AbstractProxyFactoryBean".to_string()) + ); + + assert_eq!( + parse_full("`let x = \n 5;`"), + Token::PlainText("`let x = \n 5;`".into()) + ); + + assert_eq!( + parse_full( + r#" +```js +var x = undefined; +```"# + ), + Token::BlockCode { + lang: Some("js".to_string()), + inner: "var x = undefined;".to_string(), + } + ); + + assert_eq!( + parse_full( + r" +\[ +a^2 + b^2 = c^2 +\]" + ), + Token::BlockMath("a^2 + b^2 = c^2".to_string()) + ); + assert_eq!( parse_full( r#"
centered @@ -2004,4 +2218,31 @@ text
"# ]) ); } + + #[test] + fn xml_serialization() { + assert_eq!( + &to_xml_string(&parse_full("***nyaaa***")).unwrap(), + r#"nyaaa"# + ); + + assert_eq!( + &to_xml_string(&parse_full( + "@natty $[spin.speed=0.5s 🥺]:cat_attack: test" + )) + .unwrap(), + r#" 🥺cat_attack test"# + ); + + assert_eq!( + &to_xml_string(&parse_full( + r#" +```js +var x = undefined; +``` "# + )) + .unwrap(), + "var x = undefined;" + ); + } } diff --git a/magnetar_sdk/Cargo.toml b/magnetar_sdk/Cargo.toml index 0bf402c..03a89d4 100644 --- a/magnetar_sdk/Cargo.toml +++ b/magnetar_sdk/Cargo.toml @@ -5,6 +5,7 @@ edition.workspace = true license = "MIT OR Apache-2.0" [dependencies] +magnetar_mmm_parser = { path = "../magnetar_mmm_parser" } magnetar_sdk_macros = { path = "./macros" } chrono = { workspace = true, features = ["serde"] } @@ -15,4 +16,4 @@ serde_json = { workspace = true } ts-rs = { workspace = true, features = ["chrono", "chrono-impl"] } -unicode-segmentation = { workspace = true } \ No newline at end of file +unicode-segmentation = { workspace = true } diff --git a/magnetar_sdk/src/lib.rs b/magnetar_sdk/src/lib.rs index a12afe6..79189ff 100644 --- a/magnetar_sdk/src/lib.rs +++ b/magnetar_sdk/src/lib.rs @@ -1,6 +1,7 @@ use serde::{Deserialize, Serialize}; use ts_rs::TS; +pub use magnetar_mmm_parser as mmm; pub mod endpoints; pub mod types; pub mod util_types;