diff --git a/Cargo.lock b/Cargo.lock index 9abfe30..aa58d5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -748,6 +748,15 @@ dependencies = [ "serde", ] +[[package]] +name = "emojis" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979" +dependencies = [ + "phf", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -1613,8 +1622,10 @@ dependencies = [ name = "mmm_parser" version = "0.2.1-alpha" dependencies = [ + "emojis", "nom", "nom_locate", + "unicode-segmentation", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a7a960f..f504d67 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ cached = "0.46" cfg-if = "1" chrono = "0.4" dotenvy = "0.15" +emojis = "0.6" futures-core = "0.3" futures-util = "0.3" headers = "0.3" @@ -101,4 +102,4 @@ toml = { workspace = true } unicode-segmentation = { workspace = true } [profile.release] -lto = true \ No newline at end of file +lto = true diff --git a/magnetar_mmm_parser/Cargo.toml b/magnetar_mmm_parser/Cargo.toml index 8a07618..30c2bad 100644 --- a/magnetar_mmm_parser/Cargo.toml +++ b/magnetar_mmm_parser/Cargo.toml @@ -5,5 +5,7 @@ edition.workspace = true license = "MIT OR Apache-2.0" [dependencies] +emojis = { workspace = true } nom = { workspace = true } -nom_locate = { workspace = true } \ No newline at end of file +nom_locate = { workspace = true } +unicode-segmentation = { workspace = true } diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index d4e9d6e..a3ddcd1 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -12,6 +12,7 @@ use nom::{IResult, Offset, Slice}; use nom_locate::LocatedSpan; use std::borrow::Cow; use std::collections::HashMap; +use unicode_segmentation::UnicodeSegmentation; #[derive(Copy, Clone, Debug)] pub enum MentionType { @@ -65,6 +66,8 @@ pub enum Token<'a> { host: Option>, mention_type: MentionType, }, + UnicodeEmoji(Cow<'a, str>), + ShortcodeEmoji(Cow<'a, str>), } impl Token<'_> { @@ -121,6 +124,10 @@ impl Token<'_> { host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())), mention_type: *mention_type, }, + Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())), + Token::ShortcodeEmoji(shortcode) => { + Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned())) + } } } } @@ -643,6 +650,24 @@ impl Context { )) } + fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + let frag = input.fragment(); + let Some(grapheme) = frag.graphemes(true).next() else { + return fail(input); + }; + + let emoji = emojis::get(grapheme); + + if emoji.is_none() { + return fail(input); + } + + Ok(( + input.slice(grapheme.len()..), + Token::UnicodeEmoji(grapheme.into()), + )) + } + fn mention<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { // TODO: Escaping and skip when preceded by alphanumerics @@ -723,8 +748,9 @@ fn url_chars<'a, T: 'a>( #[cfg(test)] mod test { - use crate::{url_chars, Span}; + use crate::{url_chars, Context, Span}; use nom::bytes::complete::tag; + use nom::multi::many1; #[test] fn parse_url_chars() { @@ -773,4 +799,13 @@ mod test { .into_fragment() ); } + + #[test] + fn parse_emoji() { + let test = "πŸ₯ΊπŸ’œβ€οΈπŸ¦Š"; + let ctx = Context; + let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap(); + + println!("{:#?}", tokens.1) + } }