Emoji parsing

2023-10-05 21:21:23 +02:00 · 2023-10-05 21:21:23 +02:00 · 8009546bfe
parent 52dc491a47
commit 8009546bfe
4 changed files with 52 additions and 3 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -748,6 +748,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "emojis"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979"
+dependencies = [
+ "phf",
+]
+
 [[package]]
 name = "equivalent"
 version = "1.0.1"
@ -1613,8 +1622,10 @@ dependencies = [
 name = "mmm_parser"
 version = "0.2.1-alpha"
 dependencies = [
+ "emojis",
 "nom",
 "nom_locate",
+ "unicode-segmentation",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -29,6 +29,7 @@ cached = "0.46"
 cfg-if = "1"
 chrono = "0.4"
 dotenvy = "0.15"
+emojis = "0.6"
 futures-core = "0.3"
 futures-util = "0.3"
 headers = "0.3"
@ -101,4 +102,4 @@ toml = { workspace = true }
 unicode-segmentation = { workspace = true }

 [profile.release]
-lto = true
+lto = true
--- a/magnetar_mmm_parser/Cargo.toml
+++ b/magnetar_mmm_parser/Cargo.toml
@ -5,5 +5,7 @@ edition.workspace = true
 license = "MIT OR Apache-2.0"

 [dependencies]
+emojis = { workspace = true }
 nom = { workspace = true }
-nom_locate = { workspace = true }
+nom_locate = { workspace = true }
+unicode-segmentation = { workspace = true }
--- a/magnetar_mmm_parser/src/lib.rs
+++ b/magnetar_mmm_parser/src/lib.rs
@ -12,6 +12,7 @@ use nom::{IResult, Offset, Slice};
 use nom_locate::LocatedSpan;
 use std::borrow::Cow;
 use std::collections::HashMap;
+use unicode_segmentation::UnicodeSegmentation;

 #[derive(Copy, Clone, Debug)]
 pub enum MentionType {
@ -65,6 +66,8 @@ pub enum Token<'a> {
        host: Option<Cow<'a, str>>,
        mention_type: MentionType,
    },
+    UnicodeEmoji(Cow<'a, str>),
+    ShortcodeEmoji(Cow<'a, str>),
 }

 impl Token<'_> {
@ -121,6 +124,10 @@ impl Token<'_> {
                host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())),
                mention_type: *mention_type,
            },
+            Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())),
+            Token::ShortcodeEmoji(shortcode) => {
+                Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned()))
+            }
        }
    }
 }
@ -643,6 +650,24 @@ impl Context {
        ))
    }

+    fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
+        let frag = input.fragment();
+        let Some(grapheme) = frag.graphemes(true).next() else {
+            return fail(input);
+        };
+
+        let emoji = emojis::get(grapheme);
+
+        if emoji.is_none() {
+            return fail(input);
+        }
+
+        Ok((
+            input.slice(grapheme.len()..),
+            Token::UnicodeEmoji(grapheme.into()),
+        ))
+    }
+
    fn mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
        // TODO: Escaping and skip when preceded by alphanumerics

@ -723,8 +748,9 @@ fn url_chars<'a, T: 'a>(

 #[cfg(test)]
 mod test {
-    use crate::{url_chars, Span};
+    use crate::{url_chars, Context, Span};
    use nom::bytes::complete::tag;
+    use nom::multi::many1;

    #[test]
    fn parse_url_chars() {
@ -773,4 +799,13 @@ mod test {
                .into_fragment()
        );
    }
+
+    #[test]
+    fn parse_emoji() {
+        let test = "🥺💜❤️🦊";
+        let ctx = Context;
+        let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap();
+
+        println!("{:#?}", tokens.1)
+    }
 }