Emoji parsing

This commit is contained in:
Natty 2023-10-05 21:21:23 +02:00
parent 52dc491a47
commit 8009546bfe
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
4 changed files with 52 additions and 3 deletions

11
Cargo.lock generated
View File

@ -748,6 +748,15 @@ dependencies = [
"serde",
]
[[package]]
name = "emojis"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979"
dependencies = [
"phf",
]
[[package]]
name = "equivalent"
version = "1.0.1"
@ -1613,8 +1622,10 @@ dependencies = [
name = "mmm_parser"
version = "0.2.1-alpha"
dependencies = [
"emojis",
"nom",
"nom_locate",
"unicode-segmentation",
]
[[package]]

View File

@ -29,6 +29,7 @@ cached = "0.46"
cfg-if = "1"
chrono = "0.4"
dotenvy = "0.15"
emojis = "0.6"
futures-core = "0.3"
futures-util = "0.3"
headers = "0.3"
@ -101,4 +102,4 @@ toml = { workspace = true }
unicode-segmentation = { workspace = true }
[profile.release]
lto = true
lto = true

View File

@ -5,5 +5,7 @@ edition.workspace = true
license = "MIT OR Apache-2.0"
[dependencies]
emojis = { workspace = true }
nom = { workspace = true }
nom_locate = { workspace = true }
nom_locate = { workspace = true }
unicode-segmentation = { workspace = true }

View File

@ -12,6 +12,7 @@ use nom::{IResult, Offset, Slice};
use nom_locate::LocatedSpan;
use std::borrow::Cow;
use std::collections::HashMap;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug)]
pub enum MentionType {
@ -65,6 +66,8 @@ pub enum Token<'a> {
host: Option<Cow<'a, str>>,
mention_type: MentionType,
},
UnicodeEmoji(Cow<'a, str>),
ShortcodeEmoji(Cow<'a, str>),
}
impl Token<'_> {
@ -121,6 +124,10 @@ impl Token<'_> {
host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())),
mention_type: *mention_type,
},
Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())),
Token::ShortcodeEmoji(shortcode) => {
Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned()))
}
}
}
}
@ -643,6 +650,24 @@ impl Context {
))
}
fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let frag = input.fragment();
let Some(grapheme) = frag.graphemes(true).next() else {
return fail(input);
};
let emoji = emojis::get(grapheme);
if emoji.is_none() {
return fail(input);
}
Ok((
input.slice(grapheme.len()..),
Token::UnicodeEmoji(grapheme.into()),
))
}
fn mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
// TODO: Escaping and skip when preceded by alphanumerics
@ -723,8 +748,9 @@ fn url_chars<'a, T: 'a>(
#[cfg(test)]
mod test {
use crate::{url_chars, Span};
use crate::{url_chars, Context, Span};
use nom::bytes::complete::tag;
use nom::multi::many1;
#[test]
fn parse_url_chars() {
@ -773,4 +799,13 @@ mod test {
.into_fragment()
);
}
#[test]
fn parse_emoji() {
let test = "🥺💜❤️🦊";
let ctx = Context;
let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap();
println!("{:#?}", tokens.1)
}
}