Emoji parsing
This commit is contained in:
parent
52dc491a47
commit
8009546bfe
|
@ -748,6 +748,15 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "emojis"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979"
|
||||
dependencies = [
|
||||
"phf",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.1"
|
||||
|
@ -1613,8 +1622,10 @@ dependencies = [
|
|||
name = "mmm_parser"
|
||||
version = "0.2.1-alpha"
|
||||
dependencies = [
|
||||
"emojis",
|
||||
"nom",
|
||||
"nom_locate",
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
@ -29,6 +29,7 @@ cached = "0.46"
|
|||
cfg-if = "1"
|
||||
chrono = "0.4"
|
||||
dotenvy = "0.15"
|
||||
emojis = "0.6"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
headers = "0.3"
|
||||
|
@ -101,4 +102,4 @@ toml = { workspace = true }
|
|||
unicode-segmentation = { workspace = true }
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
lto = true
|
||||
|
|
|
@ -5,5 +5,7 @@ edition.workspace = true
|
|||
license = "MIT OR Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
emojis = { workspace = true }
|
||||
nom = { workspace = true }
|
||||
nom_locate = { workspace = true }
|
||||
nom_locate = { workspace = true }
|
||||
unicode-segmentation = { workspace = true }
|
||||
|
|
|
@ -12,6 +12,7 @@ use nom::{IResult, Offset, Slice};
|
|||
use nom_locate::LocatedSpan;
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum MentionType {
|
||||
|
@ -65,6 +66,8 @@ pub enum Token<'a> {
|
|||
host: Option<Cow<'a, str>>,
|
||||
mention_type: MentionType,
|
||||
},
|
||||
UnicodeEmoji(Cow<'a, str>),
|
||||
ShortcodeEmoji(Cow<'a, str>),
|
||||
}
|
||||
|
||||
impl Token<'_> {
|
||||
|
@ -121,6 +124,10 @@ impl Token<'_> {
|
|||
host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())),
|
||||
mention_type: *mention_type,
|
||||
},
|
||||
Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())),
|
||||
Token::ShortcodeEmoji(shortcode) => {
|
||||
Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned()))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -643,6 +650,24 @@ impl Context {
|
|||
))
|
||||
}
|
||||
|
||||
fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||
let frag = input.fragment();
|
||||
let Some(grapheme) = frag.graphemes(true).next() else {
|
||||
return fail(input);
|
||||
};
|
||||
|
||||
let emoji = emojis::get(grapheme);
|
||||
|
||||
if emoji.is_none() {
|
||||
return fail(input);
|
||||
}
|
||||
|
||||
Ok((
|
||||
input.slice(grapheme.len()..),
|
||||
Token::UnicodeEmoji(grapheme.into()),
|
||||
))
|
||||
}
|
||||
|
||||
fn mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||
// TODO: Escaping and skip when preceded by alphanumerics
|
||||
|
||||
|
@ -723,8 +748,9 @@ fn url_chars<'a, T: 'a>(
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::{url_chars, Span};
|
||||
use crate::{url_chars, Context, Span};
|
||||
use nom::bytes::complete::tag;
|
||||
use nom::multi::many1;
|
||||
|
||||
#[test]
|
||||
fn parse_url_chars() {
|
||||
|
@ -773,4 +799,13 @@ mod test {
|
|||
.into_fragment()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_emoji() {
|
||||
let test = "🥺💜❤️🦊";
|
||||
let ctx = Context;
|
||||
let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap();
|
||||
|
||||
println!("{:#?}", tokens.1)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue