Emoji parsing
This commit is contained in:
parent
52dc491a47
commit
8009546bfe
|
@ -748,6 +748,15 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "emojis"
|
||||||
|
version = "0.6.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979"
|
||||||
|
dependencies = [
|
||||||
|
"phf",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "equivalent"
|
name = "equivalent"
|
||||||
version = "1.0.1"
|
version = "1.0.1"
|
||||||
|
@ -1613,8 +1622,10 @@ dependencies = [
|
||||||
name = "mmm_parser"
|
name = "mmm_parser"
|
||||||
version = "0.2.1-alpha"
|
version = "0.2.1-alpha"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"emojis",
|
||||||
"nom",
|
"nom",
|
||||||
"nom_locate",
|
"nom_locate",
|
||||||
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
@ -29,6 +29,7 @@ cached = "0.46"
|
||||||
cfg-if = "1"
|
cfg-if = "1"
|
||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
dotenvy = "0.15"
|
dotenvy = "0.15"
|
||||||
|
emojis = "0.6"
|
||||||
futures-core = "0.3"
|
futures-core = "0.3"
|
||||||
futures-util = "0.3"
|
futures-util = "0.3"
|
||||||
headers = "0.3"
|
headers = "0.3"
|
||||||
|
|
|
@ -5,5 +5,7 @@ edition.workspace = true
|
||||||
license = "MIT OR Apache-2.0"
|
license = "MIT OR Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
emojis = { workspace = true }
|
||||||
nom = { workspace = true }
|
nom = { workspace = true }
|
||||||
nom_locate = { workspace = true }
|
nom_locate = { workspace = true }
|
||||||
|
unicode-segmentation = { workspace = true }
|
||||||
|
|
|
@ -12,6 +12,7 @@ use nom::{IResult, Offset, Slice};
|
||||||
use nom_locate::LocatedSpan;
|
use nom_locate::LocatedSpan;
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug)]
|
#[derive(Copy, Clone, Debug)]
|
||||||
pub enum MentionType {
|
pub enum MentionType {
|
||||||
|
@ -65,6 +66,8 @@ pub enum Token<'a> {
|
||||||
host: Option<Cow<'a, str>>,
|
host: Option<Cow<'a, str>>,
|
||||||
mention_type: MentionType,
|
mention_type: MentionType,
|
||||||
},
|
},
|
||||||
|
UnicodeEmoji(Cow<'a, str>),
|
||||||
|
ShortcodeEmoji(Cow<'a, str>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Token<'_> {
|
impl Token<'_> {
|
||||||
|
@ -121,6 +124,10 @@ impl Token<'_> {
|
||||||
host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())),
|
host: host.as_ref().map(|v| Cow::Owned(v.clone().into_owned())),
|
||||||
mention_type: *mention_type,
|
mention_type: *mention_type,
|
||||||
},
|
},
|
||||||
|
Token::UnicodeEmoji(code) => Token::UnicodeEmoji(Cow::Owned(code.clone().into_owned())),
|
||||||
|
Token::ShortcodeEmoji(shortcode) => {
|
||||||
|
Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned()))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -643,6 +650,24 @@ impl Context {
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||||
|
let frag = input.fragment();
|
||||||
|
let Some(grapheme) = frag.graphemes(true).next() else {
|
||||||
|
return fail(input);
|
||||||
|
};
|
||||||
|
|
||||||
|
let emoji = emojis::get(grapheme);
|
||||||
|
|
||||||
|
if emoji.is_none() {
|
||||||
|
return fail(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((
|
||||||
|
input.slice(grapheme.len()..),
|
||||||
|
Token::UnicodeEmoji(grapheme.into()),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
fn mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
fn mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||||
// TODO: Escaping and skip when preceded by alphanumerics
|
// TODO: Escaping and skip when preceded by alphanumerics
|
||||||
|
|
||||||
|
@ -723,8 +748,9 @@ fn url_chars<'a, T: 'a>(
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use crate::{url_chars, Span};
|
use crate::{url_chars, Context, Span};
|
||||||
use nom::bytes::complete::tag;
|
use nom::bytes::complete::tag;
|
||||||
|
use nom::multi::many1;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn parse_url_chars() {
|
fn parse_url_chars() {
|
||||||
|
@ -773,4 +799,13 @@ mod test {
|
||||||
.into_fragment()
|
.into_fragment()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_emoji() {
|
||||||
|
let test = "🥺💜❤️🦊";
|
||||||
|
let ctx = Context;
|
||||||
|
let tokens = many1(ctx.partial(Context::unicode_emoji))(Span::from(test)).unwrap();
|
||||||
|
|
||||||
|
println!("{:#?}", tokens.1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue