diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index a3ddcd1..93f4da1 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -68,6 +68,7 @@ pub enum Token<'a> { }, UnicodeEmoji(Cow<'a, str>), ShortcodeEmoji(Cow<'a, str>), + Hashtag(Cow<'a, str>), } impl Token<'_> { @@ -128,6 +129,7 @@ impl Token<'_> { Token::ShortcodeEmoji(shortcode) => { Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned())) } + Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())), } } } @@ -703,6 +705,33 @@ impl Context { }, )) } + + fn hashtag<'a>(&self, input: Span<'a>) -> IResult, Token<'a>> { + // TODO: Skip when preceded by alphanumerics + + let (input, _) = tag("#")(input)?; + + let (input, hashtag_text) = + map(recognize(many1(hashtag_chars)), Span::into_fragment)(input)?; + + Ok((input, Token::Hashtag(hashtag_text.into()))) + } +} + +#[inline] +fn hashtag_chars(input: Span) -> IResult { + recognize(alt(( + recognize(tuple((tag("("), hashtag_chars, tag(")")))), + recognize(tuple((tag("["), hashtag_chars, tag("]")))), + recognize(tuple((tag("「"), hashtag_chars, tag("」")))), + recognize(tuple((tag("("), hashtag_chars, tag(")")))), + recognize(tuple(( + not(space1), + not_line_ending, + not(one_of(".,:;!?#?/[]【】()「」()<>")), + anychar, + ))), + )))(input) } #[inline]