diff --git a/magnetar_mmm_parser/src/lib.rs b/magnetar_mmm_parser/src/lib.rs index 2f76532..4806587 100644 --- a/magnetar_mmm_parser/src/lib.rs +++ b/magnetar_mmm_parser/src/lib.rs @@ -7,7 +7,7 @@ use nom::character::complete::{ satisfy, space1, tab, }; use nom::combinator::{eof, fail, map, not, opt, recognize}; -use nom::error::{ErrorKind, ParseError}; +use nom::error::ErrorKind; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::sequence::tuple; use nom::{IResult, Offset, Parser, Slice}; @@ -277,6 +277,14 @@ fn alpha1_unicode(input: Span) -> IResult { recognize(many1_count(satisfy(char::is_alphanumeric)))(input) } +#[inline] +fn space1_unicode(input: Span) -> IResult { + recognize(many1_count(tuple(( + not(line_ending), + satisfy(char::is_whitespace), + ))))(input) +} + #[inline] fn alphanumeric1_unicode(input: Span) -> IResult { recognize(many1_count(satisfy(char::is_alphanumeric)))(input) @@ -1293,7 +1301,12 @@ impl Context { } fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult, Token> { - // TODO: Skip when preceded by alphanumerics + let (input, maybe_preceded) = + opt(recognize(tuple((alphanumeric1_unicode, tag("#")))))(input)?; + + if let Some(preceded) = maybe_preceded { + return Ok((input, Token::PlainText(preceded.into_fragment().into()))); + } let (input, _) = tag("#")(input)?; @@ -1347,8 +1360,8 @@ impl Context { tag(")"), ))), recognize(tuple(( - not(space1), - not_line_ending, + not(space1_unicode), + not(line_ending), not(one_of(".,:;!?#?/[]【】()「」()<>")), anychar, ))), @@ -1699,6 +1712,20 @@ text"# ]) ); + assert_eq!( + parse_full("test #hashtag tail"), + Token::Sequence(vec![ + Token::PlainText("test ".into()), + Token::Hashtag("hashtag".into()), + Token::PlainText(" tail".into()) + ]) + ); + + assert_eq!( + parse_full("not#hashtag tail"), + Token::PlainText("not#hashtag tail".into()) + ); + assert_eq!( parse_full(""), Token::UrlNoEmbed("https://example.com".into())