MMM: Fixed hashtag parsing

This commit is contained in:
Natty 2023-10-23 23:52:02 +02:00
parent 86d5c87e9a
commit 42fa83c6e2
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
1 changed files with 31 additions and 4 deletions

View File

@ -7,7 +7,7 @@ use nom::character::complete::{
satisfy, space1, tab, satisfy, space1, tab,
}; };
use nom::combinator::{eof, fail, map, not, opt, recognize}; use nom::combinator::{eof, fail, map, not, opt, recognize};
use nom::error::{ErrorKind, ParseError}; use nom::error::ErrorKind;
use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1}; use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1};
use nom::sequence::tuple; use nom::sequence::tuple;
use nom::{IResult, Offset, Parser, Slice}; use nom::{IResult, Offset, Parser, Slice};
@ -277,6 +277,14 @@ fn alpha1_unicode(input: Span) -> IResult<Span, Span> {
recognize(many1_count(satisfy(char::is_alphanumeric)))(input) recognize(many1_count(satisfy(char::is_alphanumeric)))(input)
} }
#[inline]
fn space1_unicode(input: Span) -> IResult<Span, Span> {
recognize(many1_count(tuple((
not(line_ending),
satisfy(char::is_whitespace),
))))(input)
}
#[inline] #[inline]
fn alphanumeric1_unicode(input: Span) -> IResult<Span, Span> { fn alphanumeric1_unicode(input: Span) -> IResult<Span, Span> {
recognize(many1_count(satisfy(char::is_alphanumeric)))(input) recognize(many1_count(satisfy(char::is_alphanumeric)))(input)
@ -1293,7 +1301,12 @@ impl Context {
} }
fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> { fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
// TODO: Skip when preceded by alphanumerics let (input, maybe_preceded) =
opt(recognize(tuple((alphanumeric1_unicode, tag("#")))))(input)?;
if let Some(preceded) = maybe_preceded {
return Ok((input, Token::PlainText(preceded.into_fragment().into())));
}
let (input, _) = tag("#")(input)?; let (input, _) = tag("#")(input)?;
@ -1347,8 +1360,8 @@ impl Context {
tag(""), tag(""),
))), ))),
recognize(tuple(( recognize(tuple((
not(space1), not(space1_unicode),
not_line_ending, not(line_ending),
not(one_of(".,:;!?#?/[]【】()「」()<>")), not(one_of(".,:;!?#?/[]【】()「」()<>")),
anychar, anychar,
))), ))),
@ -1699,6 +1712,20 @@ text</center>"#
]) ])
); );
assert_eq!(
parse_full("test #hashtag tail"),
Token::Sequence(vec![
Token::PlainText("test ".into()),
Token::Hashtag("hashtag".into()),
Token::PlainText(" tail".into())
])
);
assert_eq!(
parse_full("not#hashtag tail"),
Token::PlainText("not#hashtag tail".into())
);
assert_eq!( assert_eq!(
parse_full("<https://example.com>"), parse_full("<https://example.com>"),
Token::UrlNoEmbed("https://example.com".into()) Token::UrlNoEmbed("https://example.com".into())