Fixed URL parsing and initial flanking rules implementation

This commit is contained in:
Natty 2023-10-08 22:15:55 +02:00
parent 26bd6fe4b2
commit d0d977e6eb
Signed by: natty
GPG Key ID: BF6CB659ADEE60EC
1 changed files with 322 additions and 66 deletions

View File

@ -1,19 +1,20 @@
use either::Either;
use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::bytes::complete::{tag, tag_no_case};
use nom::character::complete::{
alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1,
tab,
alpha1, alphanumeric1, anychar, char as one_char, char, line_ending, not_line_ending, one_of,
satisfy, space1, tab,
};
use nom::combinator::{eof, fail, map, not, opt, recognize};
use nom::error::ErrorKind;
use nom::multi::{many0, many0_count, many1, many1_count, many_till, separated_list1};
use nom::sequence::tuple;
use nom::{IResult, Offset, Slice};
use nom::{Compare, IResult, Offset, Slice};
use nom_locate::LocatedSpan;
use std::borrow::Cow;
use std::collections::HashMap;
use std::convert::{identity, Infallible};
use std::marker::PhantomData;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
@ -73,6 +74,80 @@ pub enum Token<'a> {
}
impl Token<'_> {
fn str_content_left(&self) -> Option<&str> {
match self {
Token::PlainText(text) => Some(text.as_ref()),
Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left),
Token::Quote(inner) => inner.str_content_left(),
Token::Small(inner) => inner.str_content_left(),
Token::BoldItalic(inner) => inner.str_content_left(),
Token::Bold(inner) => inner.str_content_left(),
Token::Italic(inner) => inner.str_content_left(),
Token::Center(inner) => inner.str_content_left(),
Token::Strikethrough(inner) => inner.str_content_left(),
Token::PlainTag(tag) => Some(tag.as_ref()),
Token::UrlRaw(url) => Some(url.as_ref()),
Token::UrlNoEmbed(url) => Some(url.as_ref()),
Token::Link { label, .. } => label.str_content_left(),
Token::Function { inner, .. } => inner.str_content_left(),
Token::Mention { name, .. } => Some(name.as_ref()),
Token::UnicodeEmoji(code) => Some(code.as_ref()),
Token::ShortcodeEmoji(_) => None,
Token::Hashtag(tag) => Some(tag.as_ref()),
_ => None,
}
}
fn str_content_right(&self) -> Option<&str> {
match self {
Token::PlainText(text) => Some(text.as_ref()),
Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right),
Token::Quote(inner) => inner.str_content_right(),
Token::Small(inner) => inner.str_content_right(),
Token::BoldItalic(inner) => inner.str_content_right(),
Token::Bold(inner) => inner.str_content_right(),
Token::Italic(inner) => inner.str_content_right(),
Token::Center(inner) => inner.str_content_right(),
Token::Strikethrough(inner) => inner.str_content_right(),
Token::PlainTag(tag) => Some(tag.as_ref()),
Token::UrlRaw(url) => Some(url.as_ref()),
Token::UrlNoEmbed(url) => Some(url.as_ref()),
Token::Link { label, .. } => label.str_content_right(),
Token::Function { inner, .. } => inner.str_content_right(),
Token::Mention { name, .. } => Some(name.as_ref()),
Token::UnicodeEmoji(code) => Some(code.as_ref()),
Token::Hashtag(tag) => Some(tag.as_ref()),
_ => None,
}
}
fn inner(&self) -> Token {
match self {
plain @ Token::PlainText(_) => plain.clone(),
sequence @ Token::Sequence(_) => sequence.clone(),
Token::Quote(inner) => inner.inner(),
Token::Small(inner) => inner.inner(),
Token::BoldItalic(inner) => inner.inner(),
Token::Bold(inner) => inner.inner(),
Token::Italic(inner) => inner.inner(),
Token::Center(inner) => inner.inner(),
Token::Strikethrough(inner) => inner.inner(),
Token::PlainTag(text) => Token::PlainText(text.clone()),
Token::InlineCode(code) => Token::PlainText(code.clone()),
Token::InlineMath(math) => Token::PlainText(math.clone()),
Token::UrlRaw(url) => Token::PlainText(url.clone()),
Token::UrlNoEmbed(url) => Token::PlainText(url.clone()),
Token::Link { label, .. } => label.inner(),
Token::BlockCode { inner, .. } => Token::PlainText(inner.clone()),
Token::BlockMath(math) => Token::PlainText(math.clone()),
Token::Function { inner, .. } => inner.inner(),
Token::Mention { name, .. } => Token::PlainText(name.clone()),
Token::UnicodeEmoji(code) => Token::PlainText(code.clone()),
Token::ShortcodeEmoji(shortcode) => Token::PlainText(shortcode.clone()),
Token::Hashtag(tag) => Token::PlainText(tag.clone()),
}
}
fn owned(&self) -> Token<'static> {
match self {
Token::PlainText(text) => Token::PlainText(Cow::Owned(text.clone().into_owned())),
@ -129,7 +204,7 @@ impl Token<'_> {
Token::ShortcodeEmoji(shortcode) => {
Token::ShortcodeEmoji(Cow::Owned(shortcode.clone().into_owned()))
}
Token::Hashtag(url) => Token::Hashtag(Cow::Owned(url.clone().into_owned())),
Token::Hashtag(tag) => Token::Hashtag(Cow::Owned(tag.clone().into_owned())),
}
}
@ -245,6 +320,16 @@ fn collect_char_sequence<'a>(
move |chars| func(Cow::Owned(chars.collect()))
}
#[inline]
fn alpha1_unicode(input: Span) -> IResult<Span, Span> {
recognize(many1_count(satisfy(char::is_alphanumeric)))(input)
}
#[inline]
fn alphanumeric1_unicode(input: Span) -> IResult<Span, Span> {
recognize(many1_count(satisfy(char::is_alphanumeric)))(input)
}
fn spliced<'a>(
segments: &[Span<'a>],
func: impl Fn(Span) -> IResult<Span, Token>,
@ -316,15 +401,16 @@ fn space(input: Span) -> IResult<Span, Token> {
Ok((input, Token::PlainText(frag.into_fragment().into())))
}
struct Matcher<'a, 'b, T> {
#[derive(Copy, Clone)]
struct Matcher<'a, 'b, T: Clone> {
matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a),
collector: &'a (dyn Fn(&mut dyn Iterator<Item = T>) -> Token<'b> + 'a),
_phantom_closure: std::marker::PhantomData<&'a ()>,
_phantom_data: std::marker::PhantomData<&'b ()>,
_phantom_output: std::marker::PhantomData<fn() -> T>,
_phantom_closure: PhantomData<&'a ()>,
_phantom_data: PhantomData<&'b ()>,
_phantom_output: PhantomData<fn() -> T>,
}
impl<'a, 'b, T> Matcher<'a, 'b, T> {
impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> {
fn new(
matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a),
collector: &'a (dyn Fn(&mut dyn Iterator<Item = T>) -> Token<'b> + 'a),
@ -332,9 +418,9 @@ impl<'a, 'b, T> Matcher<'a, 'b, T> {
Self {
matcher_inner,
collector,
_phantom_closure: std::marker::PhantomData,
_phantom_data: std::marker::PhantomData,
_phantom_output: std::marker::PhantomData,
_phantom_closure: PhantomData,
_phantom_data: PhantomData,
_phantom_output: PhantomData,
}
}
}
@ -345,33 +431,60 @@ impl<'a, 'b> Matcher<'a, 'b, Infallible> {
Self {
matcher_inner: &fail::<_, Infallible, _>,
collector: &|_| unreachable!(),
_phantom_closure: std::marker::PhantomData,
_phantom_data: std::marker::PhantomData,
_phantom_output: std::marker::PhantomData,
_phantom_closure: PhantomData,
_phantom_data: PhantomData,
_phantom_output: PhantomData,
}
}
}
struct Context;
#[derive(Copy, Clone, Debug)]
enum FlankingRule {
Lenient,
Strict,
DontCare,
}
struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>>(
T,
FlankingRule,
PhantomData<&'a ()>,
);
impl<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>> From<(T, FlankingRule)>
for FlankingDelim<'a, T>
{
fn from((func, rule): (T, FlankingRule)) -> Self {
FlankingDelim(func, rule, PhantomData)
}
}
impl<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>> From<T> for FlankingDelim<'a, T> {
fn from(func: T) -> Self {
FlankingDelim(func, FlankingRule::DontCare, PhantomData)
}
}
pub struct Context;
impl Context {
#[inline]
const fn partial(
fn partial(
&self,
func: impl for<'a> Fn(&Self, Span<'a>) -> IResult<Span<'a>, Token<'a>> + 'static,
) -> impl for<'a> Fn(Span<'a>) -> IResult<Span<'a>, Token<'a>> + '_ {
move |input| func(self, input)
}
fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
pub fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
map(many1(self.partial(Self::full_single)), Token::Sequence)(input)
}
fn inline<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
pub fn inline<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
map(many1(self.partial(Self::inline_single)), Token::Sequence)(input)
}
fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
pub fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
map(
many1(self.partial(Self::inline_label_safe_single)),
Token::Sequence,
@ -606,14 +719,21 @@ impl Context {
}
#[inline]
fn tag_delimited<'a, 'b: 'a, T, S>(
fn tag_delimited<'a, 'b: 'a, T: Clone, S: Clone, FOpen, FClose>(
&'a self,
opening_tag: impl Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
closing_tag: impl Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
opening_tag: impl Into<FlankingDelim<'b, FOpen>> + 'a,
closing_tag: impl Into<FlankingDelim<'b, FClose>> + 'a,
escape: bool,
matcher: Matcher<'a, 'b, T>,
fallback: Matcher<'a, 'b, S>,
) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_ {
) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token<'b>> + '_
where
FOpen: Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
FClose: Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
{
let FlankingDelim(opening_tag, opening_rule, ..) = opening_tag.into();
let FlankingDelim(closing_tag, closing_rule, ..) = closing_tag.into();
move |input| {
if escape {
if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) {
@ -662,10 +782,44 @@ impl Context {
));
}
let (input, (inner, _)) = res?;
let (input, (inner, closing)) = res?;
let mut inner = inner.into_iter().map(|(_, t)| t);
Ok((input, (matcher.collector)(&mut inner)))
let inner_tok = (matcher.collector)(&mut inner);
let correct_left_flanking =
if let FlankingRule::Lenient | FlankingRule::Strict = opening_rule {
let text_left = inner_tok.str_content_left();
!(text_left.is_some_and(|s| s.starts_with(char::is_whitespace))
|| text_left.is_none())
} else {
true
};
let correct_right_flanking =
if let FlankingRule::Lenient | FlankingRule::Strict = closing_rule {
let text_right = inner_tok.str_content_right();
!(text_right.is_some_and(|s| s.ends_with(char::is_whitespace))
|| text_right.is_none())
} else {
true
};
// TODO: Unfinished flanking rules
let correct_flanking = correct_left_flanking && correct_right_flanking;
if !correct_flanking {
return Ok((
input,
Token::Sequence(vec![
Token::PlainText(begin.fragment_between(&post_open).into()),
inner_tok.inner().owned(),
Token::PlainText(closing.into_fragment().into()),
]),
));
}
Ok((input, Token::Sequence(vec![inner_tok])))
}
}
@ -720,12 +874,12 @@ impl Context {
}
fn tag_plain<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
let opening_tag = &tag("<small>");
let closing_tag = &tag("</small>");
let opening_tag = &tag("<plain>");
let closing_tag = &tag("</plain>");
let (input, _) = opening_tag(input)?;
let (input, text) = map(
recognize(many1(tuple((not_line_ending, not(closing_tag))))),
recognize(many1(tuple((not(line_ending), not(closing_tag), anychar)))),
Span::into_fragment,
)(input)?;
let (input, _) = closing_tag(input)?;
@ -735,8 +889,8 @@ impl Context {
fn tag_small<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("<small>"),
tag("</small>"),
tag_no_case("<small>"),
tag_no_case("</small>"),
false,
Matcher::new(
&self.partial(Self::inline_single),
@ -749,11 +903,10 @@ impl Context {
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("***"),
tag("***"),
(tag("***"), FlankingRule::Lenient),
(tag("***"), FlankingRule::Lenient),
true,
Matcher::new(
&self.partial(Self::inline_single),
@ -766,11 +919,10 @@ impl Context {
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("___"),
tag("___"),
(tag("___"), FlankingRule::Strict),
(tag("___"), FlankingRule::Strict),
true,
Matcher::new(
&self.partial(Self::inline_single),
@ -785,8 +937,8 @@ impl Context {
fn tag_bold<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("<b>"),
tag("</b>"),
tag_no_case("<b>"),
tag_no_case("</b>"),
false,
Matcher::new(
&self.partial(Self::inline_single),
@ -799,11 +951,10 @@ impl Context {
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("**"),
tag("**"),
(tag("**"), FlankingRule::Lenient),
(tag("**"), FlankingRule::Lenient),
true,
Matcher::new(
&self.partial(Self::inline_single),
@ -816,11 +967,10 @@ impl Context {
)(input)
}
// TODO: CommonMark flanking rules
fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("__"),
tag("__"),
(tag("__"), FlankingRule::Strict),
(tag("__"), FlankingRule::Strict),
true,
Matcher::new(
&self.partial(Self::inline_single),
@ -835,8 +985,8 @@ impl Context {
fn tag_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("<i>"),
tag("</i>"),
tag_no_case("<i>"),
tag_no_case("</i>"),
false,
Matcher::new(
&self.partial(Self::inline_single),
@ -849,11 +999,10 @@ impl Context {
)(input)
}
// TODO: CommonMark flanking rules
fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("*"),
tag("*"),
(tag("*"), FlankingRule::Lenient),
(tag("*"), FlankingRule::Lenient),
true,
Matcher::new(
&self.partial(Self::inline_single),
@ -866,11 +1015,10 @@ impl Context {
)(input)
}
// TODO: CommonMark flanking rules
fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("_"),
tag("_"),
(tag("_"), FlankingRule::Strict),
(tag("_"), FlankingRule::Strict),
true,
Matcher::new(
&self.partial(Self::inline_single),
@ -885,8 +1033,8 @@ impl Context {
fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("<s>"),
tag("</s>"),
tag_no_case("<s>"),
tag_no_case("</s>"),
false,
Matcher::new(
&self.partial(Self::inline_single),
@ -899,11 +1047,10 @@ impl Context {
)(input)
}
// TODO: CommonMark flanking rules
fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
self.tag_delimited(
tag("~~"),
tag("~~"),
(tag("~~"), FlankingRule::Lenient),
(tag("~~"), FlankingRule::Lenient),
true,
Matcher::new(
&move |input| {
@ -1037,20 +1184,42 @@ impl Context {
}
fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
// TODO: Fail when preceded by alphanumerics
if let (plain_out, Some(plain)) = map(
opt(recognize(tuple((
alphanumeric1_unicode,
self.partial(Self::shortcode_emoji),
)))),
|o| o.map(Span::into_fragment),
)(input)?
{
return Ok((plain_out, Token::PlainText(plain.into())));
}
let (input, _) = tag(":")(input)?;
let (input, shortcode) = map(
recognize(many1(alt((alphanumeric1, recognize(one_of("_+-")))))),
recognize(many1(alt((
alphanumeric1_unicode,
recognize(one_of("_+-")),
)))),
Span::into_fragment,
)(input)?;
let (input, _) = tag(":")(input)?;
let (input, _) = not(alphanumeric1)(input)?;
let (input, _) = not(alphanumeric1_unicode)(input)?;
Ok((input, Token::ShortcodeEmoji(shortcode.into())))
}
fn tag_mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
// TODO: Escaping and skip when preceded by alphanumerics
if let (plain_out, Some(plain)) = map(
opt(recognize(tuple((
alt((tag("\\"), alphanumeric1_unicode)),
self.partial(Self::tag_mention),
)))),
|o| o.map(Span::into_fragment),
)(input)?
{
return Ok((plain_out, Token::PlainText(plain.into())));
}
let tags = one_of("@!");
let (input, mention_type) = map(tags, |c| match c {
@ -1123,12 +1292,12 @@ fn protocol(input: Span) -> IResult<Span, Span> {
#[inline]
fn url_chars_base(input: Span) -> IResult<Span, Span> {
recognize(alt((
alpha1,
alt((
alphanumeric1_unicode,
recognize(tuple((tag("["), many_till(url_chars_base, tag("]"))))),
recognize(tuple((tag("("), many_till(url_chars_base, tag(")"))))),
recognize(one_of(".,_/:%#$&?!~=+-@")),
)))(input)
))(input)
}
#[inline]
@ -1221,6 +1390,21 @@ mod test {
Token::Italic(Box::new(Token::PlainText("italic".into()))),
);
assert_eq!(
parse_full(r#"* italic *"#),
Token::PlainText("* italic *".into())
);
assert_eq!(
parse_full(r#"_ italic *"#),
Token::PlainText("_ italic *".into())
);
assert_eq!(
parse_full(r#"*"italic"*"#),
Token::Italic(Box::new(Token::PlainText("\"italic\"".into())))
);
assert_eq!(
parse_full(r#"not code `code` also not code"#),
Token::Sequence(vec![
@ -1356,6 +1540,47 @@ text</center>"#
#[test]
fn parse_link() {
assert_eq!(
parse_full("IPv4 test: <https://0>"),
Token::Sequence(vec![
Token::PlainText("IPv4 test: ".into()),
Token::UrlNoEmbed("https://0".into())
])
);
assert_eq!(
parse_full("IPv4 test: <https://127.0.0.1>"),
Token::Sequence(vec![
Token::PlainText("IPv4 test: ".into()),
Token::UrlNoEmbed("https://127.0.0.1".into())
])
);
assert_eq!(
parse_full("IPv6 test: <https://[::2f:1]/nya>"),
Token::Sequence(vec![
Token::PlainText("IPv6 test: ".into()),
Token::UrlNoEmbed("https://[::2f:1]/nya".into())
])
);
assert_eq!(
parse_full("IPv6 test: https://[::2f:1]/nya"),
Token::Sequence(vec![
Token::PlainText("IPv6 test: ".into()),
Token::UrlRaw("https://[::2f:1]/nya".into())
])
);
// IDNs
assert_eq!(
parse_full("IDN test: https://www.háčkyčárky.cz/"),
Token::Sequence(vec![
Token::PlainText("IDN test: ".into()),
Token::UrlRaw("https://www.háčkyčárky.cz/".into())
])
);
assert_eq!(
parse_full("Link test: [label](https://example.com)"),
Token::Sequence(vec![
@ -1440,6 +1665,11 @@ text</center>"#
}
);
assert_eq!(
parse_full("email@notactuallyamenmtion.org"),
Token::PlainText("email@notactuallyamenmtion.org".into())
);
assert_eq!(
parse_full("hgsjlkdsa @tag fgahjsdkd"),
Token::Sequence(vec![
@ -1532,6 +1762,32 @@ text</center>"#
);
}
#[test]
fn parse_shortcodes() {
assert_eq!(
parse_full(":bottom:"),
Token::ShortcodeEmoji("bottom".into())
);
assert_eq!(
parse_full(":bottom::blobfox:"),
Token::Sequence(vec![
Token::ShortcodeEmoji("bottom".into()),
Token::ShortcodeEmoji("blobfox".into())
])
);
assert_eq!(
parse_full(":bottom:blobfox"),
Token::PlainText(":bottom:blobfox".into())
);
assert_eq!(
parse_full("bottom:blobfox:"),
Token::PlainText("bottom:blobfox:".into())
);
}
#[test]
fn parse_emoji() {
assert_eq!(