Implemented URL parsing
This commit is contained in:
parent
24d44632e0
commit
9b26691ff4
|
@ -1,10 +1,10 @@
|
||||||
use nom::branch::alt;
|
use nom::branch::alt;
|
||||||
use nom::bytes::complete::tag;
|
use nom::bytes::complete::tag;
|
||||||
use nom::character::complete;
|
|
||||||
use nom::character::complete::{
|
use nom::character::complete::{
|
||||||
alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, tab,
|
alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of, space1,
|
||||||
|
tab,
|
||||||
};
|
};
|
||||||
use nom::combinator::{fail, not, opt, recognize};
|
use nom::combinator::{eof, fail, not, opt, recognize};
|
||||||
use nom::error::ErrorKind;
|
use nom::error::ErrorKind;
|
||||||
use nom::multi::{many0, many0_count, many1, many1_count, separated_list1};
|
use nom::multi::{many0, many0_count, many1, many1_count, separated_list1};
|
||||||
use nom::sequence::tuple;
|
use nom::sequence::tuple;
|
||||||
|
@ -28,6 +28,13 @@ pub enum Token<'a> {
|
||||||
PlainTag(Cow<'a, str>),
|
PlainTag(Cow<'a, str>),
|
||||||
InlineCode(Cow<'a, str>),
|
InlineCode(Cow<'a, str>),
|
||||||
InlineMath(Cow<'a, str>),
|
InlineMath(Cow<'a, str>),
|
||||||
|
UrlRaw(Cow<'a, str>),
|
||||||
|
UrlNoEmbed(Cow<'a, str>),
|
||||||
|
Link {
|
||||||
|
label: Cow<'a, str>,
|
||||||
|
href: Cow<'a, str>,
|
||||||
|
embed: bool,
|
||||||
|
},
|
||||||
BlockCode {
|
BlockCode {
|
||||||
lang: Option<Cow<'a, str>>,
|
lang: Option<Cow<'a, str>>,
|
||||||
inner: Cow<'a, str>,
|
inner: Cow<'a, str>,
|
||||||
|
@ -56,6 +63,13 @@ impl Token<'_> {
|
||||||
Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())),
|
Token::PlainTag(tag) => Token::PlainTag(Cow::Owned(tag.clone().into_owned())),
|
||||||
Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())),
|
Token::InlineCode(code) => Token::InlineCode(Cow::Owned(code.clone().into_owned())),
|
||||||
Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())),
|
Token::InlineMath(math) => Token::InlineMath(Cow::Owned(math.clone().into_owned())),
|
||||||
|
Token::UrlRaw(url) => Token::UrlRaw(Cow::Owned(url.clone().into_owned())),
|
||||||
|
Token::UrlNoEmbed(url) => Token::UrlNoEmbed(Cow::Owned(url.clone().into_owned())),
|
||||||
|
Token::Link { embed, label, href } => Token::Link {
|
||||||
|
embed: *embed,
|
||||||
|
label: Cow::Owned(label.clone().into_owned()),
|
||||||
|
href: Cow::Owned(href.clone().into_owned()),
|
||||||
|
},
|
||||||
Token::BlockCode { inner, lang } => Token::BlockCode {
|
Token::BlockCode { inner, lang } => Token::BlockCode {
|
||||||
lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())),
|
lang: lang.as_ref().map(|l| Cow::Owned(l.clone().into_owned())),
|
||||||
inner: Cow::Owned(inner.clone().into_owned()),
|
inner: Cow::Owned(inner.clone().into_owned()),
|
||||||
|
@ -184,12 +198,8 @@ fn spliced<'a>(
|
||||||
}
|
}
|
||||||
|
|
||||||
fn space(input: Span) -> IResult<Span, Token> {
|
fn space(input: Span) -> IResult<Span, Token> {
|
||||||
let start = input;
|
let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?;
|
||||||
let (input, _) = alt((complete::char('\u{0020}'), complete::char('\u{3000}'), tab))(input)?;
|
Ok((input, Token::PlainText(frag.into_fragment().into())))
|
||||||
Ok((
|
|
||||||
input,
|
|
||||||
Token::PlainText(start.fragment_between(&input).into()),
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Context;
|
struct Context;
|
||||||
|
@ -213,6 +223,11 @@ impl Context {
|
||||||
Ok((input, token))
|
Ok((input, token))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn inline_no_link<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||||
|
let (input, token) = alt((self.partial(Self::tag_small), self.partial(Self::text)))(input)?;
|
||||||
|
Ok((input, token))
|
||||||
|
}
|
||||||
|
|
||||||
fn tag_quote<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
fn tag_quote<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||||
let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?;
|
let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?;
|
||||||
|
|
||||||
|
@ -550,4 +565,147 @@ impl Context {
|
||||||
Token::PlainText(before.fragment_between(&input).into()),
|
Token::PlainText(before.fragment_between(&input).into()),
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn url<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||||
|
let (input, url_span) = recognize(tuple((
|
||||||
|
protocol,
|
||||||
|
url_chars(|input| not(url_chars_base)(input), false),
|
||||||
|
)))(input)?;
|
||||||
|
|
||||||
|
let url = url_span.into_fragment();
|
||||||
|
let url_bytes = url.as_bytes();
|
||||||
|
|
||||||
|
// Strip punctuation at the end of sentences that might have been consumed as a part of the URL
|
||||||
|
let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) {
|
||||||
|
url.slice(..url.len() - 1)
|
||||||
|
} else {
|
||||||
|
url
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok((input, Token::UrlRaw(Cow::from(final_url))))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||||
|
let (input, _) = tag("<")(input)?;
|
||||||
|
let (input, url_span) = recognize(tuple((protocol, url_chars(tag(">"), true))))(input)?;
|
||||||
|
let (input, _) = tag(">")(input)?;
|
||||||
|
|
||||||
|
Ok((input, Token::UrlRaw(Cow::from(url_span.into_fragment()))))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn link<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token<'a>> {
|
||||||
|
let (input, no_embed) = opt(tag("?"))(input)?;
|
||||||
|
let (input, _) = tag("[")(input)?;
|
||||||
|
let (input, _) = not(tag("["))(input)?;
|
||||||
|
let (input, label_span) =
|
||||||
|
recognize(many1(tuple((not(tag("](")), not_line_ending))))(input)?;
|
||||||
|
let (input, _) = tag("]")(input)?;
|
||||||
|
let (input, _) = tag("(")(input)?;
|
||||||
|
let (input, url_span) = recognize(tuple((protocol, url_chars(tag("]"), true))))(input)?;
|
||||||
|
let (input, _) = tag(")")(input)?;
|
||||||
|
|
||||||
|
Ok((
|
||||||
|
input,
|
||||||
|
Token::Link {
|
||||||
|
label: label_span.into_fragment().into(),
|
||||||
|
href: url_span.into_fragment().into(),
|
||||||
|
embed: no_embed.is_none(),
|
||||||
|
},
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn protocol(input: Span) -> IResult<Span, Span> {
|
||||||
|
alt((tag("https://"), tag("http://")))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn url_chars_base(input: Span) -> IResult<Span, Span> {
|
||||||
|
recognize(alt((alpha1, recognize(one_of(".,_/:%#$&?!~=+-()[]@")))))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn url_chars<'a, T: 'a>(
|
||||||
|
terminator: impl Fn(Span<'a>) -> IResult<Span<'a>, T> + 'a,
|
||||||
|
spaces: bool,
|
||||||
|
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'a {
|
||||||
|
let terminating = move |input| {
|
||||||
|
tuple((
|
||||||
|
&terminator,
|
||||||
|
alt((
|
||||||
|
space1,
|
||||||
|
line_ending,
|
||||||
|
eof,
|
||||||
|
recognize(one_of("([<'\"")),
|
||||||
|
recognize(tuple((
|
||||||
|
alt((alpha1, recognize(one_of("*")))),
|
||||||
|
alt((space1, line_ending, eof)),
|
||||||
|
))),
|
||||||
|
)),
|
||||||
|
))(input)
|
||||||
|
};
|
||||||
|
|
||||||
|
let chars = tuple((
|
||||||
|
not(tuple((space1, eof))),
|
||||||
|
not(tuple((space1, tag("\"")))),
|
||||||
|
not(tuple((opt(space1), terminating))),
|
||||||
|
alt((url_chars_base, if spaces { space1 } else { fail })),
|
||||||
|
));
|
||||||
|
|
||||||
|
recognize(many1_count(chars))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use crate::{url_chars, Span};
|
||||||
|
use nom::bytes::complete::tag;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_url_chars() {
|
||||||
|
let test1 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security))";
|
||||||
|
assert_eq!(
|
||||||
|
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)",
|
||||||
|
url_chars(tag(")"), true)(Span::new(test1))
|
||||||
|
.unwrap()
|
||||||
|
.1
|
||||||
|
.into_fragment()
|
||||||
|
);
|
||||||
|
|
||||||
|
let test2 = "https://en.wikipedia.org/wiki/Sandbox_(computer_security)))";
|
||||||
|
assert_eq!(
|
||||||
|
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
|
||||||
|
url_chars(tag(")"), true)(Span::new(test2))
|
||||||
|
.unwrap()
|
||||||
|
.1
|
||||||
|
.into_fragment()
|
||||||
|
);
|
||||||
|
|
||||||
|
let test3 = "https://en.wikipedia.org/wiki/(";
|
||||||
|
assert_eq!(
|
||||||
|
test3,
|
||||||
|
url_chars(tag(")"), true)(Span::new(test3))
|
||||||
|
.unwrap()
|
||||||
|
.1
|
||||||
|
.into_fragment()
|
||||||
|
);
|
||||||
|
|
||||||
|
let test4 = "https://cs.wikipedia.org/wiki/Among_Us ";
|
||||||
|
assert_eq!(
|
||||||
|
"https://cs.wikipedia.org/wiki/Among_Us",
|
||||||
|
url_chars(tag(")"), true)(Span::new(test4))
|
||||||
|
.unwrap()
|
||||||
|
.1
|
||||||
|
.into_fragment()
|
||||||
|
);
|
||||||
|
|
||||||
|
let test5 = "https://cs.wikipedia.org/wiki/Among Us )";
|
||||||
|
assert_eq!(
|
||||||
|
"https://cs.wikipedia.org/wiki/Among Us",
|
||||||
|
url_chars(tag(")"), true)(Span::new(test5))
|
||||||
|
.unwrap()
|
||||||
|
.1
|
||||||
|
.into_fragment()
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue