magnetar/magnetar_mmm_parser/src/lib.rs

2417 lines
79 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use compact_str::{CompactString, ToCompactString};
use either::Either;
use nom::branch::alt;
use nom::bytes::complete::{tag, tag_no_case};
use nom::character::complete::{
alpha1, alphanumeric1, anychar, char as one_char, line_ending, not_line_ending, one_of,
satisfy, space1, tab,
};
use nom::combinator::{eof, fail, map, not, opt, peek, recognize};
use nom::error::ErrorKind;
use nom::multi::{many0_count, many1, many1_count, many_till, separated_list1};
use nom::sequence::tuple;
use nom::{IResult, Offset, Parser, Slice};
use nom_locate::LocatedSpan;
use quick_xml::events::{BytesText, Event};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::convert::{identity, Infallible};
use std::io::{Cursor, Write};
use std::marker::PhantomData;
use strum::IntoStaticStr;
use tracing::trace;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize, IntoStaticStr)]
// The alternative would be to implement a serde serializer for this one enum, but that's disgusting
#[strum(serialize_all = "snake_case")]
#[serde(rename_all = "snake_case")]
pub enum MentionType {
Community,
User,
MatrixUser,
}
impl MentionType {
pub fn to_char(&self) -> char {
match self {
MentionType::Community => '!',
MentionType::User => '@',
MentionType::MatrixUser => ':',
}
}
pub fn separator(&self) -> char {
match self {
MentionType::Community | MentionType::User => '@',
MentionType::MatrixUser => ':',
}
}
}
#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)]
pub enum Token {
PlainText(CompactString),
Sequence(Vec<Token>),
Quote(Box<Token>),
Small(Box<Token>),
BoldItalic(Box<Token>),
Bold(Box<Token>),
Italic(Box<Token>),
Center(Box<Token>),
Strikethrough(Box<Token>),
PlainTag(String),
InlineCode(String),
InlineMath(String),
UrlRaw(String),
UrlNoEmbed(String),
Link {
label: Box<Token>,
href: String,
embed: bool,
},
BlockCode {
lang: Option<String>,
inner: String,
},
BlockMath(String),
Function {
name: String,
params: HashMap<String, Option<String>>,
inner: Box<Token>,
},
Mention {
name: String,
host: Option<String>,
mention_type: MentionType,
},
UnicodeEmoji(String),
ShortcodeEmoji {
shortcode: String,
host: Option<String>,
},
Hashtag(String),
}
impl Token {
fn str_content_left(&self) -> Option<&str> {
match self {
Token::PlainText(text) => Some(text.as_ref()),
Token::Sequence(tokens) => tokens.first().and_then(Token::str_content_left),
Token::Quote(inner) => inner.str_content_left(),
Token::Small(inner) => inner.str_content_left(),
Token::BoldItalic(inner) => inner.str_content_left(),
Token::Bold(inner) => inner.str_content_left(),
Token::Italic(inner) => inner.str_content_left(),
Token::Center(inner) => inner.str_content_left(),
Token::Strikethrough(inner) => inner.str_content_left(),
Token::PlainTag(tag) => Some(tag.as_ref()),
Token::UrlRaw(url) => Some(url.as_ref()),
Token::UrlNoEmbed(url) => Some(url.as_ref()),
Token::Link { label, .. } => label.str_content_left(),
Token::Function { inner, .. } => inner.str_content_left(),
Token::Mention { name, .. } => Some(name.as_ref()),
Token::UnicodeEmoji(code) => Some(code.as_ref()),
Token::Hashtag(tag) => Some(tag.as_ref()),
_ => None,
}
}
fn str_content_right(&self) -> Option<&str> {
match self {
Token::PlainText(text) => Some(text.as_ref()),
Token::Sequence(tokens) => tokens.last().and_then(Token::str_content_right),
Token::Quote(inner) => inner.str_content_right(),
Token::Small(inner) => inner.str_content_right(),
Token::BoldItalic(inner) => inner.str_content_right(),
Token::Bold(inner) => inner.str_content_right(),
Token::Italic(inner) => inner.str_content_right(),
Token::Center(inner) => inner.str_content_right(),
Token::Strikethrough(inner) => inner.str_content_right(),
Token::PlainTag(tag) => Some(tag.as_ref()),
Token::UrlRaw(url) => Some(url.as_ref()),
Token::UrlNoEmbed(url) => Some(url.as_ref()),
Token::Link { label, .. } => label.str_content_right(),
Token::Function { inner, .. } => inner.str_content_right(),
Token::Mention { name, .. } => Some(name.as_ref()),
Token::UnicodeEmoji(code) => Some(code.as_ref()),
Token::Hashtag(tag) => Some(tag.as_ref()),
_ => None,
}
}
fn inner(&self) -> Token {
match self {
plain @ Token::PlainText(_) => plain.clone(),
sequence @ Token::Sequence(_) => sequence.clone(),
Token::Quote(inner) => inner.inner(),
Token::Small(inner) => inner.inner(),
Token::BoldItalic(inner) => inner.inner(),
Token::Bold(inner) => inner.inner(),
Token::Italic(inner) => inner.inner(),
Token::Center(inner) => inner.inner(),
Token::Strikethrough(inner) => inner.inner(),
Token::PlainTag(text) => Token::PlainText(text.clone().into()),
Token::InlineCode(code) => Token::PlainText(code.clone().into()),
Token::InlineMath(math) => Token::PlainText(math.clone().into()),
Token::UrlRaw(url) => Token::PlainText(url.clone().into()),
Token::UrlNoEmbed(url) => Token::PlainText(url.clone().into()),
Token::Link { label, .. } => label.inner(),
Token::BlockCode { inner, .. } => Token::PlainText(inner.clone().into()),
Token::BlockMath(math) => Token::PlainText(math.clone().into()),
Token::Function { inner, .. } => inner.inner(),
Token::Mention { name, .. } => Token::PlainText(name.clone().into()),
Token::UnicodeEmoji(code) => Token::PlainText(code.clone().into()),
Token::ShortcodeEmoji { shortcode, .. } => Token::PlainText(shortcode.clone().into()),
Token::Hashtag(tag) => Token::PlainText(tag.clone().into()),
}
}
fn merged(&self) -> Token {
match self {
Token::Sequence(tokens) => {
let tokens_multi = tokens.iter().fold(Vec::new(), |mut acc, tok| {
if let Some(Token::PlainText(last)) = acc.last_mut() {
if let Token::PlainText(tok_text) = tok {
*last += tok_text.as_ref();
return acc;
}
}
if let Token::Sequence(seq) = tok {
let items = seq.iter().map(Token::merged).flat_map(|t| match t {
Token::Sequence(seq) => Either::Left(seq.into_iter()),
other => Either::Right(std::iter::once(other)),
});
for item in items {
if let Some(Token::PlainText(last)) = acc.last_mut() {
if let Token::PlainText(tok_text) = item {
*last += tok_text.as_ref();
continue;
}
}
acc.push(item);
}
return acc;
}
acc.push(tok.merged());
acc
});
if tokens_multi.len() == 1 {
return tokens_multi.into_iter().next().unwrap();
}
Token::Sequence(tokens_multi)
}
Token::Quote(inner) => Token::Quote(Box::new(inner.merged())),
Token::Small(inner) => Token::Small(Box::new(inner.merged())),
Token::BoldItalic(inner) => Token::BoldItalic(Box::new(inner.merged())),
Token::Bold(inner) => Token::Bold(Box::new(inner.merged())),
Token::Italic(inner) => Token::Italic(Box::new(inner.merged())),
Token::Center(inner) => Token::Center(Box::new(inner.merged())),
Token::Strikethrough(inner) => Token::Strikethrough(Box::new(inner.merged())),
Token::Link { embed, label, href } => Token::Link {
label: Box::new(label.merged()),
href: href.clone(),
embed: *embed,
},
Token::Function {
name,
params,
inner,
} => Token::Function {
name: name.clone(),
params: params.clone(),
inner: Box::new(inner.merged()),
},
other => other.clone(),
}
}
pub fn walk_map_collect<T>(&self, func: &impl Fn(&Token) -> Option<T>, out: &mut Vec<T>) {
if let Some(v) = func(self) {
out.push(v)
}
match self {
Token::Sequence(items) => {
items.iter().for_each(|tok| tok.walk_map_collect(func, out));
}
Token::Quote(inner)
| Token::Small(inner)
| Token::BoldItalic(inner)
| Token::Bold(inner)
| Token::Italic(inner)
| Token::Center(inner)
| Token::Function { inner, .. }
| Token::Link { label: inner, .. }
| Token::Strikethrough(inner) => inner.walk_map_collect(func, out),
_ => {}
}
}
pub fn walk_speech_transform(&mut self, func: &impl Fn(&mut CompactString)) {
match self {
Token::Sequence(items) => {
items
.iter_mut()
.for_each(|tok| tok.walk_speech_transform(func));
}
Token::Small(inner)
| Token::BoldItalic(inner)
| Token::Bold(inner)
| Token::Italic(inner)
| Token::Center(inner)
| Token::Function { inner, .. }
| Token::Strikethrough(inner) => inner.walk_speech_transform(func),
Token::PlainText(text) => func(text),
_ => {}
}
}
fn write<T: Write>(&self, writer: &mut quick_xml::Writer<T>) -> quick_xml::Result<()> {
match self {
Token::PlainText(plain) => {
writer.write_event(Event::Text(BytesText::new(plain.as_str())))?;
}
Token::Sequence(sequence) => {
sequence.iter().try_for_each(|item| item.write(writer))?;
}
Token::Quote(inner) => {
writer
.create_element("quote")
.write_inner_content(|w| inner.write(w))?;
}
Token::Small(inner) => {
writer
.create_element("small")
.write_inner_content(|w| inner.write(w))?;
}
Token::BoldItalic(inner) => {
writer
.create_element("b")
.write_inner_content::<_, quick_xml::Error>(|w| {
w.create_element("i")
.write_inner_content(|w| inner.write(w))?;
Ok(())
})?;
}
Token::Bold(inner) => {
writer
.create_element("b")
.write_inner_content(|w| inner.write(w))?;
}
Token::Italic(inner) => {
writer
.create_element("i")
.write_inner_content(|w| inner.write(w))?;
}
Token::Center(inner) => {
writer
.create_element("center")
.write_inner_content(|w| inner.write(w))?;
}
Token::Strikethrough(inner) => {
writer
.create_element("s")
.write_inner_content(|w| inner.write(w))?;
}
Token::PlainTag(plain) => {
writer.write_event(Event::Text(BytesText::new(plain.as_str())))?;
}
Token::InlineCode(code) => {
writer
.create_element("inline-code")
.write_text_content(BytesText::new(code))?;
}
Token::InlineMath(math) => {
writer
.create_element("inline-math")
.write_text_content(BytesText::new(math))?;
}
Token::UrlRaw(url) => {
writer
.create_element("a")
.with_attribute(("href", url.as_str()))
.write_text_content(BytesText::new(url))?;
}
Token::UrlNoEmbed(url) => {
writer
.create_element("a")
.with_attribute(("href", url.as_str()))
.with_attribute(("embed", "false"))
.write_text_content(BytesText::new(url))?;
}
Token::Link { label, href, embed } => {
writer
.create_element("a")
.with_attribute(("href", href.as_str()))
.with_attribute(("embed", if *embed { "true" } else { "false" }))
.write_inner_content(|w| label.write(w))?;
}
Token::BlockCode { inner, lang } => {
let mut ew = writer.create_element("code");
if let Some(language) = lang {
ew = ew.with_attribute(("lang", language.as_str()));
}
ew.write_text_content(BytesText::new(inner))?;
}
Token::BlockMath(math) => {
writer
.create_element("math")
.write_text_content(BytesText::new(math))?;
}
Token::Function {
inner,
name,
params,
} => {
let mut ew = writer
.create_element("fn")
.with_attribute(("name", name.as_str()));
for (k, v) in params {
ew = ew
.with_attribute((format!("arg-{k}").as_str(), v.as_deref().unwrap_or("")));
}
ew.write_inner_content(|w| inner.write(w))?;
}
Token::Mention {
name,
host,
mention_type,
} => {
let mut ew = writer
.create_element("mention")
.with_attribute(("name", name.as_str()))
.with_attribute(("type", mention_type.into()));
if let Some(host) = host {
ew = ew.with_attribute(("host", host.as_str()));
}
ew.write_empty()?;
}
Token::UnicodeEmoji(text) => {
writer
.create_element("ue")
.write_text_content(BytesText::new(text))?;
}
Token::ShortcodeEmoji { shortcode, host } => {
let mut ew = writer.create_element("ee");
if let Some(host) = host {
ew = ew.with_attribute(("host", host.as_str()));
}
ew.write_text_content(BytesText::new(shortcode))?;
}
Token::Hashtag(tag) => {
writer
.create_element("hashtag")
.write_text_content(BytesText::new(tag.as_str()))?;
}
}
Ok(())
}
}
pub fn to_xml_string(token: &Token) -> quick_xml::Result<String> {
let mut writer = quick_xml::Writer::new(Cursor::new(Vec::new()));
writer
.create_element("mmm")
.write_inner_content(|writer| token.write(writer))?;
Ok(String::from_utf8(writer.into_inner().into_inner())?)
}
#[derive(Debug, Default, Copy, Clone)]
pub struct SpanMeta {
depth: usize,
}
impl SpanMeta {
fn new(depth: usize) -> Self {
Self { depth }
}
}
type Span<'a> = LocatedSpan<&'a str, SpanMeta>;
trait SliceOffset {
fn up_to(&self, other: &Self) -> Self;
fn fragment_between<'a>(&self, other: &Self) -> &'a str
where
Self: 'a;
}
impl SliceOffset for Span<'_> {
fn up_to(&self, other: &Self) -> Self {
self.slice(..self.offset(other))
}
fn fragment_between<'a>(&self, other: &Self) -> &'a str
where
Self: 'a,
{
self.up_to(other).into_fragment()
}
}
#[inline]
fn boxing_token(func: impl Fn(Box<Token>) -> Token) -> impl Fn(Token) -> Token {
move |tokens| func(Box::new(tokens))
}
#[inline]
fn collect_sequence<T>(
func: impl Fn(Vec<T>) -> Token,
transform: impl Fn(Token) -> Token,
) -> impl Fn(&mut dyn Iterator<Item = T>) -> Token {
move |tokens| transform(func(tokens.collect()))
}
#[inline]
fn collect_char_sequence(
func: impl Fn(String) -> Token,
) -> impl Fn(&mut dyn Iterator<Item = char>) -> Token {
move |chars| func(chars.collect())
}
#[inline]
fn space1_unicode(input: Span) -> IResult<Span, Span> {
recognize(many1_count(tuple((
not(line_ending),
satisfy(char::is_whitespace),
))))(input)
}
#[inline]
fn alphanumeric1_unicode(input: Span) -> IResult<Span, Span> {
recognize(many1_count(satisfy(char::is_alphanumeric)))(input)
}
fn spliced<'a>(
segments: &[Span<'a>],
func: impl Fn(Span) -> IResult<Span, Token>,
parent: Span<'a>,
) -> IResult<Span<'a>, Token, nom::error::Error<Span<'a>>> {
let combined = segments
.iter()
.copied()
.map(Span::into_fragment)
.collect::<Vec<_>>()
.join("\n");
let cum_offset_combined = segments
.iter()
.scan(0, |acc, &x| {
*acc += x.len();
Some(*acc)
})
.collect::<Vec<_>>();
let current_seg = |input: Span| {
cum_offset_combined
.iter()
.enumerate()
.take_while(|(_, &o)| o > input.location_offset())
.map(|(i, o)| (segments[i], o))
.last()
};
type NE<E> = nom::Err<E>;
type NomError<'x> = nom::error::Error<Span<'x>>;
let spliced_span = Span::new_extra(
&combined,
segments.first().map_or(SpanMeta::new(0), |s| s.extra),
);
let (input, inner) = match func(spliced_span) {
Ok(s) => s,
Err(e) => {
return match e {
NE::Error(e) => {
let offset_new = e.input.location_offset();
if let Some((seg_parent, offset_seg_new)) = current_seg(e.input) {
let offset = offset_new - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset();
Err(NE::Error(NomError::new(
Span::new_extra(
&parent.into_fragment()[offset_orig..],
seg_parent.extra,
),
e.code,
)))
} else {
// ???
Err(NE::Failure(NomError::new(parent, ErrorKind::Fail)))
}
}
NE::Failure(e) => Err(NE::Error(NomError::new(parent, e.code))),
NE::Incomplete(i) => Err(NE::Incomplete(i)),
};
}
};
let out = if let Some((seg_parent, offset_seg_new)) = current_seg(input) {
let offset = input.location_offset() - offset_seg_new;
let offset_orig = offset + seg_parent.location_offset();
parent.slice(offset_orig..)
} else {
parent
};
Ok((out, inner))
}
fn space(input: Span) -> IResult<Span, Token> {
let (input, frag) = recognize(alt((one_char('\u{0020}'), one_char('\u{3000}'), tab)))(input)?;
Ok((input, Token::PlainText(frag.into_fragment().into())))
}
#[derive(Copy, Clone)]
struct Matcher<'a, 'b, T: Clone> {
matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a),
collector: &'a (dyn Fn(&mut dyn Iterator<Item = T>) -> Token + 'a),
_phantom_closure: PhantomData<&'a ()>,
_phantom_data: PhantomData<&'b ()>,
_phantom_output: PhantomData<fn() -> T>,
}
impl<'a, 'b, T: Clone> Matcher<'a, 'b, T> {
fn new(
matcher_inner: &'a (dyn Fn(Span<'b>) -> IResult<Span<'b>, T> + 'a),
collector: &'a (dyn Fn(&mut dyn Iterator<Item = T>) -> Token + 'a),
) -> Self {
Self {
matcher_inner,
collector,
_phantom_closure: PhantomData,
_phantom_data: PhantomData,
_phantom_output: PhantomData,
}
}
}
impl<'a, 'b> Matcher<'a, 'b, Infallible> {
// Don't break this invariant, else a monster will come at night and eat all your socks
fn reject() -> Self {
Self {
matcher_inner: &fail::<_, Infallible, _>,
collector: &|_| unreachable!(),
_phantom_closure: PhantomData,
_phantom_data: PhantomData,
_phantom_output: PhantomData,
}
}
}
#[derive(Copy, Clone, Debug)]
enum FlankingRule {
Lenient,
Strict,
DontCare,
}
struct FlankingDelim<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>>(
T,
FlankingRule,
PhantomData<&'a ()>,
);
impl<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>> From<(T, FlankingRule)>
for FlankingDelim<'a, T>
{
fn from((func, rule): (T, FlankingRule)) -> Self {
FlankingDelim(func, rule, PhantomData)
}
}
impl<'a, T: Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>>> From<T> for FlankingDelim<'a, T> {
fn from(func: T) -> Self {
FlankingDelim(func, FlankingRule::DontCare, PhantomData)
}
}
pub struct Context {
depth_limit: usize,
}
const DEFAULT_DEPTH_LIMIT: usize = 24;
impl Default for Context {
fn default() -> Self {
Context::new(DEFAULT_DEPTH_LIMIT)
}
}
impl Context {
pub fn new(depth_limit: usize) -> Self {
Self { depth_limit }
}
pub fn parse_full(&self, input: &str) -> Token {
match self.full(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Full parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
pub fn parse_inline(&self, input: &str) -> Token {
match self.full(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Inline parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
pub fn parse_ui(&self, input: &str) -> Token {
match self.inline_ui(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Inline parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
pub fn parse_profile_fields(&self, input: &str) -> Token {
match self.inline_profile_fields(Span::new_extra(input, SpanMeta::default())) {
Ok((_, t)) => t.merged(),
Err(e) => {
trace!(input = input, "Profile field parser fail: {:?}", e);
Token::PlainText(e.to_compact_string())
}
}
}
#[inline]
fn partial(
&self,
func: impl for<'a> Fn(&Self, Span<'a>) -> IResult<Span<'a>, Token> + 'static,
) -> impl for<'a> Fn(Span<'a>) -> IResult<Span<'a>, Token> + '_ {
move |input| func(self, input)
}
#[inline]
fn partial_span(
&self,
func: impl for<'a> Fn(&Self, Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'static,
) -> impl for<'a> Fn(Span<'a>) -> IResult<Span<'a>, Span<'a>> + '_ {
move |input| func(self, input)
}
pub fn full<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
map(
many_till(self.partial(Self::full_single), eof).map(|v| v.0),
Token::Sequence,
)(input)
}
pub fn inline<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
map(
many_till(self.partial(Self::inline_single), eof).map(|v| v.0),
Token::Sequence,
)(input)
}
pub fn inline_label_safe<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
map(
many_till(self.partial(Self::inline_label_safe_single), eof).map(|v| v.0),
Token::Sequence,
)(input)
}
fn inline_profile_fields<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
map(
many_till(
alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::raw_url),
self.partial(Self::tag_raw_text),
)),
eof,
)
.map(|v| v.0),
Token::Sequence,
)(input)
}
fn inline_ui<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
map(
many_till(
alt((
self.partial(Self::unicode_emoji),
self.partial(Self::shortcode_emoji),
self.partial(Self::tag_raw_text),
)),
eof,
)
.map(|v| v.0),
Token::Sequence,
)(input)
}
fn base_bold_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
alt((
self.partial(Self::tag_bold_italic_asterisk),
self.partial(Self::tag_bold_italic_underscore),
self.partial(Self::tag_bold_asterisk),
self.partial(Self::tag_italic_asterisk),
self.partial(Self::tag_bold_underscore),
self.partial(Self::tag_italic_underscore),
))(input)
}
fn full_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt((
self.increase_nesting(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_block_center),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::url_no_embed),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_block_code),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_quote),
self.partial(Self::tag_block_math),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::link),
self.partial(Self::raw_url),
))),
self.partial(Self::tag_raw_text),
))(input)?;
Ok((input, token))
}
fn inline_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
alt((
self.increase_nesting(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::url_no_embed),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::link),
self.partial(Self::raw_url),
))),
self.partial(Self::tag_raw_text),
))(input)
}
fn inline_non_formatting_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt((
self.increase_nesting(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::url_no_embed),
self.partial(Self::tag_inline_code),
self.partial(Self::tag_inline_math),
self.partial(Self::tag_func),
self.partial(Self::tag_mention),
self.partial(Self::tag_hashtag),
self.partial(Self::shortcode_emoji),
self.partial(Self::raw_url),
))),
self.partial(Self::tag_raw_text),
))(input)?;
Ok((input, token))
}
fn inline_label_safe_single<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, token) = alt((
self.increase_nesting(alt((
self.partial(Self::unicode_emoji),
self.partial(Self::tag_small),
self.partial(Self::tag_plain),
self.partial(Self::tag_bold),
self.partial(Self::tag_italic),
self.partial(Self::tag_strikethrough),
self.partial(Self::base_bold_italic),
self.partial(Self::tag_strikethrough_tilde),
self.partial(Self::tag_func),
self.partial(Self::shortcode_emoji),
))),
self.partial(Self::tag_raw_text),
))(input)?;
Ok((input, token))
}
fn tag_quote<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, leading_spaces) = tuple((opt(line_ending), opt(line_ending)))(input)?;
if let (None, None) = leading_spaces {
if input.get_column() != 1 {
return fail(input);
}
}
let quote_line = |input| tuple((tag(">"), opt(space), not_line_ending))(input);
let orig_input = input;
let (input, lines) = separated_list1(line_ending, quote_line)(input)?;
let quote_lines = lines
.into_iter()
.map(|(_, _, text)| text)
.collect::<Vec<_>>();
if quote_lines.len() == 1
&& quote_lines
.iter()
.map(Span::fragment)
.copied()
.any(&str::is_empty)
{
return fail(input);
}
let (_, inner) = spliced(&quote_lines, self.partial(Self::full), orig_input)?;
let (input, _) = tuple((opt(line_ending), opt(line_ending)))(input)?;
Ok((input, Token::Quote(Box::new(inner))))
}
fn tag_block_center<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let tag_start = &tag("<center>");
let tag_end = &tag("</center>");
let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 1 {
return fail(input);
}
let (input, _) = tag_start(input)?;
let (input, _) = opt(line_ending)(input)?;
let (input, (center_seq, _)) = many_till(
self.partial(Self::inline_single),
tuple((opt(space1), opt(line_ending), tag_end)),
)(input)?;
Ok((
input,
boxing_token(Token::Center)(Token::Sequence(center_seq)),
))
}
fn tag_block_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let delim = &tag("```");
let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 1 {
return fail(input);
}
let (input, _) = delim(input)?;
let (input, lang) = opt(map(
recognize(many1(tuple((not(delim), not(line_ending), anychar)))),
Span::into_fragment,
))(input)?;
let (input, _) = line_ending(input)?;
let (input, code) = map(
recognize(many1_count(tuple((
not(tuple((line_ending, delim))),
anychar,
)))),
Span::into_fragment,
)(input)?;
let (input, _) = line_ending(input)?;
let (input, _) = delim(input)?;
// Trailing whitespace after the triple backtick
let (input, _) = opt(space1_unicode)(input)?;
// If we got this far, the next character should be a line ending
let (input, _) = not(tuple((not(line_ending), anychar)))(input)?;
let (input, _) = opt(line_ending)(input)?;
Ok((
input,
Token::BlockCode {
lang: lang.map(<&str>::into),
inner: code.into(),
},
))
}
fn tag_block_math<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let start = &tag("\\[");
let end = &tag("\\]");
let (input, _) = opt(line_ending)(input)?;
if input.get_column() != 1 {
return fail(input);
}
let (input, _) = start(input)?;
let (input, _) = opt(line_ending)(input)?;
let (input, _) = opt(space1_unicode)(input)?;
let (input, math_span) = map(
many_till(anychar, tuple((opt(space1_unicode), opt(line_ending), end))),
|v| v.0,
)(input)?;
// Trailing whitespace after the closing delim
let (input, _) = opt(space1_unicode)(input)?;
// If we got this far, the next character should be a line ending
let (input, _) = not(tuple((not(line_ending), anychar)))(input)?;
let (input, _) = opt(line_ending)(input)?;
Ok((
input,
Token::BlockMath(math_span.into_iter().collect::<String>()),
))
}
#[inline]
fn tag_delimited<'a, 'b: 'a, T: Clone, S: Clone, FOpen, FClose>(
&'a self,
opening_tag: impl Into<FlankingDelim<'b, FOpen>> + 'a,
closing_tag: impl Into<FlankingDelim<'b, FClose>> + 'a,
escape: bool,
matcher: Matcher<'a, 'b, T>,
fallback: Matcher<'a, 'b, S>,
) -> impl Fn(Span<'b>) -> IResult<Span<'b>, Token> + '_
where
FOpen: Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
FClose: Fn(Span<'b>) -> IResult<Span<'b>, Span<'b>> + 'a,
{
let FlankingDelim(opening_tag, opening_rule, ..) = opening_tag.into();
let FlankingDelim(closing_tag, closing_rule, ..) = closing_tag.into();
move |input| {
if escape {
if let Ok((input_escaped, (_, mark))) = tuple((tag("\\"), &opening_tag))(input) {
return Ok((
input_escaped,
Token::PlainText(mark.fragment().to_string().into()),
));
}
}
if let FlankingRule::Strict = opening_rule {
let (input, pre) =
opt(recognize(tuple((alphanumeric1_unicode, &opening_tag))))(input)?;
if let Some(pre_text) = pre {
return Ok((input, Token::PlainText(pre_text.into_fragment().into())));
}
}
let begin = input;
let (post_open, _) = opening_tag(input)?;
let res = tuple((
many1(tuple((not(&closing_tag), &matcher.matcher_inner))),
&closing_tag,
))(post_open);
if let Err(nom::Err::Error(nom::error::Error {
input: input_past_err,
..
})) = res
{
let res_fallback = tuple((
many1(tuple((not(&closing_tag), &fallback.matcher_inner))),
&closing_tag,
))(post_open);
if res_fallback.is_err() {
return Ok((
input_past_err,
Token::PlainText(begin.fragment_between(&input_past_err).into()),
));
}
let (input, (inner, closing)) = res_fallback.unwrap();
let mut inner = inner.into_iter().map(|(_, t)| t);
return Ok((
input,
Token::Sequence(vec![
Token::PlainText(begin.fragment_between(&post_open).into()),
((fallback.collector)(&mut inner)),
Token::PlainText(closing.into_fragment().into()),
]),
));
}
let (input, (inner, closing)) = res?;
let mut inner = inner.into_iter().map(|(_, t)| t);
let inner_tok = (matcher.collector)(&mut inner);
let correct_left_flanking =
if let FlankingRule::Lenient | FlankingRule::Strict = opening_rule {
let text_left = inner_tok.str_content_left();
!(text_left.is_some_and(|s| s.starts_with(char::is_whitespace))
|| text_left.is_none())
} else {
true
};
let correct_right_flanking =
if let FlankingRule::Lenient | FlankingRule::Strict = closing_rule {
let text_right = inner_tok.str_content_right();
!(text_right.is_some_and(|s| s.ends_with(char::is_whitespace))
|| text_right.is_none())
} else {
true
};
let (input, alphanum) = opt(peek(alphanumeric1_unicode))(input)?;
let correct_right_outer =
alphanum.is_none() || !matches!(closing_rule, FlankingRule::Strict);
let correct_flanking =
correct_left_flanking && correct_right_flanking && correct_right_outer;
if !correct_flanking {
return Ok((
input,
Token::Sequence(vec![
Token::PlainText(begin.fragment_between(&post_open).into()),
inner_tok.inner(),
Token::PlainText(closing.into_fragment().into()),
]),
));
}
Ok((input, Token::Sequence(vec![inner_tok])))
}
}
fn tag_func<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, _) = tag("$[")(input)?;
let func_ident = |input| {
recognize(tuple((
many1_count(alt((alpha1, tag("_")))),
many0_count(alt((alphanumeric1, tag("_")))),
)))(input)
};
let arg_value = recognize(many1_count(alt((
alphanumeric1,
tag("."),
tag("-"),
tag("_"),
))));
let (input, func_name) = map(func_ident, Span::into_fragment)(input)?;
let arg = tuple((func_ident, opt(tuple((tag("="), arg_value)))));
let (input, args) =
opt(tuple((one_char('.'), separated_list1(one_char(','), arg))))(input)?;
let args_out = args.map_or_else(HashMap::new, |(_, items)| {
items
.into_iter()
.map(|(k, v)| {
(
k.into_fragment().to_string(),
v.map(|(_, val)| val.into_fragment().to_string()),
)
})
.collect::<HashMap<_, _>>()
});
let (input, _) = opt(space)(input)?;
let (input, (inner, _)) = many_till(self.partial(Self::inline_single), tag("]"))(input)?;
Ok((
input,
Token::Function {
name: func_name.to_string(),
params: args_out,
inner: Box::new(Token::Sequence(inner)),
},
))
}
fn tag_plain<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let opening_tag = &tag("<plain>");
let closing_tag = &tag("</plain>");
let (input, _) = opening_tag(input)?;
let (input, text) = map(
recognize(many1(tuple((not(line_ending), not(closing_tag), anychar)))),
Span::into_fragment,
)(input)?;
let (input, _) = closing_tag(input)?;
Ok((input, Token::PlainTag(text.into())))
}
fn tag_small<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
tag_no_case("<small>"),
tag_no_case("</small>"),
false,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::Small)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_bold_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
(tag("***"), FlankingRule::Lenient),
(tag("***"), FlankingRule::Lenient),
true,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_bold_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
(tag("___"), FlankingRule::Strict),
(tag("___"), FlankingRule::Strict),
true,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::BoldItalic)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_bold<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
tag_no_case("<b>"),
tag_no_case("</b>"),
false,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::Bold)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_bold_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
(tag("**"), FlankingRule::Lenient),
(tag("**"), FlankingRule::Lenient),
true,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::Bold)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_bold_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
(tag("__"), FlankingRule::Strict),
(tag("__"), FlankingRule::Strict),
true,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::Bold)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_italic<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
tag_no_case("<i>"),
tag_no_case("</i>"),
false,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::Italic)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_italic_asterisk<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
(tag("*"), FlankingRule::Lenient),
(tag("*"), FlankingRule::Lenient),
true,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::Italic)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_italic_underscore<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
(tag("_"), FlankingRule::Strict),
(tag("_"), FlankingRule::Strict),
true,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::Italic)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_strikethrough<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
tag_no_case("<s>"),
tag_no_case("</s>"),
false,
Matcher::new(
&self.partial(Self::inline_single),
&collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)),
),
Matcher::new(
&self.partial(Self::inline_non_formatting_single),
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_strikethrough_tilde<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
(tag("~~"), FlankingRule::Lenient),
(tag("~~"), FlankingRule::Lenient),
true,
Matcher::new(
&move |input| {
map(
tuple(((not(line_ending)), self.partial(Self::inline_single))),
|(_, captured)| captured,
)(input)
},
&collect_sequence(Token::Sequence, boxing_token(Token::Strikethrough)),
),
Matcher::new(
&move |input| {
map(
tuple((
(not(line_ending)),
self.partial(Self::inline_non_formatting_single),
)),
|(_, captured)| captured,
)(input)
},
&collect_sequence(Token::Sequence, identity),
),
)(input)
}
fn tag_inline_code<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
tag("`"),
|input| alt((tag("`"), tag("´")))(input),
true,
Matcher::new(
&move |input| {
map(
tuple((not(alt((tag("`"), tag("´"), line_ending))), anychar)),
|(_, captured)| captured,
)(input)
},
&collect_char_sequence(Token::InlineCode),
),
Matcher::reject(),
)(input)
}
fn tag_inline_math<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
self.tag_delimited(
tag("\\("),
tag("\\)"),
false,
Matcher::new(
&move |input| {
map(tuple((not(line_ending), anychar)), |(_, captured)| captured)(input)
},
&collect_char_sequence(Token::InlineMath),
),
Matcher::reject(),
)(input)
}
fn tag_raw_text<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, text) = anychar(input)?;
Ok((input, Token::PlainText(text.to_compact_string())))
}
fn raw_url<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, url_span) = recognize(tuple((
self.partial_span(Self::protocol),
self.url_chars(
|input| recognize(not(self.partial_span(Self::url_chars_base)))(input),
false,
),
)))(input)?;
let url = url_span.into_fragment();
let url_bytes = url.as_bytes();
// Strip punctuation at the end of sentences that might have been consumed as a part of the URL
let final_url = if matches!(url_bytes.last(), Some(b'.' | b',' | b'?')) {
url.slice(..url.len() - 1)
} else {
url
};
Ok((input, Token::UrlRaw(final_url.to_string())))
}
fn url_no_embed<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, _) = tag("<")(input)?;
let (input, url_span) = recognize(tuple((
self.partial_span(Self::protocol),
self.url_chars(tag(">"), true),
)))(input)?;
let (input, _) = tag(">")(input)?;
Ok((
input,
Token::UrlNoEmbed(url_span.into_fragment().to_string()),
))
}
fn link<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, no_embed) = opt(tag("?"))(input)?;
let (input, _) = tag("[")(input)?;
let (input, _) = not(tag("["))(input)?;
let (input, (label_tok, _)) =
many_till(self.partial(Self::inline_label_safe_single), tag("]("))(input)?;
let (input, url_span) = recognize(tuple((
self.partial_span(Self::protocol),
self.url_chars(tag(")"), true),
)))(input)?;
let (input, _) = tag(")")(input)?;
Ok((
input,
Token::Link {
label: Box::new(Token::Sequence(label_tok)),
href: url_span.into_fragment().into(),
embed: no_embed.is_none(),
},
))
}
fn unicode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let frag = input.fragment();
let Some(grapheme) = frag.graphemes(true).next() else {
return fail(input);
};
let grapheme = grapheme.trim_end_matches(|c| c == '\u{200c}' || c == '\u{200d}');
let emoji = emojis::get(grapheme);
if emoji.is_none() {
return fail(input);
}
Ok((
input.slice(grapheme.len()..),
Token::UnicodeEmoji(grapheme.into()),
))
}
fn shortcode_emoji<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
if let (plain_out, Some(plain)) = map(
opt(recognize(tuple((
alphanumeric1_unicode,
self.partial(Self::shortcode_emoji),
)))),
|o| o.map(Span::into_fragment),
)(input)?
{
return Ok((plain_out, Token::PlainText(plain.into())));
}
let (input, _) = tag(":")(input)?;
let (input, shortcode) = map(
recognize(many1(alt((
alphanumeric1_unicode,
recognize(one_of("_+-")),
)))),
Span::into_fragment,
)(input)?;
let (input, host) = opt(map(
tuple((
tag("@"),
map(
recognize(many1(alt((alphanumeric1, recognize(one_of("-.")))))),
Span::into_fragment,
),
)),
|(_at, host)| host,
))(input)?;
let (input, _) = tag(":")(input)?;
let (input, _) = not(alphanumeric1_unicode)(input)?;
Ok((
input,
Token::ShortcodeEmoji {
shortcode: shortcode.into(),
host: host.map(str::to_string),
},
))
}
fn tag_mention<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
if let (plain_out, Some(plain)) = map(
opt(recognize(tuple((
alt((tag("\\"), alphanumeric1_unicode)),
self.partial(Self::tag_mention),
)))),
|o| o.map(Span::into_fragment),
)(input)?
{
return Ok((plain_out, Token::PlainText(plain.into())));
}
let tags = one_of("@!");
let (input, mention_type) = map(tags, |c| match c {
'@' => MentionType::User,
'!' => MentionType::Community,
_ => unreachable!(),
})(input)?;
let (input, name) = map(
recognize(many1(alt((alphanumeric1, recognize(one_of("-_")))))),
Span::into_fragment,
)(input)?;
let before = input;
let (_, host_opt) = opt(tuple((
one_of(if matches!(mention_type, MentionType::User) {
"@:"
} else {
"@"
}),
map(
recognize(many1(alt((alphanumeric1, recognize(one_of("-_.")))))),
Span::into_fragment,
),
)))(input)?;
// Promote tags with a colon separator to Matrix handles
let mention_type = if let Some((':', _)) = host_opt {
MentionType::MatrixUser
} else {
mention_type
};
let host =
host_opt.map(|(_, name)| name.trim_end_matches(|c| matches!(c, '.' | '-' | '_')));
let input = host.map(|c| before.slice(c.len() + 1..)).unwrap_or(before);
Ok((
input,
Token::Mention {
mention_type,
name: name.into(),
host: host.map(|h| h.into()),
},
))
}
fn tag_hashtag<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Token> {
let (input, maybe_preceded) =
opt(recognize(tuple((alphanumeric1_unicode, tag("#")))))(input)?;
if let Some(preceded) = maybe_preceded {
return Ok((input, Token::PlainText(preceded.into_fragment().into())));
}
let (input, _) = tag("#")(input)?;
let (input, hashtag_text) = map(
recognize(many1(self.partial_span(Self::hashtag_chars))),
Span::into_fragment,
)(input)?;
Ok((input, Token::Hashtag(hashtag_text.into())))
}
#[inline]
fn increase_nesting<'a, 'b, O, F>(
&'b self,
mut func: F,
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, O> + 'b
where
F: Parser<Span<'a>, O, nom::error::Error<Span<'a>>> + 'b,
{
move |mut input| {
if input.extra.depth >= self.depth_limit {
return fail(input);
}
input.extra.depth += 1;
func.parse(input).map(|mut v| {
v.0.extra.depth -= 1;
v
})
}
}
#[inline]
fn hashtag_chars<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
recognize(alt((
recognize(tuple((
tag("("),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag(")"),
))),
recognize(tuple((
tag("["),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag("]"),
))),
recognize(tuple((
tag(""),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag(""),
))),
recognize(tuple((
tag(""),
self.increase_nesting(self.partial_span(Self::hashtag_chars)),
tag(""),
))),
recognize(tuple((
not(space1_unicode),
not(line_ending),
not(one_of(".,:;!?#?/[]【】()「」()<>")),
anychar,
))),
)))(input)
}
#[inline]
fn protocol<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
alt((tag("https://"), tag("http://")))(input)
}
#[inline]
fn url_chars_base<'a>(&self, input: Span<'a>) -> IResult<Span<'a>, Span<'a>> {
alt((
alphanumeric1_unicode,
recognize(tuple((
tag("["),
many_till(
self.increase_nesting(self.partial_span(Self::url_chars_base)),
tag("]"),
),
))),
recognize(tuple((
tag("("),
many_till(
self.increase_nesting(self.partial_span(Self::url_chars_base)),
tag(")"),
),
))),
recognize(one_of(".,_/:%#$&?!~=+-@")),
))(input)
}
#[inline]
fn url_chars<'a, 'b, F>(
&'b self,
mut terminator: F,
spaces: bool,
) -> impl FnMut(Span<'a>) -> IResult<Span<'a>, Span<'a>> + 'b
where
F: Parser<Span<'a>, Span<'a>, nom::error::Error<Span<'a>>> + 'b,
{
move |input| {
recognize(many1_count(tuple((
not(tuple((space1, eof))),
not(tuple((space1, tag("\"")))),
not(tuple((opt(space1), |input| terminator.parse(input)))),
alt((
|input| self.url_chars_base(input),
if spaces { space1 } else { fail },
)),
))))(input)
}
}
}
#[cfg(test)]
mod test {
use crate::{to_xml_string, Context, Span, SpanMeta, Token, DEFAULT_DEPTH_LIMIT};
use nom::bytes::complete::tag;
use std::collections::HashMap;
fn parse_full(string: &str) -> Token {
Context::default()
.full(Span::new_extra(string, SpanMeta::default()))
.unwrap()
.1
.merged()
}
#[test]
fn parse_empty() {
let ctx = Context::default();
assert_eq!(parse_full(""), Token::Sequence(vec![]));
}
#[test]
fn parse_url_chars() {
let ctx = Context::default();
assert_eq!(
ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security))",
SpanMeta::default()
))
.unwrap()
.1
.into_fragment(),
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)"
);
assert_eq!(
ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)))",
SpanMeta::default()
))
.unwrap()
.1
.into_fragment(),
"https://en.wikipedia.org/wiki/Sandbox_(computer_security)",
);
assert_eq!(
ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://cs.wikipedia.org/wiki/Among_Us ",
SpanMeta::default()
))
.unwrap()
.1
.into_fragment(),
"https://cs.wikipedia.org/wiki/Among_Us",
);
assert_eq!(
ctx.url_chars(tag(")"), true)(Span::new_extra(
"https://cs.wikipedia.org/wiki/Among Us )",
SpanMeta::default()
))
.unwrap()
.1
.into_fragment(),
"https://cs.wikipedia.org/wiki/Among Us"
);
assert_eq!(
ctx.url_chars(tag(")"), false)(Span::new_extra(
"https://en.wikipedia.org/wiki/Among Us )",
SpanMeta::default()
))
.unwrap()
.1
.into_fragment(),
"https://en.wikipedia.org/wiki/Among"
);
}
#[test]
fn parse_formatting() {
assert_eq!(
parse_full(r#"~~stikethrough~~"#),
Token::Strikethrough(Box::new(Token::PlainText("stikethrough".into()))),
);
assert_eq!(
parse_full(r#"**bold**"#),
Token::Bold(Box::new(Token::PlainText("bold".into()))),
);
assert_eq!(
parse_full(r#"*italic*"#),
Token::Italic(Box::new(Token::PlainText("italic".into()))),
);
assert_eq!(
parse_full(r#"* italic *"#),
Token::PlainText("* italic *".into())
);
assert_eq!(
parse_full("snake_case_variable"),
Token::PlainText("snake_case_variable".into())
);
assert_eq!(
parse_full("intra*word*italic"),
Token::Sequence(vec![
Token::PlainText("intra".into()),
Token::Italic(Box::new(Token::PlainText("word".into()))),
Token::PlainText("italic".into())
])
);
assert_eq!(
parse_full(r#"_ italic *"#),
Token::PlainText("_ italic *".into())
);
assert_eq!(
parse_full(r#"*"italic"*"#),
Token::Italic(Box::new(Token::PlainText("\"italic\"".into())))
);
assert_eq!(
parse_full(r#"not code `code` also not code"#),
Token::Sequence(vec![
Token::PlainText("not code ".into()),
Token::InlineCode("code".into()),
Token::PlainText(" also not code".into())
]),
);
assert_eq!(
parse_full(r#"not code `code` also `not code"#),
Token::Sequence(vec![
Token::PlainText("not code ".into()),
Token::InlineCode("code".into()),
Token::PlainText(" also `not code".into())
]),
);
assert_eq!(
parse_full(r#"not code `*not bold*` also not code"#),
Token::Sequence(vec![
Token::PlainText("not code ".into()),
Token::InlineCode("*not bold*".into()),
Token::PlainText(" also not code".into())
]),
);
assert_eq!(
parse_full(r#"***bold italic***"#),
Token::BoldItalic(Box::new(Token::PlainText("bold italic".into())))
);
assert_eq!(
parse_full(r#"<b><i>bold italic</i></b>"#),
Token::Bold(Box::new(Token::Italic(Box::new(Token::PlainText(
"bold italic".into()
)))))
);
assert_eq!(
parse_full("~~*hello\nworld*"),
Token::PlainText("~~*hello\nworld*".into())
)
}
#[test]
fn parse_complex() {
assert_eq!(
parse_full(r"\( nya^3 \)"),
Token::InlineMath(" nya^3 ".to_string())
);
assert_eq!(
parse_full("\\( nya^3 \n \\)"),
Token::PlainText("\\( nya^3 \n \\)".into())
);
assert_eq!(
parse_full(r"`AbstractProxyFactoryBean`"),
Token::InlineCode("AbstractProxyFactoryBean".to_string())
);
assert_eq!(
parse_full("`let x = \n 5;`"),
Token::PlainText("`let x = \n 5;`".into())
);
assert_eq!(
parse_full(
r#"
```js
var x = undefined;
```"#
),
Token::BlockCode {
lang: Some("js".to_string()),
inner: "var x = undefined;".to_string(),
}
);
assert_eq!(
parse_full(
r"
\[
a^2 + b^2 = c^2
\]"
),
Token::BlockMath("a^2 + b^2 = c^2".to_string())
);
assert_eq!(
parse_full(r"\[ x^2 + y^2 = z^2 \]"),
Token::BlockMath("x^2 + y^2 = z^2".to_string())
);
assert_eq!(
parse_full(
r#"<center>centered
🦋🏳️‍⚧️
text</center>"#
),
Token::Center(Box::new(Token::Sequence(vec![
Token::PlainText("centered\n".into()),
Token::UnicodeEmoji("🦋".into()),
Token::UnicodeEmoji("🏳️‍⚧️".into()),
Token::PlainText("\ntext".into())
])))
);
assert_eq!(
parse_full(
r#"> <center>centered
> 👩🏽‍🤝‍👩🏼
> text</center>"#
),
Token::Quote(Box::new(Token::Center(Box::new(Token::Sequence(vec![
Token::PlainText("centered\n".into()),
Token::UnicodeEmoji("👩🏽‍🤝‍👩🏼".into()),
Token::PlainText("\ntext".into())
]))))),
);
assert_eq!(
parse_full(r#"$[x2 $[sparkle 🥺]💜$[spin.y,speed=5s ❤️]🦊]"#),
Token::Function {
name: "x2".into(),
params: HashMap::new(),
inner: Box::new(Token::Sequence(vec![
Token::Function {
name: "sparkle".into(),
params: HashMap::new(),
inner: Box::new(Token::UnicodeEmoji("🥺".into())),
},
Token::UnicodeEmoji("💜".into()),
Token::Function {
name: "spin".into(),
params: {
let mut params = HashMap::new();
params.insert("y".into(), None);
params.insert("speed".into(), Some("5s".into()));
params
},
inner: Box::new(Token::UnicodeEmoji("❤️".into())),
},
Token::UnicodeEmoji("🦊".into()),
]))
},
);
assert_eq!(
parse_full(r#"<b>bold @tag1 <i> @tag2 </b>italic</i>"#),
Token::Sequence(vec![
Token::PlainText("<b>bold ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag1".into(),
host: None
},
Token::PlainText(" <i> ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag2".into(),
host: None
},
Token::PlainText(" </b>italic</i>".into())
]),
);
assert_eq!(
parse_full(
r#"
> test
> <i>
> italic
> </i>
>> Nested quote
"#
),
Token::Quote(Box::new(Token::Sequence(vec![
Token::PlainText("test\n".into()),
Token::Italic(Box::new(Token::PlainText("\nitalic\n".into()))),
Token::Quote(Box::new(Token::PlainText("Nested quote".into())))
]))),
);
}
#[test]
fn parse_link() {
assert_eq!(
parse_full("IPv4 test: <https://0>"),
Token::Sequence(vec![
Token::PlainText("IPv4 test: ".into()),
Token::UrlNoEmbed("https://0".into())
])
);
assert_eq!(
parse_full("IPv4 test: <https://127.0.0.1>"),
Token::Sequence(vec![
Token::PlainText("IPv4 test: ".into()),
Token::UrlNoEmbed("https://127.0.0.1".into())
])
);
assert_eq!(
parse_full("IPv6 test: <https://[::2f:1]/nya>"),
Token::Sequence(vec![
Token::PlainText("IPv6 test: ".into()),
Token::UrlNoEmbed("https://[::2f:1]/nya".into())
])
);
assert_eq!(
parse_full("IPv6 test: https://[::2f:1]/nya"),
Token::Sequence(vec![
Token::PlainText("IPv6 test: ".into()),
Token::UrlRaw("https://[::2f:1]/nya".into())
])
);
// IDNs
assert_eq!(
parse_full("IDN test: https://www.háčkyčárky.cz/"),
Token::Sequence(vec![
Token::PlainText("IDN test: ".into()),
Token::UrlRaw("https://www.háčkyčárky.cz/".into())
])
);
assert_eq!(
parse_full("Link test: [label](https://example.com)"),
Token::Sequence(vec![
Token::PlainText("Link test: ".into()),
Token::Link {
label: Box::new(Token::PlainText("label".into())),
href: "https://example.com".into(),
embed: true
}
])
);
assert_eq!(
parse_full("test #hashtag tail"),
Token::Sequence(vec![
Token::PlainText("test ".into()),
Token::Hashtag("hashtag".into()),
Token::PlainText(" tail".into())
])
);
assert_eq!(
parse_full("not#hashtag tail"),
Token::PlainText("not#hashtag tail".into())
);
assert_eq!(
parse_full("<https://example.com>"),
Token::UrlNoEmbed("https://example.com".into())
);
// Adjacent links okay
assert_eq!(
parse_full("<https://example.com/><https://awawa.gay/>"),
Token::Sequence(vec![
Token::UrlNoEmbed("https://example.com/".into()),
Token::UrlNoEmbed("https://awawa.gay/".into())
])
);
assert_eq!(
parse_full("Link test: ?[label](https://awawa.gay)"),
Token::Sequence(vec![
Token::PlainText("Link test: ".into()),
Token::Link {
label: Box::new(Token::PlainText("label".into())),
href: "https://awawa.gay".into(),
embed: false
}
])
);
assert_eq!(
parse_full("Link test: ?[label](https://awawa.gay)test"),
Token::Sequence(vec![
Token::PlainText("Link test: ".into()),
Token::Link {
label: Box::new(Token::PlainText("label".into())),
href: "https://awawa.gay".into(),
embed: false
},
Token::PlainText("test".into())
])
);
assert_eq!(
parse_full("Link test: (?[label](https://awawa.gay))"),
Token::Sequence(vec![
Token::PlainText("Link test: (".into()),
Token::Link {
label: Box::new(Token::PlainText("label".into())),
href: "https://awawa.gay".into(),
embed: false
},
Token::PlainText(")".into())
])
);
assert_eq!(
parse_full("Link test: ?[label](https://awawa.gay"), // Missing closing bracket
Token::Sequence(vec![
Token::PlainText("Link test: ?[label](".into()),
Token::UrlRaw("https://awawa.gay".into()),
])
);
}
#[test]
fn limit_nesting() {
let mut tok = Token::PlainText(" <s><i>test</i></s> ".into());
for _ in 0..DEFAULT_DEPTH_LIMIT {
tok = Token::Bold(Box::new(tok));
}
assert_eq!(
parse_full(
&("<b>".repeat(DEFAULT_DEPTH_LIMIT)
+ " <s><i>test</i></s> "
+ &*"</b>".repeat(DEFAULT_DEPTH_LIMIT))
),
tok
);
}
#[test]
fn parse_mention() {
assert_eq!(
parse_full("@tag"),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag".into(),
host: None
}
);
assert_eq!(
parse_full("email@notactuallyamenmtion.org"),
Token::PlainText("email@notactuallyamenmtion.org".into())
);
assert_eq!(
parse_full("hgsjlkdsa @tag fgahjsdkd"),
Token::Sequence(vec![
Token::PlainText("hgsjlkdsa ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag".into(),
host: None
},
Token::PlainText(" fgahjsdkd".into())
])
);
assert_eq!(
parse_full("hgsjlkdsa @tag@ fgahjsdkd"),
Token::Sequence(vec![
Token::PlainText("hgsjlkdsa ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag".into(),
host: None
},
Token::PlainText("@ fgahjsdkd".into())
])
);
assert_eq!(
parse_full("aaaa @tag@domain bbbbb"),
Token::Sequence(vec![
Token::PlainText("aaaa ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag".into(),
host: Some("domain".into())
},
Token::PlainText(" bbbbb".into())
])
);
assert_eq!(
parse_full("test @tag@domain, test"),
Token::Sequence(vec![
Token::PlainText("test ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag".into(),
host: Some("domain".into())
},
Token::PlainText(", test".into())
])
);
assert_eq!(
parse_full("test @tag@domain.gay. test"),
Token::Sequence(vec![
Token::PlainText("test ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag".into(),
host: Some("domain.gay".into())
},
Token::PlainText(". test".into())
])
);
assert_eq!(
parse_full("test @tag@domain? test"),
Token::Sequence(vec![
Token::PlainText("test ".into()),
Token::Mention {
mention_type: crate::MentionType::User,
name: "tag".into(),
host: Some("domain".into())
},
Token::PlainText("? test".into())
])
);
assert_eq!(
parse_full("test !tag@domain.com test"),
Token::Sequence(vec![
Token::PlainText("test ".into()),
Token::Mention {
mention_type: crate::MentionType::Community,
name: "tag".into(),
host: Some("domain.com".into())
},
Token::PlainText(" test".into())
])
);
assert_eq!(
parse_full("@tag:domain.com"),
Token::Mention {
mention_type: crate::MentionType::MatrixUser,
name: "tag".into(),
host: Some("domain.com".into())
},
);
}
#[test]
fn parse_shortcodes() {
assert_eq!(
parse_full(":bottom:"),
Token::ShortcodeEmoji {
shortcode: "bottom".into(),
host: None
}
);
assert_eq!(
parse_full(":bottom::blobfox:"),
Token::Sequence(vec![
Token::ShortcodeEmoji {
shortcode: "bottom".into(),
host: None
},
Token::ShortcodeEmoji {
shortcode: "blobfox".into(),
host: None
}
])
);
assert_eq!(
parse_full(":bottom@magnetar.social:"),
Token::ShortcodeEmoji {
shortcode: "bottom".into(),
host: Some("magnetar.social".into())
}
);
assert_eq!(
parse_full(":bottom:blobfox"),
Token::PlainText(":bottom:blobfox".into())
);
assert_eq!(
parse_full("bottom:blobfox:"),
Token::PlainText("bottom:blobfox:".into())
);
}
#[test]
fn parse_emoji() {
assert_eq!(
parse_full("🥺💜❤️🦊"),
Token::Sequence(
vec!["🥺", "💜", "❤️", "🦊"]
.into_iter()
.map(str::to_string)
.map(Token::UnicodeEmoji)
.collect::<Vec<_>>()
)
);
// Trans flag, ZWJ
assert_eq!(
parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}"),
Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}\u{0200d}\u{026a7}\u{0fe0f}".into())
);
assert_eq!(
parse_full("\u{0200d}\u{1f3f3}\u{0fe0f}"),
Token::Sequence(vec![
Token::PlainText("\u{0200d}".into()), // ZWJ
Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag
])
);
// Trans flag, ZWNJ
assert_eq!(
parse_full("\u{1f3f3}\u{0fe0f}\u{0200c}\u{026a7}\u{0fe0f}"),
Token::Sequence(vec![
Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag
Token::PlainText("\u{0200c}".into()), // ZWNJ
Token::UnicodeEmoji("\u{026a7}\u{0fe0f}".into()) // Trans symbol
])
);
assert_eq!(
parse_full("\u{1f3f3}\u{0fe0f}\u{0200d}\u{0200d}\u{0200d}"),
Token::Sequence(vec![
Token::UnicodeEmoji("\u{1f3f3}\u{0fe0f}".into()), // White flag
Token::PlainText("\u{0200d}\u{0200d}\u{0200d}".into()), // ZWJ
])
);
}
#[test]
fn xml_serialization() {
assert_eq!(
&to_xml_string(&parse_full("***nyaaa***")).unwrap(),
r#"<mmm><b><i>nyaaa</i></b></mmm>"#
);
assert_eq!(
&to_xml_string(&parse_full(
"@natty $[spin.speed=0.5s 🥺]:cat_attack: <plain>test</plain>"
))
.unwrap(),
r#"<mmm><mention name="natty" type="user"/> <fn name="spin" arg-speed="0.5s"><ue>🥺</ue></fn><ee>cat_attack</ee> test</mmm>"#
);
assert_eq!(
&to_xml_string(&parse_full(
"Ring Galaxy AM 0644 741 from Hubble\nCredits: AURA, STScI, J. Higdon, Cornell, ESA, #NASA\n#nature #space #astrophotography"
))
.unwrap(),
r#"<mmm>Ring Galaxy AM 0644 741 from Hubble
Credits: AURA, STScI, J. Higdon, Cornell, ESA, <hashtag>NASA</hashtag>
<hashtag>nature</hashtag> <hashtag>space</hashtag> <hashtag>astrophotography</hashtag></mmm>"#
);
assert_eq!(
&to_xml_string(&parse_full(
r#"
```js
var x = undefined;
``` "#
))
.unwrap(),
"<mmm><code lang=\"js\">var x = undefined;</code></mmm>"
);
}
}