| ` followed by
another ` `) and `
|
|?h\d>|$)",
re.MULTILINE,
)
extra_newlines_regex = re.compile(r"\n(\n*)")
def __init__(self) -> None:
self._sanitizer = Sanitizer(self.sanitize_settings())
self._inline_sanitizer = Sanitizer(self.sanitize_settings(inline=True))
# The whitespace remover doesn't take into account
sanitizer.normalize_overall_whitespace = lambda html, *args, **kw: html
sanitizer.normalize_whitespace_in_text_or_tail = \
lambda el, *args, **kw: el
# hard_wrap: convert all \n to
without required two spaces
# escape: escape HTML characters in the input string, e.g. tags
self._markdown_to_html = mistune.Markdown(
hard_wrap=True, escape=True, inline=MarkdownInlineLexer,
)
self._markdown_to_html.block.default_rules = [
rule for rule in self._markdown_to_html.block.default_rules
if rule != "block_quote"
]
def from_markdown(self, text: str, outgoing: bool = False) -> str:
"""Return filtered HTML from Markdown text."""
return self.filter(self._markdown_to_html(text), outgoing)
def from_markdown_inline(self, text: str, outgoing: bool = False) -> str:
"""Return single-line filtered HTML from Markdown text."""
return self.filter_inline(self._markdown_to_html(text), outgoing)
def filter_inline(self, html: str, outgoing: bool = False) -> str:
"""Filter and return HTML with block tags stripped or substituted."""
html = self._inline_sanitizer.sanitize(html)
if outgoing:
return html
# Client-side modifications
return self.inline_quote_regex.sub(
r'\1\2', html,
)
def filter(self, html: str, outgoing: bool = False) -> str:
"""Filter and return HTML."""
html = self._sanitizer.sanitize(html).rstrip("\n")
if outgoing:
return html
return self.quote_regex.sub(r'\1\2\3', html)
def sanitize_settings(self, inline: bool = False) -> dict:
"""Return an html_sanitizer configuration."""
# https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
# TODO: mx-reply and the new hidden thing
inline_tags = self.inline_tags
all_tags = inline_tags | self.block_tags
inlines_attributes = {
"font": {"color"},
"a": {"href"},
"code": {"class"},
}
attributes = {**inlines_attributes, **{
"ol": {"start"},
"hr": {"width"},
}}
return {
"tags": inline_tags if inline else all_tags,
"attributes": inlines_attributes if inline else attributes,
"empty": {} if inline else {"hr", "br"},
"separate": {"a"} if inline else {
"a", "p", "li", "table", "tr", "th", "td", "br", "hr",
},
"whitespace": {},
"keep_typographic_whitespace": True,
"add_nofollow": False,
"autolink": {
"link_regexes": self.link_regexes,
"avoid_hosts": [],
},
"sanitize_href": lambda href: href,
"element_preprocessors": [
sanitizer.bold_span_to_strong,
sanitizer.italic_span_to_em,
sanitizer.tag_replacer("strong", "b"),
sanitizer.tag_replacer("em", "i"),
sanitizer.tag_replacer("strike", "s"),
sanitizer.tag_replacer("del", "s"),
sanitizer.tag_replacer("form", "p"),
sanitizer.tag_replacer("div", "p"),
sanitizer.tag_replacer("caption", "p"),
sanitizer.target_blank_noopener,
self._process_span_font,
self._img_to_a,
self._remove_extra_newlines,
self._newlines_to_return_symbol if inline else lambda el: el,
],
"element_postprocessors": [],
"is_mergeable": lambda e1, e2: e1.attrib == e2.attrib,
}
@staticmethod
def _process_span_font(el: HtmlElement) -> HtmlElement:
"""Convert HTML ``."""
if el.tag not in ("span", "font"):
return el
color = el.attrib.pop("data-mx-color", None)
if color:
el.tag = "font"
el.attrib["color"] = color
return el
@staticmethod
def _img_to_a(el: HtmlElement) -> HtmlElement:
"""Linkify images by wrapping `` tags in ``."""
if el.tag == "img":
el.tag = "a"
el.attrib["href"] = el.attrib.pop("src", "")
el.text = el.attrib.pop("alt", None) or el.attrib["href"]
return el
def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement:
"""Remove excess `\\n` characters from non-`` HTML elements.
This is done to avoid additional blank lines when the CSS directive
`white-space: pre` is used.
"""
pre_parent = any(parent.tag == "pre" for parent in el.iterancestors())
if el.tag != "pre" and not pre_parent:
if el.text:
el.text = self.extra_newlines_regex.sub(r"\1", el.text)
if el.tail:
el.tail = self.extra_newlines_regex.sub(r"\1", el.tail)
return el
def _newlines_to_return_symbol(self, el: HtmlElement) -> HtmlElement:
"""Turn newlines into unicode return symbols (⏎, U+23CE).
The symbol is added to blocks with siblings (e.g. a `
` tags.
The `
` themselves will be removed by the inline sanitizer.
"""
is_block_with_siblings = (el.tag in self.block_tags and
next(el.itersiblings(), None) is not None)
if el.tag == "br" or is_block_with_siblings:
el.tail = f" ⏎ {el.tail or ''}"
# Replace left \n in text/tail of content by the return symbol.
if el.text:
el.text = re.sub(r"\n", r" ⏎ ", el.text)
if el.tail:
el.tail = re.sub(r"\n", r" ⏎ ", el.tail)
return el
HTML_PROCESSOR = HTMLProcessor()