moment/src/backend/html_markdown.py

527 lines
17 KiB
Python
Raw Normal View History

2019-12-19 22:46:16 +11:00
# SPDX-License-Identifier: LGPL-3.0-or-later
"""HTML and Markdown processing tools."""
import re
from typing import Dict, List, Optional, Tuple
from urllib.parse import unquote
import html_sanitizer.sanitizer as sanitizer
2020-03-23 14:55:48 +11:00
import lxml.html # nosec
import mistune
from html_sanitizer.sanitizer import Sanitizer
from lxml.html import HtmlElement, etree # nosec
import nio
from .svg_colors import SVG_COLORS
class MarkdownInlineGrammar(mistune.InlineGrammar):
"""Markdown inline elements syntax modifications for the Mistune parser.
Modifications:
- Disable underscores for bold/italics (e.g. `__bold__`)
- Add syntax for coloring text: `<color>(text)`,
e.g. `<red>(Lorem ipsum)` or `<#000040>(sit dolor amet...)`
"""
escape = re.compile(r"^\\([\\`*{}\[\]()#+\-.!_<>~|])") # Add <
emphasis = re.compile(r"^\*((?:\*\*|[^\*])+?)\*(?!\*)")
double_emphasis = re.compile(r"^\*{2}([\s\S]+?)\*{2}(?!\*)")
# test string: r"<b>(x) <r>(x) \<a>b>(x) <a\>b>(x) <b>(\(z) <c>(foo\)xyz)"
color = re.compile(
r"^<(.+?)>" # capture the color in `<color>`
r"\((.+?)" # capture text in `(text`
r"(?<!\\)(?:\\\\)*" # ignore the next `)` if it's \escaped
r"\)", # finish on a `)`
)
class MarkdownInlineLexer(mistune.InlineLexer):
"""Apply the changes from `MarkdownInlineGrammar` for Mistune."""
grammar_class = MarkdownInlineGrammar
default_rules = [
"escape", "color", "autolink", "url", # Add color
"footnote", "link", "reflink", "nolink",
"double_emphasis", "emphasis", "code",
"linebreak", "strikethrough", "text",
]
inline_html_rules = [
"escape", "color", "autolink", "url", "link", "reflink", # Add color
"nolink", "double_emphasis", "emphasis", "code",
"linebreak", "strikethrough", "text",
]
def output_double_emphasis(self, m):
return self.renderer.double_emphasis(self.output(m.group(1)))
def output_emphasis(self, m):
return self.renderer.emphasis(self.output(m.group(1)))
def output_color(self, m):
color = m.group(1)
text = self.output(m.group(2))
return self.renderer.color(color, text)
class MarkdownRenderer(mistune.Renderer):
def color(self, color: str, text: str):
"""Render given text with a color using `<span data-mx-color=#hex>`."""
# This may be a color name, try to get a #hex code for it.
color = SVG_COLORS.get(re.sub(r"\s", "", color.lower()), color)
return f'<span data-mx-color="{color}">{text}</span>'
class HTMLProcessor:
"""Provide HTML filtering and conversion from Markdown.
Filtering sanitizes HTML and ensures it complies both with the Matrix
specification:
https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
and the supported Qt HTML subset for usage in QML:
https://doc.qt.io/qt-5/richtext-html-subset.html
Some methods take an `outgoing` argument, specifying if the HTML is
intended to be sent to matrix servers or used locally in our application.
For local usage, extra transformations are applied:
- Wrap text lines starting with a `>` in `<span>` with a `quote` class.
This allows them to be styled appropriately from QML.
Some methods take an `inline` argument, which return text appropriate
for UI elements restricted to display a single line, e.g. the room
last message subtitles in QML or notifications.
In inline filtered HTML, block tags are stripped or substituted and
newlines are turned into symbols (U+23CE).
"""
inline_tags = {
"span", "font", "a", "sup", "sub", "b", "i", "s", "u", "code",
}
block_tags = {
"h1", "h2", "h3", "h4", "h5", "h6","blockquote",
"p", "ul", "ol", "li", "hr", "br", "img",
"table", "thead", "tbody", "tr", "th", "td", "pre",
2020-05-22 16:11:21 +10:00
"mx-reply",
}
opaque_id = r"[a-zA-Z\d._-]+?"
user_id_localpart = r"[\x21-\x39\x3B-\x7E]+?"
user_id_regex = re.compile(
rf"(?P<body>@{user_id_localpart}:(?P<host>[a-zA-Z\d.:-]*[a-zA-Z\d]))",
)
room_id_regex = re.compile(
rf"(?P<body>!{opaque_id}:(?P<host>[a-zA-Z\d.:-]*[a-zA-Z\d]))",
)
room_alias_regex = re.compile(
r"(?=^|\W)(?P<body>#\S+?:(?P<host>[a-zA-Z\d.:-]*[a-zA-Z\d]))",
)
link_regexes = [re.compile(r, re.IGNORECASE)
if isinstance(r, str) else r for r in [
# Normal :// URLs
(r"(?P<body>[a-zA-Z\d]+://(?P<host>[a-z\d._-]+(?:\:\d+)?)"
r"(?:/[/\-_.,a-z\d#%&?;=~]*)?(?:\([/\-_.,a-z\d#%&?;=~]*\))?)"),
# mailto: and tel:
r"mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9.:-]*[a-z\d]))",
r"tel:(?P<body>[0-9+-]+)(?P<host>)",
# magnet:
r"(?P<body>magnet:\?xt=urn:[a-z0-9]+:.+)(?P<host>)",
user_id_regex, room_id_regex, room_alias_regex,
]]
link_is_matrix_to_regex = re.compile(
r"https?://matrix.to/#/\S+", re.IGNORECASE,
)
link_is_user_id_regex = re.compile(
r"https?://matrix.to/#/@\S+", re.IGNORECASE,
)
link_is_room_id_regex = re.compile(
r"https?://matrix.to/#/!\S+", re.IGNORECASE,
)
link_is_room_alias_regex = re.compile(
r"https?://matrix.to/#/#\S+", re.IGNORECASE,
)
2020-03-24 08:17:15 +11:00
link_is_message_id_regex = re.compile(
r"https?://matrix.to/#/[!#]\S+/\$\S+", re.IGNORECASE,
2020-03-24 08:17:15 +11:00
)
inline_quote_regex = re.compile(r"(^|⏎)(\s*&gt;[^⏎\n]*)", re.MULTILINE)
quote_regex = re.compile(
2020-03-13 18:55:04 +11:00
r"(^|<span/?>|<p/?>|<br/?>|<h\d/?>)"
r"(\s*&gt;.*?)"
r"(<span/?>|</?p>|<br/?>|</?h\d>|$)",
re.MULTILINE,
)
extra_newlines_regex = re.compile(r"\n(\n*)")
def __init__(self) -> None:
# The whitespace remover doesn't take <pre> into account
sanitizer.normalize_overall_whitespace = lambda html, *args, **kw: html
sanitizer.normalize_whitespace_in_text_or_tail = \
lambda el, *args, **kw: el
# hard_wrap: convert all \n to <br> without required two spaces
# escape: escape HTML characters in the input string, e.g. tags
self._markdown_to_html = mistune.Markdown(
hard_wrap=True,
escape=True,
inline=MarkdownInlineLexer,
renderer=MarkdownRenderer(),
)
self._markdown_to_html.block.default_rules = [
rule for rule in self._markdown_to_html.block.default_rules
if rule != "block_quote"
]
def mentions_in_html(self, html: str) -> List[Tuple[str, str]]:
"""Return list of (text, href) tuples for all mention links in html."""
2020-03-23 14:55:48 +11:00
if not html.strip():
return []
return [
(a_tag.text, href)
for a_tag, _, href, _ in lxml.html.iterlinks(html)
2020-03-24 07:29:32 +11:00
if a_tag.text and
self.link_is_matrix_to_regex.match(unquote(href.strip()))
]
2020-03-23 14:55:48 +11:00
def from_markdown(
2020-03-23 07:21:29 +11:00
self,
2020-08-21 15:17:29 +10:00
text: str,
inline: bool = False,
outgoing: bool = False,
display_name_mentions: Optional[Dict[str, str]] = None,
) -> str:
"""Return filtered HTML from Markdown text."""
return self.filter(
self._markdown_to_html(text),
inline,
outgoing,
2020-08-21 15:17:29 +10:00
display_name_mentions,
)
2020-03-23 07:21:29 +11:00
def filter(
self,
2020-08-21 15:17:29 +10:00
html: str,
inline: bool = False,
outgoing: bool = False,
display_name_mentions: Optional[Dict[str, str]] = None,
) -> str:
"""Filter and return HTML."""
2020-08-21 15:17:29 +10:00
mentions = display_name_mentions
sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions))
html = sanit.sanitize(html).rstrip("\n")
if not html.strip():
return html
tree = etree.fromstring(
html, parser=etree.HTMLParser(encoding="utf-8"),
)
for a_tag in tree.iterdescendants("a"):
2020-08-21 15:17:29 +10:00
self._mentions_to_matrix_to_links(a_tag, mentions, outgoing)
if not outgoing:
self._matrix_to_links_add_classes(a_tag)
html = etree.tostring(tree, encoding="utf-8", method="html").decode()
html = sanit.sanitize(html).rstrip("\n")
if outgoing:
return html
2019-07-22 07:41:43 +10:00
# Client-side modifications
2020-03-13 18:55:04 +11:00
html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html)
if not inline:
return html
return self.inline_quote_regex.sub(
r'\1<span class="quote">\2</span>', html,
)
def sanitize_settings(
2020-08-21 15:17:29 +10:00
self,
inline: bool = False,
outgoing: bool = False,
display_name_mentions: Optional[Dict[str, str]] = None,
) -> dict:
"""Return an html_sanitizer configuration."""
Big performance refactoring & various improvements Instead of passing all sorts of events for the JS to handle and manually add to different data models, we now handle everything we can in Python. For any change, the python models send a sync event with their contents (no more than 4 times per second) to JS, and the QSyncable library's JsonListModel takes care of converting it to a QML ListModel and sending the appropriate signals. The SortFilterProxyModel library is not used anymore, the only case where we need to filter/sort something now is when the user interacts with the "Filter rooms" or "Filter members" fields. These cases are handled by a simple JS function. We now keep separated room and timeline models for different accounts, the previous approach of sharing all the data we could between accounts created a lot of complications (local echoes, decrypted messages replacing others, etc). The users's own account profile changes are now hidden in the timeline. On startup, if all events for a room were only own profile changes, more events will be loaded. Any kind of image format supported by Qt is now handled by the pyotherside image provider, instead of just PNG/JPG. SVGs which previously caused errors are supported as well. The typing members bar paddings/margins are fixed. The behavior of the avatar/"upload a profile picture" overlay is fixed. Config files read from disk are now cached (TODO: make them reloadable again). Pylint is not used anymore because of all its annoying false warnings and lack of understanding for dataclasses, it is replaced by flake8 with a custom config and various plugins. Debug mode is now considered on if the program was compiled with the right option, instead of taking an argument from CLI. When on, C++ will set a flag in the Window QML component. The loading screen is now unloaded after the UI is ready, where previously it just stayed in the background invisible and wasted CPU. The overall refactoring and improvements make us now able to handle rooms with thousand of members and no lazy-loading, where previously everything would freeze and simply scrolling up to load past events in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
# https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
inline_tags = self.inline_tags
all_tags = inline_tags | self.block_tags
inlines_attributes = {
Big performance refactoring & various improvements Instead of passing all sorts of events for the JS to handle and manually add to different data models, we now handle everything we can in Python. For any change, the python models send a sync event with their contents (no more than 4 times per second) to JS, and the QSyncable library's JsonListModel takes care of converting it to a QML ListModel and sending the appropriate signals. The SortFilterProxyModel library is not used anymore, the only case where we need to filter/sort something now is when the user interacts with the "Filter rooms" or "Filter members" fields. These cases are handled by a simple JS function. We now keep separated room and timeline models for different accounts, the previous approach of sharing all the data we could between accounts created a lot of complications (local echoes, decrypted messages replacing others, etc). The users's own account profile changes are now hidden in the timeline. On startup, if all events for a room were only own profile changes, more events will be loaded. Any kind of image format supported by Qt is now handled by the pyotherside image provider, instead of just PNG/JPG. SVGs which previously caused errors are supported as well. The typing members bar paddings/margins are fixed. The behavior of the avatar/"upload a profile picture" overlay is fixed. Config files read from disk are now cached (TODO: make them reloadable again). Pylint is not used anymore because of all its annoying false warnings and lack of understanding for dataclasses, it is replaced by flake8 with a custom config and various plugins. Debug mode is now considered on if the program was compiled with the right option, instead of taking an argument from CLI. When on, C++ will set a flag in the Window QML component. The loading screen is now unloaded after the UI is ready, where previously it just stayed in the background invisible and wasted CPU. The overall refactoring and improvements make us now able to handle rooms with thousand of members and no lazy-loading, where previously everything would freeze and simply scrolling up to load past events in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
"font": {"color"},
"a": {"href", "class", "data-mention"},
"code": {"class"},
}
attributes = {**inlines_attributes, **{
"ol": {"start"},
2019-07-22 08:17:51 +10:00
"hr": {"width"},
"span": {"data-mx-color"},
"img": {
"data-mx-emote", "src", "alt", "title", "width", "height",
},
}}
2020-08-21 15:17:29 +10:00
username_link_regexes = [re.compile(r) for r in [
rf"(?<!\w)(?P<body>{re.escape(name or user_id)})(?!\w)(?P<host>)"
for user_id, name in (display_name_mentions or {}).items()
2020-08-21 15:17:29 +10:00
]]
return {
"tags": inline_tags if inline else all_tags,
"attributes": inlines_attributes if inline else attributes,
"empty": {} if inline else {"hr", "br", "img"},
"separate": {"a"} if inline else {
"a", "p", "li", "table", "tr", "th", "td", "br", "hr", "img",
},
"whitespace": {},
Big performance refactoring & various improvements Instead of passing all sorts of events for the JS to handle and manually add to different data models, we now handle everything we can in Python. For any change, the python models send a sync event with their contents (no more than 4 times per second) to JS, and the QSyncable library's JsonListModel takes care of converting it to a QML ListModel and sending the appropriate signals. The SortFilterProxyModel library is not used anymore, the only case where we need to filter/sort something now is when the user interacts with the "Filter rooms" or "Filter members" fields. These cases are handled by a simple JS function. We now keep separated room and timeline models for different accounts, the previous approach of sharing all the data we could between accounts created a lot of complications (local echoes, decrypted messages replacing others, etc). The users's own account profile changes are now hidden in the timeline. On startup, if all events for a room were only own profile changes, more events will be loaded. Any kind of image format supported by Qt is now handled by the pyotherside image provider, instead of just PNG/JPG. SVGs which previously caused errors are supported as well. The typing members bar paddings/margins are fixed. The behavior of the avatar/"upload a profile picture" overlay is fixed. Config files read from disk are now cached (TODO: make them reloadable again). Pylint is not used anymore because of all its annoying false warnings and lack of understanding for dataclasses, it is replaced by flake8 with a custom config and various plugins. Debug mode is now considered on if the program was compiled with the right option, instead of taking an argument from CLI. When on, C++ will set a flag in the Window QML component. The loading screen is now unloaded after the UI is ready, where previously it just stayed in the background invisible and wasted CPU. The overall refactoring and improvements make us now able to handle rooms with thousand of members and no lazy-loading, where previously everything would freeze and simply scrolling up to load past events in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
"keep_typographic_whitespace": True,
"add_nofollow": False,
"autolink": {
"link_regexes":
self.link_regexes + username_link_regexes, # type: ignore
"avoid_hosts": [],
},
"sanitize_href": lambda href: href,
"element_preprocessors": [
sanitizer.bold_span_to_strong,
sanitizer.italic_span_to_em,
sanitizer.tag_replacer("strong", "b"),
sanitizer.tag_replacer("em", "i"),
sanitizer.tag_replacer("strike", "s"),
sanitizer.tag_replacer("del", "s"),
sanitizer.tag_replacer("form", "p"),
sanitizer.tag_replacer("div", "p"),
sanitizer.tag_replacer("caption", "p"),
sanitizer.target_blank_noopener,
self._span_color_to_font if not outgoing else lambda el: el,
self._img_to_a,
self._remove_extra_newlines,
self._newlines_to_return_symbol if inline else lambda el: el,
],
"element_postprocessors": [
self._font_color_to_span if outgoing else lambda el: el,
],
"is_mergeable": lambda e1, e2: e1.attrib == e2.attrib,
}
Big performance refactoring & various improvements Instead of passing all sorts of events for the JS to handle and manually add to different data models, we now handle everything we can in Python. For any change, the python models send a sync event with their contents (no more than 4 times per second) to JS, and the QSyncable library's JsonListModel takes care of converting it to a QML ListModel and sending the appropriate signals. The SortFilterProxyModel library is not used anymore, the only case where we need to filter/sort something now is when the user interacts with the "Filter rooms" or "Filter members" fields. These cases are handled by a simple JS function. We now keep separated room and timeline models for different accounts, the previous approach of sharing all the data we could between accounts created a lot of complications (local echoes, decrypted messages replacing others, etc). The users's own account profile changes are now hidden in the timeline. On startup, if all events for a room were only own profile changes, more events will be loaded. Any kind of image format supported by Qt is now handled by the pyotherside image provider, instead of just PNG/JPG. SVGs which previously caused errors are supported as well. The typing members bar paddings/margins are fixed. The behavior of the avatar/"upload a profile picture" overlay is fixed. Config files read from disk are now cached (TODO: make them reloadable again). Pylint is not used anymore because of all its annoying false warnings and lack of understanding for dataclasses, it is replaced by flake8 with a custom config and various plugins. Debug mode is now considered on if the program was compiled with the right option, instead of taking an argument from CLI. When on, C++ will set a flag in the Window QML component. The loading screen is now unloaded after the UI is ready, where previously it just stayed in the background invisible and wasted CPU. The overall refactoring and improvements make us now able to handle rooms with thousand of members and no lazy-loading, where previously everything would freeze and simply scrolling up to load past events in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
@staticmethod
def _span_color_to_font(el: HtmlElement) -> HtmlElement:
"""Convert HTML `<span data-mx-color=...` to `<font color=...>`."""
Big performance refactoring & various improvements Instead of passing all sorts of events for the JS to handle and manually add to different data models, we now handle everything we can in Python. For any change, the python models send a sync event with their contents (no more than 4 times per second) to JS, and the QSyncable library's JsonListModel takes care of converting it to a QML ListModel and sending the appropriate signals. The SortFilterProxyModel library is not used anymore, the only case where we need to filter/sort something now is when the user interacts with the "Filter rooms" or "Filter members" fields. These cases are handled by a simple JS function. We now keep separated room and timeline models for different accounts, the previous approach of sharing all the data we could between accounts created a lot of complications (local echoes, decrypted messages replacing others, etc). The users's own account profile changes are now hidden in the timeline. On startup, if all events for a room were only own profile changes, more events will be loaded. Any kind of image format supported by Qt is now handled by the pyotherside image provider, instead of just PNG/JPG. SVGs which previously caused errors are supported as well. The typing members bar paddings/margins are fixed. The behavior of the avatar/"upload a profile picture" overlay is fixed. Config files read from disk are now cached (TODO: make them reloadable again). Pylint is not used anymore because of all its annoying false warnings and lack of understanding for dataclasses, it is replaced by flake8 with a custom config and various plugins. Debug mode is now considered on if the program was compiled with the right option, instead of taking an argument from CLI. When on, C++ will set a flag in the Window QML component. The loading screen is now unloaded after the UI is ready, where previously it just stayed in the background invisible and wasted CPU. The overall refactoring and improvements make us now able to handle rooms with thousand of members and no lazy-loading, where previously everything would freeze and simply scrolling up to load past events in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
if el.tag not in ("span", "font"):
return el
Big performance refactoring & various improvements Instead of passing all sorts of events for the JS to handle and manually add to different data models, we now handle everything we can in Python. For any change, the python models send a sync event with their contents (no more than 4 times per second) to JS, and the QSyncable library's JsonListModel takes care of converting it to a QML ListModel and sending the appropriate signals. The SortFilterProxyModel library is not used anymore, the only case where we need to filter/sort something now is when the user interacts with the "Filter rooms" or "Filter members" fields. These cases are handled by a simple JS function. We now keep separated room and timeline models for different accounts, the previous approach of sharing all the data we could between accounts created a lot of complications (local echoes, decrypted messages replacing others, etc). The users's own account profile changes are now hidden in the timeline. On startup, if all events for a room were only own profile changes, more events will be loaded. Any kind of image format supported by Qt is now handled by the pyotherside image provider, instead of just PNG/JPG. SVGs which previously caused errors are supported as well. The typing members bar paddings/margins are fixed. The behavior of the avatar/"upload a profile picture" overlay is fixed. Config files read from disk are now cached (TODO: make them reloadable again). Pylint is not used anymore because of all its annoying false warnings and lack of understanding for dataclasses, it is replaced by flake8 with a custom config and various plugins. Debug mode is now considered on if the program was compiled with the right option, instead of taking an argument from CLI. When on, C++ will set a flag in the Window QML component. The loading screen is now unloaded after the UI is ready, where previously it just stayed in the background invisible and wasted CPU. The overall refactoring and improvements make us now able to handle rooms with thousand of members and no lazy-loading, where previously everything would freeze and simply scrolling up to load past events in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
color = el.attrib.pop("data-mx-color", None)
if color:
el.tag = "font"
el.attrib["color"] = color
return el
@staticmethod
def _font_color_to_span(el: HtmlElement) -> HtmlElement:
"""Convert HTML `<font color=...>` to `<span data-mx-color=...`."""
if el.tag not in ("span", "font"):
return el
color = el.attrib.pop("color", None)
if color:
el.tag = "span"
el.attrib["data-mx-color"] = color
return el
2019-07-08 13:52:41 +10:00
@staticmethod
def _img_to_a(el: HtmlElement) -> HtmlElement:
"""Linkify images by wrapping `<img>` tags in `<a>`."""
if el.tag != "img":
return el
src = el.attrib.get("src", "")
width = el.attrib.get("width")
height = el.attrib.get("height")
is_emote = "data-mx-emote" in el.attrib
if src.startswith("mxc://"):
el.attrib["src"] = nio.Api.mxc_to_http(src)
if is_emote and not width and not height:
el.attrib["width"] = 32
el.attrib["height"] = 32
elif is_emote and width and not height:
el.attrib["height"] = width
elif is_emote and height and not width:
el.attrib["width"] = height
elif not is_emote and (not width or not height):
el.tag = "a"
el.attrib["href"] = el.attrib.pop("src", "")
el.text = el.attrib.pop("alt", None) or el.attrib["href"]
return el
def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement:
r"""Remove excess `\n` characters from HTML elements.
This is done to avoid additional blank lines when the CSS directive
`white-space: pre` is used.
Text inside `<pre>` tags is ignored, except for the final newlines.
"""
pre_parent = any(parent.tag == "pre" for parent in el.iterancestors())
if el.tag != "pre" and not pre_parent:
if el.text:
el.text = self.extra_newlines_regex.sub(r"\1", el.text)
if el.tail:
el.tail = self.extra_newlines_regex.sub(r"\1", el.tail)
else:
if el.text and el.text.endswith("\n"):
el.text = el.text[:-1]
if el.tail and el.tail.endswith("\n"):
el.tail = el.tail[:-1]
return el
def _newlines_to_return_symbol(self, el: HtmlElement) -> HtmlElement:
"""Turn newlines into unicode return symbols (⏎, U+23CE).
The symbol is added to blocks with siblings (e.g. a `<p>` followed by
another `<p>`) and `<br>` tags.
The `<br>` themselves will be removed by the inline sanitizer.
"""
is_block_with_siblings = (el.tag in self.block_tags and
next(el.itersiblings(), None) is not None)
if el.tag == "br" or is_block_with_siblings:
el.tail = f"{el.tail or ''}"
# Replace left \n in text/tail of <pre> content by the return symbol.
if el.text:
el.text = re.sub(r"\n", r"", el.text)
if el.tail:
el.tail = re.sub(r"\n", r"", el.tail)
return el
def _mentions_to_matrix_to_links(
2020-08-21 15:17:29 +10:00
self,
el: HtmlElement,
display_name_mentions: Optional[Dict[str, str]] = None,
outgoing: bool = False,
) -> HtmlElement:
2020-08-21 15:17:29 +10:00
"""Turn user ID, usernames and room ID/aliases into matrix.to URL.
2020-08-21 15:17:29 +10:00
After the HTML sanitizer autolinks these, the links's hrefs are the
link text, e.g. `<a href="@foo:bar.com">@foo:bar.com</a>`.
We turn them into proper matrix.to URL in this function.
"""
if el.tag != "a" or not el.attrib.get("href"):
return el
id_regexes = (
self.user_id_regex, self.room_id_regex, self.room_alias_regex,
)
for regex in id_regexes:
2020-03-24 08:17:15 +11:00
if regex.match(unquote(el.attrib["href"])):
el.attrib["href"] = f"https://matrix.to/#/{el.attrib['href']}"
return el
for user_id, name in (display_name_mentions or {}).items():
if unquote(el.attrib["href"]) == (name or user_id):
el.attrib["href"] = f"https://matrix.to/#/{user_id}"
return el
return el
def _matrix_to_links_add_classes(self, el: HtmlElement) -> HtmlElement:
"""Add special CSS classes to matrix.to mention links."""
href = unquote(el.attrib.get("href", ""))
2020-03-24 07:29:32 +11:00
if not href or not el.text:
return el
# This must be first, or link will be mistaken by room ID/alias regex
if self.link_is_message_id_regex.match(href):
el.attrib["class"] = "mention message-id-mention"
el.attrib["data-mention"] = el.text.strip()
elif self.link_is_user_id_regex.match(href):
if el.text.strip().startswith("@"):
el.attrib["class"] = "mention user-id-mention"
else:
el.attrib["class"] = "mention username-mention"
el.attrib["data-mention"] = el.text.strip()
elif self.link_is_room_id_regex.match(href):
el.attrib["class"] = "mention room-id-mention"
el.attrib["data-mention"] = el.text.strip()
elif self.link_is_room_alias_regex.match(href):
el.attrib["class"] = "mention room-alias-mention"
el.attrib["data-mention"] = el.text.strip()
return el
HTML_PROCESSOR = HTMLProcessor()