2020-09-24 09:57:54 +10:00
|
|
|
# Copyright Mirage authors & contributors <https://github.com/mirukana/mirage>
|
2019-12-19 22:46:16 +11:00
|
|
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
|
|
|
|
2019-12-19 00:33:22 +11:00
|
|
|
"""HTML and Markdown processing tools."""
|
|
|
|
|
2019-07-03 03:59:52 +10:00
|
|
|
import re
|
2020-09-17 00:53:29 +10:00
|
|
|
from typing import Dict, List, Optional, Tuple
|
2020-03-23 14:00:30 +11:00
|
|
|
from urllib.parse import unquote
|
2019-07-03 03:59:52 +10:00
|
|
|
|
|
|
|
import html_sanitizer.sanitizer as sanitizer
|
2020-03-23 14:55:48 +11:00
|
|
|
import lxml.html # nosec
|
2019-10-24 22:27:13 +11:00
|
|
|
import mistune
|
2019-07-04 14:24:21 +10:00
|
|
|
from html_sanitizer.sanitizer import Sanitizer
|
2020-03-23 14:00:30 +11:00
|
|
|
from lxml.html import HtmlElement, etree # nosec
|
2019-07-03 03:59:52 +10:00
|
|
|
|
2020-06-27 22:35:11 +10:00
|
|
|
import nio
|
|
|
|
|
2019-12-21 06:02:54 +11:00
|
|
|
from .svg_colors import SVG_COLORS
|
|
|
|
|
2019-07-03 03:59:52 +10:00
|
|
|
|
2019-09-12 07:19:24 +10:00
|
|
|
class MarkdownInlineGrammar(mistune.InlineGrammar):
|
2019-12-19 00:33:22 +11:00
|
|
|
"""Markdown inline elements syntax modifications for the Mistune parser.
|
|
|
|
|
|
|
|
Modifications:
|
|
|
|
|
|
|
|
- Disable underscores for bold/italics (e.g. `__bold__`)
|
2019-12-21 05:44:31 +11:00
|
|
|
|
|
|
|
- Add syntax for coloring text: `<color>(text)`,
|
|
|
|
e.g. `<red>(Lorem ipsum)` or `<#000040>(sit dolor amet...)`
|
2019-12-19 00:33:22 +11:00
|
|
|
"""
|
|
|
|
|
2019-12-21 05:44:31 +11:00
|
|
|
escape = re.compile(r"^\\([\\`*{}\[\]()#+\-.!_<>~|])") # Add <
|
2019-09-12 07:19:24 +10:00
|
|
|
emphasis = re.compile(r"^\*((?:\*\*|[^\*])+?)\*(?!\*)")
|
|
|
|
double_emphasis = re.compile(r"^\*{2}([\s\S]+?)\*{2}(?!\*)")
|
|
|
|
|
2019-12-21 05:44:31 +11:00
|
|
|
# test string: r"<b>(x) <r>(x) \<a>b>(x) <a\>b>(x) <b>(\(z) <c>(foo\)xyz)"
|
|
|
|
color = re.compile(
|
|
|
|
r"^<(.+?)>" # capture the color in `<color>`
|
|
|
|
r"\((.+?)" # capture text in `(text`
|
|
|
|
r"(?<!\\)(?:\\\\)*" # ignore the next `)` if it's \escaped
|
|
|
|
r"\)", # finish on a `)`
|
|
|
|
)
|
|
|
|
|
2019-09-12 07:19:24 +10:00
|
|
|
|
|
|
|
class MarkdownInlineLexer(mistune.InlineLexer):
|
2019-12-19 00:33:22 +11:00
|
|
|
"""Apply the changes from `MarkdownInlineGrammar` for Mistune."""
|
|
|
|
|
2019-09-12 07:19:24 +10:00
|
|
|
grammar_class = MarkdownInlineGrammar
|
|
|
|
|
2019-12-21 05:44:31 +11:00
|
|
|
default_rules = [
|
|
|
|
"escape", "color", "autolink", "url", # Add color
|
|
|
|
"footnote", "link", "reflink", "nolink",
|
|
|
|
"double_emphasis", "emphasis", "code",
|
|
|
|
"linebreak", "strikethrough", "text",
|
|
|
|
]
|
|
|
|
inline_html_rules = [
|
|
|
|
"escape", "color", "autolink", "url", "link", "reflink", # Add color
|
|
|
|
"nolink", "double_emphasis", "emphasis", "code",
|
|
|
|
"linebreak", "strikethrough", "text",
|
|
|
|
]
|
|
|
|
|
2019-09-12 07:19:24 +10:00
|
|
|
|
|
|
|
def output_double_emphasis(self, m):
|
|
|
|
return self.renderer.double_emphasis(self.output(m.group(1)))
|
|
|
|
|
|
|
|
|
|
|
|
def output_emphasis(self, m):
|
|
|
|
return self.renderer.emphasis(self.output(m.group(1)))
|
2019-07-22 06:08:40 +10:00
|
|
|
|
|
|
|
|
2019-12-21 05:44:31 +11:00
|
|
|
def output_color(self, m):
|
|
|
|
color = m.group(1)
|
|
|
|
text = self.output(m.group(2))
|
|
|
|
return self.renderer.color(color, text)
|
|
|
|
|
|
|
|
|
|
|
|
class MarkdownRenderer(mistune.Renderer):
|
|
|
|
def color(self, color: str, text: str):
|
2019-12-21 06:02:54 +11:00
|
|
|
"""Render given text with a color using `<span data-mx-color=#hex>`."""
|
|
|
|
|
|
|
|
# This may be a color name, try to get a #hex code for it.
|
|
|
|
color = SVG_COLORS.get(re.sub(r"\s", "", color.lower()), color)
|
2019-12-21 05:44:31 +11:00
|
|
|
|
|
|
|
return f'<span data-mx-color="{color}">{text}</span>'
|
|
|
|
|
|
|
|
|
2019-12-19 00:33:22 +11:00
|
|
|
class HTMLProcessor:
|
|
|
|
"""Provide HTML filtering and conversion from Markdown.
|
|
|
|
|
2020-05-20 14:59:30 +10:00
|
|
|
Filtering sanitizes HTML and ensures it complies both with the Matrix
|
|
|
|
specification:
|
|
|
|
https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
|
|
|
|
and the supported Qt HTML subset for usage in QML:
|
|
|
|
https://doc.qt.io/qt-5/richtext-html-subset.html
|
2019-12-19 00:33:22 +11:00
|
|
|
|
|
|
|
Some methods take an `outgoing` argument, specifying if the HTML is
|
|
|
|
intended to be sent to matrix servers or used locally in our application.
|
|
|
|
|
|
|
|
For local usage, extra transformations are applied:
|
|
|
|
|
|
|
|
- Wrap text lines starting with a `>` in `<span>` with a `quote` class.
|
|
|
|
This allows them to be styled appropriately from QML.
|
|
|
|
|
2019-12-21 06:36:01 +11:00
|
|
|
Some methods take an `inline` argument, which return text appropriate
|
2019-12-19 00:33:22 +11:00
|
|
|
for UI elements restricted to display a single line, e.g. the room
|
|
|
|
last message subtitles in QML or notifications.
|
|
|
|
In inline filtered HTML, block tags are stripped or substituted and
|
|
|
|
newlines are turned into ⏎ symbols (U+23CE).
|
|
|
|
"""
|
|
|
|
|
2019-12-21 06:36:01 +11:00
|
|
|
inline_tags = {
|
|
|
|
"span", "font", "a", "sup", "sub", "b", "i", "s", "u", "code",
|
|
|
|
}
|
2019-10-25 00:08:10 +11:00
|
|
|
|
|
|
|
block_tags = {
|
|
|
|
"h1", "h2", "h3", "h4", "h5", "h6","blockquote",
|
2020-06-27 22:35:11 +10:00
|
|
|
"p", "ul", "ol", "li", "hr", "br", "img",
|
2019-10-25 00:08:10 +11:00
|
|
|
"table", "thead", "tbody", "tr", "th", "td", "pre",
|
2020-05-22 16:11:21 +10:00
|
|
|
"mx-reply",
|
2019-10-25 00:08:10 +11:00
|
|
|
}
|
|
|
|
|
2020-04-06 22:30:47 +10:00
|
|
|
opaque_id = r"[a-zA-Z\d._-]+?"
|
2020-07-29 16:00:49 +10:00
|
|
|
user_id_localpart = r"[\x21-\x39\x3B-\x7E]+?"
|
2020-04-06 22:30:47 +10:00
|
|
|
|
2020-03-23 11:58:05 +11:00
|
|
|
user_id_regex = re.compile(
|
2020-06-27 20:20:09 +10:00
|
|
|
rf"(?P<body>@{user_id_localpart}:(?P<host>[a-zA-Z\d.:-]*[a-zA-Z\d]))",
|
2020-03-23 11:58:05 +11:00
|
|
|
)
|
|
|
|
room_id_regex = re.compile(
|
2020-06-27 20:20:09 +10:00
|
|
|
rf"(?P<body>!{opaque_id}:(?P<host>[a-zA-Z\d.:-]*[a-zA-Z\d]))",
|
2020-03-23 11:58:05 +11:00
|
|
|
)
|
|
|
|
room_alias_regex = re.compile(
|
2020-06-27 20:20:09 +10:00
|
|
|
r"(?=^|\W)(?P<body>#\S+?:(?P<host>[a-zA-Z\d.:-]*[a-zA-Z\d]))",
|
2020-03-23 11:58:05 +11:00
|
|
|
)
|
|
|
|
|
|
|
|
link_regexes = [re.compile(r, re.IGNORECASE)
|
|
|
|
if isinstance(r, str) else r for r in [
|
2020-03-23 10:39:58 +11:00
|
|
|
# Normal :// URLs
|
2019-08-22 04:22:34 +10:00
|
|
|
(r"(?P<body>[a-zA-Z\d]+://(?P<host>[a-z\d._-]+(?:\:\d+)?)"
|
2019-09-04 19:10:53 +10:00
|
|
|
r"(?:/[/\-_.,a-z\d#%&?;=~]*)?(?:\([/\-_.,a-z\d#%&?;=~]*\))?)"),
|
2020-03-23 10:39:58 +11:00
|
|
|
|
|
|
|
# mailto: and tel:
|
2020-06-27 20:20:09 +10:00
|
|
|
r"mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9.:-]*[a-z\d]))",
|
2019-07-03 03:59:52 +10:00
|
|
|
r"tel:(?P<body>[0-9+-]+)(?P<host>)",
|
2020-03-23 10:39:58 +11:00
|
|
|
|
|
|
|
# magnet:
|
2019-07-03 03:59:52 +10:00
|
|
|
r"(?P<body>magnet:\?xt=urn:[a-z0-9]+:.+)(?P<host>)",
|
2020-03-23 10:39:58 +11:00
|
|
|
|
2020-05-20 21:54:40 +10:00
|
|
|
user_id_regex, room_id_regex, room_alias_regex,
|
2019-07-03 03:59:52 +10:00
|
|
|
]]
|
|
|
|
|
2020-03-24 06:39:14 +11:00
|
|
|
link_is_matrix_to_regex = re.compile(
|
2020-04-06 22:30:47 +10:00
|
|
|
r"https?://matrix.to/#/\S+", re.IGNORECASE,
|
2020-03-24 06:39:14 +11:00
|
|
|
)
|
2020-03-24 06:02:31 +11:00
|
|
|
link_is_user_id_regex = re.compile(
|
2020-04-06 22:30:47 +10:00
|
|
|
r"https?://matrix.to/#/@\S+", re.IGNORECASE,
|
2020-03-24 06:02:31 +11:00
|
|
|
)
|
|
|
|
link_is_room_id_regex = re.compile(
|
2020-04-06 22:30:47 +10:00
|
|
|
r"https?://matrix.to/#/!\S+", re.IGNORECASE,
|
2020-03-24 06:02:31 +11:00
|
|
|
)
|
|
|
|
link_is_room_alias_regex = re.compile(
|
2020-04-06 22:30:47 +10:00
|
|
|
r"https?://matrix.to/#/#\S+", re.IGNORECASE,
|
2020-03-24 06:02:31 +11:00
|
|
|
)
|
2020-03-24 08:17:15 +11:00
|
|
|
link_is_message_id_regex = re.compile(
|
2020-05-20 21:54:40 +10:00
|
|
|
r"https?://matrix.to/#/[!#]\S+/\$\S+", re.IGNORECASE,
|
2020-03-24 08:17:15 +11:00
|
|
|
)
|
2020-03-24 06:02:31 +11:00
|
|
|
|
2019-10-25 00:43:40 +11:00
|
|
|
inline_quote_regex = re.compile(r"(^|⏎)(\s*>[^⏎\n]*)", re.MULTILINE)
|
2019-09-07 15:28:02 +10:00
|
|
|
|
|
|
|
quote_regex = re.compile(
|
2020-03-13 18:55:04 +11:00
|
|
|
r"(^|<span/?>|<p/?>|<br/?>|<h\d/?>)"
|
|
|
|
r"(\s*>.*?)"
|
|
|
|
r"(<span/?>|</?p>|<br/?>|</?h\d>|$)",
|
2019-09-09 00:48:58 +10:00
|
|
|
re.MULTILINE,
|
2019-09-07 15:28:02 +10:00
|
|
|
)
|
|
|
|
|
2019-10-25 00:08:10 +11:00
|
|
|
extra_newlines_regex = re.compile(r"\n(\n*)")
|
2019-09-11 11:22:42 +10:00
|
|
|
|
2019-07-03 03:59:52 +10:00
|
|
|
|
2020-03-23 11:58:05 +11:00
|
|
|
def __init__(self) -> None:
|
2019-07-03 03:59:52 +10:00
|
|
|
# The whitespace remover doesn't take <pre> into account
|
2019-08-30 14:07:41 +10:00
|
|
|
sanitizer.normalize_overall_whitespace = lambda html, *args, **kw: html
|
|
|
|
sanitizer.normalize_whitespace_in_text_or_tail = \
|
|
|
|
lambda el, *args, **kw: el
|
2019-07-03 03:59:52 +10:00
|
|
|
|
|
|
|
# hard_wrap: convert all \n to <br> without required two spaces
|
2019-10-28 03:06:19 +11:00
|
|
|
# escape: escape HTML characters in the input string, e.g. tags
|
2019-07-22 06:08:40 +10:00
|
|
|
self._markdown_to_html = mistune.Markdown(
|
2019-12-21 05:44:31 +11:00
|
|
|
hard_wrap=True,
|
|
|
|
escape=True,
|
|
|
|
inline=MarkdownInlineLexer,
|
|
|
|
renderer=MarkdownRenderer(),
|
2019-07-22 06:08:40 +10:00
|
|
|
)
|
|
|
|
|
|
|
|
self._markdown_to_html.block.default_rules = [
|
|
|
|
rule for rule in self._markdown_to_html.block.default_rules
|
|
|
|
if rule != "block_quote"
|
|
|
|
]
|
2019-07-03 03:59:52 +10:00
|
|
|
|
|
|
|
|
2020-03-24 06:39:14 +11:00
|
|
|
def mentions_in_html(self, html: str) -> List[Tuple[str, str]]:
|
2020-05-20 14:59:30 +10:00
|
|
|
"""Return list of (text, href) tuples for all mention links in html."""
|
|
|
|
|
2020-03-23 14:55:48 +11:00
|
|
|
if not html.strip():
|
2020-03-24 06:39:14 +11:00
|
|
|
return []
|
|
|
|
|
|
|
|
return [
|
|
|
|
(a_tag.text, href)
|
|
|
|
for a_tag, _, href, _ in lxml.html.iterlinks(html)
|
2020-03-24 07:29:32 +11:00
|
|
|
if a_tag.text and
|
|
|
|
self.link_is_matrix_to_regex.match(unquote(href.strip()))
|
2020-03-24 06:39:14 +11:00
|
|
|
]
|
|
|
|
|
2020-03-23 14:55:48 +11:00
|
|
|
|
2019-12-21 06:36:01 +11:00
|
|
|
def from_markdown(
|
2020-03-23 07:21:29 +11:00
|
|
|
self,
|
2020-08-21 15:17:29 +10:00
|
|
|
text: str,
|
|
|
|
inline: bool = False,
|
|
|
|
outgoing: bool = False,
|
|
|
|
display_name_mentions: Optional[Dict[str, str]] = None,
|
2019-12-21 06:36:01 +11:00
|
|
|
) -> str:
|
2019-12-19 00:33:22 +11:00
|
|
|
"""Return filtered HTML from Markdown text."""
|
|
|
|
|
2020-03-23 11:58:05 +11:00
|
|
|
return self.filter(
|
|
|
|
self._markdown_to_html(text),
|
|
|
|
inline,
|
|
|
|
outgoing,
|
2020-08-21 15:17:29 +10:00
|
|
|
display_name_mentions,
|
2020-03-23 11:58:05 +11:00
|
|
|
)
|
2020-03-23 07:21:29 +11:00
|
|
|
|
|
|
|
|
2019-12-21 06:36:01 +11:00
|
|
|
def filter(
|
2020-03-23 11:58:05 +11:00
|
|
|
self,
|
2020-08-21 15:17:29 +10:00
|
|
|
html: str,
|
|
|
|
inline: bool = False,
|
|
|
|
outgoing: bool = False,
|
|
|
|
display_name_mentions: Optional[Dict[str, str]] = None,
|
2019-12-21 06:36:01 +11:00
|
|
|
) -> str:
|
|
|
|
"""Filter and return HTML."""
|
2019-12-19 00:33:22 +11:00
|
|
|
|
2020-08-21 15:17:29 +10:00
|
|
|
mentions = display_name_mentions
|
|
|
|
|
|
|
|
sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions))
|
2020-03-23 14:00:30 +11:00
|
|
|
html = sanit.sanitize(html).rstrip("\n")
|
|
|
|
|
|
|
|
if not html.strip():
|
|
|
|
return html
|
|
|
|
|
|
|
|
tree = etree.fromstring(
|
|
|
|
html, parser=etree.HTMLParser(encoding="utf-8"),
|
|
|
|
)
|
|
|
|
|
|
|
|
for a_tag in tree.iterdescendants("a"):
|
2020-08-21 15:17:29 +10:00
|
|
|
self._mentions_to_matrix_to_links(a_tag, mentions, outgoing)
|
2020-03-24 06:02:31 +11:00
|
|
|
|
|
|
|
if not outgoing:
|
|
|
|
self._matrix_to_links_add_classes(a_tag)
|
2020-03-23 14:00:30 +11:00
|
|
|
|
|
|
|
html = etree.tostring(tree, encoding="utf-8", method="html").decode()
|
|
|
|
html = sanit.sanitize(html).rstrip("\n")
|
2019-09-11 11:22:42 +10:00
|
|
|
|
|
|
|
if outgoing:
|
|
|
|
return html
|
2019-07-22 07:41:43 +10:00
|
|
|
|
2019-09-11 11:22:42 +10:00
|
|
|
# Client-side modifications
|
2019-08-22 04:14:44 +10:00
|
|
|
|
2020-03-13 18:55:04 +11:00
|
|
|
html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html)
|
|
|
|
|
|
|
|
if not inline:
|
|
|
|
return html
|
|
|
|
|
|
|
|
return self.inline_quote_regex.sub(
|
|
|
|
r'\1<span class="quote">\2</span>', html,
|
|
|
|
)
|
|
|
|
|
2019-07-03 03:59:52 +10:00
|
|
|
|
2019-12-21 06:36:01 +11:00
|
|
|
def sanitize_settings(
|
2020-08-21 15:17:29 +10:00
|
|
|
self,
|
|
|
|
inline: bool = False,
|
|
|
|
outgoing: bool = False,
|
|
|
|
display_name_mentions: Optional[Dict[str, str]] = None,
|
2019-12-21 06:36:01 +11:00
|
|
|
) -> dict:
|
2019-12-19 00:33:22 +11:00
|
|
|
"""Return an html_sanitizer configuration."""
|
|
|
|
|
Big performance refactoring & various improvements
Instead of passing all sorts of events for the JS to handle and manually
add to different data models, we now handle everything we can in Python.
For any change, the python models send a sync event with their
contents (no more than 4 times per second) to JS, and the QSyncable
library's JsonListModel takes care of converting it to a QML ListModel
and sending the appropriate signals.
The SortFilterProxyModel library is not used anymore, the only case
where we need to filter/sort something now is when the user interacts
with the "Filter rooms" or "Filter members" fields. These cases are
handled by a simple JS function.
We now keep separated room and timeline models for different accounts,
the previous approach of sharing all the data we could between accounts
created a lot of complications (local echoes, decrypted messages
replacing others, etc).
The users's own account profile changes are now hidden in the timeline.
On startup, if all events for a room were only own profile changes, more
events will be loaded.
Any kind of image format supported by Qt is now handled by the
pyotherside image provider, instead of just PNG/JPG.
SVGs which previously caused errors are supported as well.
The typing members bar paddings/margins are fixed.
The behavior of the avatar/"upload a profile picture" overlay is fixed.
Config files read from disk are now cached (TODO: make them reloadable
again).
Pylint is not used anymore because of all its annoying false warnings
and lack of understanding for dataclasses, it is replaced by flake8 with
a custom config and various plugins.
Debug mode is now considered on if the program was compiled with
the right option, instead of taking an argument from CLI.
When on, C++ will set a flag in the Window QML component.
The loading screen is now unloaded after the UI is ready, where
previously it just stayed in the background invisible and wasted CPU.
The overall refactoring and improvements make us now able to handle
rooms with thousand of members and no lazy-loading, where previously
everything would freeze and simply scrolling up to load past events
in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
|
|
|
# https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
|
2019-07-04 14:24:21 +10:00
|
|
|
|
2019-10-25 00:08:10 +11:00
|
|
|
inline_tags = self.inline_tags
|
|
|
|
all_tags = inline_tags | self.block_tags
|
2019-07-04 14:24:21 +10:00
|
|
|
|
|
|
|
inlines_attributes = {
|
Big performance refactoring & various improvements
Instead of passing all sorts of events for the JS to handle and manually
add to different data models, we now handle everything we can in Python.
For any change, the python models send a sync event with their
contents (no more than 4 times per second) to JS, and the QSyncable
library's JsonListModel takes care of converting it to a QML ListModel
and sending the appropriate signals.
The SortFilterProxyModel library is not used anymore, the only case
where we need to filter/sort something now is when the user interacts
with the "Filter rooms" or "Filter members" fields. These cases are
handled by a simple JS function.
We now keep separated room and timeline models for different accounts,
the previous approach of sharing all the data we could between accounts
created a lot of complications (local echoes, decrypted messages
replacing others, etc).
The users's own account profile changes are now hidden in the timeline.
On startup, if all events for a room were only own profile changes, more
events will be loaded.
Any kind of image format supported by Qt is now handled by the
pyotherside image provider, instead of just PNG/JPG.
SVGs which previously caused errors are supported as well.
The typing members bar paddings/margins are fixed.
The behavior of the avatar/"upload a profile picture" overlay is fixed.
Config files read from disk are now cached (TODO: make them reloadable
again).
Pylint is not used anymore because of all its annoying false warnings
and lack of understanding for dataclasses, it is replaced by flake8 with
a custom config and various plugins.
Debug mode is now considered on if the program was compiled with
the right option, instead of taking an argument from CLI.
When on, C++ will set a flag in the Window QML component.
The loading screen is now unloaded after the UI is ready, where
previously it just stayed in the background invisible and wasted CPU.
The overall refactoring and improvements make us now able to handle
rooms with thousand of members and no lazy-loading, where previously
everything would freeze and simply scrolling up to load past events
in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
|
|
|
"font": {"color"},
|
2020-03-24 06:23:01 +11:00
|
|
|
"a": {"href", "class", "data-mention"},
|
2019-07-04 14:24:21 +10:00
|
|
|
"code": {"class"},
|
|
|
|
}
|
|
|
|
attributes = {**inlines_attributes, **{
|
|
|
|
"ol": {"start"},
|
2019-07-22 08:17:51 +10:00
|
|
|
"hr": {"width"},
|
2019-12-21 06:36:01 +11:00
|
|
|
"span": {"data-mx-color"},
|
2020-06-27 22:35:11 +10:00
|
|
|
"img": {
|
|
|
|
"data-mx-emote", "src", "alt", "title", "width", "height",
|
|
|
|
},
|
2019-07-04 14:24:21 +10:00
|
|
|
}}
|
|
|
|
|
2020-08-21 15:17:29 +10:00
|
|
|
username_link_regexes = [re.compile(r) for r in [
|
2020-08-23 23:41:41 +10:00
|
|
|
rf"(?<!\w)(?P<body>{re.escape(name or user_id)})(?!\w)(?P<host>)"
|
|
|
|
for user_id, name in (display_name_mentions or {}).items()
|
2020-08-21 15:17:29 +10:00
|
|
|
]]
|
2020-03-23 11:58:05 +11:00
|
|
|
|
2019-07-03 03:59:52 +10:00
|
|
|
return {
|
2019-10-25 00:08:10 +11:00
|
|
|
"tags": inline_tags if inline else all_tags,
|
2019-07-04 14:24:21 +10:00
|
|
|
"attributes": inlines_attributes if inline else attributes,
|
2020-06-27 22:35:11 +10:00
|
|
|
"empty": {} if inline else {"hr", "br", "img"},
|
2019-07-04 14:24:21 +10:00
|
|
|
"separate": {"a"} if inline else {
|
2020-06-27 22:35:11 +10:00
|
|
|
"a", "p", "li", "table", "tr", "th", "td", "br", "hr", "img",
|
2019-07-03 03:59:52 +10:00
|
|
|
},
|
|
|
|
"whitespace": {},
|
Big performance refactoring & various improvements
Instead of passing all sorts of events for the JS to handle and manually
add to different data models, we now handle everything we can in Python.
For any change, the python models send a sync event with their
contents (no more than 4 times per second) to JS, and the QSyncable
library's JsonListModel takes care of converting it to a QML ListModel
and sending the appropriate signals.
The SortFilterProxyModel library is not used anymore, the only case
where we need to filter/sort something now is when the user interacts
with the "Filter rooms" or "Filter members" fields. These cases are
handled by a simple JS function.
We now keep separated room and timeline models for different accounts,
the previous approach of sharing all the data we could between accounts
created a lot of complications (local echoes, decrypted messages
replacing others, etc).
The users's own account profile changes are now hidden in the timeline.
On startup, if all events for a room were only own profile changes, more
events will be loaded.
Any kind of image format supported by Qt is now handled by the
pyotherside image provider, instead of just PNG/JPG.
SVGs which previously caused errors are supported as well.
The typing members bar paddings/margins are fixed.
The behavior of the avatar/"upload a profile picture" overlay is fixed.
Config files read from disk are now cached (TODO: make them reloadable
again).
Pylint is not used anymore because of all its annoying false warnings
and lack of understanding for dataclasses, it is replaced by flake8 with
a custom config and various plugins.
Debug mode is now considered on if the program was compiled with
the right option, instead of taking an argument from CLI.
When on, C++ will set a flag in the Window QML component.
The loading screen is now unloaded after the UI is ready, where
previously it just stayed in the background invisible and wasted CPU.
The overall refactoring and improvements make us now able to handle
rooms with thousand of members and no lazy-loading, where previously
everything would freeze and simply scrolling up to load past events
in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
|
|
|
"keep_typographic_whitespace": True,
|
2019-07-03 03:59:52 +10:00
|
|
|
"add_nofollow": False,
|
2019-07-04 14:24:21 +10:00
|
|
|
"autolink": {
|
2020-03-23 11:58:05 +11:00
|
|
|
"link_regexes":
|
|
|
|
self.link_regexes + username_link_regexes, # type: ignore
|
2019-07-03 03:59:52 +10:00
|
|
|
"avoid_hosts": [],
|
|
|
|
},
|
|
|
|
"sanitize_href": lambda href: href,
|
|
|
|
"element_preprocessors": [
|
|
|
|
sanitizer.bold_span_to_strong,
|
|
|
|
sanitizer.italic_span_to_em,
|
|
|
|
sanitizer.tag_replacer("strong", "b"),
|
|
|
|
sanitizer.tag_replacer("em", "i"),
|
|
|
|
sanitizer.tag_replacer("strike", "s"),
|
|
|
|
sanitizer.tag_replacer("del", "s"),
|
|
|
|
sanitizer.tag_replacer("form", "p"),
|
|
|
|
sanitizer.tag_replacer("div", "p"),
|
|
|
|
sanitizer.tag_replacer("caption", "p"),
|
|
|
|
sanitizer.target_blank_noopener,
|
2019-12-21 06:36:01 +11:00
|
|
|
|
|
|
|
self._span_color_to_font if not outgoing else lambda el: el,
|
|
|
|
|
2019-09-03 17:04:57 +10:00
|
|
|
self._img_to_a,
|
2019-10-24 23:06:50 +11:00
|
|
|
self._remove_extra_newlines,
|
2019-10-25 00:08:10 +11:00
|
|
|
self._newlines_to_return_symbol if inline else lambda el: el,
|
2020-09-17 10:58:02 +10:00
|
|
|
self._reply_to_inline if inline else lambda el: el,
|
2019-07-03 03:59:52 +10:00
|
|
|
],
|
2019-12-21 06:36:01 +11:00
|
|
|
"element_postprocessors": [
|
|
|
|
self._font_color_to_span if outgoing else lambda el: el,
|
|
|
|
],
|
2019-07-03 03:59:52 +10:00
|
|
|
"is_mergeable": lambda e1, e2: e1.attrib == e2.attrib,
|
|
|
|
}
|
|
|
|
|
|
|
|
|
Big performance refactoring & various improvements
Instead of passing all sorts of events for the JS to handle and manually
add to different data models, we now handle everything we can in Python.
For any change, the python models send a sync event with their
contents (no more than 4 times per second) to JS, and the QSyncable
library's JsonListModel takes care of converting it to a QML ListModel
and sending the appropriate signals.
The SortFilterProxyModel library is not used anymore, the only case
where we need to filter/sort something now is when the user interacts
with the "Filter rooms" or "Filter members" fields. These cases are
handled by a simple JS function.
We now keep separated room and timeline models for different accounts,
the previous approach of sharing all the data we could between accounts
created a lot of complications (local echoes, decrypted messages
replacing others, etc).
The users's own account profile changes are now hidden in the timeline.
On startup, if all events for a room were only own profile changes, more
events will be loaded.
Any kind of image format supported by Qt is now handled by the
pyotherside image provider, instead of just PNG/JPG.
SVGs which previously caused errors are supported as well.
The typing members bar paddings/margins are fixed.
The behavior of the avatar/"upload a profile picture" overlay is fixed.
Config files read from disk are now cached (TODO: make them reloadable
again).
Pylint is not used anymore because of all its annoying false warnings
and lack of understanding for dataclasses, it is replaced by flake8 with
a custom config and various plugins.
Debug mode is now considered on if the program was compiled with
the right option, instead of taking an argument from CLI.
When on, C++ will set a flag in the Window QML component.
The loading screen is now unloaded after the UI is ready, where
previously it just stayed in the background invisible and wasted CPU.
The overall refactoring and improvements make us now able to handle
rooms with thousand of members and no lazy-loading, where previously
everything would freeze and simply scrolling up to load past events
in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
|
|
|
@staticmethod
|
2019-12-21 06:36:01 +11:00
|
|
|
def _span_color_to_font(el: HtmlElement) -> HtmlElement:
|
2019-12-19 00:33:22 +11:00
|
|
|
"""Convert HTML `<span data-mx-color=...` to `<font color=...>`."""
|
|
|
|
|
Big performance refactoring & various improvements
Instead of passing all sorts of events for the JS to handle and manually
add to different data models, we now handle everything we can in Python.
For any change, the python models send a sync event with their
contents (no more than 4 times per second) to JS, and the QSyncable
library's JsonListModel takes care of converting it to a QML ListModel
and sending the appropriate signals.
The SortFilterProxyModel library is not used anymore, the only case
where we need to filter/sort something now is when the user interacts
with the "Filter rooms" or "Filter members" fields. These cases are
handled by a simple JS function.
We now keep separated room and timeline models for different accounts,
the previous approach of sharing all the data we could between accounts
created a lot of complications (local echoes, decrypted messages
replacing others, etc).
The users's own account profile changes are now hidden in the timeline.
On startup, if all events for a room were only own profile changes, more
events will be loaded.
Any kind of image format supported by Qt is now handled by the
pyotherside image provider, instead of just PNG/JPG.
SVGs which previously caused errors are supported as well.
The typing members bar paddings/margins are fixed.
The behavior of the avatar/"upload a profile picture" overlay is fixed.
Config files read from disk are now cached (TODO: make them reloadable
again).
Pylint is not used anymore because of all its annoying false warnings
and lack of understanding for dataclasses, it is replaced by flake8 with
a custom config and various plugins.
Debug mode is now considered on if the program was compiled with
the right option, instead of taking an argument from CLI.
When on, C++ will set a flag in the Window QML component.
The loading screen is now unloaded after the UI is ready, where
previously it just stayed in the background invisible and wasted CPU.
The overall refactoring and improvements make us now able to handle
rooms with thousand of members and no lazy-loading, where previously
everything would freeze and simply scrolling up to load past events
in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
|
|
|
if el.tag not in ("span", "font"):
|
2019-07-03 03:59:52 +10:00
|
|
|
return el
|
|
|
|
|
Big performance refactoring & various improvements
Instead of passing all sorts of events for the JS to handle and manually
add to different data models, we now handle everything we can in Python.
For any change, the python models send a sync event with their
contents (no more than 4 times per second) to JS, and the QSyncable
library's JsonListModel takes care of converting it to a QML ListModel
and sending the appropriate signals.
The SortFilterProxyModel library is not used anymore, the only case
where we need to filter/sort something now is when the user interacts
with the "Filter rooms" or "Filter members" fields. These cases are
handled by a simple JS function.
We now keep separated room and timeline models for different accounts,
the previous approach of sharing all the data we could between accounts
created a lot of complications (local echoes, decrypted messages
replacing others, etc).
The users's own account profile changes are now hidden in the timeline.
On startup, if all events for a room were only own profile changes, more
events will be loaded.
Any kind of image format supported by Qt is now handled by the
pyotherside image provider, instead of just PNG/JPG.
SVGs which previously caused errors are supported as well.
The typing members bar paddings/margins are fixed.
The behavior of the avatar/"upload a profile picture" overlay is fixed.
Config files read from disk are now cached (TODO: make them reloadable
again).
Pylint is not used anymore because of all its annoying false warnings
and lack of understanding for dataclasses, it is replaced by flake8 with
a custom config and various plugins.
Debug mode is now considered on if the program was compiled with
the right option, instead of taking an argument from CLI.
When on, C++ will set a flag in the Window QML component.
The loading screen is now unloaded after the UI is ready, where
previously it just stayed in the background invisible and wasted CPU.
The overall refactoring and improvements make us now able to handle
rooms with thousand of members and no lazy-loading, where previously
everything would freeze and simply scrolling up to load past events
in any room would block the UI for a few seconds.
2019-08-11 22:01:22 +10:00
|
|
|
color = el.attrib.pop("data-mx-color", None)
|
|
|
|
if color:
|
|
|
|
el.tag = "font"
|
|
|
|
el.attrib["color"] = color
|
2019-07-03 03:59:52 +10:00
|
|
|
|
|
|
|
return el
|
|
|
|
|
|
|
|
|
2019-12-21 06:36:01 +11:00
|
|
|
@staticmethod
|
|
|
|
def _font_color_to_span(el: HtmlElement) -> HtmlElement:
|
|
|
|
"""Convert HTML `<font color=...>` to `<span data-mx-color=...`."""
|
|
|
|
|
|
|
|
if el.tag not in ("span", "font"):
|
|
|
|
return el
|
|
|
|
|
|
|
|
color = el.attrib.pop("color", None)
|
|
|
|
if color:
|
|
|
|
el.tag = "span"
|
|
|
|
el.attrib["data-mx-color"] = color
|
|
|
|
|
|
|
|
return el
|
|
|
|
|
|
|
|
|
2019-07-08 13:52:41 +10:00
|
|
|
@staticmethod
|
2019-09-03 17:04:57 +10:00
|
|
|
def _img_to_a(el: HtmlElement) -> HtmlElement:
|
2019-12-19 00:33:22 +11:00
|
|
|
"""Linkify images by wrapping `<img>` tags in `<a>`."""
|
|
|
|
|
2020-06-27 22:35:11 +10:00
|
|
|
if el.tag != "img":
|
|
|
|
return el
|
|
|
|
|
2020-07-17 01:24:42 +10:00
|
|
|
src = el.attrib.get("src", "")
|
2020-06-27 22:35:11 +10:00
|
|
|
width = el.attrib.get("width")
|
|
|
|
height = el.attrib.get("height")
|
|
|
|
is_emote = "data-mx-emote" in el.attrib
|
|
|
|
|
|
|
|
if src.startswith("mxc://"):
|
|
|
|
el.attrib["src"] = nio.Api.mxc_to_http(src)
|
|
|
|
|
|
|
|
if is_emote and not width and not height:
|
|
|
|
el.attrib["width"] = 32
|
|
|
|
el.attrib["height"] = 32
|
|
|
|
|
|
|
|
elif is_emote and width and not height:
|
|
|
|
el.attrib["height"] = width
|
|
|
|
|
|
|
|
elif is_emote and height and not width:
|
|
|
|
el.attrib["width"] = height
|
|
|
|
|
|
|
|
elif not is_emote and (not width or not height):
|
2019-09-03 17:04:57 +10:00
|
|
|
el.tag = "a"
|
|
|
|
el.attrib["href"] = el.attrib.pop("src", "")
|
|
|
|
el.text = el.attrib.pop("alt", None) or el.attrib["href"]
|
2019-07-03 03:59:52 +10:00
|
|
|
|
|
|
|
return el
|
|
|
|
|
|
|
|
|
2019-10-24 23:06:50 +11:00
|
|
|
def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement:
|
2020-05-20 14:59:30 +10:00
|
|
|
r"""Remove excess `\n` characters from HTML elements.
|
2019-12-19 00:33:22 +11:00
|
|
|
|
|
|
|
This is done to avoid additional blank lines when the CSS directive
|
|
|
|
`white-space: pre` is used.
|
2020-03-24 06:11:29 +11:00
|
|
|
|
|
|
|
Text inside `<pre>` tags is ignored, except for the final newlines.
|
2019-12-19 00:33:22 +11:00
|
|
|
"""
|
2019-10-24 23:06:50 +11:00
|
|
|
|
|
|
|
pre_parent = any(parent.tag == "pre" for parent in el.iterancestors())
|
|
|
|
|
|
|
|
if el.tag != "pre" and not pre_parent:
|
|
|
|
if el.text:
|
2019-10-25 00:08:10 +11:00
|
|
|
el.text = self.extra_newlines_regex.sub(r"\1", el.text)
|
2019-10-24 23:06:50 +11:00
|
|
|
if el.tail:
|
2019-10-25 00:08:10 +11:00
|
|
|
el.tail = self.extra_newlines_regex.sub(r"\1", el.tail)
|
2020-03-24 06:11:29 +11:00
|
|
|
else:
|
|
|
|
if el.text and el.text.endswith("\n"):
|
|
|
|
el.text = el.text[:-1]
|
|
|
|
if el.tail and el.tail.endswith("\n"):
|
|
|
|
el.tail = el.tail[:-1]
|
2019-10-25 00:08:10 +11:00
|
|
|
|
|
|
|
return el
|
|
|
|
|
|
|
|
|
|
|
|
def _newlines_to_return_symbol(self, el: HtmlElement) -> HtmlElement:
|
2019-12-19 00:33:22 +11:00
|
|
|
"""Turn newlines into unicode return symbols (⏎, U+23CE).
|
|
|
|
|
|
|
|
The symbol is added to blocks with siblings (e.g. a `<p>` followed by
|
|
|
|
another `<p>`) and `<br>` tags.
|
|
|
|
The `<br>` themselves will be removed by the inline sanitizer.
|
|
|
|
"""
|
2019-10-25 00:08:10 +11:00
|
|
|
|
|
|
|
is_block_with_siblings = (el.tag in self.block_tags and
|
|
|
|
next(el.itersiblings(), None) is not None)
|
|
|
|
|
|
|
|
if el.tag == "br" or is_block_with_siblings:
|
|
|
|
el.tail = f" ⏎ {el.tail or ''}"
|
|
|
|
|
|
|
|
|
|
|
|
# Replace left \n in text/tail of <pre> content by the return symbol.
|
|
|
|
if el.text:
|
|
|
|
el.text = re.sub(r"\n", r" ⏎ ", el.text)
|
|
|
|
|
|
|
|
if el.tail:
|
|
|
|
el.tail = re.sub(r"\n", r" ⏎ ", el.tail)
|
2019-10-24 23:06:50 +11:00
|
|
|
|
|
|
|
return el
|
|
|
|
|
|
|
|
|
2020-09-17 10:58:02 +10:00
|
|
|
def _reply_to_inline(self, el: HtmlElement) -> HtmlElement:
|
|
|
|
"""Turn <mx-reply> into a plaintext inline form."""
|
|
|
|
|
|
|
|
if el.tag != "mx-reply":
|
|
|
|
return el
|
|
|
|
|
|
|
|
el.tail = f" ⏎⏎ {el.tail or ''}"
|
|
|
|
return el
|
|
|
|
|
|
|
|
|
2020-03-24 06:02:31 +11:00
|
|
|
def _mentions_to_matrix_to_links(
|
2020-08-21 15:17:29 +10:00
|
|
|
self,
|
|
|
|
el: HtmlElement,
|
|
|
|
display_name_mentions: Optional[Dict[str, str]] = None,
|
|
|
|
outgoing: bool = False,
|
2020-03-24 06:02:31 +11:00
|
|
|
) -> HtmlElement:
|
2020-08-21 15:17:29 +10:00
|
|
|
"""Turn user ID, usernames and room ID/aliases into matrix.to URL.
|
2020-05-20 14:59:30 +10:00
|
|
|
|
2020-08-21 15:17:29 +10:00
|
|
|
After the HTML sanitizer autolinks these, the links's hrefs are the
|
2020-05-20 14:59:30 +10:00
|
|
|
link text, e.g. `<a href="@foo:bar.com">@foo:bar.com</a>`.
|
|
|
|
We turn them into proper matrix.to URL in this function.
|
|
|
|
"""
|
2020-03-23 11:58:05 +11:00
|
|
|
|
2020-03-23 10:39:58 +11:00
|
|
|
if el.tag != "a" or not el.attrib.get("href"):
|
|
|
|
return el
|
|
|
|
|
2020-03-23 11:58:05 +11:00
|
|
|
id_regexes = (
|
|
|
|
self.user_id_regex, self.room_id_regex, self.room_alias_regex,
|
|
|
|
)
|
|
|
|
|
|
|
|
for regex in id_regexes:
|
2020-03-24 08:17:15 +11:00
|
|
|
if regex.match(unquote(el.attrib["href"])):
|
2020-05-20 14:59:30 +10:00
|
|
|
el.attrib["href"] = f"https://matrix.to/#/{el.attrib['href']}"
|
|
|
|
return el
|
2020-03-23 11:58:05 +11:00
|
|
|
|
2020-08-23 23:41:41 +10:00
|
|
|
for user_id, name in (display_name_mentions or {}).items():
|
|
|
|
if unquote(el.attrib["href"]) == (name or user_id):
|
2020-05-20 14:59:30 +10:00
|
|
|
el.attrib["href"] = f"https://matrix.to/#/{user_id}"
|
|
|
|
return el
|
2020-03-24 06:02:31 +11:00
|
|
|
|
|
|
|
return el
|
|
|
|
|
|
|
|
|
|
|
|
def _matrix_to_links_add_classes(self, el: HtmlElement) -> HtmlElement:
|
2020-05-20 21:54:40 +10:00
|
|
|
"""Add special CSS classes to matrix.to mention links."""
|
2020-05-20 14:59:30 +10:00
|
|
|
|
2020-04-07 23:57:32 +10:00
|
|
|
href = unquote(el.attrib.get("href", ""))
|
2020-03-24 06:02:31 +11:00
|
|
|
|
2020-03-24 07:29:32 +11:00
|
|
|
if not href or not el.text:
|
2020-03-24 06:02:31 +11:00
|
|
|
return el
|
|
|
|
|
2020-05-20 21:54:40 +10:00
|
|
|
# This must be first, or link will be mistaken by room ID/alias regex
|
|
|
|
if self.link_is_message_id_regex.match(href):
|
|
|
|
el.attrib["class"] = "mention message-id-mention"
|
|
|
|
el.attrib["data-mention"] = el.text.strip()
|
|
|
|
|
|
|
|
elif self.link_is_user_id_regex.match(href):
|
2020-03-24 06:23:01 +11:00
|
|
|
if el.text.strip().startswith("@"):
|
|
|
|
el.attrib["class"] = "mention user-id-mention"
|
2020-03-24 06:02:31 +11:00
|
|
|
else:
|
2020-03-24 06:23:01 +11:00
|
|
|
el.attrib["class"] = "mention username-mention"
|
|
|
|
|
|
|
|
el.attrib["data-mention"] = el.text.strip()
|
2020-03-24 06:02:31 +11:00
|
|
|
|
|
|
|
elif self.link_is_room_id_regex.match(href):
|
2020-03-24 06:23:01 +11:00
|
|
|
el.attrib["class"] = "mention room-id-mention"
|
|
|
|
el.attrib["data-mention"] = el.text.strip()
|
2020-03-24 06:02:31 +11:00
|
|
|
|
|
|
|
elif self.link_is_room_alias_regex.match(href):
|
2020-03-24 06:23:01 +11:00
|
|
|
el.attrib["class"] = "mention room-alias-mention"
|
|
|
|
el.attrib["data-mention"] = el.text.strip()
|
2020-03-23 11:58:05 +11:00
|
|
|
|
2020-03-23 10:39:58 +11:00
|
|
|
return el
|
|
|
|
|
|
|
|
|
2019-12-19 00:33:22 +11:00
|
|
|
HTML_PROCESSOR = HTMLProcessor()
|