Improve HTMLProcessor performance (mentions)

- Try to autolink usernames only for outgoing messages
- Improve and add some missing docstrings

Fixes the very slow loading/python "freezing" for rooms with a huge
number of members.
This commit is contained in:
miruka 2020-05-20 00:59:30 -04:00
parent c9d5949847
commit bc5549195b
2 changed files with 33 additions and 15 deletions

View File

@ -1,8 +1,7 @@
# TODO # TODO
- shorten indicator numbers with hundreds - Defer retrieving profiles for events from members not anymore in the room
- fix python getting stuck when loading large room
- fix lag when clicking accounts in the AccountBar with a very long room list - fix lag when clicking accounts in the AccountBar with a very long room list
- fix: on startup, if a room's last event is a membership change, - fix: on startup, if a room's last event is a membership change,

View File

@ -84,8 +84,11 @@ class MarkdownRenderer(mistune.Renderer):
class HTMLProcessor: class HTMLProcessor:
"""Provide HTML filtering and conversion from Markdown. """Provide HTML filtering and conversion from Markdown.
Filtering sanitizes HTML and ensures it complies with the supported Qt Filtering sanitizes HTML and ensures it complies both with the Matrix
subset for usage in QML: https://doc.qt.io/qt-5/richtext-html-subset.html specification:
https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
and the supported Qt HTML subset for usage in QML:
https://doc.qt.io/qt-5/richtext-html-subset.html
Some methods take an `outgoing` argument, specifying if the HTML is Some methods take an `outgoing` argument, specifying if the HTML is
intended to be sent to matrix servers or used locally in our application. intended to be sent to matrix servers or used locally in our application.
@ -197,6 +200,8 @@ class HTMLProcessor:
def mentions_in_html(self, html: str) -> List[Tuple[str, str]]: def mentions_in_html(self, html: str) -> List[Tuple[str, str]]:
"""Return list of (text, href) tuples for all mention links in html."""
if not html.strip(): if not html.strip():
return [] return []
@ -209,6 +214,8 @@ class HTMLProcessor:
def user_id_link_in_html(self, html: str, user_id: str) -> bool: def user_id_link_in_html(self, html: str, user_id: str) -> bool:
"""Return whether html contains a mention link for user_id."""
regex = re.compile(rf"https?://matrix.to/#/{user_id}", re.IGNORECASE) regex = re.compile(rf"https?://matrix.to/#/{user_id}", re.IGNORECASE)
for _, href in self.mentions_in_html(html): for _, href in self.mentions_in_html(html):
@ -255,7 +262,7 @@ class HTMLProcessor:
) )
for a_tag in tree.iterdescendants("a"): for a_tag in tree.iterdescendants("a"):
self._mentions_to_matrix_to_links(a_tag, room_id) self._mentions_to_matrix_to_links(a_tag, room_id, outgoing)
if not outgoing: if not outgoing:
self._matrix_to_links_add_classes(a_tag) self._matrix_to_links_add_classes(a_tag)
@ -301,10 +308,13 @@ class HTMLProcessor:
"span": {"data-mx-color"}, "span": {"data-mx-color"},
}} }}
username_link_regexes = [re.compile(r) for r in [ username_link_regexes = []
rf"(?<!\w)(?P<body>{re.escape(username)})(?!\w)(?P<host>)"
for username in self.rooms_user_id_names[room_id].values() if outgoing:
]] username_link_regexes = [re.compile(r) for r in [
rf"(?<!\w)(?P<body>{re.escape(username)})(?!\w)(?P<host>)"
for username in self.rooms_user_id_names[room_id].values()
]]
return { return {
"tags": inline_tags if inline else all_tags, "tags": inline_tags if inline else all_tags,
@ -390,7 +400,7 @@ class HTMLProcessor:
def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement: def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement:
"""Remove excess `\\n` characters from HTML elements. r"""Remove excess `\n` characters from HTML elements.
This is done to avoid additional blank lines when the CSS directive This is done to avoid additional blank lines when the CSS directive
`white-space: pre` is used. `white-space: pre` is used.
@ -440,9 +450,14 @@ class HTMLProcessor:
def _mentions_to_matrix_to_links( def _mentions_to_matrix_to_links(
self, el: HtmlElement, room_id: str = "", self, el: HtmlElement, room_id: str = "", outgoing: bool = False,
) -> HtmlElement: ) -> HtmlElement:
"""Turn user ID/names and room ID/aliases into matrix.to URL.""" """Turn user ID/names and room ID/aliases into matrix.to URL.
After the HTML sanitizer autolinks these, the links's hrefs will be the
link text, e.g. `<a href="@foo:bar.com">@foo:bar.com</a>`.
We turn them into proper matrix.to URL in this function.
"""
if el.tag != "a" or not el.attrib.get("href"): if el.tag != "a" or not el.attrib.get("href"):
return el return el
@ -454,19 +469,23 @@ class HTMLProcessor:
for regex in id_regexes: for regex in id_regexes:
if regex.match(unquote(el.attrib["href"])): if regex.match(unquote(el.attrib["href"])):
el.attrib["href"] = f"https://matrix.to/#/{el.attrib['href']}" el.attrib["href"] = f"https://matrix.to/#/{el.attrib['href']}"
return el
if room_id not in self.rooms_user_id_names: if not outgoing or room_id not in self.rooms_user_id_names:
return el return el
for user_id, username in self.rooms_user_id_names[room_id].items(): for user_id, username in self.rooms_user_id_names[room_id].items():
if unquote(el.attrib["href"]) == username: if unquote(el.attrib["href"]) == username:
el.attrib["href"] = f"https://matrix.to/#/{user_id}" el.attrib["href"] = f"https://matrix.to/#/{user_id}"
return el
return el return el
def _matrix_to_links_add_classes(self, el: HtmlElement) -> HtmlElement: def _matrix_to_links_add_classes(self, el: HtmlElement) -> HtmlElement:
"Add special CSS classes to matrix.to mention links."""
href = unquote(el.attrib.get("href", "")) href = unquote(el.attrib.get("href", ""))
if not href or not el.text: if not href or not el.text: