Improve HTMLProcessor performance (mentions)

- Try to autolink usernames only for outgoing messages
- Improve and add some missing docstrings

Fixes the very slow loading/python "freezing" for rooms with a huge
number of members.
This commit is contained in:
miruka 2020-05-20 00:59:30 -04:00
parent c9d5949847
commit bc5549195b
2 changed files with 33 additions and 15 deletions

View File

@ -1,8 +1,7 @@
# TODO
- shorten indicator numbers with hundreds
- Defer retrieving profiles for events from members not anymore in the room
- fix python getting stuck when loading large room
- fix lag when clicking accounts in the AccountBar with a very long room list
- fix: on startup, if a room's last event is a membership change,

View File

@ -84,8 +84,11 @@ class MarkdownRenderer(mistune.Renderer):
class HTMLProcessor:
"""Provide HTML filtering and conversion from Markdown.
Filtering sanitizes HTML and ensures it complies with the supported Qt
subset for usage in QML: https://doc.qt.io/qt-5/richtext-html-subset.html
Filtering sanitizes HTML and ensures it complies both with the Matrix
specification:
https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
and the supported Qt HTML subset for usage in QML:
https://doc.qt.io/qt-5/richtext-html-subset.html
Some methods take an `outgoing` argument, specifying if the HTML is
intended to be sent to matrix servers or used locally in our application.
@ -197,6 +200,8 @@ class HTMLProcessor:
def mentions_in_html(self, html: str) -> List[Tuple[str, str]]:
"""Return list of (text, href) tuples for all mention links in html."""
if not html.strip():
return []
@ -209,6 +214,8 @@ class HTMLProcessor:
def user_id_link_in_html(self, html: str, user_id: str) -> bool:
"""Return whether html contains a mention link for user_id."""
regex = re.compile(rf"https?://matrix.to/#/{user_id}", re.IGNORECASE)
for _, href in self.mentions_in_html(html):
@ -255,7 +262,7 @@ class HTMLProcessor:
)
for a_tag in tree.iterdescendants("a"):
self._mentions_to_matrix_to_links(a_tag, room_id)
self._mentions_to_matrix_to_links(a_tag, room_id, outgoing)
if not outgoing:
self._matrix_to_links_add_classes(a_tag)
@ -301,6 +308,9 @@ class HTMLProcessor:
"span": {"data-mx-color"},
}}
username_link_regexes = []
if outgoing:
username_link_regexes = [re.compile(r) for r in [
rf"(?<!\w)(?P<body>{re.escape(username)})(?!\w)(?P<host>)"
for username in self.rooms_user_id_names[room_id].values()
@ -390,7 +400,7 @@ class HTMLProcessor:
def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement:
"""Remove excess `\\n` characters from HTML elements.
r"""Remove excess `\n` characters from HTML elements.
This is done to avoid additional blank lines when the CSS directive
`white-space: pre` is used.
@ -440,9 +450,14 @@ class HTMLProcessor:
def _mentions_to_matrix_to_links(
self, el: HtmlElement, room_id: str = "",
self, el: HtmlElement, room_id: str = "", outgoing: bool = False,
) -> HtmlElement:
"""Turn user ID/names and room ID/aliases into matrix.to URL."""
"""Turn user ID/names and room ID/aliases into matrix.to URL.
After the HTML sanitizer autolinks these, the links's hrefs will be the
link text, e.g. `<a href="@foo:bar.com">@foo:bar.com</a>`.
We turn them into proper matrix.to URL in this function.
"""
if el.tag != "a" or not el.attrib.get("href"):
return el
@ -455,18 +470,22 @@ class HTMLProcessor:
for regex in id_regexes:
if regex.match(unquote(el.attrib["href"])):
el.attrib["href"] = f"https://matrix.to/#/{el.attrib['href']}"
return el
if room_id not in self.rooms_user_id_names:
if not outgoing or room_id not in self.rooms_user_id_names:
return el
for user_id, username in self.rooms_user_id_names[room_id].items():
if unquote(el.attrib["href"]) == username:
el.attrib["href"] = f"https://matrix.to/#/{user_id}"
return el
return el
def _matrix_to_links_add_classes(self, el: HtmlElement) -> HtmlElement:
"Add special CSS classes to matrix.to mention links."""
href = unquote(el.attrib.get("href", ""))
if not href or not el.text: