Improve HTMLProcessor performance (mentions)

- Try to autolink usernames only for outgoing messages - Improve and add some missing docstrings Fixes the very slow loading/python "freezing" for rooms with a huge number of members.
2020-05-20 00:59:30 -04:00 · 2020-05-20 00:59:30 -04:00 · bc5549195b
commit bc5549195b
parent c9d5949847
2 changed files with 33 additions and 15 deletions
--- a/TODO.md
+++ b/TODO.md
@ -1,8 +1,7 @@
 # TODO
- shorten indicator numbers with hundreds 
+- Defer retrieving profiles for events from members not anymore in the room 
 - fix python getting stuck when loading large room
 - fix lag when clicking accounts in the AccountBar with a very long room list
 - fix: on startup, if a room's last event is a membership change,
--- a/src/backend/html_markdown.py
+++ b/src/backend/html_markdown.py
@ -84,8 +84,11 @@ class MarkdownRenderer(mistune.Renderer):
 class HTMLProcessor:
    """Provide HTML filtering and conversion from Markdown.
-    Filtering sanitizes HTML and ensures it complies with the supported Qt
+    Filtering sanitizes HTML and ensures it complies both with the Matrix
-    subset for usage in QML: https://doc.qt.io/qt-5/richtext-html-subset.html
+    specification:
    https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
    and the supported Qt HTML subset for usage in QML:
    https://doc.qt.io/qt-5/richtext-html-subset.html
    Some methods take an `outgoing` argument, specifying if the HTML is
    intended to be sent to matrix servers or used locally in our application.
@ -197,6 +200,8 @@ class HTMLProcessor:
    def mentions_in_html(self, html: str) -> List[Tuple[str, str]]:
        """Return list of (text, href) tuples for all mention links in html."""
        if not html.strip():
            return []
@ -209,6 +214,8 @@ class HTMLProcessor:
    def user_id_link_in_html(self, html: str, user_id: str) -> bool:
        """Return whether html contains a mention link for user_id."""
        regex = re.compile(rf"https?://matrix.to/#/{user_id}", re.IGNORECASE)
        for _, href in self.mentions_in_html(html):
@ -255,7 +262,7 @@ class HTMLProcessor:
        )
        for a_tag in tree.iterdescendants("a"):
-            self._mentions_to_matrix_to_links(a_tag, room_id)
+            self._mentions_to_matrix_to_links(a_tag, room_id, outgoing)
            if not outgoing:
                self._matrix_to_links_add_classes(a_tag)
@ -301,6 +308,9 @@ class HTMLProcessor:
            "span": {"data-mx-color"},
        }}
        username_link_regexes = []
        if outgoing:
            username_link_regexes = [re.compile(r) for r in [
                rf"(?<!\w)(?P<body>{re.escape(username)})(?!\w)(?P<host>)"
                for username in self.rooms_user_id_names[room_id].values()
@ -390,7 +400,7 @@ class HTMLProcessor:
    def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement:
-        """Remove excess `\\n` characters from HTML elements.
+        r"""Remove excess `\n` characters from HTML elements.
        This is done to avoid additional blank lines when the CSS directive
        `white-space: pre` is used.
@ -440,9 +450,14 @@ class HTMLProcessor:
    def _mentions_to_matrix_to_links(
-        self, el: HtmlElement, room_id: str = "",
+        self, el: HtmlElement, room_id: str = "", outgoing: bool = False,
    ) -> HtmlElement:
-        """Turn user ID/names and room ID/aliases into matrix.to URL."""
+        """Turn user ID/names and room ID/aliases into matrix.to URL.
        After the HTML sanitizer autolinks these, the links's hrefs will be the
        link text, e.g. `<a href="@foo:bar.com">@foo:bar.com</a>`.
        We turn them into proper matrix.to URL in this function.
        """
        if el.tag != "a" or not el.attrib.get("href"):
            return el
@ -455,18 +470,22 @@ class HTMLProcessor:
        for regex in id_regexes:
            if regex.match(unquote(el.attrib["href"])):
                el.attrib["href"] = f"https://matrix.to/#/{el.attrib['href']}"
                return el
-        if room_id not in self.rooms_user_id_names:
+        if not outgoing or room_id not in self.rooms_user_id_names:
            return el
        for user_id, username in self.rooms_user_id_names[room_id].items():
            if unquote(el.attrib["href"]) == username:
                el.attrib["href"] = f"https://matrix.to/#/{user_id}"
                return el
        return el
    def _matrix_to_links_add_classes(self, el: HtmlElement) -> HtmlElement:
        "Add special CSS classes to matrix.to mention links."""
        href = unquote(el.attrib.get("href", ""))
        if not href or not el.text: