From 8d5bc45cebb87c4de51ded616b00a771b86b28b3 Mon Sep 17 00:00:00 2001 From: miruka Date: Sun, 22 Mar 2020 20:58:05 -0400 Subject: [PATCH] Safer linkifying for user display names --- src/backend/html_markdown.py | 113 ++++++++++++++++++++--------------- src/backend/matrix_client.py | 27 +++++---- src/backend/nio_callbacks.py | 8 ++- 3 files changed, 87 insertions(+), 61 deletions(-) diff --git a/src/backend/html_markdown.py b/src/backend/html_markdown.py index 8682fef0..30afda0a 100644 --- a/src/backend/html_markdown.py +++ b/src/backend/html_markdown.py @@ -3,8 +3,7 @@ """HTML and Markdown processing tools.""" import re -from typing import Dict, Optional -from urllib.parse import quote +from typing import DefaultDict, Dict import html_sanitizer.sanitizer as sanitizer import mistune @@ -111,7 +110,18 @@ class HTMLProcessor: "table", "thead", "tbody", "tr", "th", "td", "pre", } - link_regexes = [re.compile(r, re.IGNORECASE) for r in [ + user_id_regex = re.compile( + r"(?=^|\W)(?P@.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", + ) + room_id_regex = re.compile( + r"(?=^|\W)(?P!.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", + ) + room_alias_regex = re.compile( + r"(?=^|\W)(?P#.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", + ) + + link_regexes = [re.compile(r, re.IGNORECASE) + if isinstance(r, str) else r for r in [ # Normal :// URLs (r"(?P[a-zA-Z\d]+://(?P[a-z\d._-]+(?:\:\d+)?)" r"(?:/[/\-_.,a-z\d#%&?;=~]*)?(?:\([/\-_.,a-z\d#%&?;=~]*\))?)"), @@ -123,10 +133,7 @@ class HTMLProcessor: # magnet: r"(?Pmagnet:\?xt=urn:[a-z0-9]+:.+)(?P)", - # User ID, room ID, room alias - r"(?=^|\W)(?P@.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", - r"(?=^|\W)(?P#.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", - r"(?=^|\W)(?P!.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", + user_id_regex, room_id_regex, room_alias_regex, ]] inline_quote_regex = re.compile(r"(^|⏎)(\s*>[^⏎\n]*)", re.MULTILINE) @@ -140,19 +147,11 @@ class HTMLProcessor: extra_newlines_regex = re.compile(r"\n(\n*)") + # {room_id: {user_id: username}} + rooms_user_id_names: DefaultDict[str, Dict[str, str]] = DefaultDict(dict) + def __init__(self) -> None: - self._sanitizers = { - (False, False): Sanitizer(self.sanitize_settings(False, False)), - (True, False): Sanitizer(self.sanitize_settings(True, False)), - (False, True): Sanitizer(self.sanitize_settings(False, True)), - (True, True): Sanitizer(self.sanitize_settings(True, True)), - } - - self._inline_sanitizer = Sanitizer(self.sanitize_settings(inline=True)) - self._inline_outgoing_sanitizer = \ - Sanitizer(self.sanitize_settings(inline=True)) - # The whitespace remover doesn't take
 into account
         sanitizer.normalize_overall_whitespace = lambda html, *args, **kw: html
         sanitizer.normalize_whitespace_in_text_or_tail = \
@@ -175,39 +174,32 @@ class HTMLProcessor:
 
     def from_markdown(
         self,
-        text:              str,
-        inline:            bool                     = False,
-        outgoing:          bool                     = False,
-        mentionable_users: Optional[Dict[str, str]] = None,  # {id: name}
+        text:     str,
+        inline:   bool = False,
+        outgoing: bool = False,
+        room_id:  str  = "",
     ) -> str:
         """Return filtered HTML from Markdown text."""
 
-        text = self.markdown_linkify_users_rooms(text, mentionable_users)
-        return self.filter(self._markdown_to_html(text), inline, outgoing)
-
-
-    def markdown_linkify_users_rooms(
-        self, text: str, usernames: Optional[Dict[str, str]] = None,
-    ) -> str:
-        """Turn usernames, user ID, room alias, room ID into matrix.to URL."""
-
-        for user_id, username in (usernames or {}).items():
-            text = re.sub(
-                rf"(? str:
         """Filter and return HTML."""
 
-        html = self._sanitizers[inline, outgoing].sanitize(html).rstrip("\n")
+        settings = self.sanitize_settings(inline, outgoing, room_id)
+        html     = Sanitizer(settings).sanitize(html).rstrip("\n")
 
         if outgoing:
             return html
@@ -226,7 +218,7 @@ class HTMLProcessor:
 
 
     def sanitize_settings(
-        self, inline: bool = False, outgoing: bool = False,
+        self, inline: bool = False, outgoing: bool = False, room_id: str = "",
     ) -> dict:
         """Return an html_sanitizer configuration."""
 
@@ -247,6 +239,11 @@ class HTMLProcessor:
             "span": {"data-mx-color"},
         }}
 
+        username_link_regexes = [re.compile(r, re.IGNORECASE) for r in [
+            rf"(?{re.escape(username)})(?!\w)(?P)"
+            for username in self.rooms_user_id_names[room_id].values()
+        ]]
+
         return {
             "tags": inline_tags if inline else all_tags,
             "attributes": inlines_attributes if inline else attributes,
@@ -258,7 +255,8 @@ class HTMLProcessor:
             "keep_typographic_whitespace": True,
             "add_nofollow": False,
             "autolink": {
-                "link_regexes": self.link_regexes,
+                "link_regexes":
+                    self.link_regexes + username_link_regexes,  # type: ignore
                 "avoid_hosts": [],
             },
             "sanitize_href": lambda href: href,
@@ -280,7 +278,7 @@ class HTMLProcessor:
                 self._remove_extra_newlines,
                 self._newlines_to_return_symbol if inline else lambda el: el,
 
-                self._matrix_toify_user_room_links,
+                lambda el: self._matrix_toify(el, room_id),
             ],
             "element_postprocessors": [
                 self._font_color_to_span if outgoing else lambda el: el,
@@ -374,12 +372,31 @@ class HTMLProcessor:
         return el
 
 
-    @staticmethod
-    def _matrix_toify_user_room_links(el: HtmlElement) -> HtmlElement:
+    def _matrix_toify(self, el: HtmlElement, room_id: str = "") -> HtmlElement:
+        """Turn userID, usernames, roomID, room aliases into matrix.to URL."""
+
         if el.tag != "a" or not el.attrib.get("href"):
+            # print("ret 1", el.tag, el.attrib, el.text, el.tail, sep="||")
             return el
 
-        el.attrib["href"] = "https://matrix.to/#/%s" % el.attrib["href"]
+        id_regexes = (
+            self.user_id_regex, self.room_id_regex, self.room_alias_regex,
+        )
+
+        for regex in id_regexes:
+            if regex.match(el.attrib["href"]):
+                el.attrib["href"] = f"https://matrix.to/#/{el.attrib['href']}"
+
+        if room_id not in self.rooms_user_id_names:
+            # print("ret 2", el.tag, el.attrib, el.text, el.tail, sep="||")
+            return el
+
+        for user_id, username in self.rooms_user_id_names[room_id].items():
+            # print(el.attrib["href"], username, user_id)
+            if el.attrib["href"] == username:
+                el.attrib["href"] = f"https://matrix.to/#/{user_id}"
+
+        # print("ret 3", el.tag, el.attrib, el.text, el.tail, sep="||")
         return el
 
 
diff --git a/src/backend/matrix_client.py b/src/backend/matrix_client.py
index a1836ac5..09dcd024 100644
--- a/src/backend/matrix_client.py
+++ b/src/backend/matrix_client.py
@@ -308,14 +308,7 @@ class MatrixClient(nio.AsyncClient):
     async def send_text(self, room_id: str, text: str) -> None:
         """Send a markdown `m.text` or `m.notice` (with `/me`) message ."""
 
-        from_md = partial(
-            HTML.from_markdown,
-            mentionable_users={
-                user_id: member.display_name or user_id
-                for user_id, member in
-                self.models[self.user_id, room_id, "members"].items()
-            },
-        )
+        from_md = partial(HTML.from_markdown, room_id=room_id)
 
         escape = False
         if text.startswith("//") or text.startswith(r"\/"):
@@ -626,7 +619,9 @@ class MatrixClient(nio.AsyncClient):
         content = event_fields.get("content", "").strip()
 
         if content and "inline_content" not in event_fields:
-            event_fields["inline_content"] = HTML.filter(content, inline=True)
+            event_fields["inline_content"] = HTML.filter(
+                content, inline=True, room_id=room_id,
+            )
 
         event = Event(
             id            = f"echo-{transaction_id}",
@@ -1088,7 +1083,9 @@ class MatrixClient(nio.AsyncClient):
             display_name   = room.display_name or "",
             avatar_url     = room.gen_avatar_url or "",
             plain_topic    = room.topic or "",
-            topic          = HTML.filter(room.topic or "", inline=True),
+            topic          = HTML.filter(
+                room.topic or "", inline=True, room_id=room.room_id,
+            ),
             inviter_id     = inviter,
             inviter_name   = room.user_name(inviter) if inviter else "",
             inviter_avatar =
@@ -1123,6 +1120,7 @@ class MatrixClient(nio.AsyncClient):
 
         for user_id in left_the_room:
             del self.models[self.user_id, room.room_id, "members"][user_id]
+            HTML.rooms_user_id_names[room.room_id].pop(user_id, None)
 
         # Add the room members to the added room
         new_dict = {
@@ -1138,6 +1136,11 @@ class MatrixClient(nio.AsyncClient):
         }
         self.models[self.user_id, room.room_id, "members"].update(new_dict)
 
+        for user_id, member in room.users.items():
+            if member.display_name:
+                HTML.rooms_user_id_names[room.room_id][user_id] = \
+                    member.display_name
+
 
     async def get_member_name_avatar(
         self, room_id: str, user_id: str,
@@ -1182,7 +1185,9 @@ class MatrixClient(nio.AsyncClient):
         content = fields.get("content", "").strip()
 
         if content and "inline_content" not in fields:
-            fields["inline_content"] = HTML.filter(content, inline=True)
+            fields["inline_content"] = HTML.filter(
+                content, inline=True, room_id=room.room_id,
+            )
 
         # Create Event ModelItem
 
diff --git a/src/backend/nio_callbacks.py b/src/backend/nio_callbacks.py
index b49312ea..ade38076 100644
--- a/src/backend/nio_callbacks.py
+++ b/src/backend/nio_callbacks.py
@@ -97,6 +97,8 @@ class NioCallbacks:
             ev.formatted_body
             if ev.format == "org.matrix.custom.html" else
             utils.plain2html(ev.body),
+
+            room_id = room.room_id,
         )
         await self.client.register_nio_event(room, ev, content=co)
 
@@ -337,8 +339,10 @@ class NioCallbacks:
 
     async def onRoomTopicEvent(self, room, ev) -> None:
         if ev.topic:
-            topic = HTML_PROCESSOR.filter(ev.topic, inline=True)
-            co    = f"%1 changed the room's topic to \"{topic}\""
+            topic = HTML_PROCESSOR.filter(
+                ev.topic, inline=True, room_id=room.room_id,
+            )
+            co = f"%1 changed the room's topic to \"{topic}\""
         else:
             co = "%1 removed the room's topic"