Safer linkifying for user ID, room ID, room alias

Don't replace anything in already existing links or <code>.
This commit is contained in:
miruka 2020-03-22 19:39:58 -04:00
parent 2fbfac5e0d
commit 8fd7ce4e14

View File

@ -112,11 +112,21 @@ class HTMLProcessor:
} }
link_regexes = [re.compile(r, re.IGNORECASE) for r in [ link_regexes = [re.compile(r, re.IGNORECASE) for r in [
# Normal :// URLs
(r"(?P<body>[a-zA-Z\d]+://(?P<host>[a-z\d._-]+(?:\:\d+)?)" (r"(?P<body>[a-zA-Z\d]+://(?P<host>[a-z\d._-]+(?:\:\d+)?)"
r"(?:/[/\-_.,a-z\d#%&?;=~]*)?(?:\([/\-_.,a-z\d#%&?;=~]*\))?)"), r"(?:/[/\-_.,a-z\d#%&?;=~]*)?(?:\([/\-_.,a-z\d#%&?;=~]*\))?)"),
# mailto: and tel:
r"mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z](?:\:\d+)?))", r"mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z](?:\:\d+)?))",
r"tel:(?P<body>[0-9+-]+)(?P<host>)", r"tel:(?P<body>[0-9+-]+)(?P<host>)",
# magnet:
r"(?P<body>magnet:\?xt=urn:[a-z0-9]+:.+)(?P<host>)", r"(?P<body>magnet:\?xt=urn:[a-z0-9]+:.+)(?P<host>)",
# User ID, room ID, room alias
r"(?=^|\W)(?P<body>@.+?:(?P<host>[a-zA-Z\d.-:]*[a-zA-Z\d]))",
r"(?=^|\W)(?P<body>#.+?:(?P<host>[a-zA-Z\d.-:]*[a-zA-Z\d]))",
r"(?=^|\W)(?P<body>!.+?:(?P<host>[a-zA-Z\d.-:]*[a-zA-Z\d]))",
]] ]]
inline_quote_regex = re.compile(r"(^|⏎)(\s*&gt;[^⏎\n]*)", re.MULTILINE) inline_quote_regex = re.compile(r"(^|⏎)(\s*&gt;[^⏎\n]*)", re.MULTILINE)
@ -130,16 +140,6 @@ class HTMLProcessor:
extra_newlines_regex = re.compile(r"\n(\n*)") extra_newlines_regex = re.compile(r"\n(\n*)")
user_id_mention_regex = re.compile(
r"(?=^|\W)@.+?:[a-zA-Z\d.-:]*[a-zA-Z\d]",
)
room_id_mention_regex = re.compile(
r"(?=^|\W)!.+?:[a-zA-Z\d.-:]*[a-zA-Z\d]",
)
room_alias_mention_regex = re.compile(
r"(?=^|\W)#.+?:[a-zA-Z\d.-:]*[a-zA-Z\d]",
)
def __init__(self) -> None: def __init__(self) -> None:
self._sanitizers = { self._sanitizers = {
@ -191,13 +191,6 @@ class HTMLProcessor:
) -> str: ) -> str:
"""Turn usernames, user ID, room alias, room ID into matrix.to URL.""" """Turn usernames, user ID, room alias, room ID into matrix.to URL."""
def repl_func(m) -> str:
return rf"[{m.group(0)}](https://matrix.to/#/{quote(m.group(0))})"
text = self.user_id_mention_regex.sub(repl_func, text)
text = self.room_id_mention_regex.sub(repl_func, text)
text = self.room_alias_mention_regex.sub(repl_func, text)
for user_id, username in (usernames or {}).items(): for user_id, username in (usernames or {}).items():
text = re.sub( text = re.sub(
rf"(?<!\w)({re.escape(username)})(?!\w)", rf"(?<!\w)({re.escape(username)})(?!\w)",
@ -286,6 +279,8 @@ class HTMLProcessor:
self._img_to_a, self._img_to_a,
self._remove_extra_newlines, self._remove_extra_newlines,
self._newlines_to_return_symbol if inline else lambda el: el, self._newlines_to_return_symbol if inline else lambda el: el,
self._matrix_toify_user_room_links,
], ],
"element_postprocessors": [ "element_postprocessors": [
self._font_color_to_span if outgoing else lambda el: el, self._font_color_to_span if outgoing else lambda el: el,
@ -379,4 +374,13 @@ class HTMLProcessor:
return el return el
@staticmethod
def _matrix_toify_user_room_links(el: HtmlElement) -> HtmlElement:
if el.tag != "a" or not el.attrib.get("href"):
return el
el.attrib["href"] = "https://matrix.to/#/%s" % el.attrib["href"]
return el
HTML_PROCESSOR = HTMLProcessor() HTML_PROCESSOR = HTMLProcessor()