diff --git a/src/backend/html_markdown.py b/src/backend/html_markdown.py index bcb6cbaf..f06c7c51 100644 --- a/src/backend/html_markdown.py +++ b/src/backend/html_markdown.py @@ -112,17 +112,20 @@ class HTMLProcessor: "table", "thead", "tbody", "tr", "th", "td", "pre", } + opaque_id = r"[a-zA-Z\d._-]+?" + user_id_localpart = r"[\x21-\x39\x3D-\x7E]+?" + user_id_regex = re.compile( - r"(?=^|\W)(?P@.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", + rf"(?P@{user_id_localpart}:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", ) room_id_regex = re.compile( - r"(?=^|\W)(?P!.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", + rf"(?P!{opaque_id}:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", ) room_alias_regex = re.compile( - r"(?=^|\W)(?P#.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", + r"(?=^|\W)(?P#\S+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", ) message_id_regex = re.compile( - r"(?=^|\W)(?P\$.+?:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", + rf"(?P\${opaque_id}:(?P[a-zA-Z\d.-:]*[a-zA-Z\d]))", ) link_regexes = [re.compile(r, re.IGNORECASE) @@ -142,19 +145,19 @@ class HTMLProcessor: ]] link_is_matrix_to_regex = re.compile( - r"https?://matrix.to/#/.+", re.IGNORECASE, + r"https?://matrix.to/#/\S+", re.IGNORECASE, ) link_is_user_id_regex = re.compile( - r"https?://matrix.to/#/@.+", re.IGNORECASE, + r"https?://matrix.to/#/@\S+", re.IGNORECASE, ) link_is_room_id_regex = re.compile( - r"https?://matrix.to/#/!.+", re.IGNORECASE, + r"https?://matrix.to/#/!\S+", re.IGNORECASE, ) link_is_room_alias_regex = re.compile( - r"https?://matrix.to/#/#.+", re.IGNORECASE, + r"https?://matrix.to/#/#\S+", re.IGNORECASE, ) link_is_message_id_regex = re.compile( - r"https?://matrix.to/#/\$.+", re.IGNORECASE, + r"https?://matrix.to/#/\$\S+", re.IGNORECASE, ) inline_quote_regex = re.compile(r"(^|⏎)(\s*>[^⏎\n]*)", re.MULTILINE)