From a7bf1fca44265a3c53a03fa89477734850eee7a0 Mon Sep 17 00:00:00 2001 From: miruka Date: Wed, 17 Apr 2019 10:49:54 -0400 Subject: [PATCH] Sanitize HTML displayed as message content --- TODO.md | 4 + harmonyqml/backend/backend.py | 12 +-- harmonyqml/backend/html_filter.py | 82 +++++++++++++++++++ harmonyqml/components/chat/MessageContent.qml | 4 +- 4 files changed, 96 insertions(+), 6 deletions(-) create mode 100644 harmonyqml/backend/html_filter.py diff --git a/TODO.md b/TODO.md index 3bfc5c00..d7c4c05d 100644 --- a/TODO.md +++ b/TODO.md @@ -22,3 +22,7 @@ - Migrate more JS functions to their own files - Accept room\_id arg for getUser + +- Set Qt parents for all QObject + +- `
` scrollbar on overflow
diff --git a/harmonyqml/backend/backend.py b/harmonyqml/backend/backend.py
index 407bdf94..490b434f 100644
--- a/harmonyqml/backend/backend.py
+++ b/harmonyqml/backend/backend.py
@@ -9,6 +9,7 @@ from PyQt5.QtCore import QObject, pyqtProperty, pyqtSlot
 from .client_manager import ClientManager
 from .model.items import User
 from .model.qml_models import QMLModels
+from .html_filter import HtmlFilter
 
 
 class Backend(QObject):
@@ -16,14 +17,12 @@ class Backend(QObject):
         super().__init__()
         self._client_manager: ClientManager = ClientManager()
         self._models:         QMLModels     = QMLModels()
+        self._html_filter:    HtmlFilter    = HtmlFilter()
+        # a = self._client_manager; m = self._models
 
         from .signal_manager import SignalManager
         self._signal_manager: SignalManager = SignalManager(self)
 
-        # a = self._client_manager; m = self._models
-        # from PyQt5.QtCore import pyqtRemoveInputHook as PRI
-        # import pdb; PRI(); pdb.set_trace()
-
         self.clientManager.configLoad()
 
 
@@ -31,11 +30,14 @@ class Backend(QObject):
     def clientManager(self):
         return self._client_manager
 
-
     @pyqtProperty("QVariant", constant=True)
     def models(self):
         return self._models
 
+    @pyqtProperty("QVariant", constant=True)
+    def htmlFilter(self):
+        return self._html_filter
+
 
     @pyqtSlot(str, result="QVariantMap")
     def getUser(self, user_id: str) -> Dict[str, str]:
diff --git a/harmonyqml/backend/html_filter.py b/harmonyqml/backend/html_filter.py
new file mode 100644
index 00000000..11102b1a
--- /dev/null
+++ b/harmonyqml/backend/html_filter.py
@@ -0,0 +1,82 @@
+# Copyright 2019 miruka
+# This file is part of harmonyqml, licensed under GPLv3.
+
+import html_sanitizer.sanitizer as sanitizer
+from lxml.html import HtmlElement
+from PyQt5.QtCore import QObject, pyqtProperty, pyqtSlot
+
+
+class HtmlFilter(QObject):
+    def __init__(self) -> None:
+        super().__init__()
+        self._sanitizer = sanitizer.Sanitizer(self.sanitizer_settings)
+
+        # The whitespace remover doesn't take 
 into account
+        sanitizer.normalize_overall_whitespace         = lambda html: html
+        sanitizer.normalize_whitespace_in_text_or_tail = lambda el: el
+
+        # Prevent custom attributes from being removed
+        sanitizer.lxml.html.clean.Cleaner.safe_attrs |= \
+            self.sanitizer_settings["attributes"]["font"]
+
+
+    @pyqtSlot(str, result=str)
+    def sanitize(self, html: str) -> str:
+        return self._sanitizer.sanitize(html)
+
+
+    @pyqtProperty("QVariant")
+    def sanitizer_settings(self) -> dict:
+        # https://matrix.org/docs/spec/client_server/latest.html#m-room-message-msgtypes
+        return {
+            "tags": {
+                # TODO: mx-reply, audio, video
+                "font", "h1", "h2", "h3", "h4", "h5", "h6",
+                "blockquote", "p", "a", "ul", "ol", "sup", "sub", "li",
+                "b", "i", "s", "u", "code", "hr", "br",
+                "table", "thead", "tbody", "tr", "th", "td",
+                "pre", "img",
+            },
+            "attributes": {
+                # TODO: translate font attrs to qt html subset
+                "font": {"data-mx-bg-color", "data-mx-color"},
+                "a":    {"href"},
+                "img":  {"width", "height", "alt", "title", "src"},
+                "ol":   {"start"},
+                "code": {"class"},
+            },
+            "empty": {"hr", "br", "img"},
+            "separate": {
+                "a", "p", "li", "table", "tr", "th", "td", "br", "hr"
+            },
+            "whitespace": {},
+            "add_nofollow": False,
+            "autolink": True,
+            "sanitize_href": sanitizer.sanitize_href,
+            "element_preprocessors": [
+                sanitizer.bold_span_to_strong,
+                sanitizer.italic_span_to_em,
+                sanitizer.tag_replacer("strong", "b"),
+                sanitizer.tag_replacer("em", "i"),
+                sanitizer.tag_replacer("strike", "s"),
+                sanitizer.tag_replacer("del", "s"),
+                sanitizer.tag_replacer("span", "font"),
+                self._remove_empty_font,
+                sanitizer.tag_replacer("form", "p"),
+                sanitizer.tag_replacer("div", "p"),
+                sanitizer.tag_replacer("caption", "p"),
+                sanitizer.target_blank_noopener,
+            ],
+            "element_postprocessors": [],
+            "is_mergeable": lambda e1, e2: e1.attrib == e2.attrib,
+        }
+
+
+    def _remove_empty_font(self, el: HtmlElement) -> HtmlElement:
+        if el.tag != "font":
+            return el
+
+        if not self.sanitizer_settings["attributes"]["font"] & set(el.keys()):
+            el.clear()
+
+        return el
diff --git a/harmonyqml/components/chat/MessageContent.qml b/harmonyqml/components/chat/MessageContent.qml
index 5d7f5680..f13c18ef 100644
--- a/harmonyqml/components/chat/MessageContent.qml
+++ b/harmonyqml/components/chat/MessageContent.qml
@@ -38,7 +38,9 @@ Row {
                   //"" +
             //      (isOwn ? "  " + content : "")
 
-            text: (dict.formatted_body || dict.body) +
+            text: (dict.formatted_body ?
+                   Backend.htmlFilter.sanitize(dict.formatted_body) :
+                   dict.body) +
                   "  " +
                   Qt.formatDateTime(date_time, "hh:mm:ss") +
                   ""