Rename and document html filtering stuff
This commit is contained in:
		
							
								
								
									
										4
									
								
								TODO.md
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								TODO.md
									
									
									
									
									
								
							| @@ -90,6 +90,7 @@ | |||||||
| - Way to open context menus without a right mouse button | - Way to open context menus without a right mouse button | ||||||
| - `smartVerticalFlick()` gradual acceleration | - `smartVerticalFlick()` gradual acceleration | ||||||
| - Make banner buttons look better | - Make banner buttons look better | ||||||
|  | - Way to color HTML from the composer | ||||||
|  |  | ||||||
| - Choose a better default easing type for animations | - Choose a better default easing type for animations | ||||||
| - Make HListView scrollbars visible | - Make HListView scrollbars visible | ||||||
| @@ -206,8 +207,9 @@ | |||||||
| - Turn all the Error and Response classes into exceptions and normal returns | - Turn all the Error and Response classes into exceptions and normal returns | ||||||
|   once `HttpClient` is deprecated |   once `HttpClient` is deprecated | ||||||
|  |  | ||||||
| ## Distribution | ## Distribution & dependencies | ||||||
|  |  | ||||||
|  | - Mistune v2.0 | ||||||
| - Include python dependencies in binary with rcc? | - Include python dependencies in binary with rcc? | ||||||
| - Improve the README.md | - Improve the README.md | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,3 +1,5 @@ | |||||||
|  | """HTML and Markdown processing tools.""" | ||||||
|  | 
 | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| import html_sanitizer.sanitizer as sanitizer | import html_sanitizer.sanitizer as sanitizer | ||||||
| @@ -7,12 +9,20 @@ from lxml.html import HtmlElement  # nosec | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MarkdownInlineGrammar(mistune.InlineGrammar): | class MarkdownInlineGrammar(mistune.InlineGrammar): | ||||||
|     # Enable *word* but not _word_ syntaxes (TODO: config option for that) |     """Markdown inline elements syntax modifications for the Mistune parser. | ||||||
|  | 
 | ||||||
|  |     Modifications: | ||||||
|  | 
 | ||||||
|  |     - Disable underscores for bold/italics (e.g. `__bold__`) | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|     emphasis        = re.compile(r"^\*((?:\*\*|[^\*])+?)\*(?!\*)") |     emphasis        = re.compile(r"^\*((?:\*\*|[^\*])+?)\*(?!\*)") | ||||||
|     double_emphasis = re.compile(r"^\*{2}([\s\S]+?)\*{2}(?!\*)") |     double_emphasis = re.compile(r"^\*{2}([\s\S]+?)\*{2}(?!\*)") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MarkdownInlineLexer(mistune.InlineLexer): | class MarkdownInlineLexer(mistune.InlineLexer): | ||||||
|  |     """Apply the changes from `MarkdownInlineGrammar` for Mistune.""" | ||||||
|  | 
 | ||||||
|     grammar_class = MarkdownInlineGrammar |     grammar_class = MarkdownInlineGrammar | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @@ -24,7 +34,27 @@ class MarkdownInlineLexer(mistune.InlineLexer): | |||||||
|         return self.renderer.emphasis(self.output(m.group(1))) |         return self.renderer.emphasis(self.output(m.group(1))) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class HtmlFilter: | class HTMLProcessor: | ||||||
|  |     """Provide HTML filtering and conversion from Markdown. | ||||||
|  | 
 | ||||||
|  |     Filtering sanitizes HTML and ensures it complies with the supported Qt | ||||||
|  |     subset for usage in QML: https://doc.qt.io/qt-5/richtext-html-subset.html | ||||||
|  | 
 | ||||||
|  |     Some methods take an `outgoing` argument, specifying if the HTML is | ||||||
|  |     intended to be sent to matrix servers or used locally in our application. | ||||||
|  | 
 | ||||||
|  |     For local usage, extra transformations are applied: | ||||||
|  | 
 | ||||||
|  |     - Wrap text lines starting with a `>` in `<span>` with a `quote` class. | ||||||
|  |       This allows them to be styled appropriately from QML. | ||||||
|  | 
 | ||||||
|  |     Some methods have `inline` counterparts, which return text appropriate | ||||||
|  |     for UI elements restricted to display a single line, e.g. the room | ||||||
|  |     last message subtitles in QML or notifications. | ||||||
|  |     In inline filtered HTML, block tags are stripped or substituted and | ||||||
|  |     newlines are turned into ⏎ symbols (U+23CE). | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|     inline_tags = {"font", "a", "sup", "sub", "b", "i", "s", "u", "code"} |     inline_tags = {"font", "a", "sup", "sub", "b", "i", "s", "u", "code"} | ||||||
| 
 | 
 | ||||||
|     block_tags = { |     block_tags = { | ||||||
| @@ -73,14 +103,20 @@ class HtmlFilter: | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def from_markdown(self, text: str, outgoing: bool = False) -> str: |     def from_markdown(self, text: str, outgoing: bool = False) -> str: | ||||||
|  |         """Return filtered HTML from Markdown text.""" | ||||||
|  | 
 | ||||||
|         return self.filter(self._markdown_to_html(text), outgoing) |         return self.filter(self._markdown_to_html(text), outgoing) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def from_markdown_inline(self, text: str, outgoing: bool = False) -> str: |     def from_markdown_inline(self, text: str, outgoing: bool = False) -> str: | ||||||
|  |         """Return single-line filtered HTML from Markdown text.""" | ||||||
|  | 
 | ||||||
|         return self.filter_inline(self._markdown_to_html(text), outgoing) |         return self.filter_inline(self._markdown_to_html(text), outgoing) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def filter_inline(self, html: str, outgoing: bool = False) -> str: |     def filter_inline(self, html: str, outgoing: bool = False) -> str: | ||||||
|  |         """Filter and return HTML with block tags stripped or substituted.""" | ||||||
|  | 
 | ||||||
|         html = self._inline_sanitizer.sanitize(html) |         html = self._inline_sanitizer.sanitize(html) | ||||||
| 
 | 
 | ||||||
|         if outgoing: |         if outgoing: | ||||||
| @@ -93,6 +129,8 @@ class HtmlFilter: | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def filter(self, html: str, outgoing: bool = False) -> str: |     def filter(self, html: str, outgoing: bool = False) -> str: | ||||||
|  |         """Filter and return HTML.""" | ||||||
|  | 
 | ||||||
|         html = self._sanitizer.sanitize(html).rstrip("\n") |         html = self._sanitizer.sanitize(html).rstrip("\n") | ||||||
| 
 | 
 | ||||||
|         if outgoing: |         if outgoing: | ||||||
| @@ -102,6 +140,8 @@ class HtmlFilter: | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def sanitize_settings(self, inline: bool = False) -> dict: |     def sanitize_settings(self, inline: bool = False) -> dict: | ||||||
|  |         """Return an html_sanitizer configuration.""" | ||||||
|  | 
 | ||||||
|         # https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes |         # https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes | ||||||
|         # TODO: mx-reply and the new hidden thing |         # TODO: mx-reply and the new hidden thing | ||||||
| 
 | 
 | ||||||
| @@ -156,6 +196,8 @@ class HtmlFilter: | |||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def _process_span_font(el: HtmlElement) -> HtmlElement: |     def _process_span_font(el: HtmlElement) -> HtmlElement: | ||||||
|  |         """Convert HTML `<span data-mx-color=...` to `<font color=...>`.""" | ||||||
|  | 
 | ||||||
|         if el.tag not in ("span", "font"): |         if el.tag not in ("span", "font"): | ||||||
|             return el |             return el | ||||||
| 
 | 
 | ||||||
| @@ -169,6 +211,8 @@ class HtmlFilter: | |||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def _img_to_a(el: HtmlElement) -> HtmlElement: |     def _img_to_a(el: HtmlElement) -> HtmlElement: | ||||||
|  |         """Linkify images by wrapping `<img>` tags in `<a>`.""" | ||||||
|  | 
 | ||||||
|         if el.tag == "img": |         if el.tag == "img": | ||||||
|             el.tag            = "a" |             el.tag            = "a" | ||||||
|             el.attrib["href"] = el.attrib.pop("src", "") |             el.attrib["href"] = el.attrib.pop("src", "") | ||||||
| @@ -178,8 +222,11 @@ class HtmlFilter: | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement: |     def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement: | ||||||
|         # Remove excess \n characters to avoid additional blank lines with |         """Remove excess `\\n` characters from non-`<pre>` HTML elements. | ||||||
|         # HTML/CSS using `white-space: pre`, except in <pre> content. | 
 | ||||||
|  |         This is done to avoid additional blank lines when the CSS directive | ||||||
|  |         `white-space: pre` is used. | ||||||
|  |         """ | ||||||
| 
 | 
 | ||||||
|         pre_parent = any(parent.tag == "pre" for parent in el.iterancestors()) |         pre_parent = any(parent.tag == "pre" for parent in el.iterancestors()) | ||||||
| 
 | 
 | ||||||
| @@ -193,9 +240,12 @@ class HtmlFilter: | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def _newlines_to_return_symbol(self, el: HtmlElement) -> HtmlElement: |     def _newlines_to_return_symbol(self, el: HtmlElement) -> HtmlElement: | ||||||
|         # Add a return unicode symbol (U+23CE) to blocks with siblings |         """Turn newlines into unicode return symbols (⏎, U+23CE). | ||||||
|         # (e.g. a <p> followed by another <p>) or <br>. | 
 | ||||||
|         # The <br> themselves will be removed by the inline sanitizer. |         The symbol is added to blocks with siblings (e.g. a `<p>` followed by | ||||||
|  |         another `<p>`) and `<br>` tags. | ||||||
|  |         The `<br>` themselves will be removed by the inline sanitizer. | ||||||
|  |         """ | ||||||
| 
 | 
 | ||||||
|         is_block_with_siblings = (el.tag in self.block_tags and |         is_block_with_siblings = (el.tag in self.block_tags and | ||||||
|                                   next(el.itersiblings(), None) is not None) |                                   next(el.itersiblings(), None) is not None) | ||||||
| @@ -214,4 +264,4 @@ class HtmlFilter: | |||||||
|         return el |         return el | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| HTML_FILTER = HtmlFilter() | HTML_PROCESSOR = HTMLProcessor() | ||||||
| @@ -30,7 +30,7 @@ from .errors import ( | |||||||
|     BadMimeType, InvalidUserId, InvalidUserInContext, MatrixError, |     BadMimeType, InvalidUserId, InvalidUserInContext, MatrixError, | ||||||
|     UneededThumbnail, UserNotFound, |     UneededThumbnail, UserNotFound, | ||||||
| ) | ) | ||||||
| from .html_filter import HTML_FILTER | from .html_markdown import HTML_PROCESSOR as HTML | ||||||
| from .models.items import ( | from .models.items import ( | ||||||
|     Account, Event, Member, Room, TypeSpecifier, Upload, UploadStatus, |     Account, Event, Member, Room, TypeSpecifier, Upload, UploadStatus, | ||||||
| ) | ) | ||||||
| @@ -205,13 +205,13 @@ class MatrixClient(nio.AsyncClient): | |||||||
|             event_type = nio.RoomMessageEmote |             event_type = nio.RoomMessageEmote | ||||||
|             text       = text[len("/me "): ] |             text       = text[len("/me "): ] | ||||||
|             content    = {"body": text, "msgtype": "m.emote"} |             content    = {"body": text, "msgtype": "m.emote"} | ||||||
|             to_html    = HTML_FILTER.from_markdown_inline(text, outgoing=True) |             to_html    = HTML.from_markdown_inline(text, outgoing=True) | ||||||
|             echo_body  = HTML_FILTER.from_markdown_inline(text) |             echo_body  = HTML.from_markdown_inline(text) | ||||||
|         else: |         else: | ||||||
|             event_type = nio.RoomMessageText |             event_type = nio.RoomMessageText | ||||||
|             content    = {"body": text, "msgtype": "m.text"} |             content    = {"body": text, "msgtype": "m.text"} | ||||||
|             to_html    = HTML_FILTER.from_markdown(text, outgoing=True) |             to_html    = HTML.from_markdown(text, outgoing=True) | ||||||
|             echo_body  = HTML_FILTER.from_markdown(text) |             echo_body  = HTML.from_markdown(text) | ||||||
|  |  | ||||||
|         if to_html not in (html.escape(text), f"<p>{html.escape(text)}</p>"): |         if to_html not in (html.escape(text), f"<p>{html.escape(text)}</p>"): | ||||||
|             content["format"]         = "org.matrix.custom.html" |             content["format"]         = "org.matrix.custom.html" | ||||||
| @@ -832,7 +832,7 @@ class MatrixClient(nio.AsyncClient): | |||||||
|             display_name   = room.display_name or "", |             display_name   = room.display_name or "", | ||||||
|             avatar_url     = room.gen_avatar_url or "", |             avatar_url     = room.gen_avatar_url or "", | ||||||
|             plain_topic    = room.topic or "", |             plain_topic    = room.topic or "", | ||||||
|             topic          = HTML_FILTER.filter_inline(room.topic or ""), |             topic          = HTML.filter_inline(room.topic or ""), | ||||||
|             inviter_id     = inviter, |             inviter_id     = inviter, | ||||||
|             inviter_name   = room.user_name(inviter) if inviter else "", |             inviter_name   = room.user_name(inviter) if inviter else "", | ||||||
|             inviter_avatar = |             inviter_avatar = | ||||||
|   | |||||||
| @@ -10,7 +10,7 @@ import lxml  # nosec | |||||||
|  |  | ||||||
| import nio | import nio | ||||||
|  |  | ||||||
| from ..html_filter import HTML_FILTER | from ..html_markdown import HTML_PROCESSOR | ||||||
| from ..utils import AutoStrEnum, auto | from ..utils import AutoStrEnum, auto | ||||||
| from .model_item import ModelItem | from .model_item import ModelItem | ||||||
|  |  | ||||||
| @@ -200,7 +200,7 @@ class Event(ModelItem): | |||||||
|  |  | ||||||
|     def __post_init__(self) -> None: |     def __post_init__(self) -> None: | ||||||
|         if not self.inline_content: |         if not self.inline_content: | ||||||
|             self.inline_content = HTML_FILTER.filter_inline(self.content) |             self.inline_content = HTML_PROCESSOR.filter_inline(self.content) | ||||||
|  |  | ||||||
|  |  | ||||||
|     def __lt__(self, other: "Event") -> bool: |     def __lt__(self, other: "Event") -> bool: | ||||||
|   | |||||||
| @@ -10,7 +10,7 @@ from urllib.parse import quote | |||||||
| import nio | import nio | ||||||
|  |  | ||||||
| from . import utils | from . import utils | ||||||
| from .html_filter import HTML_FILTER | from .html_markdown import HTML_PROCESSOR | ||||||
| from .matrix_client import MatrixClient | from .matrix_client import MatrixClient | ||||||
| from .models.items import Account, Room, TypeSpecifier | from .models.items import Account, Room, TypeSpecifier | ||||||
|  |  | ||||||
| @@ -80,7 +80,7 @@ class NioCallbacks: | |||||||
|     # Content: %1 is the sender, %2 the target (ev.state_key). |     # Content: %1 is the sender, %2 the target (ev.state_key). | ||||||
|  |  | ||||||
|     async def onRoomMessageText(self, room, ev) -> None: |     async def onRoomMessageText(self, room, ev) -> None: | ||||||
|         co = HTML_FILTER.filter( |         co = HTML_PROCESSOR.filter( | ||||||
|             ev.formatted_body |             ev.formatted_body | ||||||
|             if ev.format == "org.matrix.custom.html" else |             if ev.format == "org.matrix.custom.html" else | ||||||
|             utils.plain2html(ev.body), |             utils.plain2html(ev.body), | ||||||
| @@ -315,7 +315,7 @@ class NioCallbacks: | |||||||
|  |  | ||||||
|     async def onRoomTopicEvent(self, room, ev) -> None: |     async def onRoomTopicEvent(self, room, ev) -> None: | ||||||
|         if ev.topic: |         if ev.topic: | ||||||
|             topic = HTML_FILTER.filter_inline(ev.topic) |             topic = HTML_PROCESSOR.filter_inline(ev.topic) | ||||||
|             co    = f"%1 changed the room's topic to \"{topic}\"" |             co    = f"%1 changed the room's topic to \"{topic}\"" | ||||||
|         else: |         else: | ||||||
|             co = "%1 removed the room's topic" |             co = "%1 removed the room's topic" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	