Rename and document html filtering stuff

This commit is contained in:
miruka 2019-12-18 09:33:22 -04:00
parent cfcc190473
commit d7045644f1
5 changed files with 72 additions and 20 deletions

View File

@ -90,6 +90,7 @@
- Way to open context menus without a right mouse button - Way to open context menus without a right mouse button
- `smartVerticalFlick()` gradual acceleration - `smartVerticalFlick()` gradual acceleration
- Make banner buttons look better - Make banner buttons look better
- Way to color HTML from the composer
- Choose a better default easing type for animations - Choose a better default easing type for animations
- Make HListView scrollbars visible - Make HListView scrollbars visible
@ -206,8 +207,9 @@
- Turn all the Error and Response classes into exceptions and normal returns - Turn all the Error and Response classes into exceptions and normal returns
once `HttpClient` is deprecated once `HttpClient` is deprecated
## Distribution ## Distribution & dependencies
- Mistune v2.0
- Include python dependencies in binary with rcc? - Include python dependencies in binary with rcc?
- Improve the README.md - Improve the README.md

View File

@ -1,3 +1,5 @@
"""HTML and Markdown processing tools."""
import re import re
import html_sanitizer.sanitizer as sanitizer import html_sanitizer.sanitizer as sanitizer
@ -7,12 +9,20 @@ from lxml.html import HtmlElement # nosec
class MarkdownInlineGrammar(mistune.InlineGrammar): class MarkdownInlineGrammar(mistune.InlineGrammar):
# Enable *word* but not _word_ syntaxes (TODO: config option for that) """Markdown inline elements syntax modifications for the Mistune parser.
Modifications:
- Disable underscores for bold/italics (e.g. `__bold__`)
"""
emphasis = re.compile(r"^\*((?:\*\*|[^\*])+?)\*(?!\*)") emphasis = re.compile(r"^\*((?:\*\*|[^\*])+?)\*(?!\*)")
double_emphasis = re.compile(r"^\*{2}([\s\S]+?)\*{2}(?!\*)") double_emphasis = re.compile(r"^\*{2}([\s\S]+?)\*{2}(?!\*)")
class MarkdownInlineLexer(mistune.InlineLexer): class MarkdownInlineLexer(mistune.InlineLexer):
"""Apply the changes from `MarkdownInlineGrammar` for Mistune."""
grammar_class = MarkdownInlineGrammar grammar_class = MarkdownInlineGrammar
@ -24,7 +34,27 @@ class MarkdownInlineLexer(mistune.InlineLexer):
return self.renderer.emphasis(self.output(m.group(1))) return self.renderer.emphasis(self.output(m.group(1)))
class HtmlFilter: class HTMLProcessor:
"""Provide HTML filtering and conversion from Markdown.
Filtering sanitizes HTML and ensures it complies with the supported Qt
subset for usage in QML: https://doc.qt.io/qt-5/richtext-html-subset.html
Some methods take an `outgoing` argument, specifying if the HTML is
intended to be sent to matrix servers or used locally in our application.
For local usage, extra transformations are applied:
- Wrap text lines starting with a `>` in `<span>` with a `quote` class.
This allows them to be styled appropriately from QML.
Some methods have `inline` counterparts, which return text appropriate
for UI elements restricted to display a single line, e.g. the room
last message subtitles in QML or notifications.
In inline filtered HTML, block tags are stripped or substituted and
newlines are turned into symbols (U+23CE).
"""
inline_tags = {"font", "a", "sup", "sub", "b", "i", "s", "u", "code"} inline_tags = {"font", "a", "sup", "sub", "b", "i", "s", "u", "code"}
block_tags = { block_tags = {
@ -73,14 +103,20 @@ class HtmlFilter:
def from_markdown(self, text: str, outgoing: bool = False) -> str: def from_markdown(self, text: str, outgoing: bool = False) -> str:
"""Return filtered HTML from Markdown text."""
return self.filter(self._markdown_to_html(text), outgoing) return self.filter(self._markdown_to_html(text), outgoing)
def from_markdown_inline(self, text: str, outgoing: bool = False) -> str: def from_markdown_inline(self, text: str, outgoing: bool = False) -> str:
"""Return single-line filtered HTML from Markdown text."""
return self.filter_inline(self._markdown_to_html(text), outgoing) return self.filter_inline(self._markdown_to_html(text), outgoing)
def filter_inline(self, html: str, outgoing: bool = False) -> str: def filter_inline(self, html: str, outgoing: bool = False) -> str:
"""Filter and return HTML with block tags stripped or substituted."""
html = self._inline_sanitizer.sanitize(html) html = self._inline_sanitizer.sanitize(html)
if outgoing: if outgoing:
@ -93,6 +129,8 @@ class HtmlFilter:
def filter(self, html: str, outgoing: bool = False) -> str: def filter(self, html: str, outgoing: bool = False) -> str:
"""Filter and return HTML."""
html = self._sanitizer.sanitize(html).rstrip("\n") html = self._sanitizer.sanitize(html).rstrip("\n")
if outgoing: if outgoing:
@ -102,6 +140,8 @@ class HtmlFilter:
def sanitize_settings(self, inline: bool = False) -> dict: def sanitize_settings(self, inline: bool = False) -> dict:
"""Return an html_sanitizer configuration."""
# https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes # https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
# TODO: mx-reply and the new hidden thing # TODO: mx-reply and the new hidden thing
@ -156,6 +196,8 @@ class HtmlFilter:
@staticmethod @staticmethod
def _process_span_font(el: HtmlElement) -> HtmlElement: def _process_span_font(el: HtmlElement) -> HtmlElement:
"""Convert HTML `<span data-mx-color=...` to `<font color=...>`."""
if el.tag not in ("span", "font"): if el.tag not in ("span", "font"):
return el return el
@ -169,6 +211,8 @@ class HtmlFilter:
@staticmethod @staticmethod
def _img_to_a(el: HtmlElement) -> HtmlElement: def _img_to_a(el: HtmlElement) -> HtmlElement:
"""Linkify images by wrapping `<img>` tags in `<a>`."""
if el.tag == "img": if el.tag == "img":
el.tag = "a" el.tag = "a"
el.attrib["href"] = el.attrib.pop("src", "") el.attrib["href"] = el.attrib.pop("src", "")
@ -178,8 +222,11 @@ class HtmlFilter:
def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement: def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement:
# Remove excess \n characters to avoid additional blank lines with """Remove excess `\\n` characters from non-`<pre>` HTML elements.
# HTML/CSS using `white-space: pre`, except in <pre> content.
This is done to avoid additional blank lines when the CSS directive
`white-space: pre` is used.
"""
pre_parent = any(parent.tag == "pre" for parent in el.iterancestors()) pre_parent = any(parent.tag == "pre" for parent in el.iterancestors())
@ -193,9 +240,12 @@ class HtmlFilter:
def _newlines_to_return_symbol(self, el: HtmlElement) -> HtmlElement: def _newlines_to_return_symbol(self, el: HtmlElement) -> HtmlElement:
# Add a return unicode symbol (U+23CE) to blocks with siblings """Turn newlines into unicode return symbols (⏎, U+23CE).
# (e.g. a <p> followed by another <p>) or <br>.
# The <br> themselves will be removed by the inline sanitizer. The symbol is added to blocks with siblings (e.g. a `<p>` followed by
another `<p>`) and `<br>` tags.
The `<br>` themselves will be removed by the inline sanitizer.
"""
is_block_with_siblings = (el.tag in self.block_tags and is_block_with_siblings = (el.tag in self.block_tags and
next(el.itersiblings(), None) is not None) next(el.itersiblings(), None) is not None)
@ -214,4 +264,4 @@ class HtmlFilter:
return el return el
HTML_FILTER = HtmlFilter() HTML_PROCESSOR = HTMLProcessor()

View File

@ -30,7 +30,7 @@ from .errors import (
BadMimeType, InvalidUserId, InvalidUserInContext, MatrixError, BadMimeType, InvalidUserId, InvalidUserInContext, MatrixError,
UneededThumbnail, UserNotFound, UneededThumbnail, UserNotFound,
) )
from .html_filter import HTML_FILTER from .html_markdown import HTML_PROCESSOR as HTML
from .models.items import ( from .models.items import (
Account, Event, Member, Room, TypeSpecifier, Upload, UploadStatus, Account, Event, Member, Room, TypeSpecifier, Upload, UploadStatus,
) )
@ -205,13 +205,13 @@ class MatrixClient(nio.AsyncClient):
event_type = nio.RoomMessageEmote event_type = nio.RoomMessageEmote
text = text[len("/me "): ] text = text[len("/me "): ]
content = {"body": text, "msgtype": "m.emote"} content = {"body": text, "msgtype": "m.emote"}
to_html = HTML_FILTER.from_markdown_inline(text, outgoing=True) to_html = HTML.from_markdown_inline(text, outgoing=True)
echo_body = HTML_FILTER.from_markdown_inline(text) echo_body = HTML.from_markdown_inline(text)
else: else:
event_type = nio.RoomMessageText event_type = nio.RoomMessageText
content = {"body": text, "msgtype": "m.text"} content = {"body": text, "msgtype": "m.text"}
to_html = HTML_FILTER.from_markdown(text, outgoing=True) to_html = HTML.from_markdown(text, outgoing=True)
echo_body = HTML_FILTER.from_markdown(text) echo_body = HTML.from_markdown(text)
if to_html not in (html.escape(text), f"<p>{html.escape(text)}</p>"): if to_html not in (html.escape(text), f"<p>{html.escape(text)}</p>"):
content["format"] = "org.matrix.custom.html" content["format"] = "org.matrix.custom.html"
@ -832,7 +832,7 @@ class MatrixClient(nio.AsyncClient):
display_name = room.display_name or "", display_name = room.display_name or "",
avatar_url = room.gen_avatar_url or "", avatar_url = room.gen_avatar_url or "",
plain_topic = room.topic or "", plain_topic = room.topic or "",
topic = HTML_FILTER.filter_inline(room.topic or ""), topic = HTML.filter_inline(room.topic or ""),
inviter_id = inviter, inviter_id = inviter,
inviter_name = room.user_name(inviter) if inviter else "", inviter_name = room.user_name(inviter) if inviter else "",
inviter_avatar = inviter_avatar =

View File

@ -10,7 +10,7 @@ import lxml # nosec
import nio import nio
from ..html_filter import HTML_FILTER from ..html_markdown import HTML_PROCESSOR
from ..utils import AutoStrEnum, auto from ..utils import AutoStrEnum, auto
from .model_item import ModelItem from .model_item import ModelItem
@ -200,7 +200,7 @@ class Event(ModelItem):
def __post_init__(self) -> None: def __post_init__(self) -> None:
if not self.inline_content: if not self.inline_content:
self.inline_content = HTML_FILTER.filter_inline(self.content) self.inline_content = HTML_PROCESSOR.filter_inline(self.content)
def __lt__(self, other: "Event") -> bool: def __lt__(self, other: "Event") -> bool:

View File

@ -10,7 +10,7 @@ from urllib.parse import quote
import nio import nio
from . import utils from . import utils
from .html_filter import HTML_FILTER from .html_markdown import HTML_PROCESSOR
from .matrix_client import MatrixClient from .matrix_client import MatrixClient
from .models.items import Account, Room, TypeSpecifier from .models.items import Account, Room, TypeSpecifier
@ -80,7 +80,7 @@ class NioCallbacks:
# Content: %1 is the sender, %2 the target (ev.state_key). # Content: %1 is the sender, %2 the target (ev.state_key).
async def onRoomMessageText(self, room, ev) -> None: async def onRoomMessageText(self, room, ev) -> None:
co = HTML_FILTER.filter( co = HTML_PROCESSOR.filter(
ev.formatted_body ev.formatted_body
if ev.format == "org.matrix.custom.html" else if ev.format == "org.matrix.custom.html" else
utils.plain2html(ev.body), utils.plain2html(ev.body),
@ -315,7 +315,7 @@ class NioCallbacks:
async def onRoomTopicEvent(self, room, ev) -> None: async def onRoomTopicEvent(self, room, ev) -> None:
if ev.topic: if ev.topic:
topic = HTML_FILTER.filter_inline(ev.topic) topic = HTML_PROCESSOR.filter_inline(ev.topic)
co = f"%1 changed the room's topic to \"{topic}\"" co = f"%1 changed the room's topic to \"{topic}\""
else: else:
co = "%1 removed the room's topic" co = "%1 removed the room's topic"