Rename and document html filtering stuff
This commit is contained in:
parent
cfcc190473
commit
d7045644f1
4
TODO.md
4
TODO.md
@ -90,6 +90,7 @@
|
|||||||
- Way to open context menus without a right mouse button
|
- Way to open context menus without a right mouse button
|
||||||
- `smartVerticalFlick()` gradual acceleration
|
- `smartVerticalFlick()` gradual acceleration
|
||||||
- Make banner buttons look better
|
- Make banner buttons look better
|
||||||
|
- Way to color HTML from the composer
|
||||||
|
|
||||||
- Choose a better default easing type for animations
|
- Choose a better default easing type for animations
|
||||||
- Make HListView scrollbars visible
|
- Make HListView scrollbars visible
|
||||||
@ -206,8 +207,9 @@
|
|||||||
- Turn all the Error and Response classes into exceptions and normal returns
|
- Turn all the Error and Response classes into exceptions and normal returns
|
||||||
once `HttpClient` is deprecated
|
once `HttpClient` is deprecated
|
||||||
|
|
||||||
## Distribution
|
## Distribution & dependencies
|
||||||
|
|
||||||
|
- Mistune v2.0
|
||||||
- Include python dependencies in binary with rcc?
|
- Include python dependencies in binary with rcc?
|
||||||
- Improve the README.md
|
- Improve the README.md
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
"""HTML and Markdown processing tools."""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import html_sanitizer.sanitizer as sanitizer
|
import html_sanitizer.sanitizer as sanitizer
|
||||||
@ -7,12 +9,20 @@ from lxml.html import HtmlElement # nosec
|
|||||||
|
|
||||||
|
|
||||||
class MarkdownInlineGrammar(mistune.InlineGrammar):
|
class MarkdownInlineGrammar(mistune.InlineGrammar):
|
||||||
# Enable *word* but not _word_ syntaxes (TODO: config option for that)
|
"""Markdown inline elements syntax modifications for the Mistune parser.
|
||||||
|
|
||||||
|
Modifications:
|
||||||
|
|
||||||
|
- Disable underscores for bold/italics (e.g. `__bold__`)
|
||||||
|
"""
|
||||||
|
|
||||||
emphasis = re.compile(r"^\*((?:\*\*|[^\*])+?)\*(?!\*)")
|
emphasis = re.compile(r"^\*((?:\*\*|[^\*])+?)\*(?!\*)")
|
||||||
double_emphasis = re.compile(r"^\*{2}([\s\S]+?)\*{2}(?!\*)")
|
double_emphasis = re.compile(r"^\*{2}([\s\S]+?)\*{2}(?!\*)")
|
||||||
|
|
||||||
|
|
||||||
class MarkdownInlineLexer(mistune.InlineLexer):
|
class MarkdownInlineLexer(mistune.InlineLexer):
|
||||||
|
"""Apply the changes from `MarkdownInlineGrammar` for Mistune."""
|
||||||
|
|
||||||
grammar_class = MarkdownInlineGrammar
|
grammar_class = MarkdownInlineGrammar
|
||||||
|
|
||||||
|
|
||||||
@ -24,7 +34,27 @@ class MarkdownInlineLexer(mistune.InlineLexer):
|
|||||||
return self.renderer.emphasis(self.output(m.group(1)))
|
return self.renderer.emphasis(self.output(m.group(1)))
|
||||||
|
|
||||||
|
|
||||||
class HtmlFilter:
|
class HTMLProcessor:
|
||||||
|
"""Provide HTML filtering and conversion from Markdown.
|
||||||
|
|
||||||
|
Filtering sanitizes HTML and ensures it complies with the supported Qt
|
||||||
|
subset for usage in QML: https://doc.qt.io/qt-5/richtext-html-subset.html
|
||||||
|
|
||||||
|
Some methods take an `outgoing` argument, specifying if the HTML is
|
||||||
|
intended to be sent to matrix servers or used locally in our application.
|
||||||
|
|
||||||
|
For local usage, extra transformations are applied:
|
||||||
|
|
||||||
|
- Wrap text lines starting with a `>` in `<span>` with a `quote` class.
|
||||||
|
This allows them to be styled appropriately from QML.
|
||||||
|
|
||||||
|
Some methods have `inline` counterparts, which return text appropriate
|
||||||
|
for UI elements restricted to display a single line, e.g. the room
|
||||||
|
last message subtitles in QML or notifications.
|
||||||
|
In inline filtered HTML, block tags are stripped or substituted and
|
||||||
|
newlines are turned into ⏎ symbols (U+23CE).
|
||||||
|
"""
|
||||||
|
|
||||||
inline_tags = {"font", "a", "sup", "sub", "b", "i", "s", "u", "code"}
|
inline_tags = {"font", "a", "sup", "sub", "b", "i", "s", "u", "code"}
|
||||||
|
|
||||||
block_tags = {
|
block_tags = {
|
||||||
@ -73,14 +103,20 @@ class HtmlFilter:
|
|||||||
|
|
||||||
|
|
||||||
def from_markdown(self, text: str, outgoing: bool = False) -> str:
|
def from_markdown(self, text: str, outgoing: bool = False) -> str:
|
||||||
|
"""Return filtered HTML from Markdown text."""
|
||||||
|
|
||||||
return self.filter(self._markdown_to_html(text), outgoing)
|
return self.filter(self._markdown_to_html(text), outgoing)
|
||||||
|
|
||||||
|
|
||||||
def from_markdown_inline(self, text: str, outgoing: bool = False) -> str:
|
def from_markdown_inline(self, text: str, outgoing: bool = False) -> str:
|
||||||
|
"""Return single-line filtered HTML from Markdown text."""
|
||||||
|
|
||||||
return self.filter_inline(self._markdown_to_html(text), outgoing)
|
return self.filter_inline(self._markdown_to_html(text), outgoing)
|
||||||
|
|
||||||
|
|
||||||
def filter_inline(self, html: str, outgoing: bool = False) -> str:
|
def filter_inline(self, html: str, outgoing: bool = False) -> str:
|
||||||
|
"""Filter and return HTML with block tags stripped or substituted."""
|
||||||
|
|
||||||
html = self._inline_sanitizer.sanitize(html)
|
html = self._inline_sanitizer.sanitize(html)
|
||||||
|
|
||||||
if outgoing:
|
if outgoing:
|
||||||
@ -93,6 +129,8 @@ class HtmlFilter:
|
|||||||
|
|
||||||
|
|
||||||
def filter(self, html: str, outgoing: bool = False) -> str:
|
def filter(self, html: str, outgoing: bool = False) -> str:
|
||||||
|
"""Filter and return HTML."""
|
||||||
|
|
||||||
html = self._sanitizer.sanitize(html).rstrip("\n")
|
html = self._sanitizer.sanitize(html).rstrip("\n")
|
||||||
|
|
||||||
if outgoing:
|
if outgoing:
|
||||||
@ -102,6 +140,8 @@ class HtmlFilter:
|
|||||||
|
|
||||||
|
|
||||||
def sanitize_settings(self, inline: bool = False) -> dict:
|
def sanitize_settings(self, inline: bool = False) -> dict:
|
||||||
|
"""Return an html_sanitizer configuration."""
|
||||||
|
|
||||||
# https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
|
# https://matrix.org/docs/spec/client_server/latest#m-room-message-msgtypes
|
||||||
# TODO: mx-reply and the new hidden thing
|
# TODO: mx-reply and the new hidden thing
|
||||||
|
|
||||||
@ -156,6 +196,8 @@ class HtmlFilter:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _process_span_font(el: HtmlElement) -> HtmlElement:
|
def _process_span_font(el: HtmlElement) -> HtmlElement:
|
||||||
|
"""Convert HTML `<span data-mx-color=...` to `<font color=...>`."""
|
||||||
|
|
||||||
if el.tag not in ("span", "font"):
|
if el.tag not in ("span", "font"):
|
||||||
return el
|
return el
|
||||||
|
|
||||||
@ -169,6 +211,8 @@ class HtmlFilter:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _img_to_a(el: HtmlElement) -> HtmlElement:
|
def _img_to_a(el: HtmlElement) -> HtmlElement:
|
||||||
|
"""Linkify images by wrapping `<img>` tags in `<a>`."""
|
||||||
|
|
||||||
if el.tag == "img":
|
if el.tag == "img":
|
||||||
el.tag = "a"
|
el.tag = "a"
|
||||||
el.attrib["href"] = el.attrib.pop("src", "")
|
el.attrib["href"] = el.attrib.pop("src", "")
|
||||||
@ -178,8 +222,11 @@ class HtmlFilter:
|
|||||||
|
|
||||||
|
|
||||||
def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement:
|
def _remove_extra_newlines(self, el: HtmlElement) -> HtmlElement:
|
||||||
# Remove excess \n characters to avoid additional blank lines with
|
"""Remove excess `\\n` characters from non-`<pre>` HTML elements.
|
||||||
# HTML/CSS using `white-space: pre`, except in <pre> content.
|
|
||||||
|
This is done to avoid additional blank lines when the CSS directive
|
||||||
|
`white-space: pre` is used.
|
||||||
|
"""
|
||||||
|
|
||||||
pre_parent = any(parent.tag == "pre" for parent in el.iterancestors())
|
pre_parent = any(parent.tag == "pre" for parent in el.iterancestors())
|
||||||
|
|
||||||
@ -193,9 +240,12 @@ class HtmlFilter:
|
|||||||
|
|
||||||
|
|
||||||
def _newlines_to_return_symbol(self, el: HtmlElement) -> HtmlElement:
|
def _newlines_to_return_symbol(self, el: HtmlElement) -> HtmlElement:
|
||||||
# Add a return unicode symbol (U+23CE) to blocks with siblings
|
"""Turn newlines into unicode return symbols (⏎, U+23CE).
|
||||||
# (e.g. a <p> followed by another <p>) or <br>.
|
|
||||||
# The <br> themselves will be removed by the inline sanitizer.
|
The symbol is added to blocks with siblings (e.g. a `<p>` followed by
|
||||||
|
another `<p>`) and `<br>` tags.
|
||||||
|
The `<br>` themselves will be removed by the inline sanitizer.
|
||||||
|
"""
|
||||||
|
|
||||||
is_block_with_siblings = (el.tag in self.block_tags and
|
is_block_with_siblings = (el.tag in self.block_tags and
|
||||||
next(el.itersiblings(), None) is not None)
|
next(el.itersiblings(), None) is not None)
|
||||||
@ -214,4 +264,4 @@ class HtmlFilter:
|
|||||||
return el
|
return el
|
||||||
|
|
||||||
|
|
||||||
HTML_FILTER = HtmlFilter()
|
HTML_PROCESSOR = HTMLProcessor()
|
@ -30,7 +30,7 @@ from .errors import (
|
|||||||
BadMimeType, InvalidUserId, InvalidUserInContext, MatrixError,
|
BadMimeType, InvalidUserId, InvalidUserInContext, MatrixError,
|
||||||
UneededThumbnail, UserNotFound,
|
UneededThumbnail, UserNotFound,
|
||||||
)
|
)
|
||||||
from .html_filter import HTML_FILTER
|
from .html_markdown import HTML_PROCESSOR as HTML
|
||||||
from .models.items import (
|
from .models.items import (
|
||||||
Account, Event, Member, Room, TypeSpecifier, Upload, UploadStatus,
|
Account, Event, Member, Room, TypeSpecifier, Upload, UploadStatus,
|
||||||
)
|
)
|
||||||
@ -205,13 +205,13 @@ class MatrixClient(nio.AsyncClient):
|
|||||||
event_type = nio.RoomMessageEmote
|
event_type = nio.RoomMessageEmote
|
||||||
text = text[len("/me "): ]
|
text = text[len("/me "): ]
|
||||||
content = {"body": text, "msgtype": "m.emote"}
|
content = {"body": text, "msgtype": "m.emote"}
|
||||||
to_html = HTML_FILTER.from_markdown_inline(text, outgoing=True)
|
to_html = HTML.from_markdown_inline(text, outgoing=True)
|
||||||
echo_body = HTML_FILTER.from_markdown_inline(text)
|
echo_body = HTML.from_markdown_inline(text)
|
||||||
else:
|
else:
|
||||||
event_type = nio.RoomMessageText
|
event_type = nio.RoomMessageText
|
||||||
content = {"body": text, "msgtype": "m.text"}
|
content = {"body": text, "msgtype": "m.text"}
|
||||||
to_html = HTML_FILTER.from_markdown(text, outgoing=True)
|
to_html = HTML.from_markdown(text, outgoing=True)
|
||||||
echo_body = HTML_FILTER.from_markdown(text)
|
echo_body = HTML.from_markdown(text)
|
||||||
|
|
||||||
if to_html not in (html.escape(text), f"<p>{html.escape(text)}</p>"):
|
if to_html not in (html.escape(text), f"<p>{html.escape(text)}</p>"):
|
||||||
content["format"] = "org.matrix.custom.html"
|
content["format"] = "org.matrix.custom.html"
|
||||||
@ -832,7 +832,7 @@ class MatrixClient(nio.AsyncClient):
|
|||||||
display_name = room.display_name or "",
|
display_name = room.display_name or "",
|
||||||
avatar_url = room.gen_avatar_url or "",
|
avatar_url = room.gen_avatar_url or "",
|
||||||
plain_topic = room.topic or "",
|
plain_topic = room.topic or "",
|
||||||
topic = HTML_FILTER.filter_inline(room.topic or ""),
|
topic = HTML.filter_inline(room.topic or ""),
|
||||||
inviter_id = inviter,
|
inviter_id = inviter,
|
||||||
inviter_name = room.user_name(inviter) if inviter else "",
|
inviter_name = room.user_name(inviter) if inviter else "",
|
||||||
inviter_avatar =
|
inviter_avatar =
|
||||||
|
@ -10,7 +10,7 @@ import lxml # nosec
|
|||||||
|
|
||||||
import nio
|
import nio
|
||||||
|
|
||||||
from ..html_filter import HTML_FILTER
|
from ..html_markdown import HTML_PROCESSOR
|
||||||
from ..utils import AutoStrEnum, auto
|
from ..utils import AutoStrEnum, auto
|
||||||
from .model_item import ModelItem
|
from .model_item import ModelItem
|
||||||
|
|
||||||
@ -200,7 +200,7 @@ class Event(ModelItem):
|
|||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
if not self.inline_content:
|
if not self.inline_content:
|
||||||
self.inline_content = HTML_FILTER.filter_inline(self.content)
|
self.inline_content = HTML_PROCESSOR.filter_inline(self.content)
|
||||||
|
|
||||||
|
|
||||||
def __lt__(self, other: "Event") -> bool:
|
def __lt__(self, other: "Event") -> bool:
|
||||||
|
@ -10,7 +10,7 @@ from urllib.parse import quote
|
|||||||
import nio
|
import nio
|
||||||
|
|
||||||
from . import utils
|
from . import utils
|
||||||
from .html_filter import HTML_FILTER
|
from .html_markdown import HTML_PROCESSOR
|
||||||
from .matrix_client import MatrixClient
|
from .matrix_client import MatrixClient
|
||||||
from .models.items import Account, Room, TypeSpecifier
|
from .models.items import Account, Room, TypeSpecifier
|
||||||
|
|
||||||
@ -80,7 +80,7 @@ class NioCallbacks:
|
|||||||
# Content: %1 is the sender, %2 the target (ev.state_key).
|
# Content: %1 is the sender, %2 the target (ev.state_key).
|
||||||
|
|
||||||
async def onRoomMessageText(self, room, ev) -> None:
|
async def onRoomMessageText(self, room, ev) -> None:
|
||||||
co = HTML_FILTER.filter(
|
co = HTML_PROCESSOR.filter(
|
||||||
ev.formatted_body
|
ev.formatted_body
|
||||||
if ev.format == "org.matrix.custom.html" else
|
if ev.format == "org.matrix.custom.html" else
|
||||||
utils.plain2html(ev.body),
|
utils.plain2html(ev.body),
|
||||||
@ -315,7 +315,7 @@ class NioCallbacks:
|
|||||||
|
|
||||||
async def onRoomTopicEvent(self, room, ev) -> None:
|
async def onRoomTopicEvent(self, room, ev) -> None:
|
||||||
if ev.topic:
|
if ev.topic:
|
||||||
topic = HTML_FILTER.filter_inline(ev.topic)
|
topic = HTML_PROCESSOR.filter_inline(ev.topic)
|
||||||
co = f"%1 changed the room's topic to \"{topic}\""
|
co = f"%1 changed the room's topic to \"{topic}\""
|
||||||
else:
|
else:
|
||||||
co = "%1 removed the room's topic"
|
co = "%1 removed the room's topic"
|
||||||
|
Loading…
Reference in New Issue
Block a user