fix(matrix): sanitize formatted html with nh3

This commit is contained in:
Alexander Minges
2026-02-10 16:18:47 +01:00
parent 9b14869cb1
commit 6be7368a38
3 changed files with 137 additions and 6 deletions

View File

@@ -2,6 +2,7 @@ import asyncio
import logging
from typing import Any
import nh3
from loguru import logger
from mistune import create_markdown
from nio import (
@@ -26,21 +27,106 @@ MATRIX_HTML_FORMAT = "org.matrix.custom.html"
# Keep plugin output aligned with Matrix recommended HTML tags:
# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
# - table/strikethrough/task_lists are already used in replies.
# - table/strikethrough are already used in replies.
# - url, superscript, and subscript map to common tags (<a>, <sup>, <sub>)
# that Matrix clients (e.g. Element/FluffyChat) can render consistently.
# We intentionally avoid plugins that emit less-portable tags to keep output
# predictable across clients.
MATRIX_MARKDOWN = create_markdown(
escape=True,
plugins=["table", "strikethrough", "task_lists", "url", "superscript", "subscript"],
plugins=["table", "strikethrough", "url", "superscript", "subscript"],
)
# Sanitizer policy rationale:
# - Baseline follows Matrix formatted message guidance:
# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
# - We intentionally use a tighter subset than the full spec to keep behavior
# predictable across clients and reduce risk from LLM-generated content.
# - URLs are restricted to common safe schemes for links, and image sources are
# additionally constrained to mxc:// for Matrix-native media handling.
# - Spec items intentionally NOT enabled yet:
# - href schemes ftp/magnet (we keep link schemes smaller for now).
# - a[target] (clients already control link-opening behavior).
# - span[data-mx-bg-color|data-mx-color|data-mx-spoiler|data-mx-maths]
# - div[data-mx-maths]
# These can be added later when we explicitly support those Matrix features.
MATRIX_ALLOWED_HTML_TAGS = {
"p",
"a",
"strong",
"em",
"del",
"code",
"pre",
"blockquote",
"ul",
"ol",
"li",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"hr",
"br",
"table",
"thead",
"tbody",
"tr",
"th",
"td",
"caption",
"sup",
"sub",
"img",
}
MATRIX_ALLOWED_HTML_ATTRIBUTES: dict[str, set[str]] = {
"a": {"href"},
"code": {"class"},
"ol": {"start"},
"img": {"src", "alt", "title", "width", "height"},
}
MATRIX_ALLOWED_URL_SCHEMES = {"https", "http", "matrix", "mailto", "mxc"}
def _filter_matrix_html_attribute(tag: str, attr: str, value: str) -> str | None:
"""Filter attribute values to a safe Matrix-compatible subset."""
if tag == "a" and attr == "href":
lower_value = value.lower()
if lower_value.startswith(("https://", "http://", "matrix:", "mailto:")):
return value
return None
if tag == "img" and attr == "src":
return value if value.lower().startswith("mxc://") else None
if tag == "code" and attr == "class":
classes = [
cls
for cls in value.split()
if cls.startswith("language-") and not cls.startswith("language-_")
]
return " ".join(classes) if classes else None
return value
MATRIX_HTML_CLEANER = nh3.Cleaner(
tags=MATRIX_ALLOWED_HTML_TAGS,
attributes=MATRIX_ALLOWED_HTML_ATTRIBUTES,
attribute_filter=_filter_matrix_html_attribute,
url_schemes=MATRIX_ALLOWED_URL_SCHEMES,
strip_comments=True,
link_rel="noopener noreferrer",
)
def _render_markdown_html(text: str) -> str | None:
"""Render markdown to HTML for Matrix formatted messages."""
try:
formatted = MATRIX_MARKDOWN(text).strip()
rendered = MATRIX_MARKDOWN(text)
formatted = MATRIX_HTML_CLEANER.clean(rendered).strip()
except Exception as e:
logger.debug(
"Matrix markdown rendering failed ({}): {}",