fix(matrix): sanitize formatted html with nh3

This commit is contained in:
Alexander Minges
2026-02-10 16:18:47 +01:00
parent 9b14869cb1
commit 6be7368a38
3 changed files with 137 additions and 6 deletions

View File

@@ -2,6 +2,7 @@ import asyncio
import logging import logging
from typing import Any from typing import Any
import nh3
from loguru import logger from loguru import logger
from mistune import create_markdown from mistune import create_markdown
from nio import ( from nio import (
@@ -26,21 +27,106 @@ MATRIX_HTML_FORMAT = "org.matrix.custom.html"
# Keep plugin output aligned with Matrix recommended HTML tags: # Keep plugin output aligned with Matrix recommended HTML tags:
# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes # https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
# - table/strikethrough/task_lists are already used in replies. # - table/strikethrough are already used in replies.
# - url, superscript, and subscript map to common tags (<a>, <sup>, <sub>) # - url, superscript, and subscript map to common tags (<a>, <sup>, <sub>)
# that Matrix clients (e.g. Element/FluffyChat) can render consistently. # that Matrix clients (e.g. Element/FluffyChat) can render consistently.
# We intentionally avoid plugins that emit less-portable tags to keep output # We intentionally avoid plugins that emit less-portable tags to keep output
# predictable across clients. # predictable across clients.
MATRIX_MARKDOWN = create_markdown( MATRIX_MARKDOWN = create_markdown(
escape=True, escape=True,
plugins=["table", "strikethrough", "task_lists", "url", "superscript", "subscript"], plugins=["table", "strikethrough", "url", "superscript", "subscript"],
)
# Sanitizer policy rationale:
# - Baseline follows Matrix formatted message guidance:
# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
# - We intentionally use a tighter subset than the full spec to keep behavior
# predictable across clients and reduce risk from LLM-generated content.
# - URLs are restricted to common safe schemes for links, and image sources are
# additionally constrained to mxc:// for Matrix-native media handling.
# - Spec items intentionally NOT enabled yet:
# - href schemes ftp/magnet (we keep link schemes smaller for now).
# - a[target] (clients already control link-opening behavior).
# - span[data-mx-bg-color|data-mx-color|data-mx-spoiler|data-mx-maths]
# - div[data-mx-maths]
# These can be added later when we explicitly support those Matrix features.
MATRIX_ALLOWED_HTML_TAGS = {
"p",
"a",
"strong",
"em",
"del",
"code",
"pre",
"blockquote",
"ul",
"ol",
"li",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"hr",
"br",
"table",
"thead",
"tbody",
"tr",
"th",
"td",
"caption",
"sup",
"sub",
"img",
}
MATRIX_ALLOWED_HTML_ATTRIBUTES: dict[str, set[str]] = {
"a": {"href"},
"code": {"class"},
"ol": {"start"},
"img": {"src", "alt", "title", "width", "height"},
}
MATRIX_ALLOWED_URL_SCHEMES = {"https", "http", "matrix", "mailto", "mxc"}
def _filter_matrix_html_attribute(tag: str, attr: str, value: str) -> str | None:
"""Filter attribute values to a safe Matrix-compatible subset."""
if tag == "a" and attr == "href":
lower_value = value.lower()
if lower_value.startswith(("https://", "http://", "matrix:", "mailto:")):
return value
return None
if tag == "img" and attr == "src":
return value if value.lower().startswith("mxc://") else None
if tag == "code" and attr == "class":
classes = [
cls
for cls in value.split()
if cls.startswith("language-") and not cls.startswith("language-_")
]
return " ".join(classes) if classes else None
return value
MATRIX_HTML_CLEANER = nh3.Cleaner(
tags=MATRIX_ALLOWED_HTML_TAGS,
attributes=MATRIX_ALLOWED_HTML_ATTRIBUTES,
attribute_filter=_filter_matrix_html_attribute,
url_schemes=MATRIX_ALLOWED_URL_SCHEMES,
strip_comments=True,
link_rel="noopener noreferrer",
) )
def _render_markdown_html(text: str) -> str | None: def _render_markdown_html(text: str) -> str | None:
"""Render markdown to HTML for Matrix formatted messages.""" """Render markdown to HTML for Matrix formatted messages."""
try: try:
formatted = MATRIX_MARKDOWN(text).strip() rendered = MATRIX_MARKDOWN(text)
formatted = MATRIX_HTML_CLEANER.clean(rendered).strip()
except Exception as e: except Exception as e:
logger.debug( logger.debug(
"Matrix markdown rendering failed ({}): {}", "Matrix markdown rendering failed ({}): {}",

View File

@@ -43,7 +43,8 @@ dependencies = [
"mcp>=1.26.0,<2.0.0", "mcp>=1.26.0,<2.0.0",
"json-repair>=0.57.0,<1.0.0", "json-repair>=0.57.0,<1.0.0",
"matrix-nio[e2e]>=0.25.2", "matrix-nio[e2e]>=0.25.2",
"mistune>=3.0.0", "mistune>=3.0.0,<4.0.0",
"nh3>=0.2.17,<1.0.0",
] ]
[project.optional-dependencies] [project.optional-dependencies]

View File

@@ -421,7 +421,7 @@ async def test_send_adds_formatted_body_for_markdown() -> None:
assert content["format"] == MATRIX_HTML_FORMAT assert content["format"] == MATRIX_HTML_FORMAT
assert "<h1>Headline</h1>" in str(content["formatted_body"]) assert "<h1>Headline</h1>" in str(content["formatted_body"])
assert "<table>" in str(content["formatted_body"]) assert "<table>" in str(content["formatted_body"])
assert "task-list-item-checkbox" in str(content["formatted_body"]) assert "<li>[x] done</li>" in str(content["formatted_body"])
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -439,11 +439,55 @@ async def test_send_adds_formatted_body_for_inline_url_superscript_subscript() -
assert content["msgtype"] == "m.text" assert content["msgtype"] == "m.text"
assert content["body"] == markdown_text assert content["body"] == markdown_text
assert content["format"] == MATRIX_HTML_FORMAT assert content["format"] == MATRIX_HTML_FORMAT
assert '<a href="https://example.com">' in str(content["formatted_body"]) assert '<a href="https://example.com" rel="noopener noreferrer">' in str(
content["formatted_body"]
)
assert "<sup>2</sup>" in str(content["formatted_body"]) assert "<sup>2</sup>" in str(content["formatted_body"])
assert "<sub>2</sub>" in str(content["formatted_body"]) assert "<sub>2</sub>" in str(content["formatted_body"])
@pytest.mark.asyncio
async def test_send_sanitizes_disallowed_link_scheme() -> None:
channel = MatrixChannel(_make_config(), MessageBus())
client = _FakeAsyncClient("", "", "", None)
channel.client = client
markdown_text = "[click](javascript:alert(1))"
await channel.send(
OutboundMessage(channel="matrix", chat_id="!room:matrix.org", content=markdown_text)
)
formatted_body = str(client.room_send_calls[0]["content"]["formatted_body"])
assert "javascript:" not in formatted_body
assert "<a" in formatted_body
assert "href=" not in formatted_body
def test_matrix_html_cleaner_strips_event_handlers_and_script_tags() -> None:
dirty_html = '<a href="https://example.com" onclick="evil()">x</a><script>alert(1)</script>'
cleaned_html = matrix_module.MATRIX_HTML_CLEANER.clean(dirty_html)
assert "<script" not in cleaned_html
assert "onclick=" not in cleaned_html
assert '<a href="https://example.com"' in cleaned_html
@pytest.mark.asyncio
async def test_send_keeps_only_mxc_image_sources() -> None:
channel = MatrixChannel(_make_config(), MessageBus())
client = _FakeAsyncClient("", "", "", None)
channel.client = client
markdown_text = "![ok](mxc://example.org/mediaid) ![no](https://example.com/a.png)"
await channel.send(
OutboundMessage(channel="matrix", chat_id="!room:matrix.org", content=markdown_text)
)
formatted_body = str(client.room_send_calls[0]["content"]["formatted_body"])
assert 'src="mxc://example.org/mediaid"' in formatted_body
assert 'src="https://example.com/a.png"' not in formatted_body
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_send_falls_back_to_plaintext_when_markdown_render_fails(monkeypatch) -> None: async def test_send_falls_back_to_plaintext_when_markdown_render_fails(monkeypatch) -> None:
channel = MatrixChannel(_make_config(), MessageBus()) channel = MatrixChannel(_make_config(), MessageBus())