fix(matrix): sanitize formatted html with nh3
This commit is contained in:
@@ -2,6 +2,7 @@ import asyncio
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import nh3
|
||||
from loguru import logger
|
||||
from mistune import create_markdown
|
||||
from nio import (
|
||||
@@ -26,21 +27,106 @@ MATRIX_HTML_FORMAT = "org.matrix.custom.html"
|
||||
|
||||
# Keep plugin output aligned with Matrix recommended HTML tags:
|
||||
# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
# - table/strikethrough/task_lists are already used in replies.
|
||||
# - table/strikethrough are already used in replies.
|
||||
# - url, superscript, and subscript map to common tags (<a>, <sup>, <sub>)
|
||||
# that Matrix clients (e.g. Element/FluffyChat) can render consistently.
|
||||
# We intentionally avoid plugins that emit less-portable tags to keep output
|
||||
# predictable across clients.
|
||||
MATRIX_MARKDOWN = create_markdown(
|
||||
escape=True,
|
||||
plugins=["table", "strikethrough", "task_lists", "url", "superscript", "subscript"],
|
||||
plugins=["table", "strikethrough", "url", "superscript", "subscript"],
|
||||
)
|
||||
|
||||
# Sanitizer policy rationale:
|
||||
# - Baseline follows Matrix formatted message guidance:
|
||||
# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
# - We intentionally use a tighter subset than the full spec to keep behavior
|
||||
# predictable across clients and reduce risk from LLM-generated content.
|
||||
# - URLs are restricted to common safe schemes for links, and image sources are
|
||||
# additionally constrained to mxc:// for Matrix-native media handling.
|
||||
# - Spec items intentionally NOT enabled yet:
|
||||
# - href schemes ftp/magnet (we keep link schemes smaller for now).
|
||||
# - a[target] (clients already control link-opening behavior).
|
||||
# - span[data-mx-bg-color|data-mx-color|data-mx-spoiler|data-mx-maths]
|
||||
# - div[data-mx-maths]
|
||||
# These can be added later when we explicitly support those Matrix features.
|
||||
MATRIX_ALLOWED_HTML_TAGS = {
|
||||
"p",
|
||||
"a",
|
||||
"strong",
|
||||
"em",
|
||||
"del",
|
||||
"code",
|
||||
"pre",
|
||||
"blockquote",
|
||||
"ul",
|
||||
"ol",
|
||||
"li",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"hr",
|
||||
"br",
|
||||
"table",
|
||||
"thead",
|
||||
"tbody",
|
||||
"tr",
|
||||
"th",
|
||||
"td",
|
||||
"caption",
|
||||
"sup",
|
||||
"sub",
|
||||
"img",
|
||||
}
|
||||
MATRIX_ALLOWED_HTML_ATTRIBUTES: dict[str, set[str]] = {
|
||||
"a": {"href"},
|
||||
"code": {"class"},
|
||||
"ol": {"start"},
|
||||
"img": {"src", "alt", "title", "width", "height"},
|
||||
}
|
||||
MATRIX_ALLOWED_URL_SCHEMES = {"https", "http", "matrix", "mailto", "mxc"}
|
||||
|
||||
|
||||
def _filter_matrix_html_attribute(tag: str, attr: str, value: str) -> str | None:
|
||||
"""Filter attribute values to a safe Matrix-compatible subset."""
|
||||
if tag == "a" and attr == "href":
|
||||
lower_value = value.lower()
|
||||
if lower_value.startswith(("https://", "http://", "matrix:", "mailto:")):
|
||||
return value
|
||||
return None
|
||||
|
||||
if tag == "img" and attr == "src":
|
||||
return value if value.lower().startswith("mxc://") else None
|
||||
|
||||
if tag == "code" and attr == "class":
|
||||
classes = [
|
||||
cls
|
||||
for cls in value.split()
|
||||
if cls.startswith("language-") and not cls.startswith("language-_")
|
||||
]
|
||||
return " ".join(classes) if classes else None
|
||||
|
||||
return value
|
||||
|
||||
|
||||
MATRIX_HTML_CLEANER = nh3.Cleaner(
|
||||
tags=MATRIX_ALLOWED_HTML_TAGS,
|
||||
attributes=MATRIX_ALLOWED_HTML_ATTRIBUTES,
|
||||
attribute_filter=_filter_matrix_html_attribute,
|
||||
url_schemes=MATRIX_ALLOWED_URL_SCHEMES,
|
||||
strip_comments=True,
|
||||
link_rel="noopener noreferrer",
|
||||
)
|
||||
|
||||
|
||||
def _render_markdown_html(text: str) -> str | None:
|
||||
"""Render markdown to HTML for Matrix formatted messages."""
|
||||
try:
|
||||
formatted = MATRIX_MARKDOWN(text).strip()
|
||||
rendered = MATRIX_MARKDOWN(text)
|
||||
formatted = MATRIX_HTML_CLEANER.clean(rendered).strip()
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
"Matrix markdown rendering failed ({}): {}",
|
||||
|
||||
@@ -43,7 +43,8 @@ dependencies = [
|
||||
"mcp>=1.26.0,<2.0.0",
|
||||
"json-repair>=0.57.0,<1.0.0",
|
||||
"matrix-nio[e2e]>=0.25.2",
|
||||
"mistune>=3.0.0",
|
||||
"mistune>=3.0.0,<4.0.0",
|
||||
"nh3>=0.2.17,<1.0.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -421,7 +421,7 @@ async def test_send_adds_formatted_body_for_markdown() -> None:
|
||||
assert content["format"] == MATRIX_HTML_FORMAT
|
||||
assert "<h1>Headline</h1>" in str(content["formatted_body"])
|
||||
assert "<table>" in str(content["formatted_body"])
|
||||
assert "task-list-item-checkbox" in str(content["formatted_body"])
|
||||
assert "<li>[x] done</li>" in str(content["formatted_body"])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -439,11 +439,55 @@ async def test_send_adds_formatted_body_for_inline_url_superscript_subscript() -
|
||||
assert content["msgtype"] == "m.text"
|
||||
assert content["body"] == markdown_text
|
||||
assert content["format"] == MATRIX_HTML_FORMAT
|
||||
assert '<a href="https://example.com">' in str(content["formatted_body"])
|
||||
assert '<a href="https://example.com" rel="noopener noreferrer">' in str(
|
||||
content["formatted_body"]
|
||||
)
|
||||
assert "<sup>2</sup>" in str(content["formatted_body"])
|
||||
assert "<sub>2</sub>" in str(content["formatted_body"])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_sanitizes_disallowed_link_scheme() -> None:
|
||||
channel = MatrixChannel(_make_config(), MessageBus())
|
||||
client = _FakeAsyncClient("", "", "", None)
|
||||
channel.client = client
|
||||
|
||||
markdown_text = "[click](javascript:alert(1))"
|
||||
await channel.send(
|
||||
OutboundMessage(channel="matrix", chat_id="!room:matrix.org", content=markdown_text)
|
||||
)
|
||||
|
||||
formatted_body = str(client.room_send_calls[0]["content"]["formatted_body"])
|
||||
assert "javascript:" not in formatted_body
|
||||
assert "<a" in formatted_body
|
||||
assert "href=" not in formatted_body
|
||||
|
||||
|
||||
def test_matrix_html_cleaner_strips_event_handlers_and_script_tags() -> None:
|
||||
dirty_html = '<a href="https://example.com" onclick="evil()">x</a><script>alert(1)</script>'
|
||||
cleaned_html = matrix_module.MATRIX_HTML_CLEANER.clean(dirty_html)
|
||||
|
||||
assert "<script" not in cleaned_html
|
||||
assert "onclick=" not in cleaned_html
|
||||
assert '<a href="https://example.com"' in cleaned_html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_keeps_only_mxc_image_sources() -> None:
|
||||
channel = MatrixChannel(_make_config(), MessageBus())
|
||||
client = _FakeAsyncClient("", "", "", None)
|
||||
channel.client = client
|
||||
|
||||
markdown_text = " "
|
||||
await channel.send(
|
||||
OutboundMessage(channel="matrix", chat_id="!room:matrix.org", content=markdown_text)
|
||||
)
|
||||
|
||||
formatted_body = str(client.room_send_calls[0]["content"]["formatted_body"])
|
||||
assert 'src="mxc://example.org/mediaid"' in formatted_body
|
||||
assert 'src="https://example.com/a.png"' not in formatted_body
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_falls_back_to_plaintext_when_markdown_render_fails(monkeypatch) -> None:
|
||||
channel = MatrixChannel(_make_config(), MessageBus())
|
||||
|
||||
Reference in New Issue
Block a user