fix(matrix): sanitize formatted html with nh3
This commit is contained in:
@@ -2,6 +2,7 @@ import asyncio
|
|||||||
import logging
|
import logging
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
import nh3
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from mistune import create_markdown
|
from mistune import create_markdown
|
||||||
from nio import (
|
from nio import (
|
||||||
@@ -26,21 +27,106 @@ MATRIX_HTML_FORMAT = "org.matrix.custom.html"
|
|||||||
|
|
||||||
# Keep plugin output aligned with Matrix recommended HTML tags:
|
# Keep plugin output aligned with Matrix recommended HTML tags:
|
||||||
# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||||
# - table/strikethrough/task_lists are already used in replies.
|
# - table/strikethrough are already used in replies.
|
||||||
# - url, superscript, and subscript map to common tags (<a>, <sup>, <sub>)
|
# - url, superscript, and subscript map to common tags (<a>, <sup>, <sub>)
|
||||||
# that Matrix clients (e.g. Element/FluffyChat) can render consistently.
|
# that Matrix clients (e.g. Element/FluffyChat) can render consistently.
|
||||||
# We intentionally avoid plugins that emit less-portable tags to keep output
|
# We intentionally avoid plugins that emit less-portable tags to keep output
|
||||||
# predictable across clients.
|
# predictable across clients.
|
||||||
MATRIX_MARKDOWN = create_markdown(
|
MATRIX_MARKDOWN = create_markdown(
|
||||||
escape=True,
|
escape=True,
|
||||||
plugins=["table", "strikethrough", "task_lists", "url", "superscript", "subscript"],
|
plugins=["table", "strikethrough", "url", "superscript", "subscript"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sanitizer policy rationale:
|
||||||
|
# - Baseline follows Matrix formatted message guidance:
|
||||||
|
# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||||
|
# - We intentionally use a tighter subset than the full spec to keep behavior
|
||||||
|
# predictable across clients and reduce risk from LLM-generated content.
|
||||||
|
# - URLs are restricted to common safe schemes for links, and image sources are
|
||||||
|
# additionally constrained to mxc:// for Matrix-native media handling.
|
||||||
|
# - Spec items intentionally NOT enabled yet:
|
||||||
|
# - href schemes ftp/magnet (we keep link schemes smaller for now).
|
||||||
|
# - a[target] (clients already control link-opening behavior).
|
||||||
|
# - span[data-mx-bg-color|data-mx-color|data-mx-spoiler|data-mx-maths]
|
||||||
|
# - div[data-mx-maths]
|
||||||
|
# These can be added later when we explicitly support those Matrix features.
|
||||||
|
MATRIX_ALLOWED_HTML_TAGS = {
|
||||||
|
"p",
|
||||||
|
"a",
|
||||||
|
"strong",
|
||||||
|
"em",
|
||||||
|
"del",
|
||||||
|
"code",
|
||||||
|
"pre",
|
||||||
|
"blockquote",
|
||||||
|
"ul",
|
||||||
|
"ol",
|
||||||
|
"li",
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"h6",
|
||||||
|
"hr",
|
||||||
|
"br",
|
||||||
|
"table",
|
||||||
|
"thead",
|
||||||
|
"tbody",
|
||||||
|
"tr",
|
||||||
|
"th",
|
||||||
|
"td",
|
||||||
|
"caption",
|
||||||
|
"sup",
|
||||||
|
"sub",
|
||||||
|
"img",
|
||||||
|
}
|
||||||
|
MATRIX_ALLOWED_HTML_ATTRIBUTES: dict[str, set[str]] = {
|
||||||
|
"a": {"href"},
|
||||||
|
"code": {"class"},
|
||||||
|
"ol": {"start"},
|
||||||
|
"img": {"src", "alt", "title", "width", "height"},
|
||||||
|
}
|
||||||
|
MATRIX_ALLOWED_URL_SCHEMES = {"https", "http", "matrix", "mailto", "mxc"}
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_matrix_html_attribute(tag: str, attr: str, value: str) -> str | None:
|
||||||
|
"""Filter attribute values to a safe Matrix-compatible subset."""
|
||||||
|
if tag == "a" and attr == "href":
|
||||||
|
lower_value = value.lower()
|
||||||
|
if lower_value.startswith(("https://", "http://", "matrix:", "mailto:")):
|
||||||
|
return value
|
||||||
|
return None
|
||||||
|
|
||||||
|
if tag == "img" and attr == "src":
|
||||||
|
return value if value.lower().startswith("mxc://") else None
|
||||||
|
|
||||||
|
if tag == "code" and attr == "class":
|
||||||
|
classes = [
|
||||||
|
cls
|
||||||
|
for cls in value.split()
|
||||||
|
if cls.startswith("language-") and not cls.startswith("language-_")
|
||||||
|
]
|
||||||
|
return " ".join(classes) if classes else None
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
MATRIX_HTML_CLEANER = nh3.Cleaner(
|
||||||
|
tags=MATRIX_ALLOWED_HTML_TAGS,
|
||||||
|
attributes=MATRIX_ALLOWED_HTML_ATTRIBUTES,
|
||||||
|
attribute_filter=_filter_matrix_html_attribute,
|
||||||
|
url_schemes=MATRIX_ALLOWED_URL_SCHEMES,
|
||||||
|
strip_comments=True,
|
||||||
|
link_rel="noopener noreferrer",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _render_markdown_html(text: str) -> str | None:
|
def _render_markdown_html(text: str) -> str | None:
|
||||||
"""Render markdown to HTML for Matrix formatted messages."""
|
"""Render markdown to HTML for Matrix formatted messages."""
|
||||||
try:
|
try:
|
||||||
formatted = MATRIX_MARKDOWN(text).strip()
|
rendered = MATRIX_MARKDOWN(text)
|
||||||
|
formatted = MATRIX_HTML_CLEANER.clean(rendered).strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Matrix markdown rendering failed ({}): {}",
|
"Matrix markdown rendering failed ({}): {}",
|
||||||
|
|||||||
@@ -43,7 +43,8 @@ dependencies = [
|
|||||||
"mcp>=1.26.0,<2.0.0",
|
"mcp>=1.26.0,<2.0.0",
|
||||||
"json-repair>=0.57.0,<1.0.0",
|
"json-repair>=0.57.0,<1.0.0",
|
||||||
"matrix-nio[e2e]>=0.25.2",
|
"matrix-nio[e2e]>=0.25.2",
|
||||||
"mistune>=3.0.0",
|
"mistune>=3.0.0,<4.0.0",
|
||||||
|
"nh3>=0.2.17,<1.0.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
@@ -421,7 +421,7 @@ async def test_send_adds_formatted_body_for_markdown() -> None:
|
|||||||
assert content["format"] == MATRIX_HTML_FORMAT
|
assert content["format"] == MATRIX_HTML_FORMAT
|
||||||
assert "<h1>Headline</h1>" in str(content["formatted_body"])
|
assert "<h1>Headline</h1>" in str(content["formatted_body"])
|
||||||
assert "<table>" in str(content["formatted_body"])
|
assert "<table>" in str(content["formatted_body"])
|
||||||
assert "task-list-item-checkbox" in str(content["formatted_body"])
|
assert "<li>[x] done</li>" in str(content["formatted_body"])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@@ -439,11 +439,55 @@ async def test_send_adds_formatted_body_for_inline_url_superscript_subscript() -
|
|||||||
assert content["msgtype"] == "m.text"
|
assert content["msgtype"] == "m.text"
|
||||||
assert content["body"] == markdown_text
|
assert content["body"] == markdown_text
|
||||||
assert content["format"] == MATRIX_HTML_FORMAT
|
assert content["format"] == MATRIX_HTML_FORMAT
|
||||||
assert '<a href="https://example.com">' in str(content["formatted_body"])
|
assert '<a href="https://example.com" rel="noopener noreferrer">' in str(
|
||||||
|
content["formatted_body"]
|
||||||
|
)
|
||||||
assert "<sup>2</sup>" in str(content["formatted_body"])
|
assert "<sup>2</sup>" in str(content["formatted_body"])
|
||||||
assert "<sub>2</sub>" in str(content["formatted_body"])
|
assert "<sub>2</sub>" in str(content["formatted_body"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_send_sanitizes_disallowed_link_scheme() -> None:
|
||||||
|
channel = MatrixChannel(_make_config(), MessageBus())
|
||||||
|
client = _FakeAsyncClient("", "", "", None)
|
||||||
|
channel.client = client
|
||||||
|
|
||||||
|
markdown_text = "[click](javascript:alert(1))"
|
||||||
|
await channel.send(
|
||||||
|
OutboundMessage(channel="matrix", chat_id="!room:matrix.org", content=markdown_text)
|
||||||
|
)
|
||||||
|
|
||||||
|
formatted_body = str(client.room_send_calls[0]["content"]["formatted_body"])
|
||||||
|
assert "javascript:" not in formatted_body
|
||||||
|
assert "<a" in formatted_body
|
||||||
|
assert "href=" not in formatted_body
|
||||||
|
|
||||||
|
|
||||||
|
def test_matrix_html_cleaner_strips_event_handlers_and_script_tags() -> None:
|
||||||
|
dirty_html = '<a href="https://example.com" onclick="evil()">x</a><script>alert(1)</script>'
|
||||||
|
cleaned_html = matrix_module.MATRIX_HTML_CLEANER.clean(dirty_html)
|
||||||
|
|
||||||
|
assert "<script" not in cleaned_html
|
||||||
|
assert "onclick=" not in cleaned_html
|
||||||
|
assert '<a href="https://example.com"' in cleaned_html
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_send_keeps_only_mxc_image_sources() -> None:
|
||||||
|
channel = MatrixChannel(_make_config(), MessageBus())
|
||||||
|
client = _FakeAsyncClient("", "", "", None)
|
||||||
|
channel.client = client
|
||||||
|
|
||||||
|
markdown_text = " "
|
||||||
|
await channel.send(
|
||||||
|
OutboundMessage(channel="matrix", chat_id="!room:matrix.org", content=markdown_text)
|
||||||
|
)
|
||||||
|
|
||||||
|
formatted_body = str(client.room_send_calls[0]["content"]["formatted_body"])
|
||||||
|
assert 'src="mxc://example.org/mediaid"' in formatted_body
|
||||||
|
assert 'src="https://example.com/a.png"' not in formatted_body
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_send_falls_back_to_plaintext_when_markdown_render_fails(monkeypatch) -> None:
|
async def test_send_falls_back_to_plaintext_when_markdown_render_fails(monkeypatch) -> None:
|
||||||
channel = MatrixChannel(_make_config(), MessageBus())
|
channel = MatrixChannel(_make_config(), MessageBus())
|
||||||
|
|||||||
Reference in New Issue
Block a user