diff --git a/nanobot/channels/matrix.py b/nanobot/channels/matrix.py index 61113ac..8240b51 100644 --- a/nanobot/channels/matrix.py +++ b/nanobot/channels/matrix.py @@ -2,6 +2,7 @@ import asyncio import logging from typing import Any +import nh3 from loguru import logger from mistune import create_markdown from nio import ( @@ -26,21 +27,106 @@ MATRIX_HTML_FORMAT = "org.matrix.custom.html" # Keep plugin output aligned with Matrix recommended HTML tags: # https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes -# - table/strikethrough/task_lists are already used in replies. +# - table/strikethrough are already used in replies. # - url, superscript, and subscript map to common tags (, , ) # that Matrix clients (e.g. Element/FluffyChat) can render consistently. # We intentionally avoid plugins that emit less-portable tags to keep output # predictable across clients. MATRIX_MARKDOWN = create_markdown( escape=True, - plugins=["table", "strikethrough", "task_lists", "url", "superscript", "subscript"], + plugins=["table", "strikethrough", "url", "superscript", "subscript"], +) + +# Sanitizer policy rationale: +# - Baseline follows Matrix formatted message guidance: +# https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes +# - We intentionally use a tighter subset than the full spec to keep behavior +# predictable across clients and reduce risk from LLM-generated content. +# - URLs are restricted to common safe schemes for links, and image sources are +# additionally constrained to mxc:// for Matrix-native media handling. +# - Spec items intentionally NOT enabled yet: +# - href schemes ftp/magnet (we keep link schemes smaller for now). +# - a[target] (clients already control link-opening behavior). +# - span[data-mx-bg-color|data-mx-color|data-mx-spoiler|data-mx-maths] +# - div[data-mx-maths] +# These can be added later when we explicitly support those Matrix features. +MATRIX_ALLOWED_HTML_TAGS = { + "p", + "a", + "strong", + "em", + "del", + "code", + "pre", + "blockquote", + "ul", + "ol", + "li", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "br", + "table", + "thead", + "tbody", + "tr", + "th", + "td", + "caption", + "sup", + "sub", + "img", +} +MATRIX_ALLOWED_HTML_ATTRIBUTES: dict[str, set[str]] = { + "a": {"href"}, + "code": {"class"}, + "ol": {"start"}, + "img": {"src", "alt", "title", "width", "height"}, +} +MATRIX_ALLOWED_URL_SCHEMES = {"https", "http", "matrix", "mailto", "mxc"} + + +def _filter_matrix_html_attribute(tag: str, attr: str, value: str) -> str | None: + """Filter attribute values to a safe Matrix-compatible subset.""" + if tag == "a" and attr == "href": + lower_value = value.lower() + if lower_value.startswith(("https://", "http://", "matrix:", "mailto:")): + return value + return None + + if tag == "img" and attr == "src": + return value if value.lower().startswith("mxc://") else None + + if tag == "code" and attr == "class": + classes = [ + cls + for cls in value.split() + if cls.startswith("language-") and not cls.startswith("language-_") + ] + return " ".join(classes) if classes else None + + return value + + +MATRIX_HTML_CLEANER = nh3.Cleaner( + tags=MATRIX_ALLOWED_HTML_TAGS, + attributes=MATRIX_ALLOWED_HTML_ATTRIBUTES, + attribute_filter=_filter_matrix_html_attribute, + url_schemes=MATRIX_ALLOWED_URL_SCHEMES, + strip_comments=True, + link_rel="noopener noreferrer", ) def _render_markdown_html(text: str) -> str | None: """Render markdown to HTML for Matrix formatted messages.""" try: - formatted = MATRIX_MARKDOWN(text).strip() + rendered = MATRIX_MARKDOWN(text) + formatted = MATRIX_HTML_CLEANER.clean(rendered).strip() except Exception as e: logger.debug( "Matrix markdown rendering failed ({}): {}", diff --git a/pyproject.toml b/pyproject.toml index 82b37a3..12a1ee8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,8 @@ dependencies = [ "mcp>=1.26.0,<2.0.0", "json-repair>=0.57.0,<1.0.0", "matrix-nio[e2e]>=0.25.2", - "mistune>=3.0.0", + "mistune>=3.0.0,<4.0.0", + "nh3>=0.2.17,<1.0.0", ] [project.optional-dependencies] diff --git a/tests/test_matrix_channel.py b/tests/test_matrix_channel.py index 2e3dad2..616b0bc 100644 --- a/tests/test_matrix_channel.py +++ b/tests/test_matrix_channel.py @@ -421,7 +421,7 @@ async def test_send_adds_formatted_body_for_markdown() -> None: assert content["format"] == MATRIX_HTML_FORMAT assert "

Headline

" in str(content["formatted_body"]) assert "" in str(content["formatted_body"]) - assert "task-list-item-checkbox" in str(content["formatted_body"]) + assert "
  • [x] done
  • " in str(content["formatted_body"]) @pytest.mark.asyncio @@ -439,11 +439,55 @@ async def test_send_adds_formatted_body_for_inline_url_superscript_subscript() - assert content["msgtype"] == "m.text" assert content["body"] == markdown_text assert content["format"] == MATRIX_HTML_FORMAT - assert '' in str(content["formatted_body"]) + assert '' in str( + content["formatted_body"] + ) assert "2" in str(content["formatted_body"]) assert "2" in str(content["formatted_body"]) +@pytest.mark.asyncio +async def test_send_sanitizes_disallowed_link_scheme() -> None: + channel = MatrixChannel(_make_config(), MessageBus()) + client = _FakeAsyncClient("", "", "", None) + channel.client = client + + markdown_text = "[click](javascript:alert(1))" + await channel.send( + OutboundMessage(channel="matrix", chat_id="!room:matrix.org", content=markdown_text) + ) + + formatted_body = str(client.room_send_calls[0]["content"]["formatted_body"]) + assert "javascript:" not in formatted_body + assert "x' + cleaned_html = matrix_module.MATRIX_HTML_CLEANER.clean(dirty_html) + + assert " None: + channel = MatrixChannel(_make_config(), MessageBus()) + client = _FakeAsyncClient("", "", "", None) + channel.client = client + + markdown_text = "![ok](mxc://example.org/mediaid) ![no](https://example.com/a.png)" + await channel.send( + OutboundMessage(channel="matrix", chat_id="!room:matrix.org", content=markdown_text) + ) + + formatted_body = str(client.room_send_calls[0]["content"]["formatted_body"]) + assert 'src="mxc://example.org/mediaid"' in formatted_body + assert 'src="https://example.com/a.png"' not in formatted_body + + @pytest.mark.asyncio async def test_send_falls_back_to_plaintext_when_markdown_render_fails(monkeypatch) -> None: channel = MatrixChannel(_make_config(), MessageBus())