diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..dc75abd4fc 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -41,7 +41,15 @@ def extract_content_from_html(html: str) -> str: content = markdownify.markdownify( ret["content"], heading_style=markdownify.ATX, - ) + ).lstrip() + title = ret.get("title") + if title: + title_markdown = f"# {title.strip()}" + # Readability often omits the document title when the article body does + # not repeat it. Include it once so callers can identify fetched pages, + # while avoiding duplicate headings when the title is already present. + if title_markdown.casefold() not in content[: len(title_markdown) + 32].casefold(): + content = f"{title_markdown}\n\n{content}" return content diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py index 96c1cb38c7..f6dd0a880a 100644 --- a/src/fetch/tests/test_server.py +++ b/src/fetch/tests/test_server.py @@ -67,6 +67,43 @@ def test_simple_html(self): # readabilipy may extract different parts depending on the content assert "test paragraph" in result + def test_html_includes_document_title_when_omitted_from_article(self): + """Test that simplified markdown preserves the page title.""" + html = """ + + What’s new in 2.1.0 (Aug 30, 2023) + +
+

These are the release notes.

+
+ + + """ + + result = extract_content_from_html(html) + + assert result.startswith("# What’s new in 2.1.0 (Aug 30, 2023)") + assert "These are the release notes." in result + + def test_html_does_not_duplicate_existing_title_heading(self): + """Test that a page title already present as the first heading is not duplicated.""" + html = """ + + Existing Title + +
+

Existing Title

+

Article body.

+
+ + + """ + + result = extract_content_from_html(html) + + assert result.count("# Existing Title") == 1 + assert "Article body." in result + def test_html_with_links(self): """Test that links are converted to markdown.""" html = """