diff --git a/src/fetch/README.md b/src/fetch/README.md
index 2c3e048927..ebd52bb17c 100644
--- a/src/fetch/README.md
+++ b/src/fetch/README.md
@@ -26,7 +26,8 @@ The fetch tool will truncate the response, but by using the `start_index` argume
## Installation
-Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust.
+By default, the fetch server uses a Python-only HTML simplifier and does not require Node.js. Optionally install Node.js and pass
+`--use-readability-js` to use Mozilla Readability through readabilipy's Node.js backend, which may simplify some pages more robustly.
### Using uv (recommended)
@@ -170,6 +171,11 @@ This can be customized by adding the argument `--user-agent=YourUserAgent` to th
The server can be configured to use a proxy by using the `--proxy-url` argument.
+### Customization - Mozilla Readability via Node.js
+
+By default, HTML simplification uses readabilipy's Python-only backend. To opt into Mozilla Readability through readabilipy's Node.js
+backend, install Node.js and add `--use-readability-js` to the `args` list in the configuration.
+
## Windows Configuration
If you're experiencing timeout issues on Windows, you may need to set the `PYTHONIOENCODING` environment variable to ensure proper character encoding:
diff --git a/src/fetch/src/mcp_server_fetch/__init__.py b/src/fetch/src/mcp_server_fetch/__init__.py
index 09744ce319..6891f4be36 100644
--- a/src/fetch/src/mcp_server_fetch/__init__.py
+++ b/src/fetch/src/mcp_server_fetch/__init__.py
@@ -16,9 +16,24 @@ def main():
help="Ignore robots.txt restrictions",
)
parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests")
+ parser.add_argument(
+ "--use-readability-js",
+ action="store_true",
+ help=(
+ "Use Mozilla Readability through readabilipy's Node.js backend. "
+ "By default, the server uses the Python-only backend."
+ ),
+ )
args = parser.parse_args()
- asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url))
+ asyncio.run(
+ serve(
+ args.user_agent,
+ args.ignore_robots_txt,
+ args.proxy_url,
+ args.use_readability_js,
+ )
+ )
if __name__ == "__main__":
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index b42c7b1f6b..3a815f6970 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -24,17 +24,18 @@
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
-def extract_content_from_html(html: str) -> str:
+def extract_content_from_html(html: str, *, use_readability_js: bool = False) -> str:
"""Extract and convert HTML content to Markdown format.
Args:
html: Raw HTML content to process
+ use_readability_js: Whether to use Mozilla Readability via Node.js
Returns:
Simplified markdown version of the content
"""
ret = readabilipy.simple_json.simple_json_from_html_string(
- html, use_readability=True
+ html, use_readability=use_readability_js
)
if not ret["content"]:
return "Page failed to be simplified from HTML"
@@ -42,6 +43,8 @@ def extract_content_from_html(html: str) -> str:
ret["content"],
heading_style=markdownify.ATX,
)
+ if not content.strip():
+ return "Page failed to be simplified from HTML"
return content
@@ -63,7 +66,9 @@ def get_robots_txt_url(url: str) -> str:
return robots_url
-async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url: str | None = None) -> None:
+async def check_may_autonomously_fetch_url(
+ url: str, user_agent: str, proxy_url: str | None = None
+) -> None:
"""
Check if the URL can be fetched by the user agent according to the robots.txt file.
Raises a McpError if not.
@@ -80,15 +85,19 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
headers={"User-Agent": user_agent},
)
except HTTPError:
- raise McpError(ErrorData(
- code=INTERNAL_ERROR,
- message=f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
- ))
+ raise McpError(
+ ErrorData(
+ code=INTERNAL_ERROR,
+ message=f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue",
+ )
+ )
if response.status_code in (401, 403):
- raise McpError(ErrorData(
- code=INTERNAL_ERROR,
- message=f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
- ))
+ raise McpError(
+ ErrorData(
+ code=INTERNAL_ERROR,
+ message=f"When fetching robots.txt ({robot_txt_url}), received status {response.status_code} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
+ )
+ )
elif 400 <= response.status_code < 500:
return
robot_txt = response.text
@@ -97,19 +106,25 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
)
robot_parser = Protego.parse(processed_robot_txt)
if not robot_parser.can_fetch(str(url), user_agent):
- raise McpError(ErrorData(
- code=INTERNAL_ERROR,
- message=f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
- f"{user_agent}\n"
- f"{url}"
- f"\n{robot_txt}\n\n"
- f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n"
- f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.",
- ))
+ raise McpError(
+ ErrorData(
+ code=INTERNAL_ERROR,
+ message=f"The sites robots.txt ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, "
+ f"{user_agent}\n"
+ f"{url}"
+ f"\n{robot_txt}\n\n"
+ f"The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n"
+ f"The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.",
+ )
+ )
async def fetch_url(
- url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
+ url: str,
+ user_agent: str,
+ force_raw: bool = False,
+ proxy_url: str | None = None,
+ use_readability_js: bool = False,
) -> Tuple[str, str]:
"""
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
@@ -125,12 +140,16 @@ async def fetch_url(
timeout=30,
)
except HTTPError as e:
- raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}"))
+ raise McpError(
+ ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}")
+ )
if response.status_code >= 400:
- raise McpError(ErrorData(
- code=INTERNAL_ERROR,
- message=f"Failed to fetch {url} - status code {response.status_code}",
- ))
+ raise McpError(
+ ErrorData(
+ code=INTERNAL_ERROR,
+ message=f"Failed to fetch {url} - status code {response.status_code}",
+ )
+ )
page_raw = response.text
@@ -140,7 +159,9 @@ async def fetch_url(
)
if is_page_html and not force_raw:
- return extract_content_from_html(page_raw), ""
+ return extract_content_from_html(
+ page_raw, use_readability_js=use_readability_js
+ ), ""
return (
page_raw,
@@ -182,6 +203,7 @@ async def serve(
custom_user_agent: str | None = None,
ignore_robots_txt: bool = False,
proxy_url: str | None = None,
+ use_readability_js: bool = False,
) -> None:
"""Run the fetch MCP server.
@@ -189,6 +211,7 @@ async def serve(
custom_user_agent: Optional custom User-Agent string to use for requests
ignore_robots_txt: Whether to ignore robots.txt restrictions
proxy_url: Optional proxy URL to use for requests
+ use_readability_js: Whether to use Mozilla Readability via Node.js
"""
server = Server("mcp-fetch")
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
@@ -232,22 +255,32 @@ async def call_tool(name, arguments: dict) -> list[TextContent]:
raise McpError(ErrorData(code=INVALID_PARAMS, message="URL is required"))
if not ignore_robots_txt:
- await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)
+ await check_may_autonomously_fetch_url(
+ url, user_agent_autonomous, proxy_url
+ )
content, prefix = await fetch_url(
- url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
+ url,
+ user_agent_autonomous,
+ force_raw=args.raw,
+ proxy_url=proxy_url,
+ use_readability_js=use_readability_js,
)
original_length = len(content)
if args.start_index >= original_length:
content = "No more content available."
else:
- truncated_content = content[args.start_index : args.start_index + args.max_length]
+ truncated_content = content[
+ args.start_index : args.start_index + args.max_length
+ ]
if not truncated_content:
content = "No more content available."
else:
content = truncated_content
actual_content_length = len(truncated_content)
- remaining_content = original_length - (args.start_index + actual_content_length)
+ remaining_content = original_length - (
+ args.start_index + actual_content_length
+ )
# Only add the prompt to continue fetching if there is still remaining content
if actual_content_length == args.max_length and remaining_content > 0:
next_start = args.start_index + actual_content_length
@@ -262,7 +295,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
url = arguments["url"]
try:
- content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url)
+ content, prefix = await fetch_url(
+ url,
+ user_agent_manual,
+ proxy_url=proxy_url,
+ use_readability_js=use_readability_js,
+ )
# TODO: after SDK bug is addressed, don't catch the exception
except McpError as e:
return GetPromptResult(
diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
index 96c1cb38c7..d3e2b6d193 100644
--- a/src/fetch/tests/test_server.py
+++ b/src/fetch/tests/test_server.py
@@ -87,6 +87,32 @@ def test_empty_content_returns_error(self):
result = extract_content_from_html(html)
assert "" in result
+ def test_uses_python_readability_backend_by_default(self):
+ """Test that HTML extraction does not require Node.js by default."""
+ html = "Hello World
"
+
+ with patch(
+ "readabilipy.simple_json.simple_json_from_html_string",
+ return_value={"content": "Hello World
"},
+ ) as mock_extract:
+ result = extract_content_from_html(html)
+
+ mock_extract.assert_called_once_with(html, use_readability=False)
+ assert "Hello World" in result
+
+ def test_can_opt_into_readability_js_backend(self):
+ """Test that operators can explicitly opt into the Node.js backend."""
+ html = "Hello World
"
+
+ with patch(
+ "readabilipy.simple_json.simple_json_from_html_string",
+ return_value={"content": "Hello World
"},
+ ) as mock_extract:
+ result = extract_content_from_html(html, use_readability_js=True)
+
+ mock_extract.assert_called_once_with(html, use_readability=True)
+ assert "Hello World" in result
+
class TestCheckMayAutonomouslyFetchUrl:
"""Tests for check_may_autonomously_fetch_url function."""
@@ -100,13 +126,14 @@ async def test_allows_when_robots_txt_404(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
# Should not raise
await check_may_autonomously_fetch_url(
- "https://example.com/page",
- DEFAULT_USER_AGENT_AUTONOMOUS
+ "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
)
@pytest.mark.asyncio
@@ -118,13 +145,14 @@ async def test_blocks_when_robots_txt_401(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
with pytest.raises(McpError):
await check_may_autonomously_fetch_url(
- "https://example.com/page",
- DEFAULT_USER_AGENT_AUTONOMOUS
+ "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
)
@pytest.mark.asyncio
@@ -136,13 +164,14 @@ async def test_blocks_when_robots_txt_403(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
with pytest.raises(McpError):
await check_may_autonomously_fetch_url(
- "https://example.com/page",
- DEFAULT_USER_AGENT_AUTONOMOUS
+ "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
)
@pytest.mark.asyncio
@@ -155,13 +184,14 @@ async def test_allows_when_robots_txt_allows_all(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
# Should not raise
await check_may_autonomously_fetch_url(
- "https://example.com/page",
- DEFAULT_USER_AGENT_AUTONOMOUS
+ "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
)
@pytest.mark.asyncio
@@ -174,13 +204,14 @@ async def test_blocks_when_robots_txt_disallows_all(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
with pytest.raises(McpError):
await check_may_autonomously_fetch_url(
- "https://example.com/page",
- DEFAULT_USER_AGENT_AUTONOMOUS
+ "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
)
@@ -207,12 +238,13 @@ async def test_fetch_html_page(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
content, prefix = await fetch_url(
- "https://example.com/page",
- DEFAULT_USER_AGENT_AUTONOMOUS
+ "https://example.com/page", DEFAULT_USER_AGENT_AUTONOMOUS
)
# HTML is processed, so we check it returns something
@@ -231,13 +263,15 @@ async def test_fetch_html_page_raw(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
content, prefix = await fetch_url(
"https://example.com/page",
DEFAULT_USER_AGENT_AUTONOMOUS,
- force_raw=True
+ force_raw=True,
)
assert content == html_content
@@ -255,12 +289,13 @@ async def test_fetch_json_returns_raw(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
content, prefix = await fetch_url(
- "https://api.example.com/data",
- DEFAULT_USER_AGENT_AUTONOMOUS
+ "https://api.example.com/data", DEFAULT_USER_AGENT_AUTONOMOUS
)
assert content == json_content
@@ -275,13 +310,14 @@ async def test_fetch_404_raises_error(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
with pytest.raises(McpError):
await fetch_url(
- "https://example.com/notfound",
- DEFAULT_USER_AGENT_AUTONOMOUS
+ "https://example.com/notfound", DEFAULT_USER_AGENT_AUTONOMOUS
)
@pytest.mark.asyncio
@@ -293,13 +329,14 @@ async def test_fetch_500_raises_error(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
with pytest.raises(McpError):
await fetch_url(
- "https://example.com/error",
- DEFAULT_USER_AGENT_AUTONOMOUS
+ "https://example.com/error", DEFAULT_USER_AGENT_AUTONOMOUS
)
@pytest.mark.asyncio
@@ -313,14 +350,18 @@ async def test_fetch_with_proxy(self):
with patch("httpx.AsyncClient") as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
- mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aenter__ = AsyncMock(
+ return_value=mock_client
+ )
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
await fetch_url(
"https://example.com/data",
DEFAULT_USER_AGENT_AUTONOMOUS,
- proxy_url="http://proxy.example.com:8080"
+ proxy_url="http://proxy.example.com:8080",
)
# Verify AsyncClient was called with proxy
- mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")
+ mock_client_class.assert_called_once_with(
+ proxy="http://proxy.example.com:8080"
+ )