From 9f8f419b8ad0243855506ff4e68f1d648cea49fa Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Sun, 5 Apr 2026 00:12:03 +0800 Subject: [PATCH 01/10] =?UTF-8?q?feat(websearch):=20=E6=96=B0=E5=A2=9E=20E?= =?UTF-8?q?xa=20=E6=90=9C=E7=B4=A2=E6=8F=90=E4=BE=9B=E5=95=86=EF=BC=8C?= =?UTF-8?q?=E6=94=AF=E6=8C=81=20Tavily/Exa=20API=20Base=20URL=20=E5=8F=AF?= =?UTF-8?q?=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 Exa 搜索提供商,包含三个工具: - web_search_exa:语义搜索,支持 5 种搜索类型和 6 个垂直领域 - exa_extract_web_page:通过 /contents 端点提取网页全文 - exa_find_similar:通过 /findSimilar 端点查找语义相似网页 - Tavily 和 Exa 的 API Base URL 可在 WebUI 中配置,方便代理/自建实例 - 所有联网搜索工具统一添加可配置 timeout 参数(最小 30s) - MessageList.vue 引用解析支持 Exa/BoCha/findSimilar - 更新配置元数据、i18n、路由及 hooks - 更新中英文用户文档,补充 Tavily/BoCha/百度AI搜索的工具参数说明 --- astrbot/builtin_stars/web_searcher/main.py | 402 +++++++++++++++++- astrbot/core/astr_agent_hooks.py | 8 +- astrbot/core/config/default.py | 39 +- astrbot/core/knowledge_base/kb_helper.py | 8 +- .../core/knowledge_base/parsers/url_parser.py | 15 +- astrbot/dashboard/routes/chat.py | 7 +- astrbot/dashboard/routes/live_chat.py | 7 +- dashboard/src/components/chat/MessageList.vue | 5 +- .../en-US/features/config-metadata.json | 12 + .../ru-RU/features/config-metadata.json | 12 + .../zh-CN/features/config-metadata.json | 12 + docs/en/use/websearch.md | 122 +++++- docs/zh/use/websearch.md | 145 ++++++- 13 files changed, 766 insertions(+), 28 deletions(-) diff --git a/astrbot/builtin_stars/web_searcher/main.py b/astrbot/builtin_stars/web_searcher/main.py index cca1b43fb4..14b3e0d90c 100644 --- a/astrbot/builtin_stars/web_searcher/main.py +++ b/astrbot/builtin_stars/web_searcher/main.py @@ -24,6 +24,9 @@ class Main(star.Star): "web_search_tavily", "tavily_extract_web_page", "web_search_bocha", + "web_search_exa", + "exa_extract_web_page", + "exa_find_similar", ] def __init__(self, context: star.Context) -> None: @@ -34,6 +37,9 @@ def __init__(self, context: star.Context) -> None: self.bocha_key_index = 0 self.bocha_key_lock = asyncio.Lock() + self.exa_key_index = 0 + self.exa_key_lock = asyncio.Lock() + # 将 str 类型的 key 迁移至 list[str],并保存 cfg = self.context.get_config() provider_settings = cfg.get("provider_settings") @@ -57,6 +63,14 @@ def __init__(self, context: star.Context) -> None: provider_settings["websearch_bocha_key"] = [] cfg.save_config() + exa_key = provider_settings.get("websearch_exa_key") + if isinstance(exa_key, str): + if exa_key: + provider_settings["websearch_exa_key"] = [exa_key] + else: + provider_settings["websearch_exa_key"] = [] + cfg.save_config() + self.bing_search = Bing() self.sogo_search = Sogo() self.baidu_initialized = False @@ -65,12 +79,16 @@ async def _tidy_text(self, text: str) -> str: """清理文本,去除空格、换行符等""" return text.strip().replace("\n", " ").replace("\r", " ").replace(" ", " ") - async def _get_from_url(self, url: str) -> str: + async def _get_from_url(self, url: str, timeout: int = 30) -> str: """获取网页内容""" + if timeout < 30: + timeout = 30 header = HEADERS header.update({"User-Agent": random.choice(USER_AGENTS)}) async with aiohttp.ClientSession(trust_env=True) as session: - async with session.get(url, headers=header) as response: + async with session.get( + url, headers=header, timeout=aiohttp.ClientTimeout(total=timeout) + ) as response: html = await response.text(encoding="utf-8") doc = Document(html) ret = doc.summary(html_partial=True) @@ -138,10 +156,18 @@ async def _web_search_tavily( self, cfg: AstrBotConfig, payload: dict, + timeout: int = 30, ) -> list[SearchResult]: """使用 Tavily 搜索引擎进行搜索""" tavily_key = await self._get_tavily_key(cfg) - url = "https://api.tavily.com/search" + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_tavily_base_url", "https://api.tavily.com") + .rstrip("/") + ) + if timeout < 30: + timeout = 30 + url = f"{base_url}/search" header = { "Authorization": f"Bearer {tavily_key}", "Content-Type": "application/json", @@ -151,6 +177,7 @@ async def _web_search_tavily( url, json=payload, headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), ) as response: if response.status != 200: reason = await response.text() @@ -169,10 +196,19 @@ async def _web_search_tavily( results.append(result) return results - async def _extract_tavily(self, cfg: AstrBotConfig, payload: dict) -> list[dict]: + async def _extract_tavily( + self, cfg: AstrBotConfig, payload: dict, timeout: int = 30 + ) -> list[dict]: """使用 Tavily 提取网页内容""" tavily_key = await self._get_tavily_key(cfg) - url = "https://api.tavily.com/extract" + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_tavily_base_url", "https://api.tavily.com") + .rstrip("/") + ) + if timeout < 30: + timeout = 30 + url = f"{base_url}/extract" header = { "Authorization": f"Bearer {tavily_key}", "Content-Type": "application/json", @@ -182,6 +218,7 @@ async def _extract_tavily(self, cfg: AstrBotConfig, payload: dict) -> list[dict] url, json=payload, headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), ) as response: if response.status != 200: reason = await response.text() @@ -261,14 +298,19 @@ async def ensure_baidu_ai_search_mcp(self, umo: str | None = None) -> None: logger.info("Successfully initialized Baidu AI Search MCP server.") @llm_tool(name="fetch_url") - async def fetch_website_content(self, event: AstrMessageEvent, url: str) -> str: + async def fetch_website_content( + self, event: AstrMessageEvent, url: str, timeout: int = 30 + ) -> str: """Fetch the content of a website with the given web url Args: url(string): The url of the website to fetch content from + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - resp = await self._get_from_url(url) + if timeout < 30: + timeout = 30 + resp = await self._get_from_url(url, timeout=timeout) return resp @llm_tool("web_search_tavily") @@ -283,6 +325,7 @@ async def search_from_tavily( time_range: str = "", start_date: str = "", end_date: str = "", + timeout: int = 30, ) -> str: """A web search tool that uses Tavily to search the web for relevant content. Ideal for gathering current information, news, and detailed web content analysis. @@ -296,8 +339,11 @@ async def search_from_tavily( time_range(string): Optional. The time range back from the current date to include in the search results. This feature is available for both 'general' and 'news' search topics. Must be one of 'day', 'week', 'month', 'year'. start_date(string): Optional. The start date for the search results in the format 'YYYY-MM-DD'. end_date(string): Optional. The end date for the search results in the format 'YYYY-MM-DD'. + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ + if timeout < 30: + timeout = 30 logger.info(f"web_searcher - search_from_tavily: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) # websearch_link = cfg["provider_settings"].get("web_search_link", False) @@ -324,7 +370,7 @@ async def search_from_tavily( if end_date: payload["end_date"] = end_date - results = await self._web_search_tavily(cfg, payload) + results = await self._web_search_tavily(cfg, payload, timeout=timeout) if not results: return "Error: Tavily web searcher does not return any results." @@ -353,14 +399,18 @@ async def tavily_extract_web_page( event: AstrMessageEvent, url: str = "", extract_depth: str = "basic", + timeout: int = 30, ) -> str: """Extract the content of a web page using Tavily. Args: url(string): Required. An URl to extract content from. extract_depth(string): Optional. The depth of the extraction, must be one of 'basic', 'advanced'. Default is "basic". + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ + if timeout < 30: + timeout = 30 cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_tavily_key", []): raise ValueError("Error: Tavily API key is not configured in AstrBot.") @@ -373,7 +423,7 @@ async def tavily_extract_web_page( "urls": [url], "extract_depth": extract_depth, } - results = await self._extract_tavily(cfg, payload) + results = await self._extract_tavily(cfg, payload, timeout=timeout) ret_ls = [] for result in results: ret_ls.append(f"URL: {result.get('url', 'No URL')}") @@ -398,9 +448,12 @@ async def _web_search_bocha( self, cfg: AstrBotConfig, payload: dict, + timeout: int = 30, ) -> list[SearchResult]: """使用 BoCha 搜索引擎进行搜索""" bocha_key = await self._get_bocha_key(cfg) + if timeout < 30: + timeout = 30 url = "https://api.bochaai.com/v1/web-search" header = { "Authorization": f"Bearer {bocha_key}", @@ -411,6 +464,7 @@ async def _web_search_bocha( url, json=payload, headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), ) as response: if response.status != 200: reason = await response.text() @@ -440,6 +494,7 @@ async def search_from_bocha( include: str = "", exclude: str = "", count: int = 10, + timeout: int = 30, ) -> str: """ A web search tool based on Bocha Search API, used to retrieve web pages @@ -487,7 +542,11 @@ async def search_from_bocha( - Default: 10 The actual number of returned results may be less than the specified count. + + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ + if timeout < 30: + timeout = 30 logger.info(f"web_searcher - search_from_bocha: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) # websearch_link = cfg["provider_settings"].get("web_search_link", False) @@ -515,7 +574,7 @@ async def search_from_bocha( if exclude: payload["exclude"] = exclude - results = await self._web_search_bocha(cfg, payload) + results = await self._web_search_bocha(cfg, payload, timeout=timeout) if not results: return "Error: BoCha web searcher does not return any results." @@ -537,6 +596,301 @@ async def search_from_bocha( ret = json.dumps({"results": ret_ls}, ensure_ascii=False) return ret + async def _get_exa_key(self, cfg: AstrBotConfig) -> str: + """并发安全的从列表中获取并轮换 Exa API 密钥。""" + exa_keys = cfg.get("provider_settings", {}).get("websearch_exa_key", []) + if not exa_keys: + raise ValueError("错误:Exa API 密钥未在 AstrBot 中配置。") + + async with self.exa_key_lock: + key = exa_keys[self.exa_key_index] + self.exa_key_index = (self.exa_key_index + 1) % len(exa_keys) + return key + + async def _web_search_exa( + self, + cfg: AstrBotConfig, + payload: dict, + timeout: int = 30, + ) -> list[SearchResult]: + """使用 Exa 搜索引擎进行搜索""" + exa_key = await self._get_exa_key(cfg) + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_exa_base_url", "https://api.exa.ai") + .rstrip("/") + ) + url = f"{base_url}/search" + header = { + "x-api-key": exa_key, + "Content-Type": "application/json", + } + async with aiohttp.ClientSession(trust_env=True) as session: + async with session.post( + url, + json=payload, + headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), + ) as response: + if response.status != 200: + reason = await response.text() + raise Exception( + f"Exa web search failed: {reason}, status: {response.status}", + ) + data = await response.json() + results = [] + for item in data.get("results", []): + result = SearchResult( + title=item.get("title", ""), + url=item.get("url", ""), + snippet=(item.get("text") or "")[:500], + ) + results.append(result) + return results + + @llm_tool("web_search_exa") + async def search_from_exa( + self, + event: AstrMessageEvent, + query: str, + max_results: int = 10, + search_type: str = "auto", + category: str = "", + timeout: int = 30, + ) -> str: + """A web search tool that uses Exa to search the web for relevant content. + Ideal for gathering current information with semantic search capabilities. + Supports vertical search categories: company, people, research paper, news, personal site, financial report. + + Args: + query(string): Required. Search query. + max_results(number): Optional. The maximum number of results to return. Default is 10. Range is 1-100. + search_type(string): Optional. The type of search, must be one of 'auto', 'neural', 'fast', 'instant', 'deep'. Default is "auto". + category(string): Optional. The category of search. Supported values: 'company'(50M+ company pages), 'people'(1B+ people profiles), 'research paper'(100M+ papers), 'news', 'personal site', 'financial report'. Default is empty (general web search). + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. + + """ + if timeout < 30: + timeout = 30 + logger.info(f"web_searcher - search_from_exa: {query}") + cfg = self.context.get_config(umo=event.unified_msg_origin) + if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): + raise ValueError("Error: Exa API key is not configured in AstrBot.") + + valid_types = ("auto", "neural", "fast", "instant", "deep") + if search_type not in valid_types: + search_type = "auto" + + max_results = max(1, min(max_results, 100)) + + payload = { + "query": query, + "numResults": max_results, + "type": search_type, + "contents": {"text": {"maxCharacters": 500}}, + } + + valid_categories = ( + "company", + "people", + "research paper", + "news", + "personal site", + "financial report", + ) + if category in valid_categories: + payload["category"] = category + + results = await self._web_search_exa(cfg, payload, timeout=timeout) + if not results: + return "Error: Exa web searcher does not return any results." + + ret_ls = [] + ref_uuid = str(uuid.uuid4())[:4] + for idx, result in enumerate(results, 1): + index = f"{ref_uuid}.{idx}" + ret_ls.append( + { + "title": result.title, + "url": result.url, + "snippet": result.snippet, + "index": index, + } + ) + ret = json.dumps({"results": ret_ls}, ensure_ascii=False) + return ret + + async def _extract_exa( + self, cfg: AstrBotConfig, payload: dict, timeout: int = 30 + ) -> list[dict]: + """使用 Exa 提取网页内容""" + exa_key = await self._get_exa_key(cfg) + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_exa_base_url", "https://api.exa.ai") + .rstrip("/") + ) + if timeout < 30: + timeout = 30 + url = f"{base_url}/contents" + header = { + "x-api-key": exa_key, + "Content-Type": "application/json", + } + async with aiohttp.ClientSession(trust_env=True) as session: + async with session.post( + url, + json=payload, + headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), + ) as response: + if response.status != 200: + reason = await response.text() + raise Exception( + f"Exa content extraction failed: {reason}, status: {response.status}", + ) + data = await response.json() + results: list[dict] = data.get("results", []) + if not results: + raise ValueError( + "Error: Exa content extraction does not return any results.", + ) + return results + + @llm_tool("exa_extract_web_page") + async def exa_extract_web_page( + self, + event: AstrMessageEvent, + url: str = "", + timeout: int = 30, + ) -> str: + """Extract the content of a web page using Exa. + Use this tool when the user wants to extract or summarize content from a specific URL. + + Args: + url(string): Required. A URL to extract content from. + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. + + """ + if timeout < 30: + timeout = 30 + cfg = self.context.get_config(umo=event.unified_msg_origin) + if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): + raise ValueError("Error: Exa API key is not configured in AstrBot.") + + if not url: + raise ValueError("Error: url must be a non-empty string.") + + payload = { + "urls": [url], + "text": True, + } + + results = await self._extract_exa(cfg, payload, timeout=timeout) + ret_ls = [] + for result in results: + ret_ls.append(f"URL: {result.get('url', 'No URL')}") + text = await self._tidy_text(result.get("text", "No content")) + ret_ls.append(f"Content: {text}") + ret = "\n".join(ret_ls) + if not ret: + return "Error: Exa content extraction does not return any results." + return ret + + async def _find_similar_exa( + self, cfg: AstrBotConfig, payload: dict, timeout: int = 30 + ) -> list[SearchResult]: + """使用 Exa 查找相似链接""" + exa_key = await self._get_exa_key(cfg) + base_url = ( + cfg.get("provider_settings", {}) + .get("websearch_exa_base_url", "https://api.exa.ai") + .rstrip("/") + ) + if timeout < 30: + timeout = 30 + url = f"{base_url}/findSimilar" + header = { + "x-api-key": exa_key, + "Content-Type": "application/json", + } + async with aiohttp.ClientSession(trust_env=True) as session: + async with session.post( + url, + json=payload, + headers=header, + timeout=aiohttp.ClientTimeout(total=timeout), + ) as response: + if response.status != 200: + reason = await response.text() + raise Exception( + f"Exa find similar failed: {reason}, status: {response.status}", + ) + data = await response.json() + results = [] + for item in data.get("results", []): + result = SearchResult( + title=item.get("title", ""), + url=item.get("url", ""), + snippet=(item.get("text") or "")[:500], + ) + results.append(result) + return results + + @llm_tool("exa_find_similar") + async def find_similar_links( + self, + event: AstrMessageEvent, + url: str, + max_results: int = 10, + timeout: int = 30, + ) -> str: + """Find web pages that are semantically similar to a given URL. + Use this tool when the user wants to discover content related to a specific webpage they have found interesting. + + Args: + url(string): Required. The URL of the webpage to find similar content for. + max_results(number): Optional. The maximum number of similar results to return. Default is 10. Range is 1-100. + timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. + + """ + if timeout < 30: + timeout = 30 + logger.info(f"web_searcher - find_similar_links: {url}") + cfg = self.context.get_config(umo=event.unified_msg_origin) + if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): + raise ValueError("Error: Exa API key is not configured in AstrBot.") + + if not url: + raise ValueError("Error: url must be a non-empty string.") + + num = max(1, min(max_results, 100)) + + payload = { + "url": url, + "numResults": num, + "contents": {"text": {"maxCharacters": 500}}, + } + + results = await self._find_similar_exa(cfg, payload, timeout=timeout) + if not results: + return "Error: Exa find similar does not return any results." + + ret_ls = [] + ref_uuid = str(uuid.uuid4())[:4] + for idx, result in enumerate(results, 1): + index = f"{ref_uuid}.{idx}" + ret_ls.append( + { + "title": result.title, + "url": result.url, + "snippet": result.snippet, + "index": index, + } + ) + ret = json.dumps({"results": ret_ls}, ensure_ascii=False) + return ret + @filter.on_llm_request(priority=-10000) async def edit_web_search_tools( self, @@ -575,6 +929,9 @@ async def edit_web_search_tools( tool_set.remove_tool("tavily_extract_web_page") tool_set.remove_tool("AIsearch") tool_set.remove_tool("web_search_bocha") + tool_set.remove_tool("web_search_exa") + tool_set.remove_tool("exa_extract_web_page") + tool_set.remove_tool("exa_find_similar") elif provider == "tavily": web_search_tavily = func_tool_mgr.get_func("web_search_tavily") tavily_extract_web_page = func_tool_mgr.get_func("tavily_extract_web_page") @@ -586,6 +943,9 @@ async def edit_web_search_tools( tool_set.remove_tool("fetch_url") tool_set.remove_tool("AIsearch") tool_set.remove_tool("web_search_bocha") + tool_set.remove_tool("web_search_exa") + tool_set.remove_tool("exa_extract_web_page") + tool_set.remove_tool("exa_find_similar") elif provider == "baidu_ai_search": try: await self.ensure_baidu_ai_search_mcp(event.unified_msg_origin) @@ -597,6 +957,9 @@ async def edit_web_search_tools( tool_set.remove_tool("web_search_tavily") tool_set.remove_tool("tavily_extract_web_page") tool_set.remove_tool("web_search_bocha") + tool_set.remove_tool("web_search_exa") + tool_set.remove_tool("exa_extract_web_page") + tool_set.remove_tool("exa_find_similar") except Exception as e: logger.error(f"Cannot Initialize Baidu AI Search MCP Server: {e}") elif provider == "bocha": @@ -608,3 +971,22 @@ async def edit_web_search_tools( tool_set.remove_tool("AIsearch") tool_set.remove_tool("web_search_tavily") tool_set.remove_tool("tavily_extract_web_page") + tool_set.remove_tool("web_search_exa") + tool_set.remove_tool("exa_extract_web_page") + tool_set.remove_tool("exa_find_similar") + elif provider == "exa": + web_search_exa = func_tool_mgr.get_func("web_search_exa") + exa_extract_web_page = func_tool_mgr.get_func("exa_extract_web_page") + exa_find_similar = func_tool_mgr.get_func("exa_find_similar") + if web_search_exa and web_search_exa.active: + tool_set.add_tool(web_search_exa) + if exa_extract_web_page and exa_extract_web_page.active: + tool_set.add_tool(exa_extract_web_page) + if exa_find_similar and exa_find_similar.active: + tool_set.add_tool(exa_find_similar) + tool_set.remove_tool("web_search") + tool_set.remove_tool("fetch_url") + tool_set.remove_tool("AIsearch") + tool_set.remove_tool("web_search_tavily") + tool_set.remove_tool("tavily_extract_web_page") + tool_set.remove_tool("web_search_bocha") diff --git a/astrbot/core/astr_agent_hooks.py b/astrbot/core/astr_agent_hooks.py index 09bf32deb4..86f8a6c5b2 100644 --- a/astrbot/core/astr_agent_hooks.py +++ b/astrbot/core/astr_agent_hooks.py @@ -59,7 +59,13 @@ async def on_tool_end( platform_name = run_context.context.event.get_platform_name() if ( platform_name == "webchat" - and tool.name in ["web_search_tavily", "web_search_bocha"] + and tool.name + in [ + "web_search_tavily", + "web_search_bocha", + "web_search_exa", + "exa_find_similar", + ] and len(run_context.messages) > 0 and tool_result and len(tool_result.content) diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 45412bdccb..e34ebc408b 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -108,8 +108,11 @@ "web_search": False, "websearch_provider": "default", "websearch_tavily_key": [], + "websearch_tavily_base_url": "https://api.tavily.com", "websearch_bocha_key": [], "websearch_baidu_app_builder_key": "", + "websearch_exa_key": [], + "websearch_exa_base_url": "https://api.exa.ai", "web_search_link": False, "display_reasoning_text": False, "identifier": False, @@ -3084,7 +3087,13 @@ class ChatProviderTemplate(TypedDict): "provider_settings.websearch_provider": { "description": "网页搜索提供商", "type": "string", - "options": ["default", "tavily", "baidu_ai_search", "bocha"], + "options": [ + "default", + "tavily", + "baidu_ai_search", + "bocha", + "exa", + ], "condition": { "provider_settings.web_search": True, }, @@ -3117,6 +3126,34 @@ class ChatProviderTemplate(TypedDict): "provider_settings.websearch_provider": "baidu_ai_search", }, }, + "provider_settings.websearch_tavily_base_url": { + "description": "Tavily API Base URL", + "type": "string", + "hint": "默认为 https://api.tavily.com,可改为代理地址。", + "condition": { + "provider_settings.websearch_provider": "tavily", + "provider_settings.web_search": True, + }, + }, + "provider_settings.websearch_exa_key": { + "description": "Exa API Key", + "type": "list", + "items": {"type": "string"}, + "hint": "可添加多个 Key 进行轮询。", + "condition": { + "provider_settings.websearch_provider": "exa", + "provider_settings.web_search": True, + }, + }, + "provider_settings.websearch_exa_base_url": { + "description": "Exa API Base URL", + "type": "string", + "hint": "默认为 https://api.exa.ai,可改为代理地址。", + "condition": { + "provider_settings.websearch_provider": "exa", + "provider_settings.web_search": True, + }, + }, "provider_settings.web_search_link": { "description": "显示来源引用", "type": "bool", diff --git a/astrbot/core/knowledge_base/kb_helper.py b/astrbot/core/knowledge_base/kb_helper.py index cb74cb2ba8..912ebbbe80 100644 --- a/astrbot/core/knowledge_base/kb_helper.py +++ b/astrbot/core/knowledge_base/kb_helper.py @@ -518,12 +518,18 @@ async def upload_from_url( "Error: Tavily API key is not configured in provider_settings." ) + tavily_base_url = config.get("provider_settings", {}).get( + "websearch_tavily_base_url", "https://api.tavily.com" + ) + # 阶段1: 从 URL 提取内容 if progress_callback: await progress_callback("extracting", 0, 100) try: - text_content = await extract_text_from_url(url, tavily_keys) + text_content = await extract_text_from_url( + url, tavily_keys, tavily_base_url + ) except Exception as e: logger.error(f"Failed to extract content from URL {url}: {e}") raise OSError(f"Failed to extract content from URL {url}: {e}") from e diff --git a/astrbot/core/knowledge_base/parsers/url_parser.py b/astrbot/core/knowledge_base/parsers/url_parser.py index 2867164a96..d0c41cafa3 100644 --- a/astrbot/core/knowledge_base/parsers/url_parser.py +++ b/astrbot/core/knowledge_base/parsers/url_parser.py @@ -6,12 +6,15 @@ class URLExtractor: """URL 内容提取器,封装了 Tavily API 调用和密钥管理""" - def __init__(self, tavily_keys: list[str]) -> None: + def __init__( + self, tavily_keys: list[str], tavily_base_url: str = "https://api.tavily.com" + ) -> None: """ 初始化 URL 提取器 Args: tavily_keys: Tavily API 密钥列表 + tavily_base_url: Tavily API 基础 URL """ if not tavily_keys: raise ValueError("Error: Tavily API keys are not configured.") @@ -19,6 +22,7 @@ def __init__(self, tavily_keys: list[str]) -> None: self.tavily_keys = tavily_keys self.tavily_key_index = 0 self.tavily_key_lock = asyncio.Lock() + self.tavily_base_url = tavily_base_url.rstrip("/") async def _get_tavily_key(self) -> str: """并发安全的从列表中获取并轮换Tavily API密钥。""" @@ -47,7 +51,7 @@ async def extract_text_from_url(self, url: str) -> str: raise ValueError("Error: url must be a non-empty string.") tavily_key = await self._get_tavily_key() - api_url = "https://api.tavily.com/extract" + api_url = f"{self.tavily_base_url}/extract" headers = { "Authorization": f"Bearer {tavily_key}", "Content-Type": "application/json", @@ -88,16 +92,19 @@ async def extract_text_from_url(self, url: str) -> str: # 为了向后兼容,提供一个简单的函数接口 -async def extract_text_from_url(url: str, tavily_keys: list[str]) -> str: +async def extract_text_from_url( + url: str, tavily_keys: list[str], tavily_base_url: str = "https://api.tavily.com" +) -> str: """ 简单的函数接口,用于从 URL 提取文本内容 Args: url: 要提取内容的网页 URL tavily_keys: Tavily API 密钥列表 + tavily_base_url: Tavily API 基础 URL Returns: 提取的文本内容 """ - extractor = URLExtractor(tavily_keys) + extractor = URLExtractor(tavily_keys, tavily_base_url) return await extractor.extract_text_from_url(url) diff --git a/astrbot/dashboard/routes/chat.py b/astrbot/dashboard/routes/chat.py index a4173ed843..4c4fd0ce84 100644 --- a/astrbot/dashboard/routes/chat.py +++ b/astrbot/dashboard/routes/chat.py @@ -224,7 +224,12 @@ def _extract_web_search_refs( Returns: 包含 used 列表的字典,记录被引用的搜索结果 """ - supported = ["web_search_tavily", "web_search_bocha"] + supported = [ + "web_search_tavily", + "web_search_bocha", + "web_search_exa", + "exa_find_similar", + ] # 从 accumulated_parts 中找到所有 web_search_tavily 的工具调用结果 web_search_results = {} tool_call_parts = [ diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py index 8d0af938d0..25310cf61a 100644 --- a/astrbot/dashboard/routes/live_chat.py +++ b/astrbot/dashboard/routes/live_chat.py @@ -198,7 +198,12 @@ def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: """从消息中提取 web_search 引用。""" - supported = ["web_search_tavily", "web_search_bocha"] + supported = [ + "web_search_tavily", + "web_search_bocha", + "web_search_exa", + "exa_find_similar", + ] web_search_results = {} tool_call_parts = [ p diff --git a/dashboard/src/components/chat/MessageList.vue b/dashboard/src/components/chat/MessageList.vue index ca86331a86..63c8fabb2c 100644 --- a/dashboard/src/components/chat/MessageList.vue +++ b/dashboard/src/components/chat/MessageList.vue @@ -302,8 +302,9 @@ export default { } part.tool_calls.forEach(toolCall => { - // 检查是否是 web_search_tavily 工具调用 - if (toolCall.name !== 'web_search_tavily' || !toolCall.result) { + // 检查是否是网页搜索工具调用 + const supportedTools = ['web_search_tavily', 'web_search_bocha', 'web_search_exa', 'exa_find_similar']; + if (!supportedTools.includes(toolCall.name) || !toolCall.result) { return; } diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json index 9ae8672826..d28c019f70 100644 --- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json +++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json @@ -117,6 +117,10 @@ "description": "Tavily API Key", "hint": "Multiple keys can be added for rotation." }, + "websearch_tavily_base_url": { + "description": "Tavily API Base URL", + "hint": "Default: https://api.tavily.com. Change to use a proxy or self-hosted instance." + }, "websearch_bocha_key": { "description": "BoCha API Key", "hint": "Multiple keys can be added for rotation." @@ -125,6 +129,14 @@ "description": "Baidu Qianfan Smart Cloud APP Builder API Key", "hint": "Reference: [https://console.bce.baidu.com/iam/#/iam/apikey/list](https://console.bce.baidu.com/iam/#/iam/apikey/list)" }, + "websearch_exa_key": { + "description": "Exa API Key", + "hint": "Multiple keys can be added for rotation." + }, + "websearch_exa_base_url": { + "description": "Exa API Base URL", + "hint": "Default: https://api.exa.ai. Change to use a proxy or self-hosted instance." + }, "web_search_link": { "description": "Display Source Citations" } diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json index 0aa5c791ac..bdc153a763 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json +++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json @@ -117,6 +117,10 @@ "description": "API-ключ Tavily", "hint": "Можно добавить несколько ключей для ротации." }, + "websearch_tavily_base_url": { + "description": "Базовый URL API Tavily", + "hint": "По умолчанию: https://api.tavily.com. Можно изменить на прокси-адрес." + }, "websearch_bocha_key": { "description": "API-ключ BoCha", "hint": "Можно добавить несколько ключей для ротации." @@ -125,6 +129,14 @@ "description": "API-ключ Baidu Qianfan APP Builder", "hint": "Ссылка: [https://console.bce.baidu.com/iam/#/iam/apikey/list](https://console.bce.baidu.com/iam/#/iam/apikey/list)" }, + "websearch_exa_key": { + "description": "API-ключ Exa", + "hint": "Можно добавить несколько ключей для ротации." + }, + "websearch_exa_base_url": { + "description": "Базовый URL API Exa", + "hint": "По умолчанию: https://api.exa.ai. Можно изменить на прокси-адрес." + }, "web_search_link": { "description": "Показывать ссылки на источники" } diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json index c04138402e..5d3da77ab1 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json +++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json @@ -119,6 +119,10 @@ "description": "Tavily API Key", "hint": "可添加多个 Key 进行轮询。" }, + "websearch_tavily_base_url": { + "description": "Tavily API Base URL", + "hint": "默认为 https://api.tavily.com,可改为代理地址。" + }, "websearch_bocha_key": { "description": "BoCha API Key", "hint": "可添加多个 Key 进行轮询。" @@ -127,6 +131,14 @@ "description": "百度千帆智能云 APP Builder API Key", "hint": "参考:[https://console.bce.baidu.com/iam/#/iam/apikey/list](https://console.bce.baidu.com/iam/#/iam/apikey/list)" }, + "websearch_exa_key": { + "description": "Exa API Key", + "hint": "可添加多个 Key 进行轮询。" + }, + "websearch_exa_base_url": { + "description": "Exa API Base URL", + "hint": "默认为 https://api.exa.ai,可改为代理地址。" + }, "web_search_link": { "description": "显示来源引用" } diff --git a/docs/en/use/websearch.md b/docs/en/use/websearch.md index 82e77bb937..f226b4357c 100644 --- a/docs/en/use/websearch.md +++ b/docs/en/use/websearch.md @@ -14,18 +14,28 @@ When using a large language model that supports function calling with the web se And other prompts with search intent to trigger the model to invoke the search tool. -AstrBot supports 3 types of web search source integration: `default`, `Tavily`, and `Baidu AI Search`. +AstrBot supports 5 types of web search source integration: `default`, `Tavily`, `Baidu AI Search`, `BoCha`, and `Exa`. -The former uses AstrBot's built-in web search requester to query Google, Bing, and Sogou search engines, performing best in network environments with Google access. **We recommend using Tavily**. +The former uses AstrBot's built-in web search requester to query Google, Bing, and Sogou search engines, performing best in network environments with Google access. **We recommend using Tavily or Exa**. ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) -Go to `Configuration`, scroll down to find Web Search, where you can select `default` (default, not recommended) or `Tavily`. +Go to `Configuration`, scroll down to find Web Search, where you can select `default` (default, not recommended) or `Tavily` or `Exa`. ### default (Not Recommended) If your device is in China and you have a proxy, you can enable the proxy and enter the HTTP proxy address in `Admin Panel - Other Configuration - HTTP Proxy` to apply the proxy. +The default provider exposes two tools: + +- **`web_search`** — Searches the web via Bing and Sogou engines. +- **`fetch_url`** — Extracts the full text content from any given URL. Useful for reading and summarizing web pages when search result snippets are not sufficient. Parameters: + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `url` | string | Yes | — | The URL of the web page to fetch content from | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + ### Tavily Go to [Tavily](https://app.tavily.com/home) to get an API Key, then fill it in the corresponding configuration item. @@ -33,3 +43,109 @@ Go to [Tavily](https://app.tavily.com/home) to get an API Key, then fill it in t If you use Tavily as your web search source, you will get a better experience optimization on AstrBot ChatUI, including citation source display and more: ![](https://files.astrbot.app/docs/source/images/websearch/image1.png) + +To use a proxy or self-hosted instance, modify the `Tavily API Base URL` configuration item. + +The Tavily provider exposes two tools: + +#### 1. Search (`web_search_tavily`) + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `query` | string | Yes | — | Search query | +| `max_results` | number | No | 7 | Maximum number of results to return. Range: 5–20 | +| `search_depth` | string | No | `basic` | Search depth. Must be `basic` or `advanced` | +| `topic` | string | No | `general` | Search topic. Must be `general` or `news` | +| `days` | number | No | 3 | Number of days back from today to include. Only available when `topic` is `news` | +| `time_range` | string | No | — | Time range for results. Must be one of `day`, `week`, `month`, `year`. Available for both `general` and `news` topics | +| `start_date` | string | No | — | Start date for results in `YYYY-MM-DD` format | +| `end_date` | string | No | — | End date for results in `YYYY-MM-DD` format | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +#### 2. Extract Web Page (`tavily_extract_web_page`) + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `url` | string | Yes | — | The URL to extract content from | +| `extract_depth` | string | No | `basic` | Extraction depth. Must be `basic` or `advanced` | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +### Baidu AI Search + +Go to the [BCE Console](https://console.bce.baidu.com/iam/#/iam/apikey/list) to get an API Key, then fill it in the `websearch_baidu_app_builder_key` configuration item. + +Baidu AI Search uses the MCP (Model Context Protocol) to communicate with Baidu's AI Search service. The tool is registered as `AIsearch` internally but commonly referred to as `baidu_ai_search`. Since it operates via MCP, no tool parameters are exposed directly — the model interacts with the service through the MCP protocol. + +### BoCha + +Go to [BoCha](https://www.bocha.ai) to get an API Key, then fill it in the corresponding configuration item. + +The BoCha provider exposes one tool: + +#### Search (`web_search_bocha`) + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `query` | string | Yes | — | Search query | +| `freshness` | string | No | `noLimit` | Time range filter. Supported values: `noLimit`, `oneDay`, `oneWeek`, `oneMonth`, `oneYear`, `YYYY-MM-DD..YYYY-MM-DD` (date range), or `YYYY-MM-DD` (exact date). Using `noLimit` is recommended as the search algorithm will automatically optimize time relevance | +| `summary` | boolean | No | `false` | Whether to include a text summary for each result | +| `include` | string | No | — | Domains to include. Multiple domains separated by `\|` or `,` (max 100 domains). Example: `qq.com\|m.163.com` | +| `exclude` | string | No | — | Domains to exclude. Same format as `include` | +| `count` | number | No | 10 | Number of results to return. Range: 1–50. Actual results may be fewer | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +### Exa + +Go to [Exa](https://dashboard.exa.ai) to get an API Key, then fill it in the corresponding configuration item. + +Exa provides semantic search capabilities powered by neural embeddings, offering three integrated tools for the model to use: + +#### 1. Search (`web_search_exa`) + +The core search tool supports 5 search types: + +- `auto` — Automatically selects the best search mode based on the query (default) +- `neural` — Semantic search using embeddings, ideal for conceptual or natural language queries +- `fast` — Fast keyword-based search for quick results +- `instant` — Near-instant results for simple factual queries +- `deep` — Deep search with thorough result exploration + +Additionally, Exa supports 6 vertical categories for domain-specific searches: + +| Category | Coverage | +|---|---| +| `company` | 50M+ company pages | +| `people` | 1B+ profiles | +| `research paper` | 100M+ academic papers | +| `news` | News articles and reports | +| `personal site` | Personal websites and blogs | +| `financial report` | Financial filings and data | + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `query` | string | Yes | — | Search query | +| `max_results` | number | No | 10 | Maximum number of results to return. Range: 1–100 | +| `search_type` | string | No | `auto` | Search type. Must be one of `auto`, `neural`, `fast`, `instant`, `deep` | +| `category` | string | No | — | Vertical search category. Supported values: `company`, `people`, `research paper`, `news`, `personal site`, `financial report`. Leave empty for general web search | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +#### 2. Content Extraction (`exa_extract_web_page`) + +Extracts full text content from any given URL. The model can use this to read and summarize web pages, articles, or documents when the search result snippet is not sufficient. + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `url` | string | Yes | — | The URL to extract content from | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +#### 3. Find Similar (`exa_find_similar`) + +Finds semantically similar webpages to a given URL. This is a unique Exa feature that allows discovering related content based on neural embeddings rather than keyword matching. + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `url` | string | Yes | — | The URL of the webpage to find similar content for | +| `max_results` | number | No | 10 | Maximum number of similar results to return. Range: 1–100 | +| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | + +To use a proxy or self-hosted instance, modify the `Exa API Base URL` configuration. diff --git a/docs/zh/use/websearch.md b/docs/zh/use/websearch.md index 93200c44bf..82a448c4df 100644 --- a/docs/zh/use/websearch.md +++ b/docs/zh/use/websearch.md @@ -13,22 +13,159 @@ AstrBot 内置的网页搜索功能依赖大模型提供 `函数调用` 能力 等等带有搜索意味的提示让大模型触发调用搜索工具。 -AstrBot 支持 3 种网页搜索源接入方式:`默认`、`Tavily`、`百度 AI 搜索`。 +AstrBot 支持 5 种网页搜索源接入方式:`默认`、`Tavily`、`百度 AI 搜索`、`BoCha`、`Exa`。 -前者使用 AstrBot 内置的网页搜索请求器请求 Google、Bing、搜狗搜索引擎,在能够使用 Google 的网络环境下表现最佳。**我们推荐使用 Tavily**。 +前者使用 AstrBot 内置的网页搜索请求器请求 Google、Bing、搜狗搜索引擎,在能够使用 Google 的网络环境下表现最佳。**我们推荐使用 Tavily 或 Exa**。 ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) -进入 `配置`,下拉找到网页搜索,您可选择 `default`(默认,不推荐) 或 `Tavily`。 +进入 `配置`,下拉找到网页搜索,您可选择 `default`(默认,不推荐) 或 `Tavily` 或 `Exa`。 ### default(不推荐) 如果您的设备在国内并且有代理,可以开启代理并在 `管理面板-其他配置-HTTP代理` 填入 HTTP 代理地址以应用代理。 +启用默认搜索后,大模型将获得以下工具: + +#### 网页搜索(web_search) + +使用 Google、Bing、搜狗等搜索引擎进行搜索。 + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `query` | string | 是 | 搜索关键词 | +| `max_results` | number | 否 | 返回的最大搜索结果数量,默认为 5 | + +#### 网页内容提取(fetch_url) + +提取任意 URL 的网页全文内容,可用于让大模型阅读和总结指定网页。 + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `url` | string | 是 | 要提取内容的网页 URL | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + ### Tavily 前往 [Tavily](https://app.tavily.com/home) 得到 API Key,然后填写在相应的配置项。 如果您使用 Tavily 作为网页搜索源,在 AstrBot ChatUI 上将会获得更好的体验优化,包括引用来源展示等: -![](https://files.astrbot.app/docs/source/images/websearch/image1.png) \ No newline at end of file +![](https://files.astrbot.app/docs/source/images/websearch/image1.png) + +如需使用代理或自建实例,可修改 `Tavily API Base URL` 配置项。 + +启用 Tavily 后,大模型将获得以下工具: + +#### 搜索(web_search_tavily) + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `query` | string | 是 | 搜索关键词 | +| `max_results` | number | 否 | 返回的最大结果数量,范围 5-20,默认 7 | +| `search_depth` | string | 否 | 搜索深度,可选 `basic`(默认)或 `advanced` | +| `topic` | string | 否 | 搜索主题,可选 `general`(默认)或 `news` | +| `days` | number | 否 | 从当前日期往前包含的天数,仅在 `topic` 为 `news` 时生效 | +| `time_range` | string | 否 | 时间范围,可选 `day`、`week`、`month`、`year`,对 `general` 和 `news` 均生效 | +| `start_date` | string | 否 | 起始日期,格式 `YYYY-MM-DD` | +| `end_date` | string | 否 | 结束日期,格式 `YYYY-MM-DD` | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +#### 网页内容提取(tavily_extract_web_page) + +提取任意 URL 的网页全文内容。 + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `url` | string | 是 | 要提取内容的网页 URL | +| `extract_depth` | string | 否 | 提取深度,可选 `basic`(默认)或 `advanced` | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +### Exa + +前往 [Exa](https://dashboard.exa.ai) 获取 API Key,然后填写在相应的配置项。 + +Exa 提供基于语义理解的搜索能力,相比传统关键词搜索能够更精准地理解搜索意图。启用 Exa 后,大模型将获得以下三个工具: + +#### 搜索(web_search_exa) + +Exa 的核心搜索工具,支持以下搜索类型: + +- `auto`:自动模式,由 Exa 根据查询内容智能选择最佳搜索方式(推荐) +- `neural`:语义搜索,基于嵌入向量匹配,适合模糊或描述性的查询 +- `fast`:快速搜索,优先返回速度,适合简单关键词查询 +- `instant`:即时搜索,适合需要快速获取摘要的场景 +- `deep`:深度搜索,更全面地检索相关结果,适合复杂研究类查询 + +此外,搜索支持按垂直领域筛选结果: + +| 类别 | 说明 | +|------|------| +| `company` | 5000 万+ 公司主页 | +| `people` | 10 亿+ 个人主页/档案 | +| `research paper` | 1 亿+ 研究论文 | +| `news` | 新闻资讯 | +| `personal site` | 个人网站/博客 | +| `financial report` | 财务报告 | + +**工具参数:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `query` | string | 是 | 搜索关键词 | +| `max_results` | number | 否 | 返回的最大结果数量,范围 1-100,默认 10 | +| `search_type` | string | 否 | 搜索类型,可选 `auto`(默认)、`neural`、`fast`、`instant`、`deep` | +| `category` | string | 否 | 垂直领域筛选,默认为空(通用搜索) | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +#### 内容提取(exa_extract_web_page) + +提取任意 URL 的网页全文内容,可用于让大模型阅读和总结指定网页。您可以直接对大模型说: + +- `帮我总结一下这个链接:https://example.com` +- `读取这个页面的内容:https://example.com` + +**工具参数:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `url` | string | 是 | 要提取内容的网页 URL | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +#### 相似链接(exa_find_similar) + +Exa 独有的功能,根据给定的 URL 查找语义相似的网页。适合用于扩展阅读、查找同类资源或发现相关内容。 + +**工具参数:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `url` | string | 是 | 用于查找相似内容的网页 URL | +| `max_results` | number | 否 | 返回的最大结果数量,范围 1-100,默认 10 | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | + +如需使用代理或自建实例,可修改 `Exa API Base URL` 配置项。 + +### 百度 AI 搜索 + +前往 [百度智能云控制台](https://console.bce.baidu.com/iam/#/iam/apikey/list) 获取 APP Builder API Key,然后填写在相应的配置项。 + +百度 AI 搜索通过 MCP 协议接入,启用后大模型将自动获得 `baidu_ai_search` 工具,无需额外配置工具参数。 + +### BoCha + +前往 [BoCha](https://www.bocha.ai) 获取 API Key,然后填写在相应的配置项。 + +启用 BoCha 后,大模型将获得以下工具: + +#### 搜索(web_search_bocha) + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `query` | string | 是 | 搜索关键词 | +| `freshness` | string | 否 | 时间范围筛选。可选 `noLimit`(默认,推荐)、`oneDay`、`oneWeek`、`oneMonth`、`oneYear`,或指定日期 `YYYY-MM-DD`、日期范围 `YYYY-MM-DD..YYYY-MM-DD`。建议使用 `noLimit`,搜索算法会自动优化时间相关性,手动限制可能导致无结果 | +| `summary` | boolean | 否 | 是否为每个搜索结果包含文本摘要,默认 `false` | +| `include` | string | 否 | 限定搜索域名,多个域名用 `\|` 或 `,` 分隔,最多 100 个。示例:`qq.com` 或 `qq.com\|m.163.com` | +| `exclude` | string | 否 | 排除搜索域名,多个域名用 `\|` 或 `,` 分隔,最多 100 个。示例:`qq.com` 或 `qq.com\|m.163.com` | +| `count` | number | 否 | 返回的搜索结果数量,范围 1-50,默认 10(实际返回数量可能少于指定值) | +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | \ No newline at end of file From f0edbb9623a580e8c0d3b5b2a7e4c35cfc33d121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B0=95=E6=B0=99?= <2014440212@qq.com> Date: Sun, 5 Apr 2026 00:36:12 +0800 Subject: [PATCH 02/10] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/en/use/websearch.md | 2 +- docs/zh/use/websearch.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/use/websearch.md b/docs/en/use/websearch.md index f226b4357c..b030c48756 100644 --- a/docs/en/use/websearch.md +++ b/docs/en/use/websearch.md @@ -20,7 +20,7 @@ The former uses AstrBot's built-in web search requester to query Google, Bing, a ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) -Go to `Configuration`, scroll down to find Web Search, where you can select `default` (default, not recommended) or `Tavily` or `Exa`. +Go to `Configuration`, scroll down to find Web Search, where you can select `default` (default, not recommended), `Tavily`, `Baidu AI Search`, `BoCha`, or `Exa`. ### default (Not Recommended) diff --git a/docs/zh/use/websearch.md b/docs/zh/use/websearch.md index 82a448c4df..96a569aa70 100644 --- a/docs/zh/use/websearch.md +++ b/docs/zh/use/websearch.md @@ -19,7 +19,7 @@ AstrBot 支持 5 种网页搜索源接入方式:`默认`、`Tavily`、`百度 ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) -进入 `配置`,下拉找到网页搜索,您可选择 `default`(默认,不推荐) 或 `Tavily` 或 `Exa`。 +进入 `配置`,下拉找到网页搜索,您可选择 `default`(默认,不推荐)、`Tavily`、`百度 AI 搜索`、`BoCha` 或 `Exa`。 ### default(不推荐) From 479c58e6051b26faf98433477c3231235e924b7b Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Sun, 5 Apr 2026 02:05:10 +0800 Subject: [PATCH 03/10] =?UTF-8?q?fix(websearch):=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E5=85=A8=E5=B1=80=20HEADERS=20=E6=B1=A1=E6=9F=93=E3=80=81?= =?UTF-8?q?=E5=AF=86=E9=92=A5=E7=B4=A2=E5=BC=95=E8=B6=8A=E7=95=8C=E7=AD=89?= =?UTF-8?q?=E9=97=AE=E9=A2=98=EF=BC=8C=E6=8F=90=E5=8F=96=E5=85=B1=E4=BA=AB?= =?UTF-8?q?=E5=B7=A5=E5=85=B7=E5=87=BD=E6=95=B0=E6=B6=88=E9=99=A4=E9=87=8D?= =?UTF-8?q?=E5=A4=8D=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/builtin_stars/web_searcher/main.py | 216 ++++++------------ astrbot/core/astr_agent_hooks.py | 9 +- .../core/knowledge_base/parsers/url_parser.py | 8 +- astrbot/core/utils/web_search_utils.py | 64 ++++++ astrbot/dashboard/routes/chat.py | 36 +-- astrbot/dashboard/routes/live_chat.py | 33 +-- dashboard/src/components/chat/MessageList.vue | 10 +- docs/en/use/websearch.md | 4 +- docs/zh/use/websearch.md | 8 +- 9 files changed, 168 insertions(+), 220 deletions(-) create mode 100644 astrbot/core/utils/web_search_utils.py diff --git a/astrbot/builtin_stars/web_searcher/main.py b/astrbot/builtin_stars/web_searcher/main.py index 14b3e0d90c..e0044969b6 100644 --- a/astrbot/builtin_stars/web_searcher/main.py +++ b/astrbot/builtin_stars/web_searcher/main.py @@ -11,11 +11,14 @@ from astrbot.api.event import AstrMessageEvent, filter from astrbot.api.provider import ProviderRequest from astrbot.core.provider.func_tool_manager import FunctionToolManager +from astrbot.core.utils.web_search_utils import normalize_web_search_base_url from .engines import HEADERS, USER_AGENTS, SearchResult from .engines.bing import Bing from .engines.sogo import Sogo +MIN_WEB_SEARCH_TIMEOUT = 30 + class Main(star.Star): TOOLS = [ @@ -28,6 +31,14 @@ class Main(star.Star): "exa_extract_web_page", "exa_find_similar", ] + MANAGED_TOOLS = TOOLS + ["AIsearch"] + PROVIDER_TOOLS = { + "default": ("web_search", "fetch_url"), + "tavily": ("web_search_tavily", "tavily_extract_web_page"), + "baidu_ai_search": ("AIsearch",), + "bocha": ("web_search_bocha",), + "exa": ("web_search_exa", "exa_extract_web_page", "exa_find_similar"), + } def __init__(self, context: star.Context) -> None: self.context = context @@ -79,15 +90,40 @@ async def _tidy_text(self, text: str) -> str: """清理文本,去除空格、换行符等""" return text.strip().replace("\n", " ").replace("\r", " ").replace(" ", " ") + def _normalize_timeout(self, timeout: int) -> aiohttp.ClientTimeout: + return aiohttp.ClientTimeout(total=max(timeout, MIN_WEB_SEARCH_TIMEOUT)) + + def _get_tavily_base_url(self, cfg: AstrBotConfig) -> str: + return normalize_web_search_base_url( + cfg.get("provider_settings", {}).get("websearch_tavily_base_url"), + default="https://api.tavily.com", + provider_name="Tavily", + ) + + def _get_exa_base_url(self, cfg: AstrBotConfig) -> str: + return normalize_web_search_base_url( + cfg.get("provider_settings", {}).get("websearch_exa_base_url"), + default="https://api.exa.ai", + provider_name="Exa", + ) + + def _add_active_tools( + self, tool_set, func_tool_mgr, tool_names: tuple[str, ...] + ) -> None: + for tool_name in tool_names: + tool = func_tool_mgr.get_func(tool_name) + if tool and tool.active: + tool_set.add_tool(tool) + async def _get_from_url(self, url: str, timeout: int = 30) -> str: """获取网页内容""" - if timeout < 30: - timeout = 30 - header = HEADERS - header.update({"User-Agent": random.choice(USER_AGENTS)}) + header = HEADERS.copy() + header["User-Agent"] = random.choice(USER_AGENTS) async with aiohttp.ClientSession(trust_env=True) as session: async with session.get( - url, headers=header, timeout=aiohttp.ClientTimeout(total=timeout) + url, + headers=header, + timeout=self._normalize_timeout(timeout), ) as response: html = await response.text(encoding="utf-8") doc = Document(html) @@ -145,9 +181,10 @@ async def _get_tavily_key(self, cfg: AstrBotConfig) -> str: """并发安全的从列表中获取并轮换Tavily API密钥。""" tavily_keys = cfg.get("provider_settings", {}).get("websearch_tavily_key", []) if not tavily_keys: - raise ValueError("错误:Tavily API密钥未在AstrBot中配置。") + raise ValueError("Error: Tavily API key is not configured in AstrBot.") async with self.tavily_key_lock: + self.tavily_key_index %= len(tavily_keys) key = tavily_keys[self.tavily_key_index] self.tavily_key_index = (self.tavily_key_index + 1) % len(tavily_keys) return key @@ -160,13 +197,7 @@ async def _web_search_tavily( ) -> list[SearchResult]: """使用 Tavily 搜索引擎进行搜索""" tavily_key = await self._get_tavily_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_tavily_base_url", "https://api.tavily.com") - .rstrip("/") - ) - if timeout < 30: - timeout = 30 + base_url = self._get_tavily_base_url(cfg) url = f"{base_url}/search" header = { "Authorization": f"Bearer {tavily_key}", @@ -177,7 +208,7 @@ async def _web_search_tavily( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -201,13 +232,7 @@ async def _extract_tavily( ) -> list[dict]: """使用 Tavily 提取网页内容""" tavily_key = await self._get_tavily_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_tavily_base_url", "https://api.tavily.com") - .rstrip("/") - ) - if timeout < 30: - timeout = 30 + base_url = self._get_tavily_base_url(cfg) url = f"{base_url}/extract" header = { "Authorization": f"Bearer {tavily_key}", @@ -218,7 +243,7 @@ async def _extract_tavily( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -243,7 +268,7 @@ async def search_from_search_engine( """搜索网络以回答用户的问题。当用户需要搜索网络以获取即时性的信息时调用此工具。 Args: - query(string): 和用户的问题最相关的搜索关键词,用于在 Google 上搜索。 + query(string): 和用户的问题最相关的搜索关键词,用于在搜索引擎上搜索。 max_results(number): 返回的最大搜索结果数量,默认为 5。 """ @@ -308,8 +333,6 @@ async def fetch_website_content( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 resp = await self._get_from_url(url, timeout=timeout) return resp @@ -342,8 +365,6 @@ async def search_from_tavily( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 logger.info(f"web_searcher - search_from_tavily: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) # websearch_link = cfg["provider_settings"].get("web_search_link", False) @@ -409,8 +430,6 @@ async def tavily_extract_web_page( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_tavily_key", []): raise ValueError("Error: Tavily API key is not configured in AstrBot.") @@ -437,9 +456,10 @@ async def _get_bocha_key(self, cfg: AstrBotConfig) -> str: """并发安全的从列表中获取并轮换BoCha API密钥。""" bocha_keys = cfg.get("provider_settings", {}).get("websearch_bocha_key", []) if not bocha_keys: - raise ValueError("错误:BoCha API密钥未在AstrBot中配置。") + raise ValueError("Error: BoCha API key is not configured in AstrBot.") async with self.bocha_key_lock: + self.bocha_key_index %= len(bocha_keys) key = bocha_keys[self.bocha_key_index] self.bocha_key_index = (self.bocha_key_index + 1) % len(bocha_keys) return key @@ -452,8 +472,6 @@ async def _web_search_bocha( ) -> list[SearchResult]: """使用 BoCha 搜索引擎进行搜索""" bocha_key = await self._get_bocha_key(cfg) - if timeout < 30: - timeout = 30 url = "https://api.bochaai.com/v1/web-search" header = { "Authorization": f"Bearer {bocha_key}", @@ -464,7 +482,7 @@ async def _web_search_bocha( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -545,8 +563,6 @@ async def search_from_bocha( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 logger.info(f"web_searcher - search_from_bocha: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) # websearch_link = cfg["provider_settings"].get("web_search_link", False) @@ -600,9 +616,10 @@ async def _get_exa_key(self, cfg: AstrBotConfig) -> str: """并发安全的从列表中获取并轮换 Exa API 密钥。""" exa_keys = cfg.get("provider_settings", {}).get("websearch_exa_key", []) if not exa_keys: - raise ValueError("错误:Exa API 密钥未在 AstrBot 中配置。") + raise ValueError("Error: Exa API key is not configured in AstrBot.") async with self.exa_key_lock: + self.exa_key_index %= len(exa_keys) key = exa_keys[self.exa_key_index] self.exa_key_index = (self.exa_key_index + 1) % len(exa_keys) return key @@ -615,11 +632,7 @@ async def _web_search_exa( ) -> list[SearchResult]: """使用 Exa 搜索引擎进行搜索""" exa_key = await self._get_exa_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_exa_base_url", "https://api.exa.ai") - .rstrip("/") - ) + base_url = self._get_exa_base_url(cfg) url = f"{base_url}/search" header = { "x-api-key": exa_key, @@ -630,7 +643,7 @@ async def _web_search_exa( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -670,8 +683,6 @@ async def search_from_exa( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 logger.info(f"web_searcher - search_from_exa: {query}") cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): @@ -725,13 +736,7 @@ async def _extract_exa( ) -> list[dict]: """使用 Exa 提取网页内容""" exa_key = await self._get_exa_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_exa_base_url", "https://api.exa.ai") - .rstrip("/") - ) - if timeout < 30: - timeout = 30 + base_url = self._get_exa_base_url(cfg) url = f"{base_url}/contents" header = { "x-api-key": exa_key, @@ -742,7 +747,7 @@ async def _extract_exa( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -750,12 +755,7 @@ async def _extract_exa( f"Exa content extraction failed: {reason}, status: {response.status}", ) data = await response.json() - results: list[dict] = data.get("results", []) - if not results: - raise ValueError( - "Error: Exa content extraction does not return any results.", - ) - return results + return data.get("results", []) @llm_tool("exa_extract_web_page") async def exa_extract_web_page( @@ -772,8 +772,6 @@ async def exa_extract_web_page( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): raise ValueError("Error: Exa API key is not configured in AstrBot.") @@ -787,6 +785,8 @@ async def exa_extract_web_page( } results = await self._extract_exa(cfg, payload, timeout=timeout) + if not results: + return "Error: Exa content extraction does not return any results." ret_ls = [] for result in results: ret_ls.append(f"URL: {result.get('url', 'No URL')}") @@ -802,13 +802,7 @@ async def _find_similar_exa( ) -> list[SearchResult]: """使用 Exa 查找相似链接""" exa_key = await self._get_exa_key(cfg) - base_url = ( - cfg.get("provider_settings", {}) - .get("websearch_exa_base_url", "https://api.exa.ai") - .rstrip("/") - ) - if timeout < 30: - timeout = 30 + base_url = self._get_exa_base_url(cfg) url = f"{base_url}/findSimilar" header = { "x-api-key": exa_key, @@ -819,7 +813,7 @@ async def _find_similar_exa( url, json=payload, headers=header, - timeout=aiohttp.ClientTimeout(total=timeout), + timeout=self._normalize_timeout(timeout), ) as response: if response.status != 200: reason = await response.text() @@ -854,8 +848,6 @@ async def find_similar_links( timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. """ - if timeout < 30: - timeout = 30 logger.info(f"web_searcher - find_similar_links: {url}") cfg = self.context.get_config(umo=event.unified_msg_origin) if not cfg.get("provider_settings", {}).get("websearch_exa_key", []): @@ -912,81 +904,25 @@ async def edit_web_search_tools( return if not websearch_enable: - # pop tools - for tool_name in self.TOOLS: + for tool_name in self.MANAGED_TOOLS: tool_set.remove_tool(tool_name) return func_tool_mgr = self.context.get_llm_tool_manager() - if provider == "default": - web_search_t = func_tool_mgr.get_func("web_search") - fetch_url_t = func_tool_mgr.get_func("fetch_url") - if web_search_t and web_search_t.active: - tool_set.add_tool(web_search_t) - if fetch_url_t and fetch_url_t.active: - tool_set.add_tool(fetch_url_t) - tool_set.remove_tool("web_search_tavily") - tool_set.remove_tool("tavily_extract_web_page") - tool_set.remove_tool("AIsearch") - tool_set.remove_tool("web_search_bocha") - tool_set.remove_tool("web_search_exa") - tool_set.remove_tool("exa_extract_web_page") - tool_set.remove_tool("exa_find_similar") - elif provider == "tavily": - web_search_tavily = func_tool_mgr.get_func("web_search_tavily") - tavily_extract_web_page = func_tool_mgr.get_func("tavily_extract_web_page") - if web_search_tavily and web_search_tavily.active: - tool_set.add_tool(web_search_tavily) - if tavily_extract_web_page and tavily_extract_web_page.active: - tool_set.add_tool(tavily_extract_web_page) - tool_set.remove_tool("web_search") - tool_set.remove_tool("fetch_url") - tool_set.remove_tool("AIsearch") - tool_set.remove_tool("web_search_bocha") - tool_set.remove_tool("web_search_exa") - tool_set.remove_tool("exa_extract_web_page") - tool_set.remove_tool("exa_find_similar") - elif provider == "baidu_ai_search": + for tool_name in self.MANAGED_TOOLS: + tool_set.remove_tool(tool_name) + + if provider == "baidu_ai_search": try: await self.ensure_baidu_ai_search_mcp(event.unified_msg_origin) - aisearch_tool = func_tool_mgr.get_func("AIsearch") - if aisearch_tool and aisearch_tool.active: - tool_set.add_tool(aisearch_tool) - tool_set.remove_tool("web_search") - tool_set.remove_tool("fetch_url") - tool_set.remove_tool("web_search_tavily") - tool_set.remove_tool("tavily_extract_web_page") - tool_set.remove_tool("web_search_bocha") - tool_set.remove_tool("web_search_exa") - tool_set.remove_tool("exa_extract_web_page") - tool_set.remove_tool("exa_find_similar") + self._add_active_tools( + tool_set, + func_tool_mgr, + self.PROVIDER_TOOLS["baidu_ai_search"], + ) except Exception as e: logger.error(f"Cannot Initialize Baidu AI Search MCP Server: {e}") - elif provider == "bocha": - web_search_bocha = func_tool_mgr.get_func("web_search_bocha") - if web_search_bocha and web_search_bocha.active: - tool_set.add_tool(web_search_bocha) - tool_set.remove_tool("web_search") - tool_set.remove_tool("fetch_url") - tool_set.remove_tool("AIsearch") - tool_set.remove_tool("web_search_tavily") - tool_set.remove_tool("tavily_extract_web_page") - tool_set.remove_tool("web_search_exa") - tool_set.remove_tool("exa_extract_web_page") - tool_set.remove_tool("exa_find_similar") - elif provider == "exa": - web_search_exa = func_tool_mgr.get_func("web_search_exa") - exa_extract_web_page = func_tool_mgr.get_func("exa_extract_web_page") - exa_find_similar = func_tool_mgr.get_func("exa_find_similar") - if web_search_exa and web_search_exa.active: - tool_set.add_tool(web_search_exa) - if exa_extract_web_page and exa_extract_web_page.active: - tool_set.add_tool(exa_extract_web_page) - if exa_find_similar and exa_find_similar.active: - tool_set.add_tool(exa_find_similar) - tool_set.remove_tool("web_search") - tool_set.remove_tool("fetch_url") - tool_set.remove_tool("AIsearch") - tool_set.remove_tool("web_search_tavily") - tool_set.remove_tool("tavily_extract_web_page") - tool_set.remove_tool("web_search_bocha") + return + + tool_names = self.PROVIDER_TOOLS.get(provider, self.PROVIDER_TOOLS["default"]) + self._add_active_tools(tool_set, func_tool_mgr, tool_names) diff --git a/astrbot/core/astr_agent_hooks.py b/astrbot/core/astr_agent_hooks.py index 86f8a6c5b2..83c21a7462 100644 --- a/astrbot/core/astr_agent_hooks.py +++ b/astrbot/core/astr_agent_hooks.py @@ -9,6 +9,7 @@ from astrbot.core.astr_agent_context import AstrAgentContext from astrbot.core.pipeline.context_utils import call_event_hook from astrbot.core.star.star_handler import EventType +from astrbot.core.utils.web_search_utils import WEB_SEARCH_REFERENCE_TOOLS class MainAgentHooks(BaseAgentRunHooks[AstrAgentContext]): @@ -59,13 +60,7 @@ async def on_tool_end( platform_name = run_context.context.event.get_platform_name() if ( platform_name == "webchat" - and tool.name - in [ - "web_search_tavily", - "web_search_bocha", - "web_search_exa", - "exa_find_similar", - ] + and tool.name in WEB_SEARCH_REFERENCE_TOOLS and len(run_context.messages) > 0 and tool_result and len(tool_result.content) diff --git a/astrbot/core/knowledge_base/parsers/url_parser.py b/astrbot/core/knowledge_base/parsers/url_parser.py index d0c41cafa3..660e110fe6 100644 --- a/astrbot/core/knowledge_base/parsers/url_parser.py +++ b/astrbot/core/knowledge_base/parsers/url_parser.py @@ -2,6 +2,8 @@ import aiohttp +from astrbot.core.utils.web_search_utils import normalize_web_search_base_url + class URLExtractor: """URL 内容提取器,封装了 Tavily API 调用和密钥管理""" @@ -22,7 +24,11 @@ def __init__( self.tavily_keys = tavily_keys self.tavily_key_index = 0 self.tavily_key_lock = asyncio.Lock() - self.tavily_base_url = tavily_base_url.rstrip("/") + self.tavily_base_url = normalize_web_search_base_url( + tavily_base_url, + default="https://api.tavily.com", + provider_name="Tavily", + ) async def _get_tavily_key(self) -> str: """并发安全的从列表中获取并轮换Tavily API密钥。""" diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py new file mode 100644 index 0000000000..acecaebce1 --- /dev/null +++ b/astrbot/core/utils/web_search_utils.py @@ -0,0 +1,64 @@ +import json +from typing import Any +from urllib.parse import urlparse + +WEB_SEARCH_REFERENCE_TOOLS = ( + "web_search_tavily", + "web_search_bocha", + "web_search_exa", + "exa_find_similar", +) + + +def normalize_web_search_base_url( + base_url: str | None, + *, + default: str, + provider_name: str, +) -> str: + normalized = (base_url or "").strip() + if not normalized: + normalized = default + normalized = normalized.rstrip("/") + + parsed = urlparse(normalized) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + raise ValueError( + f"Error: {provider_name} API Base URL must start with http:// or https://.", + ) + return normalized + + +def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict: + web_search_results = {} + + for part in accumulated_parts: + if part.get("type") != "tool_call" or not part.get("tool_calls"): + continue + + for tool_call in part["tool_calls"]: + if tool_call.get( + "name" + ) not in WEB_SEARCH_REFERENCE_TOOLS or not tool_call.get("result"): + continue + + result = tool_call["result"] + try: + result_data = json.loads(result) if isinstance(result, str) else result + except json.JSONDecodeError: + continue + + if not isinstance(result_data, dict): + continue + + for item in result_data.get("results", []): + if not isinstance(item, dict): + continue + if idx := item.get("index"): + web_search_results[idx] = { + "url": item.get("url"), + "title": item.get("title"), + "snippet": item.get("snippet"), + } + + return web_search_results diff --git a/astrbot/dashboard/routes/chat.py b/astrbot/dashboard/routes/chat.py index 4c4fd0ce84..63b24f5927 100644 --- a/astrbot/dashboard/routes/chat.py +++ b/astrbot/dashboard/routes/chat.py @@ -23,6 +23,7 @@ from astrbot.core.utils.active_event_registry import active_event_registry from astrbot.core.utils.astrbot_path import get_astrbot_data_path from astrbot.core.utils.datetime_utils import to_utc_isoformat +from astrbot.core.utils.web_search_utils import collect_web_search_results from .route import Response, Route, RouteContext @@ -215,7 +216,7 @@ async def _create_attachment_from_file( def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: - """从消息中提取 web_search_tavily 的引用 + """从消息中提取网页搜索引用。 Args: accumulated_text: 累积的文本内容 @@ -224,38 +225,7 @@ def _extract_web_search_refs( Returns: 包含 used 列表的字典,记录被引用的搜索结果 """ - supported = [ - "web_search_tavily", - "web_search_bocha", - "web_search_exa", - "exa_find_similar", - ] - # 从 accumulated_parts 中找到所有 web_search_tavily 的工具调用结果 - web_search_results = {} - tool_call_parts = [ - p - for p in accumulated_parts - if p.get("type") == "tool_call" and p.get("tool_calls") - ] - - for part in tool_call_parts: - for tool_call in part["tool_calls"]: - if tool_call.get("name") not in supported or not tool_call.get( - "result" - ): - continue - try: - result_data = json.loads(tool_call["result"]) - for item in result_data.get("results", []): - if idx := item.get("index"): - web_search_results[idx] = { - "url": item.get("url"), - "title": item.get("title"), - "snippet": item.get("snippet"), - } - except (json.JSONDecodeError, KeyError): - pass - + web_search_results = collect_web_search_results(accumulated_parts) if not web_search_results: return {} diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py index 25310cf61a..25edd10f34 100644 --- a/astrbot/dashboard/routes/live_chat.py +++ b/astrbot/dashboard/routes/live_chat.py @@ -22,6 +22,7 @@ from astrbot.core.platform.sources.webchat.webchat_queue_mgr import webchat_queue_mgr from astrbot.core.utils.astrbot_path import get_astrbot_data_path, get_astrbot_temp_path from astrbot.core.utils.datetime_utils import to_utc_isoformat +from astrbot.core.utils.web_search_utils import collect_web_search_results from .route import Route, RouteContext @@ -198,37 +199,7 @@ def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: """从消息中提取 web_search 引用。""" - supported = [ - "web_search_tavily", - "web_search_bocha", - "web_search_exa", - "exa_find_similar", - ] - web_search_results = {} - tool_call_parts = [ - p - for p in accumulated_parts - if p.get("type") == "tool_call" and p.get("tool_calls") - ] - - for part in tool_call_parts: - for tool_call in part["tool_calls"]: - if tool_call.get("name") not in supported or not tool_call.get( - "result" - ): - continue - try: - result_data = json.loads(tool_call["result"]) - for item in result_data.get("results", []): - if idx := item.get("index"): - web_search_results[idx] = { - "url": item.get("url"), - "title": item.get("title"), - "snippet": item.get("snippet"), - } - except (json.JSONDecodeError, KeyError): - pass - + web_search_results = collect_web_search_results(accumulated_parts) if not web_search_results: return {} diff --git a/dashboard/src/components/chat/MessageList.vue b/dashboard/src/components/chat/MessageList.vue index 63c8fabb2c..128e75af0c 100644 --- a/dashboard/src/components/chat/MessageList.vue +++ b/dashboard/src/components/chat/MessageList.vue @@ -200,6 +200,13 @@ setCustomComponents('message-list', { code_block: MarkdownCodeBlockNode }); +const WEB_SEARCH_REFERENCE_TOOLS = Object.freeze([ + 'web_search_tavily', + 'web_search_bocha', + 'web_search_exa', + 'exa_find_similar' +]); + export default { name: 'MessageList', components: { @@ -303,8 +310,7 @@ export default { part.tool_calls.forEach(toolCall => { // 检查是否是网页搜索工具调用 - const supportedTools = ['web_search_tavily', 'web_search_bocha', 'web_search_exa', 'exa_find_similar']; - if (!supportedTools.includes(toolCall.name) || !toolCall.result) { + if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall.name) || !toolCall.result) { return; } diff --git a/docs/en/use/websearch.md b/docs/en/use/websearch.md index b030c48756..35a5e70852 100644 --- a/docs/en/use/websearch.md +++ b/docs/en/use/websearch.md @@ -1,7 +1,7 @@ # Web Search -The web search feature aims to provide large language models with the ability to invoke search engines like Google, Bing, and Sogou to obtain recent world information, which can improve the accuracy of model responses and reduce hallucinations to some extent. +The web search feature aims to provide large language models with the ability to invoke search engines like Bing and Sogou to obtain recent world information, which can improve the accuracy of model responses and reduce hallucinations to some extent. AstrBot's built-in web search functionality relies on the large language model's `function calling` capability. If you're not familiar with function calling, please refer to: [Function Calling](/use/websearch). @@ -16,7 +16,7 @@ And other prompts with search intent to trigger the model to invoke the search t AstrBot supports 5 types of web search source integration: `default`, `Tavily`, `Baidu AI Search`, `BoCha`, and `Exa`. -The former uses AstrBot's built-in web search requester to query Google, Bing, and Sogou search engines, performing best in network environments with Google access. **We recommend using Tavily or Exa**. +The former uses AstrBot's built-in web search requester to query Bing and Sogou search engines. **We recommend using Tavily or Exa**. ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) diff --git a/docs/zh/use/websearch.md b/docs/zh/use/websearch.md index 96a569aa70..613d697e1a 100644 --- a/docs/zh/use/websearch.md +++ b/docs/zh/use/websearch.md @@ -1,6 +1,6 @@ # 网页搜索 -网页搜索功能旨在提供大模型调用 Google,Bing,搜狗等搜索引擎以获取世界最近信息的能力,一定程度上能够提高大模型的回复准确度,减少幻觉。 +网页搜索功能旨在提供大模型调用 Bing、搜狗等搜索引擎以获取世界最近信息的能力,一定程度上能够提高大模型的回复准确度,减少幻觉。 AstrBot 内置的网页搜索功能依赖大模型提供 `函数调用` 能力。如果你不了解函数调用,请参考:[函数调用](/use/websearch)。 @@ -15,7 +15,7 @@ AstrBot 内置的网页搜索功能依赖大模型提供 `函数调用` 能力 AstrBot 支持 5 种网页搜索源接入方式:`默认`、`Tavily`、`百度 AI 搜索`、`BoCha`、`Exa`。 -前者使用 AstrBot 内置的网页搜索请求器请求 Google、Bing、搜狗搜索引擎,在能够使用 Google 的网络环境下表现最佳。**我们推荐使用 Tavily 或 Exa**。 +前者使用 AstrBot 内置的网页搜索请求器请求 Bing、搜狗搜索引擎。**我们推荐使用 Tavily 或 Exa**。 ![image](https://files.astrbot.app/docs/source/images/websearch/image.png) @@ -29,7 +29,7 @@ AstrBot 支持 5 种网页搜索源接入方式:`默认`、`Tavily`、`百度 #### 网页搜索(web_search) -使用 Google、Bing、搜狗等搜索引擎进行搜索。 +使用 Bing、搜狗等搜索引擎进行搜索。 | 参数 | 类型 | 必填 | 说明 | |------|------|------|------| @@ -168,4 +168,4 @@ Exa 独有的功能,根据给定的 URL 查找语义相似的网页。适合 | `include` | string | 否 | 限定搜索域名,多个域名用 `\|` 或 `,` 分隔,最多 100 个。示例:`qq.com` 或 `qq.com\|m.163.com` | | `exclude` | string | 否 | 排除搜索域名,多个域名用 `\|` 或 `,` 分隔,最多 100 个。示例:`qq.com` 或 `qq.com\|m.163.com` | | `count` | number | 否 | 返回的搜索结果数量,范围 1-50,默认 10(实际返回数量可能少于指定值) | -| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | \ No newline at end of file +| `timeout` | number | 否 | 请求超时时间(秒),最小 30,默认 30 | From 22e2c8bcf9d70b13900689685f019c393eeb398c Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Sun, 5 Apr 2026 02:24:01 +0800 Subject: [PATCH 04/10] =?UTF-8?q?fix(websearch):=20=E7=BB=9F=E4=B8=80?= =?UTF-8?q?=E5=89=8D=E5=90=8E=E7=AB=AF=E7=BD=91=E9=A1=B5=E6=90=9C=E7=B4=A2?= =?UTF-8?q?=E5=BC=95=E7=94=A8=E6=8F=90=E5=8F=96=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=89=8D=E7=AB=AF=20refs=20=E9=99=8D?= =?UTF-8?q?=E7=BA=A7=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 重构 web_search_utils.py 为分层结构,新增 build_web_search_refs() 和 _extract_ref_indices() 支持从 标签提取引用索引 - 简化 chat.py/live_chat.py 中 ref 提取为调用 build_web_search_refs() - MessageList.vue 新增 getMessageRefs() 在后端未返回 refs 时前端自行降级提取 - 修复 chat.py 中消息保存条件判断逻辑 --- astrbot/core/utils/web_search_utils.py | 88 +++++++++++-- astrbot/dashboard/routes/chat.py | 62 ++++----- astrbot/dashboard/routes/live_chat.py | 27 ++-- dashboard/src/components/chat/MessageList.vue | 118 ++++++++++++------ tests/unit/test_web_search_utils.py | 90 +++++++++++++ 5 files changed, 282 insertions(+), 103 deletions(-) create mode 100644 tests/unit/test_web_search_utils.py diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py index acecaebce1..0052440152 100644 --- a/astrbot/core/utils/web_search_utils.py +++ b/astrbot/core/utils/web_search_utils.py @@ -1,4 +1,5 @@ import json +import re from typing import Any from urllib.parse import urlparse @@ -29,9 +30,9 @@ def normalize_web_search_base_url( return normalized -def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict: - web_search_results = {} - +def _iter_web_search_result_items( + accumulated_parts: list[dict[str, Any]], +): for part in accumulated_parts: if part.get("type") != "tool_call" or not part.get("tool_calls"): continue @@ -52,13 +53,78 @@ def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict: continue for item in result_data.get("results", []): - if not isinstance(item, dict): - continue - if idx := item.get("index"): - web_search_results[idx] = { - "url": item.get("url"), - "title": item.get("title"), - "snippet": item.get("snippet"), - } + if isinstance(item, dict): + yield item + + +def _extract_ref_indices(accumulated_text: str) -> list[str]: + ref_indices: list[str] = [] + seen_indices: set[str] = set() + + for match in re.finditer(r"(.*?)", accumulated_text): + ref_index = match.group(1).strip() + if not ref_index or ref_index in seen_indices: + continue + ref_indices.append(ref_index) + seen_indices.add(ref_index) + + return ref_indices + + +def collect_web_search_ref_items( + accumulated_parts: list[dict[str, Any]], + favicon_cache: dict[str, str] | None = None, +) -> list[dict[str, Any]]: + web_search_refs: list[dict[str, Any]] = [] + seen_indices: set[str] = set() + + for item in _iter_web_search_result_items(accumulated_parts): + ref_index = item.get("index") + if not ref_index or ref_index in seen_indices: + continue + + payload = { + "index": ref_index, + "url": item.get("url"), + "title": item.get("title"), + "snippet": item.get("snippet"), + } + if favicon_cache and payload["url"] in favicon_cache: + payload["favicon"] = favicon_cache[payload["url"]] + + web_search_refs.append(payload) + seen_indices.add(ref_index) + + return web_search_refs + + +def build_web_search_refs( + accumulated_text: str, + accumulated_parts: list[dict[str, Any]], + favicon_cache: dict[str, str] | None = None, +) -> dict: + ordered_refs = collect_web_search_ref_items(accumulated_parts, favicon_cache) + if not ordered_refs: + return {} + + refs_by_index = {ref["index"]: ref for ref in ordered_refs} + ref_indices = _extract_ref_indices(accumulated_text) + used_refs = [refs_by_index[idx] for idx in ref_indices if idx in refs_by_index] + + if not used_refs: + used_refs = ordered_refs + + return {"used": used_refs} + + +def collect_web_search_results(accumulated_parts: list[dict[str, Any]]) -> dict: + web_search_results = {} + + for ref in collect_web_search_ref_items(accumulated_parts): + web_search_results[ref["index"]] = { + "url": ref.get("url"), + "title": ref.get("title"), + "snippet": ref.get("snippet"), + } return web_search_results diff --git a/astrbot/dashboard/routes/chat.py b/astrbot/dashboard/routes/chat.py index 63b24f5927..53e2fe1beb 100644 --- a/astrbot/dashboard/routes/chat.py +++ b/astrbot/dashboard/routes/chat.py @@ -1,7 +1,6 @@ import asyncio import json import os -import re import uuid from contextlib import asynccontextmanager from typing import cast @@ -23,7 +22,7 @@ from astrbot.core.utils.active_event_registry import active_event_registry from astrbot.core.utils.astrbot_path import get_astrbot_data_path from astrbot.core.utils.datetime_utils import to_utc_isoformat -from astrbot.core.utils.web_search_utils import collect_web_search_results +from astrbot.core.utils.web_search_utils import build_web_search_refs from .route import Response, Route, RouteContext @@ -216,35 +215,13 @@ async def _create_attachment_from_file( def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: - """从消息中提取网页搜索引用。 - - Args: - accumulated_text: 累积的文本内容 - accumulated_parts: 累积的消息部分列表 - - Returns: - 包含 used 列表的字典,记录被引用的搜索结果 - """ - web_search_results = collect_web_search_results(accumulated_parts) - if not web_search_results: - return {} - - # 从文本中提取所有 xxx 标签并去重 - ref_indices = { - m.strip() for m in re.findall(r"(.*?)", accumulated_text) - } - - # 构建被引用的结果列表 - used_refs = [] - for ref_index in ref_indices: - if ref_index not in web_search_results: - continue - payload = {"index": ref_index, **web_search_results[ref_index]} - if favicon := sp.temporary_cache.get("_ws_favicon", {}).get(payload["url"]): - payload["favicon"] = favicon - used_refs.append(payload) - - return {"used": used_refs} if used_refs else {} + """从消息中提取网页搜索引用。""" + favicon_cache = sp.temporary_cache.get("_ws_favicon", {}) + return build_web_search_refs( + accumulated_text, + accumulated_parts, + favicon_cache, + ) async def _save_bot_message( self, @@ -446,19 +423,27 @@ async def stream(): accumulated_parts.append(part) # 消息结束处理 + should_save = False if msg_type == "end": - break + should_save = bool( + accumulated_parts + or accumulated_text + or accumulated_reasoning + or refs + or agent_stats + ) elif ( (streaming and msg_type == "complete") or not streaming # or msg_type == "break" ): - if ( - chain_type == "tool_call" - or chain_type == "tool_call_result" + if chain_type not in ( + "tool_call", + "tool_call_result", + "agent_stats", ): - continue + should_save = True - # 提取 web_search_tavily 引用 + if should_save: try: refs = self._extract_web_search_refs( accumulated_text, @@ -499,6 +484,9 @@ async def stream(): # tool_calls = {} agent_stats = {} refs = {} + + if msg_type == "end": + break except BaseException as e: logger.exception(f"WebChat stream unexpected error: {e}", exc_info=True) finally: diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py index 25edd10f34..b68a02c20e 100644 --- a/astrbot/dashboard/routes/live_chat.py +++ b/astrbot/dashboard/routes/live_chat.py @@ -1,7 +1,6 @@ import asyncio import json import os -import re import time import uuid import wave @@ -22,7 +21,7 @@ from astrbot.core.platform.sources.webchat.webchat_queue_mgr import webchat_queue_mgr from astrbot.core.utils.astrbot_path import get_astrbot_data_path, get_astrbot_temp_path from astrbot.core.utils.datetime_utils import to_utc_isoformat -from astrbot.core.utils.web_search_utils import collect_web_search_results +from astrbot.core.utils.web_search_utils import build_web_search_refs from .route import Route, RouteContext @@ -199,24 +198,12 @@ def _extract_web_search_refs( self, accumulated_text: str, accumulated_parts: list ) -> dict: """从消息中提取 web_search 引用。""" - web_search_results = collect_web_search_results(accumulated_parts) - if not web_search_results: - return {} - - ref_indices = { - m.strip() for m in re.findall(r"(.*?)", accumulated_text) - } - - used_refs = [] - for ref_index in ref_indices: - if ref_index not in web_search_results: - continue - payload = {"index": ref_index, **web_search_results[ref_index]} - if favicon := sp.temporary_cache.get("_ws_favicon", {}).get(payload["url"]): - payload["favicon"] = favicon - used_refs.append(payload) - - return {"used": used_refs} if used_refs else {} + favicon_cache = sp.temporary_cache.get("_ws_favicon", {}) + return build_web_search_refs( + accumulated_text, + accumulated_parts, + favicon_cache, + ) async def _save_bot_message( self, diff --git a/dashboard/src/components/chat/MessageList.vue b/dashboard/src/components/chat/MessageList.vue index 128e75af0c..4030be61a0 100644 --- a/dashboard/src/components/chat/MessageList.vue +++ b/dashboard/src/components/chat/MessageList.vue @@ -149,7 +149,7 @@ @click="$emit('replyMessage', msg, index)" :title="tm('actions.reply')" /> - + @@ -294,7 +294,81 @@ export default { this.extractWebSearchResults(); }, methods: { - // 从消息中提取 web_search_tavily 的搜索结果 + extractRefsFromToolCall(toolCall) { + if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall?.name) || !toolCall.result) { + return []; + } + + try { + const resultData = typeof toolCall.result === 'string' + ? JSON.parse(toolCall.result) + : toolCall.result; + + if (!resultData?.results || !Array.isArray(resultData.results)) { + return []; + } + + const refs = []; + const seenIndices = new Set(); + + resultData.results.forEach(item => { + if (!item?.index || seenIndices.has(item.index)) { + return; + } + + refs.push({ + index: item.index, + url: item.url, + title: item.title, + snippet: item.snippet + }); + seenIndices.add(item.index); + }); + + return refs; + } catch (e) { + console.error('Failed to parse web search result:', e); + return []; + } + }, + + collectMessageWebSearchRefs(messageParts) { + if (!Array.isArray(messageParts)) { + return []; + } + + const refs = []; + const seenIndices = new Set(); + + messageParts.forEach(part => { + if (part.type !== 'tool_call' || !Array.isArray(part.tool_calls)) { + return; + } + + part.tool_calls.forEach(toolCall => { + this.extractRefsFromToolCall(toolCall).forEach(ref => { + if (seenIndices.has(ref.index)) { + return; + } + refs.push(ref); + seenIndices.add(ref.index); + }); + }); + }); + + return refs; + }, + + getMessageRefs(content) { + if (content?.refs?.used?.length) { + return content.refs; + } + + const fallbackRefs = this.collectMessageWebSearchRefs(content?.message); + return fallbackRefs.length ? { used: fallbackRefs } : null; + }, + + // 从消息中提取网页搜索结果映射 extractWebSearchResults() { const results = {}; @@ -302,39 +376,13 @@ export default { if (msg.content.type !== 'bot' || !Array.isArray(msg.content.message)) { return; } - - msg.content.message.forEach(part => { - if (part.type !== 'tool_call' || !Array.isArray(part.tool_calls)) { - return; - } - - part.tool_calls.forEach(toolCall => { - // 检查是否是网页搜索工具调用 - if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall.name) || !toolCall.result) { - return; - } - - try { - // 解析工具调用结果 - const resultData = typeof toolCall.result === 'string' - ? JSON.parse(toolCall.result) - : toolCall.result; - - if (resultData.results && Array.isArray(resultData.results)) { - resultData.results.forEach(item => { - if (item.index) { - results[item.index] = { - url: item.url, - title: item.title, - snippet: item.snippet - }; - } - }); - } - } catch (e) { - console.error('Failed to parse web search result:', e); - } - }); + + this.collectMessageWebSearchRefs(msg.content.message).forEach(ref => { + results[ref.index] = { + url: ref.url, + title: ref.title, + snippet: ref.snippet + }; }); }); diff --git a/tests/unit/test_web_search_utils.py b/tests/unit/test_web_search_utils.py new file mode 100644 index 0000000000..7e32bbc7cc --- /dev/null +++ b/tests/unit/test_web_search_utils.py @@ -0,0 +1,90 @@ +import json + +from astrbot.core.utils.web_search_utils import ( + build_web_search_refs, + collect_web_search_ref_items, + collect_web_search_results, +) + + +def _make_web_search_parts() -> list[dict]: + return [ + { + "type": "tool_call", + "tool_calls": [ + { + "name": "web_search_exa", + "result": json.dumps( + { + "results": [ + { + "index": "a152.1", + "url": "https://example.com/1", + "title": "Example 1", + "snippet": "Snippet 1", + }, + { + "index": "a152.2", + "url": "https://example.com/2", + "title": "Example 2", + "snippet": "Snippet 2", + }, + ] + } + ), + } + ], + } + ] + + +def test_collect_web_search_results_builds_index_mapping(): + results = collect_web_search_results(_make_web_search_parts()) + + assert results == { + "a152.1": { + "url": "https://example.com/1", + "title": "Example 1", + "snippet": "Snippet 1", + }, + "a152.2": { + "url": "https://example.com/2", + "title": "Example 2", + "snippet": "Snippet 2", + }, + } + + +def test_collect_web_search_ref_items_preserves_order_and_favicon(): + refs = collect_web_search_ref_items( + _make_web_search_parts(), + {"https://example.com/2": "https://example.com/favicon.ico"}, + ) + + assert [ref["index"] for ref in refs] == ["a152.1", "a152.2"] + assert "favicon" not in refs[0] + assert refs[1]["favicon"] == "https://example.com/favicon.ico" + + +def test_build_web_search_refs_uses_explicit_ref_indices_in_text_order(): + refs = build_web_search_refs( + "Second a152.2 first a152.1", + _make_web_search_parts(), + ) + + assert [ref["index"] for ref in refs["used"]] == ["a152.2", "a152.1"] + + +def test_build_web_search_refs_falls_back_to_all_results_without_refs(): + refs = build_web_search_refs("No explicit refs here.", _make_web_search_parts()) + + assert [ref["index"] for ref in refs["used"]] == ["a152.1", "a152.2"] + + +def test_build_web_search_refs_ignores_tool_call_id_and_falls_back(): + refs = build_web_search_refs( + "call_a73499ddbaf845dba8310e44", + _make_web_search_parts(), + ) + + assert [ref["index"] for ref in refs["used"]] == ["a152.1", "a152.2"] From 370167fb397d6ff0e3bd4ee4f9ee4577c5098952 Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Mon, 6 Apr 2026 21:20:00 +0800 Subject: [PATCH 05/10] =?UTF-8?q?fix(websearch):=20=E4=BF=AE=E5=A4=8D=20UU?= =?UTF-8?q?ID=20=E7=94=9F=E6=88=90=E9=80=BB=E8=BE=91=EF=BC=8C=E7=A1=AE?= =?UTF-8?q?=E4=BF=9D=E5=94=AF=E4=B8=80=E6=80=A7=EF=BC=9B=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=20API=20Base=20URL=20=E9=94=99=E8=AF=AF=E6=8F=90=E7=A4=BA?= =?UTF-8?q?=E4=BF=A1=E6=81=AF=EF=BC=9B=E6=96=B0=E5=A2=9E=E6=B6=88=E6=81=AF?= =?UTF-8?q?=E5=BC=95=E7=94=A8=E7=BC=93=E5=AD=98=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/builtin_stars/web_searcher/main.py | 10 +-- astrbot/core/utils/web_search_utils.py | 4 +- dashboard/src/components/chat/MessageList.vue | 67 +++++++++++++++++-- tests/unit/test_web_search_utils.py | 18 +++++ 4 files changed, 89 insertions(+), 10 deletions(-) diff --git a/astrbot/builtin_stars/web_searcher/main.py b/astrbot/builtin_stars/web_searcher/main.py index e0044969b6..f5972fd141 100644 --- a/astrbot/builtin_stars/web_searcher/main.py +++ b/astrbot/builtin_stars/web_searcher/main.py @@ -396,7 +396,7 @@ async def search_from_tavily( return "Error: Tavily web searcher does not return any results." ret_ls = [] - ref_uuid = str(uuid.uuid4())[:4] + ref_uuid = str(uuid.uuid4()) for idx, result in enumerate(results, 1): index = f"{ref_uuid}.{idx}" ret_ls.append( @@ -425,7 +425,7 @@ async def tavily_extract_web_page( """Extract the content of a web page using Tavily. Args: - url(string): Required. An URl to extract content from. + url(string): Required. A URL to extract content from. extract_depth(string): Optional. The depth of the extraction, must be one of 'basic', 'advanced'. Default is "basic". timeout(number): Optional. Request timeout in seconds. Minimum is 30. Default is 30. @@ -595,7 +595,7 @@ async def search_from_bocha( return "Error: BoCha web searcher does not return any results." ret_ls = [] - ref_uuid = str(uuid.uuid4())[:4] + ref_uuid = str(uuid.uuid4()) for idx, result in enumerate(results, 1): index = f"{ref_uuid}.{idx}" ret_ls.append( @@ -717,7 +717,7 @@ async def search_from_exa( return "Error: Exa web searcher does not return any results." ret_ls = [] - ref_uuid = str(uuid.uuid4())[:4] + ref_uuid = str(uuid.uuid4()) for idx, result in enumerate(results, 1): index = f"{ref_uuid}.{idx}" ret_ls.append( @@ -869,7 +869,7 @@ async def find_similar_links( return "Error: Exa find similar does not return any results." ret_ls = [] - ref_uuid = str(uuid.uuid4())[:4] + ref_uuid = str(uuid.uuid4()) for idx, result in enumerate(results, 1): index = f"{ref_uuid}.{idx}" ret_ls.append( diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py index 0052440152..4c00d48f0f 100644 --- a/astrbot/core/utils/web_search_utils.py +++ b/astrbot/core/utils/web_search_utils.py @@ -25,7 +25,9 @@ def normalize_web_search_base_url( parsed = urlparse(normalized) if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise ValueError( - f"Error: {provider_name} API Base URL must start with http:// or https://.", + f"Error: {provider_name} API Base URL must be a base host URL starting " + f"with http:// or https:// (for example, {default}), not a full endpoint " + f"path. Received: {normalized!r}.", ) return normalized diff --git a/dashboard/src/components/chat/MessageList.vue b/dashboard/src/components/chat/MessageList.vue index 4030be61a0..b67f35bb3f 100644 --- a/dashboard/src/components/chat/MessageList.vue +++ b/dashboard/src/components/chat/MessageList.vue @@ -274,7 +274,8 @@ export default { url: '' }, // Web search results mapping: { 'uuid.idx': { url, title, snippet } } - webSearchResults: {} + webSearchResults: {}, + messageRefsCache: new WeakMap() }; }, async mounted() { @@ -359,13 +360,66 @@ export default { return refs; }, + buildMessageRefsCacheKey(messageParts) { + if (!Array.isArray(messageParts)) { + return ''; + } + + const cacheParts = []; + + messageParts.forEach(part => { + if (part.type !== 'tool_call' || !Array.isArray(part.tool_calls)) { + return; + } + + part.tool_calls.forEach(toolCall => { + if (!WEB_SEARCH_REFERENCE_TOOLS.includes(toolCall?.name) || !toolCall.result) { + return; + } + + const rawResult = typeof toolCall.result === 'string' + ? toolCall.result + : JSON.stringify(toolCall.result); + + cacheParts.push(`${toolCall.id || toolCall.name}:${rawResult}`); + }); + }); + + return cacheParts.join('||'); + }, + + getCachedMessageRefs(content) { + if (!content || typeof content !== 'object') { + return null; + } + + const cacheKey = this.buildMessageRefsCacheKey(content.message); + if (!cacheKey) { + return null; + } + + const cachedEntry = this.messageRefsCache.get(content); + if (cachedEntry?.key === cacheKey) { + return cachedEntry.refs; + } + + const refs = this.collectMessageWebSearchRefs(content.message); + const normalizedRefs = refs.length ? { used: refs } : null; + + this.messageRefsCache.set(content, { + key: cacheKey, + refs: normalizedRefs + }); + + return normalizedRefs; + }, + getMessageRefs(content) { if (content?.refs?.used?.length) { return content.refs; } - const fallbackRefs = this.collectMessageWebSearchRefs(content?.message); - return fallbackRefs.length ? { used: fallbackRefs } : null; + return this.getCachedMessageRefs(content); }, // 从消息中提取网页搜索结果映射 @@ -377,7 +431,12 @@ export default { return; } - this.collectMessageWebSearchRefs(msg.content.message).forEach(ref => { + const refs = this.getMessageRefs(msg.content); + if (!refs?.used?.length) { + return; + } + + refs.used.forEach(ref => { results[ref.index] = { url: ref.url, title: ref.title, diff --git a/tests/unit/test_web_search_utils.py b/tests/unit/test_web_search_utils.py index 7e32bbc7cc..fc86f7c468 100644 --- a/tests/unit/test_web_search_utils.py +++ b/tests/unit/test_web_search_utils.py @@ -1,9 +1,12 @@ import json +import pytest + from astrbot.core.utils.web_search_utils import ( build_web_search_refs, collect_web_search_ref_items, collect_web_search_results, + normalize_web_search_base_url, ) @@ -88,3 +91,18 @@ def test_build_web_search_refs_ignores_tool_call_id_and_falls_back(): ) assert [ref["index"] for ref in refs["used"]] == ["a152.1", "a152.2"] + + +def test_normalize_web_search_base_url_reports_invalid_value(): + with pytest.raises(ValueError) as exc_info: + normalize_web_search_base_url( + "exa.ai/search", + default="https://api.exa.ai", + provider_name="Exa", + ) + + assert str(exc_info.value) == ( + "Error: Exa API Base URL must be a base host URL starting with " + "http:// or https:// (for example, https://api.exa.ai), not a full " + "endpoint path. Received: 'exa.ai/search'." + ) From 96e15f79ad4dc723b459b27358e77345b2b7ec2a Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Mon, 6 Apr 2026 21:51:49 +0800 Subject: [PATCH 06/10] =?UTF-8?q?fix(websearch):=20=E6=94=BE=E5=AE=BD=20AP?= =?UTF-8?q?I=20Base=20URL=20=E6=A0=A1=E9=AA=8C=EF=BC=8C=E5=A2=9E=E5=BC=BA?= =?UTF-8?q?=20Tavily/Exa=20=E8=AF=B7=E6=B1=82=E6=8A=A5=E9=94=99=E6=8F=90?= =?UTF-8?q?=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/builtin_stars/web_searcher/main.py | 49 +++++++++++++++++-- .../core/knowledge_base/parsers/url_parser.py | 5 +- astrbot/core/utils/web_search_utils.py | 5 +- tests/unit/test_web_search_utils.py | 40 ++++++++++++--- 4 files changed, 84 insertions(+), 15 deletions(-) diff --git a/astrbot/builtin_stars/web_searcher/main.py b/astrbot/builtin_stars/web_searcher/main.py index f5972fd141..82a38de04e 100644 --- a/astrbot/builtin_stars/web_searcher/main.py +++ b/astrbot/builtin_stars/web_searcher/main.py @@ -107,6 +107,15 @@ def _get_exa_base_url(self, cfg: AstrBotConfig) -> str: provider_name="Exa", ) + def _format_provider_request_error( + self, provider_name: str, action: str, url: str, reason: str, status: int + ) -> str: + return ( + f"{provider_name} {action} failed for URL {url}: {reason}, status: {status}. " + "If you configured an API Base URL, make sure it is a base URL or proxy " + "prefix rather than a specific endpoint path." + ) + def _add_active_tools( self, tool_set, func_tool_mgr, tool_names: tuple[str, ...] ) -> None: @@ -213,7 +222,13 @@ async def _web_search_tavily( if response.status != 200: reason = await response.text() raise Exception( - f"Tavily web search failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Tavily", + "web search", + url, + reason, + response.status, + ), ) data = await response.json() results = [] @@ -248,7 +263,13 @@ async def _extract_tavily( if response.status != 200: reason = await response.text() raise Exception( - f"Tavily web search failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Tavily", + "content extraction", + url, + reason, + response.status, + ), ) data = await response.json() results: list[dict] = data.get("results", []) @@ -648,7 +669,13 @@ async def _web_search_exa( if response.status != 200: reason = await response.text() raise Exception( - f"Exa web search failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Exa", + "web search", + url, + reason, + response.status, + ), ) data = await response.json() results = [] @@ -752,7 +779,13 @@ async def _extract_exa( if response.status != 200: reason = await response.text() raise Exception( - f"Exa content extraction failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Exa", + "content extraction", + url, + reason, + response.status, + ), ) data = await response.json() return data.get("results", []) @@ -818,7 +851,13 @@ async def _find_similar_exa( if response.status != 200: reason = await response.text() raise Exception( - f"Exa find similar failed: {reason}, status: {response.status}", + self._format_provider_request_error( + "Exa", + "find similar", + url, + reason, + response.status, + ), ) data = await response.json() results = [] diff --git a/astrbot/core/knowledge_base/parsers/url_parser.py b/astrbot/core/knowledge_base/parsers/url_parser.py index 660e110fe6..09a226f572 100644 --- a/astrbot/core/knowledge_base/parsers/url_parser.py +++ b/astrbot/core/knowledge_base/parsers/url_parser.py @@ -79,7 +79,10 @@ async def extract_text_from_url(self, url: str) -> str: if response.status != 200: reason = await response.text() raise OSError( - f"Tavily web extraction failed: {reason}, status: {response.status}" + f"Tavily web extraction failed for URL {api_url}: " + f"{reason}, status: {response.status}. If you configured " + "a Tavily API Base URL, make sure it is a base URL or " + "proxy prefix rather than a specific endpoint path." ) data = await response.json() diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py index 4c00d48f0f..701852449b 100644 --- a/astrbot/core/utils/web_search_utils.py +++ b/astrbot/core/utils/web_search_utils.py @@ -25,9 +25,8 @@ def normalize_web_search_base_url( parsed = urlparse(normalized) if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise ValueError( - f"Error: {provider_name} API Base URL must be a base host URL starting " - f"with http:// or https:// (for example, {default}), not a full endpoint " - f"path. Received: {normalized!r}.", + f"Error: {provider_name} API Base URL must start with http:// or " + f"https://. Proxy base paths are allowed. Received: {normalized!r}.", ) return normalized diff --git a/tests/unit/test_web_search_utils.py b/tests/unit/test_web_search_utils.py index fc86f7c468..b718ac3f31 100644 --- a/tests/unit/test_web_search_utils.py +++ b/tests/unit/test_web_search_utils.py @@ -93,16 +93,44 @@ def test_build_web_search_refs_ignores_tool_call_id_and_falls_back(): assert [ref["index"] for ref in refs["used"]] == ["a152.1", "a152.2"] -def test_normalize_web_search_base_url_reports_invalid_value(): +@pytest.mark.parametrize( + ("base_url", "expected_message"), + [ + ( + "exa.ai/search", + "Error: Exa API Base URL must start with http:// or https://. " + "Proxy base paths are allowed. Received: 'exa.ai/search'.", + ), + ], +) +def test_normalize_web_search_base_url_reports_invalid_value( + base_url: str, expected_message: str +): with pytest.raises(ValueError) as exc_info: normalize_web_search_base_url( - "exa.ai/search", + base_url, default="https://api.exa.ai", provider_name="Exa", ) - assert str(exc_info.value) == ( - "Error: Exa API Base URL must be a base host URL starting with " - "http:// or https:// (for example, https://api.exa.ai), not a full " - "endpoint path. Received: 'exa.ai/search'." + assert str(exc_info.value) == expected_message + + +@pytest.mark.parametrize( + ("base_url", "expected"), + [ + (" https://api.exa.ai/ ", "https://api.exa.ai"), + ("https://proxy.example.com/exa/", "https://proxy.example.com/exa"), + ("https://api.exa.ai/search", "https://api.exa.ai/search"), + ], +) +def test_normalize_web_search_base_url_accepts_proxy_paths( + base_url: str, expected: str +): + normalized = normalize_web_search_base_url( + base_url, + default="https://api.exa.ai", + provider_name="Exa", ) + + assert normalized == expected From 8e0a7232707ff3999d53220081abbf1d651e3846 Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Sun, 19 Apr 2026 15:02:22 +0800 Subject: [PATCH 07/10] docs: simplify web search documentation to match upstream style --- docs/en/use/websearch.md | 119 +++++---------------------------------- docs/zh/use/websearch.md | 118 ++++---------------------------------- 2 files changed, 25 insertions(+), 212 deletions(-) diff --git a/docs/en/use/websearch.md b/docs/en/use/websearch.md index 3be460397e..ff47e03429 100644 --- a/docs/en/use/websearch.md +++ b/docs/en/use/websearch.md @@ -1,3 +1,4 @@ + # Web Search The web search feature gives large language models internet retrieval capability for recent information, which can improve response accuracy and reduce hallucinations to some extent. @@ -19,120 +20,26 @@ AstrBot currently supports 5 web search providers: `Tavily`, `BoCha`, `Baidu AI Go to `Configuration`, scroll down to find Web Search, where you can select `Tavily`, `BoCha`, `Baidu AI Search`, `Brave`, or `Exa`. -If you use Tavily as your web search source, you will get a better experience optimization on AstrBot ChatUI, including citation source display and more: - -![](https://files.astrbot.app/docs/source/images/websearch/image1.png) - -## Tavily - -Go to [Tavily](https://app.tavily.com/home) to get an API key, then fill it in the corresponding configuration item. - -To use a proxy or self-hosted instance, modify the `Tavily API Base URL` configuration item. - -Tavily exposes two tools: - -### Search (`web_search_tavily`) - -| Parameter | Type | Required | Default | Description | -|---|---|---|---|---| -| `query` | string | Yes | - | Search query | -| `max_results` | number | No | 7 | Maximum number of results to return. Range: 5-20 | -| `search_depth` | string | No | `basic` | Search depth. Must be `basic` or `advanced` | -| `topic` | string | No | `general` | Search topic. Must be `general` or `news` | -| `days` | number | No | 3 | Number of days back from today to include. Only available when `topic` is `news` | -| `time_range` | string | No | - | Time range for results. Must be `day`, `week`, `month`, or `year` | -| `start_date` | string | No | - | Start date in `YYYY-MM-DD` format | -| `end_date` | string | No | - | End date in `YYYY-MM-DD` format | -| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | - -### Extract Web Page (`tavily_extract_web_page`) - -| Parameter | Type | Required | Default | Description | -|---|---|---|---|---| -| `url` | string | Yes | - | The URL to extract content from | -| `extract_depth` | string | No | `basic` | Extraction depth. Must be `basic` or `advanced` | -| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | - -## Baidu AI Search - -Go to the [BCE Console](https://console.bce.baidu.com/iam/#/iam/apikey/list) to get an API key, then fill it in the `websearch_baidu_app_builder_key` configuration item. +### Tavily -Baidu AI Search exposes one tool: +Go to [Tavily](https://app.tavily.com/home) to get an API Key, then fill it in the corresponding configuration item. -### Search (`web_search_baidu`) +### BoCha -| Parameter | Type | Required | Default | Description | -|---|---|---|---|---| -| `query` | string | Yes | - | Search query | -| `top_k` | number | No | 10 | Number of web results to return. Maximum 50 | -| `search_recency_filter` | string | No | - | Time filter. Must be `week`, `month`, `semiyear`, or `year` | -| `site` | string | No | - | Restrict search to specific sites, separated by `,` or `\|` | -| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | +Get an API Key from the BoCha platform, then fill it in the corresponding configuration item. -## BoCha +### Baidu AI Search -Go to [BoCha](https://www.bocha.ai) to get an API key, then fill it in the corresponding configuration item. +Get an API Key from Baidu Qianfan APP Builder, then fill it in the corresponding configuration item. -BoCha exposes one tool: +### Brave -### Search (`web_search_bocha`) +Get an API Key from Brave Search, then fill it in the corresponding configuration item. -| Parameter | Type | Required | Default | Description | -|---|---|---|---|---| -| `query` | string | Yes | - | Search query | -| `freshness` | string | No | `noLimit` | Time range filter. Supported values: `noLimit`, `oneDay`, `oneWeek`, `oneMonth`, `oneYear`, `YYYY-MM-DD..YYYY-MM-DD`, or `YYYY-MM-DD` | -| `summary` | boolean | No | `false` | Whether to include a summary for each result | -| `include` | string | No | - | Domains to include. Multiple domains separated by `\|` or `,` | -| `exclude` | string | No | - | Domains to exclude. Multiple domains separated by `\|` or `,` | -| `count` | number | No | 10 | Number of results to return. Range: 1-50 | -| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | +### Exa -## Brave +Go to [Exa](https://dashboard.exa.ai) to get an API Key, then fill it in the corresponding configuration item. -Go to Brave Search to get an API key, then fill it in the corresponding configuration item. - -Brave exposes one tool: - -### Search (`web_search_brave`) - -| Parameter | Type | Required | Default | Description | -|---|---|---|---|---| -| `query` | string | Yes | - | Search query | -| `count` | number | No | 10 | Number of results to return. Range: 1-20 | -| `country` | string | No | `US` | Country code for region-specific results | -| `search_lang` | string | No | `zh-hans` | Brave language code | -| `freshness` | string | No | - | Time range. Must be `day`, `week`, `month`, or `year` | -| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | - -## Exa - -Go to [Exa](https://dashboard.exa.ai) to get an API key, then fill it in the corresponding configuration item. - -To use a proxy or self-hosted instance, modify the `Exa API Base URL` configuration item. - -Exa exposes three tools: - -### Search (`web_search_exa`) - -| Parameter | Type | Required | Default | Description | -|---|---|---|---|---| -| `query` | string | Yes | - | Search query | -| `max_results` | number | No | 10 | Maximum number of results to return. Range: 1-100 | -| `search_type` | string | No | `auto` | Search type. Must be `auto`, `neural`, `fast`, `instant`, or `deep` | -| `category` | string | No | - | Vertical category. Supported values: `company`, `people`, `research paper`, `news`, `personal site`, `financial report` | -| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | - -### Content Extraction (`exa_extract_web_page`) - -| Parameter | Type | Required | Default | Description | -|---|---|---|---|---| -| `url` | string | Yes | - | The URL to extract content from | -| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | - -### Find Similar (`exa_find_similar`) +If you use Tavily as your web search source, you will get a better experience optimization on AstrBot ChatUI, including citation source display and more: -| Parameter | Type | Required | Default | Description | -|---|---|---|---|---| -| `url` | string | Yes | - | The URL to find similar content for | -| `max_results` | number | No | 10 | Maximum number of results to return. Range: 1-100 | -| `timeout` | number | No | 30 | Request timeout in seconds. Minimum is 30 | +![](https://files.astrbot.app/docs/source/images/websearch/image1.png) diff --git a/docs/zh/use/websearch.md b/docs/zh/use/websearch.md index 93c38adb33..bdfdc99bc0 100644 --- a/docs/zh/use/websearch.md +++ b/docs/zh/use/websearch.md @@ -19,120 +19,26 @@ AstrBot 当前支持 5 种网页搜索源接入方式:`Tavily`、`BoCha`、` 进入 `配置`,下拉找到网页搜索,你可选择 `Tavily`、`BoCha`、`百度 AI 搜索`、`Brave` 或 `Exa`。 -如果你使用 Tavily 作为网页搜索源,在 AstrBot ChatUI 上会获得更好的引用来源展示体验: +### Tavily -![](https://files.astrbot.app/docs/source/images/websearch/image1.png) - -## Tavily - -前往 [Tavily](https://app.tavily.com/home) 获取 API Key,然后填写在相应配置项中。 - -如需使用代理或自建实例,可修改 `Tavily API Base URL` 配置项。 - -Tavily 提供两个工具: - -### 搜索(`web_search_tavily`) - -| 参数 | 类型 | 必填 | 默认值 | 说明 | -|---|---|---|---|---| -| `query` | string | 是 | - | 搜索关键词 | -| `max_results` | number | 否 | 7 | 返回最大结果数,范围 5-20 | -| `search_depth` | string | 否 | `basic` | 搜索深度,可选 `basic` 或 `advanced` | -| `topic` | string | 否 | `general` | 搜索主题,可选 `general` 或 `news` | -| `days` | number | 否 | 3 | 向前追溯的天数,仅 `topic=news` 时生效 | -| `time_range` | string | 否 | - | 时间范围,可选 `day`、`week`、`month`、`year` | -| `start_date` | string | 否 | - | 起始日期,格式 `YYYY-MM-DD` | -| `end_date` | string | 否 | - | 结束日期,格式 `YYYY-MM-DD` | -| `timeout` | number | 否 | 30 | 请求超时时间(秒),最小 30 | - -### 网页内容提取(`tavily_extract_web_page`) - -| 参数 | 类型 | 必填 | 默认值 | 说明 | -|---|---|---|---|---| -| `url` | string | 是 | - | 要提取内容的网页 URL | -| `extract_depth` | string | 否 | `basic` | 提取深度,可选 `basic` 或 `advanced` | -| `timeout` | number | 否 | 30 | 请求超时时间(秒),最小 30 | - -## 百度 AI 搜索 - -前往 [百度智能云控制台](https://console.bce.baidu.com/iam/#/iam/apikey/list) 获取 API Key,然后填写在 `websearch_baidu_app_builder_key` 配置项中。 - -百度 AI 搜索提供一个工具: - -### 搜索(`web_search_baidu`) +前往 [Tavily](https://app.tavily.com/home) 获取 API Key,然后填写在相应的配置项。 -| 参数 | 类型 | 必填 | 默认值 | 说明 | -|---|---|---|---|---| -| `query` | string | 是 | - | 搜索关键词 | -| `top_k` | number | 否 | 10 | 返回的网页结果数量,最大 50 | -| `search_recency_filter` | string | 否 | - | 时间范围,可选 `week`、`month`、`semiyear`、`year` | -| `site` | string | 否 | - | 限定搜索站点,多个站点可用 `,` 或 `\|` 分隔 | -| `timeout` | number | 否 | 30 | 请求超时时间(秒),最小 30 | +### BoCha -## BoCha +前往 BoCha 平台获取 API Key,然后填写在相应的配置项。 -前往 [BoCha](https://www.bocha.ai) 获取 API Key,然后填写在相应配置项中。 +### 百度 AI 搜索 -BoCha 提供一个工具: +前往百度千帆 APP Builder 获取 API Key,然后填写在相应的配置项。 -### 搜索(`web_search_bocha`) +### Brave -| 参数 | 类型 | 必填 | 默认值 | 说明 | -|---|---|---|---|---| -| `query` | string | 是 | - | 搜索关键词 | -| `freshness` | string | 否 | `noLimit` | 时间范围筛选,可选 `noLimit`、`oneDay`、`oneWeek`、`oneMonth`、`oneYear`、`YYYY-MM-DD..YYYY-MM-DD` 或 `YYYY-MM-DD` | -| `summary` | boolean | 否 | `false` | 是否返回每条结果的摘要 | -| `include` | string | 否 | - | 仅搜索指定域名,多个域名用 `\|` 或 `,` 分隔 | -| `exclude` | string | 否 | - | 排除指定域名,多个域名用 `\|` 或 `,` 分隔 | -| `count` | number | 否 | 10 | 返回结果数量,范围 1-50 | -| `timeout` | number | 否 | 30 | 请求超时时间(秒),最小 30 | +前往 Brave Search 获取 API Key,然后填写在相应的配置项。 -## Brave +### Exa -前往 Brave Search 获取 API Key,然后填写在相应配置项中。 +前往 [Exa](https://dashboard.exa.ai) 获取 API Key,然后填写在相应的配置项。 -Brave 提供一个工具: +如果你使用 Tavily 作为网页搜索源,在 AstrBot ChatUI 上会获得更好的体验优化,包括引用来源展示等: -### 搜索(`web_search_brave`) - -| 参数 | 类型 | 必填 | 默认值 | 说明 | -|---|---|---|---|---| -| `query` | string | 是 | - | 搜索关键词 | -| `count` | number | 否 | 10 | 返回结果数量,范围 1-20 | -| `country` | string | 否 | `US` | 国家/地区代码 | -| `search_lang` | string | 否 | `zh-hans` | 搜索语言代码 | -| `freshness` | string | 否 | - | 时间范围,可选 `day`、`week`、`month`、`year` | -| `timeout` | number | 否 | 30 | 请求超时时间(秒),最小 30 | - -## Exa - -前往 [Exa](https://dashboard.exa.ai) 获取 API Key,然后填写在相应配置项中。 - -如需使用代理或自建实例,可修改 `Exa API Base URL` 配置项。 - -Exa 提供三个工具: - -### 搜索(`web_search_exa`) - -| 参数 | 类型 | 必填 | 默认值 | 说明 | -|---|---|---|---|---| -| `query` | string | 是 | - | 搜索关键词 | -| `max_results` | number | 否 | 10 | 返回最大结果数,范围 1-100 | -| `search_type` | string | 否 | `auto` | 搜索类型,可选 `auto`、`neural`、`fast`、`instant`、`deep` | -| `category` | string | 否 | - | 垂直类别,可选 `company`、`people`、`research paper`、`news`、`personal site`、`financial report` | -| `timeout` | number | 否 | 30 | 请求超时时间(秒),最小 30 | - -### 内容提取(`exa_extract_web_page`) - -| 参数 | 类型 | 必填 | 默认值 | 说明 | -|---|---|---|---|---| -| `url` | string | 是 | - | 要提取内容的网页 URL | -| `timeout` | number | 否 | 30 | 请求超时时间(秒),最小 30 | - -### 相似链接(`exa_find_similar`) - -| 参数 | 类型 | 必填 | 默认值 | 说明 | -|---|---|---|---|---| -| `url` | string | 是 | - | 用于查找相似内容的网页 URL | -| `max_results` | number | 否 | 10 | 返回最大结果数,范围 1-100 | -| `timeout` | number | 否 | 30 | 请求超时时间(秒),最小 30 | +![](https://files.astrbot.app/docs/source/images/websearch/image1.png) From bab3b7cabad3de37c46636227ce44015c84cfb41 Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:50:07 +0800 Subject: [PATCH 08/10] fix(web-search): validate base URLs against endpoint paths and expand Exa search types - Reject specific API endpoint paths (e.g., /search, /extract) in base URL normalization via new disallowed_path_suffixes parameter to prevent misconfiguration errors - Add deep-lite and deep-reasoning to valid Exa search types and normalize search_type input before validation - Add missing config parameter to BochaWebSearchTool builtin_tool decorator so provider status checks are properly registered --- .../core/knowledge_base/parsers/url_parser.py | 1 + astrbot/core/tools/web_search_tools.py | 19 ++- astrbot/core/utils/web_search_utils.py | 13 ++ tests/unit/test_web_search_tools.py | 112 ++++++++++++++++++ tests/unit/test_web_search_utils.py | 38 +++++- 5 files changed, 178 insertions(+), 5 deletions(-) create mode 100644 tests/unit/test_web_search_tools.py diff --git a/astrbot/core/knowledge_base/parsers/url_parser.py b/astrbot/core/knowledge_base/parsers/url_parser.py index 09a226f572..a8d6b3694e 100644 --- a/astrbot/core/knowledge_base/parsers/url_parser.py +++ b/astrbot/core/knowledge_base/parsers/url_parser.py @@ -28,6 +28,7 @@ def __init__( tavily_base_url, default="https://api.tavily.com", provider_name="Tavily", + disallowed_path_suffixes=("search", "extract"), ) async def _get_tavily_key(self) -> str: diff --git a/astrbot/core/tools/web_search_tools.py b/astrbot/core/tools/web_search_tools.py index 2614c920d8..a020b65927 100644 --- a/astrbot/core/tools/web_search_tools.py +++ b/astrbot/core/tools/web_search_tools.py @@ -46,6 +46,15 @@ "provider_settings.web_search": True, "provider_settings.websearch_provider": "exa", } +_EXA_SEARCH_TYPES = ( + "auto", + "fast", + "deep", + "deep-lite", + "deep-reasoning", + "instant", + "neural", +) @std_dataclass @@ -149,6 +158,7 @@ def _get_tavily_base_url(provider_settings: dict) -> str: provider_settings.get("websearch_tavily_base_url"), default="https://api.tavily.com", provider_name="Tavily", + disallowed_path_suffixes=("search", "extract"), ) @@ -157,6 +167,7 @@ def _get_exa_base_url(provider_settings: dict) -> str: provider_settings.get("websearch_exa_base_url"), default="https://api.exa.ai", provider_name="Exa", + disallowed_path_suffixes=("search", "contents", "findSimilar"), ) @@ -645,7 +656,7 @@ class ExaWebSearchTool(FunctionTool[AstrAgentContext]): }, "search_type": { "type": "string", - "description": 'Optional. Search type. Must be one of "auto", "neural", "fast", "instant", "deep". Default is "auto".', + "description": 'Optional. Search type. Must be one of "auto", "fast", "deep", "deep-lite", "deep-reasoning", "instant", "neural". Default is "auto".', }, "category": { "type": "string", @@ -665,8 +676,8 @@ async def call(self, context, **kwargs) -> ToolExecResult: if not provider_settings.get("websearch_exa_key", []): return "Error: Exa API key is not configured in AstrBot." - search_type = kwargs.get("search_type", "auto") - if search_type not in ("auto", "neural", "fast", "instant", "deep"): + search_type = str(kwargs.get("search_type", "auto")).strip().lower() + if search_type not in _EXA_SEARCH_TYPES: search_type = "auto" max_results = max(1, min(int(kwargs.get("max_results", 10)), 100)) @@ -794,7 +805,7 @@ async def call(self, context, **kwargs) -> ToolExecResult: return _search_result_payload(results) -@builtin_tool +@builtin_tool(config=_BOCHA_WEB_SEARCH_TOOL_CONFIG) @pydantic_dataclass class BochaWebSearchTool(FunctionTool[AstrAgentContext]): name: str = "web_search_bocha" diff --git a/astrbot/core/utils/web_search_utils.py b/astrbot/core/utils/web_search_utils.py index 665f371ea5..680cd1b58a 100644 --- a/astrbot/core/utils/web_search_utils.py +++ b/astrbot/core/utils/web_search_utils.py @@ -18,6 +18,7 @@ def normalize_web_search_base_url( *, default: str, provider_name: str, + disallowed_path_suffixes: tuple[str, ...] = (), ) -> str: normalized = (base_url or "").strip() if not normalized: @@ -30,6 +31,18 @@ def normalize_web_search_base_url( f"Error: {provider_name} API Base URL must start with http:// or " f"https://. Proxy base paths are allowed. Received: {normalized!r}.", ) + + last_path_segment = parsed.path.rstrip("/").rsplit("/", 1)[-1].lower() + invalid_suffixes = { + suffix.strip("/").lower() + for suffix in disallowed_path_suffixes + if suffix and suffix.strip("/") + } + if last_path_segment and last_path_segment in invalid_suffixes: + raise ValueError( + f"Error: {provider_name} API Base URL must be a base URL or proxy " + f"prefix, not a specific endpoint path. Received: {normalized!r}.", + ) return normalized diff --git a/tests/unit/test_web_search_tools.py b/tests/unit/test_web_search_tools.py new file mode 100644 index 0000000000..9db5c093f0 --- /dev/null +++ b/tests/unit/test_web_search_tools.py @@ -0,0 +1,112 @@ +from types import SimpleNamespace + +import pytest + +import astrbot.core.tools.registry as tool_registry +import astrbot.core.tools.web_search_tools as web_search_tools +from astrbot.core.knowledge_base.parsers.url_parser import URLExtractor +from astrbot.core.tools.web_search_tools import ExaWebSearchTool + + +def _make_tool_context(provider_settings: dict) -> SimpleNamespace: + cfg = {"provider_settings": provider_settings} + return SimpleNamespace( + context=SimpleNamespace( + context=SimpleNamespace(get_config=lambda umo=None: cfg), + event=SimpleNamespace(unified_msg_origin="test:private:session"), + ) + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("search_type", "expected"), + [ + ("deep-lite", "deep-lite"), + ("deep-reasoning", "deep-reasoning"), + ("instant", "instant"), + ("unsupported", "auto"), + ], +) +async def test_exa_web_search_tool_normalizes_search_type( + monkeypatch: pytest.MonkeyPatch, + search_type: str, + expected: str, +): + captured: dict[str, object] = {} + + async def fake_exa_search(provider_settings: dict, payload: dict, timeout: int): + captured["provider_settings"] = provider_settings + captured["payload"] = payload + captured["timeout"] = timeout + return [] + + monkeypatch.setattr(web_search_tools, "_exa_search", fake_exa_search) + + tool = ExaWebSearchTool() + result = await tool.call( + _make_tool_context({"websearch_exa_key": ["test-key"]}), + query="AstrBot", + search_type=search_type, + ) + + assert result == "Error: Exa web searcher does not return any results." + assert captured["payload"]["type"] == expected + + +def test_get_exa_base_url_rejects_endpoint_path(): + with pytest.raises(ValueError) as exc_info: + web_search_tools._get_exa_base_url( + {"websearch_exa_base_url": "https://api.exa.ai/search"} + ) + + assert str(exc_info.value) == ( + "Error: Exa API Base URL must be a base URL or proxy prefix, " + "not a specific endpoint path. Received: 'https://api.exa.ai/search'." + ) + + +def test_url_extractor_rejects_endpoint_base_url(): + with pytest.raises(ValueError) as exc_info: + URLExtractor( + ["test-key"], + tavily_base_url="https://api.tavily.com/extract", + ) + + assert str(exc_info.value) == ( + "Error: Tavily API Base URL must be a base URL or proxy prefix, " + "not a specific endpoint path. Received: 'https://api.tavily.com/extract'." + ) + + +def test_bocha_builtin_config_statuses_are_registered(): + rule = tool_registry._BUILTIN_TOOL_CONFIG_RULES.get("web_search_bocha") + + assert rule is not None + statuses = rule.evaluate( + { + "provider_settings": { + "web_search": True, + "websearch_provider": "bocha", + } + } + ) + + assert statuses == [ + { + "key": "provider_settings.web_search", + "operator": "equals", + "expected": True, + "actual": True, + "matched": True, + "message": None, + }, + { + "key": "provider_settings.websearch_provider", + "operator": "equals", + "expected": "bocha", + "actual": "bocha", + "matched": True, + "message": None, + } + ] diff --git a/tests/unit/test_web_search_utils.py b/tests/unit/test_web_search_utils.py index b718ac3f31..2f619eb66e 100644 --- a/tests/unit/test_web_search_utils.py +++ b/tests/unit/test_web_search_utils.py @@ -121,7 +121,6 @@ def test_normalize_web_search_base_url_reports_invalid_value( [ (" https://api.exa.ai/ ", "https://api.exa.ai"), ("https://proxy.example.com/exa/", "https://proxy.example.com/exa"), - ("https://api.exa.ai/search", "https://api.exa.ai/search"), ], ) def test_normalize_web_search_base_url_accepts_proxy_paths( @@ -134,3 +133,40 @@ def test_normalize_web_search_base_url_accepts_proxy_paths( ) assert normalized == expected + + +@pytest.mark.parametrize( + ("base_url", "provider_name", "disallowed_path_suffixes", "expected_message"), + [ + ( + "https://api.exa.ai/search", + "Exa", + ("search", "contents", "findSimilar"), + "Error: Exa API Base URL must be a base URL or proxy prefix, " + "not a specific endpoint path. Received: 'https://api.exa.ai/search'.", + ), + ( + "https://api.tavily.com/extract", + "Tavily", + ("search", "extract"), + "Error: Tavily API Base URL must be a base URL or proxy prefix, " + "not a specific endpoint path. Received: " + "'https://api.tavily.com/extract'.", + ), + ], +) +def test_normalize_web_search_base_url_rejects_endpoint_paths( + base_url: str, + provider_name: str, + disallowed_path_suffixes: tuple[str, ...], + expected_message: str, +): + with pytest.raises(ValueError) as exc_info: + normalize_web_search_base_url( + base_url, + default="https://api.exa.ai", + provider_name=provider_name, + disallowed_path_suffixes=disallowed_path_suffixes, + ) + + assert str(exc_info.value) == expected_message From 0b915ba26bc16fd0d6fe6bab54cb1ac2d223cf43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E6=B0=B8=E8=B5=AB?= <1259085392@qq.com> Date: Tue, 21 Apr 2026 09:46:32 +0900 Subject: [PATCH 09/10] fix(websearch): include live refs and Exa favicons --- astrbot/core/tools/web_search_tools.py | 2 + astrbot/dashboard/routes/chat.py | 14 ++-- astrbot/dashboard/routes/live_chat.py | 17 ++--- astrbot/dashboard/routes/message_events.py | 21 ++++++ tests/test_chat_route.py | 36 +++++++++ tests/unit/test_web_search_tools.py | 86 ++++++++++++++++++++++ 6 files changed, 156 insertions(+), 20 deletions(-) create mode 100644 astrbot/dashboard/routes/message_events.py diff --git a/astrbot/core/tools/web_search_tools.py b/astrbot/core/tools/web_search_tools.py index a020b65927..e776aa170d 100644 --- a/astrbot/core/tools/web_search_tools.py +++ b/astrbot/core/tools/web_search_tools.py @@ -416,6 +416,7 @@ async def _exa_search( title=item.get("title", ""), url=item.get("url", ""), snippet=(item.get("text") or "")[:500], + favicon=item.get("favicon"), ) for item in data.get("results", []) ] @@ -489,6 +490,7 @@ async def _exa_find_similar( title=item.get("title", ""), url=item.get("url", ""), snippet=(item.get("text") or "")[:500], + favicon=item.get("favicon"), ) for item in data.get("results", []) ] diff --git a/astrbot/dashboard/routes/chat.py b/astrbot/dashboard/routes/chat.py index 53e2fe1beb..193f9d62a2 100644 --- a/astrbot/dashboard/routes/chat.py +++ b/astrbot/dashboard/routes/chat.py @@ -24,6 +24,7 @@ from astrbot.core.utils.datetime_utils import to_utc_isoformat from astrbot.core.utils.web_search_utils import build_web_search_refs +from .message_events import build_message_saved_event from .route import Response, Route, RouteContext # SSE heartbeat message to keep the connection alive during long-running operations @@ -465,15 +466,10 @@ async def stream(): ) # 发送保存的消息信息给前端 if saved_record and not client_disconnected: - saved_info = { - "type": "message_saved", - "data": { - "id": saved_record.id, - "created_at": to_utc_isoformat( - saved_record.created_at - ), - }, - } + saved_info = build_message_saved_event( + saved_record, + refs, + ) try: yield f"data: {json.dumps(saved_info, ensure_ascii=False)}\n\n" except Exception: diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py index b68a02c20e..509f240e4c 100644 --- a/astrbot/dashboard/routes/live_chat.py +++ b/astrbot/dashboard/routes/live_chat.py @@ -20,9 +20,9 @@ ) from astrbot.core.platform.sources.webchat.webchat_queue_mgr import webchat_queue_mgr from astrbot.core.utils.astrbot_path import get_astrbot_data_path, get_astrbot_temp_path -from astrbot.core.utils.datetime_utils import to_utc_isoformat from astrbot.core.utils.web_search_utils import build_web_search_refs +from .message_events import build_message_saved_event from .route import Route, RouteContext @@ -580,16 +580,11 @@ async def _handle_chat_message( if saved_record: await self._send_chat_payload( session, - { - "ct": "chat", - "type": "message_saved", - "data": { - "id": saved_record.id, - "created_at": to_utc_isoformat( - saved_record.created_at - ), - }, - }, + build_message_saved_event( + saved_record, + refs, + chat_mode=True, + ), ) accumulated_parts = [] diff --git a/astrbot/dashboard/routes/message_events.py b/astrbot/dashboard/routes/message_events.py new file mode 100644 index 0000000000..e701036094 --- /dev/null +++ b/astrbot/dashboard/routes/message_events.py @@ -0,0 +1,21 @@ +from astrbot.core.utils.datetime_utils import to_utc_isoformat + + +def build_message_saved_event( + saved_record, + refs: dict | None = None, + *, + chat_mode: bool = False, +) -> dict: + payload = { + "type": "message_saved", + "data": { + "id": saved_record.id, + "created_at": to_utc_isoformat(saved_record.created_at), + }, + } + if refs: + payload["data"]["refs"] = refs + if chat_mode: + payload["ct"] = "chat" + return payload diff --git a/tests/test_chat_route.py b/tests/test_chat_route.py index 47bd747a04..bbd95fe519 100644 --- a/tests/test_chat_route.py +++ b/tests/test_chat_route.py @@ -1,8 +1,12 @@ import asyncio +from datetime import datetime, timezone +from types import SimpleNamespace import pytest from astrbot.dashboard.routes.chat import _poll_webchat_stream_result +from astrbot.dashboard.routes.message_events import build_message_saved_event +from astrbot.core.utils.datetime_utils import to_utc_isoformat class _QueueThatRaises: @@ -54,3 +58,35 @@ async def test_poll_webchat_stream_result_returns_queue_payload(): assert result == payload assert should_break is False + + +@pytest.mark.parametrize("chat_mode", [False, True]) +def test_build_message_saved_event_includes_refs(chat_mode: bool): + saved_record = SimpleNamespace( + id=42, + created_at=datetime(2026, 4, 21, 12, 0, tzinfo=timezone.utc), + ) + refs = { + "used": [ + { + "index": "abcd.1", + "url": "https://example.com", + "title": "Example", + } + ] + } + + payload = build_message_saved_event(saved_record, refs, chat_mode=chat_mode) + + expected = { + "type": "message_saved", + "data": { + "id": 42, + "created_at": to_utc_isoformat(saved_record.created_at), + "refs": refs, + }, + } + if chat_mode: + expected["ct"] = "chat" + + assert payload == expected diff --git a/tests/unit/test_web_search_tools.py b/tests/unit/test_web_search_tools.py index 9db5c093f0..8177a517fd 100644 --- a/tests/unit/test_web_search_tools.py +++ b/tests/unit/test_web_search_tools.py @@ -8,6 +8,41 @@ from astrbot.core.tools.web_search_tools import ExaWebSearchTool +class _FakeResponse: + def __init__(self, payload: dict): + self.status = 200 + self._payload = payload + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + async def json(self): + return self._payload + + async def text(self): + return "" + + +class _FakeSession: + def __init__(self, payload: dict, captured: dict[str, object]): + self._payload = payload + self._captured = captured + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + def post(self, url: str, **kwargs): + self._captured["url"] = url + self._captured["kwargs"] = kwargs + return _FakeResponse(self._payload) + + def _make_tool_context(provider_settings: dict) -> SimpleNamespace: cfg = {"provider_settings": provider_settings} return SimpleNamespace( @@ -110,3 +145,54 @@ def test_bocha_builtin_config_statuses_are_registered(): "message": None, } ] + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("helper_name", "payload"), + [ + ( + "_exa_search", + {"query": "AstrBot"}, + ), + ( + "_exa_find_similar", + {"url": "https://example.com"}, + ), + ], +) +async def test_exa_helpers_preserve_favicon( + monkeypatch: pytest.MonkeyPatch, + helper_name: str, + payload: dict, +): + captured: dict[str, object] = {} + response_payload = { + "results": [ + { + "title": "Example", + "url": "https://example.com", + "text": "Snippet", + "favicon": "https://example.com/favicon.ico", + } + ] + } + + async def fake_get(provider_settings: dict) -> str: + return "test-key" + + monkeypatch.setattr(web_search_tools._EXA_KEY_ROTATOR, "get", fake_get) + monkeypatch.setattr( + web_search_tools.aiohttp, + "ClientSession", + lambda **kwargs: _FakeSession(response_payload, captured), + ) + + helper = getattr(web_search_tools, helper_name) + results = await helper( + {"websearch_exa_key": ["test-key"]}, + payload, + ) + + assert captured["url"] + assert results[0].favicon == "https://example.com/favicon.ico" From 2e26d0078da9ea402495b50a3d43b5a68710099b Mon Sep 17 00:00:00 2001 From: piexian <64474352+piexian@users.noreply.github.com> Date: Sat, 2 May 2026 22:45:03 +0800 Subject: [PATCH 10/10] feat(exa): add content extraction error handling and param normalization surface Exa content extraction status errors with URL and error tag details; extract count validation into reusable _normalize_count helper; pass llm_checkpoint_id through build_message_saved_event parameter --- astrbot/core/tools/web_search_tools.py | 53 +++++++++++- astrbot/dashboard/routes/chat.py | 4 +- astrbot/dashboard/routes/live_chat.py | 1 + astrbot/dashboard/routes/message_events.py | 3 + tests/test_chat_route.py | 29 ++++++- tests/unit/test_web_search_tools.py | 94 ++++++++++++++++++++++ 6 files changed, 178 insertions(+), 6 deletions(-) diff --git a/astrbot/core/tools/web_search_tools.py b/astrbot/core/tools/web_search_tools.py index 6a18fa43cb..73844215bb 100644 --- a/astrbot/core/tools/web_search_tools.py +++ b/astrbot/core/tools/web_search_tools.py @@ -146,6 +146,20 @@ def _normalize_timeout(timeout: int | float | str | None) -> aiohttp.ClientTimeo return aiohttp.ClientTimeout(total=max(timeout_value, MIN_WEB_SEARCH_TIMEOUT)) +def _normalize_count( + value: int | float | str | None, + *, + default: int, + minimum: int, + maximum: int, +) -> int: + try: + count = int(value) if value is not None else default + except (TypeError, ValueError): + count = default + return max(minimum, min(count, maximum)) + + def _cache_favicon(url: str, favicon: str | None) -> None: if favicon: sp.temporary_cache["_ws_favicon"][url] = favicon @@ -196,6 +210,26 @@ def _search_result_payload(results: list[SearchResult]) -> str: return json.dumps({"results": ret_ls}, ensure_ascii=False) +def _format_exa_contents_status_error(statuses: list[dict]) -> str | None: + failed_statuses = [ + status + for status in statuses + if status.get("status") and status["status"] != "success" + ] + if not failed_statuses: + return None + + errors = [] + for status in failed_statuses: + error = status.get("error") or {} + details = error.get("tag") or "unknown error" + http_status = error.get("httpStatusCode") + if http_status is not None: + details = f"{details} (HTTP {http_status})" + errors.append(f"{status.get('id', 'unknown URL')}: {details}") + return "Error: Exa content extraction failed: " + "; ".join(errors) + + async def _tavily_search( provider_settings: dict, payload: dict, @@ -526,6 +560,11 @@ async def _exa_extract( ) ) data = await response.json() + status_error = _format_exa_contents_status_error( + data.get("statuses", []), + ) + if status_error: + raise ValueError(status_error) return data.get("results", []) @@ -756,7 +795,12 @@ async def call(self, context, **kwargs) -> ToolExecResult: if search_type not in _EXA_SEARCH_TYPES: search_type = "auto" - max_results = max(1, min(int(kwargs.get("max_results", 10)), 100)) + max_results = _normalize_count( + kwargs.get("max_results"), + default=10, + minimum=1, + maximum=100, + ) payload = { "query": kwargs["query"], "numResults": max_results, @@ -871,7 +915,12 @@ async def call(self, context, **kwargs) -> ToolExecResult: provider_settings, { "url": url, - "numResults": max(1, min(int(kwargs.get("max_results", 10)), 100)), + "numResults": _normalize_count( + kwargs.get("max_results"), + default=10, + minimum=1, + maximum=100, + ), "contents": {"text": {"maxCharacters": 500}}, }, timeout=kwargs.get("timeout", MIN_WEB_SEARCH_TIMEOUT), diff --git a/astrbot/dashboard/routes/chat.py b/astrbot/dashboard/routes/chat.py index 49d3f60320..368348daf4 100644 --- a/astrbot/dashboard/routes/chat.py +++ b/astrbot/dashboard/routes/chat.py @@ -922,9 +922,7 @@ def build_attachment_saved_event(part: dict | None) -> str | None: saved_info = build_message_saved_event( saved_record, saved_refs, - ) - saved_info["data"]["llm_checkpoint_id"] = ( - llm_checkpoint_id + llm_checkpoint_id=llm_checkpoint_id, ) try: yield f"data: {json.dumps(saved_info, ensure_ascii=False)}\n\n" diff --git a/astrbot/dashboard/routes/live_chat.py b/astrbot/dashboard/routes/live_chat.py index a6e5961cab..8239eb6e30 100644 --- a/astrbot/dashboard/routes/live_chat.py +++ b/astrbot/dashboard/routes/live_chat.py @@ -600,6 +600,7 @@ async def send_attachment_saved_event(part: dict | None) -> None: build_message_saved_event( saved_record, saved_refs, + llm_checkpoint_id=llm_checkpoint_id, chat_mode=True, ), ) diff --git a/astrbot/dashboard/routes/message_events.py b/astrbot/dashboard/routes/message_events.py index e701036094..7207ee7361 100644 --- a/astrbot/dashboard/routes/message_events.py +++ b/astrbot/dashboard/routes/message_events.py @@ -5,6 +5,7 @@ def build_message_saved_event( saved_record, refs: dict | None = None, *, + llm_checkpoint_id: str | None = None, chat_mode: bool = False, ) -> dict: payload = { @@ -16,6 +17,8 @@ def build_message_saved_event( } if refs: payload["data"]["refs"] = refs + if llm_checkpoint_id is not None: + payload["data"]["llm_checkpoint_id"] = llm_checkpoint_id if chat_mode: payload["ct"] = "chat" return payload diff --git a/tests/test_chat_route.py b/tests/test_chat_route.py index bbd95fe519..2855fa179d 100644 --- a/tests/test_chat_route.py +++ b/tests/test_chat_route.py @@ -4,9 +4,9 @@ import pytest +from astrbot.core.utils.datetime_utils import to_utc_isoformat from astrbot.dashboard.routes.chat import _poll_webchat_stream_result from astrbot.dashboard.routes.message_events import build_message_saved_event -from astrbot.core.utils.datetime_utils import to_utc_isoformat class _QueueThatRaises: @@ -90,3 +90,30 @@ def test_build_message_saved_event_includes_refs(chat_mode: bool): expected["ct"] = "chat" assert payload == expected + + +@pytest.mark.parametrize("chat_mode", [False, True]) +def test_build_message_saved_event_includes_checkpoint_id(chat_mode: bool): + saved_record = SimpleNamespace( + id=42, + created_at=datetime(2026, 4, 21, 12, 0, tzinfo=timezone.utc), + ) + + payload = build_message_saved_event( + saved_record, + llm_checkpoint_id="checkpoint-1", + chat_mode=chat_mode, + ) + + expected = { + "type": "message_saved", + "data": { + "id": 42, + "created_at": to_utc_isoformat(saved_record.created_at), + "llm_checkpoint_id": "checkpoint-1", + }, + } + if chat_mode: + expected["ct"] = "chat" + + assert payload == expected diff --git a/tests/unit/test_web_search_tools.py b/tests/unit/test_web_search_tools.py index 93f7eb4388..507ba635ba 100644 --- a/tests/unit/test_web_search_tools.py +++ b/tests/unit/test_web_search_tools.py @@ -122,9 +122,14 @@ def test_normalize_legacy_web_search_config_migrates_firecrawl_and_exa_keys(): @pytest.mark.parametrize( ("search_type", "expected"), [ + ("auto", "auto"), + ("neural", "neural"), + ("fast", "fast"), ("deep-lite", "deep-lite"), + ("deep", "deep"), ("deep-reasoning", "deep-reasoning"), ("instant", "instant"), + (" INSTANT ", "instant"), ("unsupported", "auto"), ], ) @@ -154,6 +159,56 @@ async def fake_exa_search(provider_settings: dict, payload: dict, timeout: int): assert captured["payload"]["type"] == expected +@pytest.mark.asyncio +async def test_exa_web_search_tool_uses_default_for_invalid_max_results( + monkeypatch: pytest.MonkeyPatch, +): + captured: dict[str, object] = {} + + async def fake_exa_search(provider_settings: dict, payload: dict, timeout: int): + captured["payload"] = payload + return [] + + monkeypatch.setattr(tools, "_exa_search", fake_exa_search) + + tool = tools.ExaWebSearchTool() + result = await tool.call( + _context_with_provider_settings({"websearch_exa_key": ["test-key"]}), + query="AstrBot", + max_results="not-a-number", + ) + + assert result == "Error: Exa web searcher does not return any results." + assert captured["payload"]["numResults"] == 10 + + +@pytest.mark.asyncio +async def test_exa_find_similar_uses_default_for_invalid_max_results( + monkeypatch: pytest.MonkeyPatch, +): + captured: dict[str, object] = {} + + async def fake_exa_find_similar( + provider_settings: dict, + payload: dict, + timeout: int, + ): + captured["payload"] = payload + return [] + + monkeypatch.setattr(tools, "_exa_find_similar", fake_exa_find_similar) + + tool = tools.ExaFindSimilarTool() + result = await tool.call( + _context_with_provider_settings({"websearch_exa_key": ["test-key"]}), + url="https://example.com", + max_results="not-a-number", + ) + + assert result == "Error: Exa find similar does not return any results." + assert captured["payload"]["numResults"] == 10 + + def test_get_exa_base_url_rejects_endpoint_path(): with pytest.raises(ValueError) as exc_info: tools._get_exa_base_url({"websearch_exa_base_url": "https://api.exa.ai/search"}) @@ -255,6 +310,45 @@ async def fake_get(provider_settings: dict) -> str: assert results[0].favicon == "https://example.com/favicon.ico" +@pytest.mark.asyncio +async def test_exa_extract_raises_status_error(monkeypatch: pytest.MonkeyPatch): + response_payload = { + "results": [], + "statuses": [ + { + "id": "https://example.com/missing", + "status": "error", + "error": { + "tag": "CRAWL_NOT_FOUND", + "httpStatusCode": 404, + }, + } + ], + } + captured: dict[str, object] = {} + + async def fake_get(provider_settings: dict) -> str: + return "test-key" + + monkeypatch.setattr(tools._EXA_KEY_ROTATOR, "get", fake_get) + monkeypatch.setattr( + tools.aiohttp, + "ClientSession", + lambda **kwargs: _FakeExaSession(response_payload, captured), + ) + + with pytest.raises(ValueError) as exc_info: + await tools._exa_extract( + {"websearch_exa_key": ["test-key"]}, + {"urls": ["https://example.com/missing"], "text": True}, + ) + + assert str(exc_info.value) == ( + "Error: Exa content extraction failed: " + "https://example.com/missing: CRAWL_NOT_FOUND (HTTP 404)" + ) + + @pytest.mark.asyncio async def test_firecrawl_search_maps_web_results(monkeypatch): async def fake_firecrawl_search(provider_settings, payload):