From 1febb655df57a44753ed86779ce6320883a0d7bc Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 22 Apr 2026 22:44:55 +0000 Subject: [PATCH] add stagehand plugin --- .../playwright_crawler_stagehand/__init__.py | 0 .../browser_classes.py | 101 -------- .../stagehand_run.py | 66 ------ .../support_classes.py | 57 ----- docs/guides/playwright_crawler_stagehand.mdx | 66 ------ pyproject.toml | 3 + .../browsers/_stagehand_browser_controller.py | 193 +++++++++++++++ .../browsers/_stagehand_browser_plugin.py | 221 ++++++++++++++++++ src/crawlee/browsers/_types.py | 93 +++++++- src/crawlee/crawlers/_stagehand/__init__.py | 3 + .../crawlers/_stagehand/_stagehand_crawler.py | 99 ++++++++ uv.lock | 46 +++- 12 files changed, 654 insertions(+), 294 deletions(-) delete mode 100644 docs/guides/code_examples/playwright_crawler_stagehand/__init__.py delete mode 100644 docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py delete mode 100644 docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py delete mode 100644 docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py delete mode 100644 docs/guides/playwright_crawler_stagehand.mdx create mode 100644 src/crawlee/browsers/_stagehand_browser_controller.py create mode 100644 src/crawlee/browsers/_stagehand_browser_plugin.py create mode 100644 src/crawlee/crawlers/_stagehand/__init__.py create mode 100644 src/crawlee/crawlers/_stagehand/_stagehand_crawler.py diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/__init__.py b/docs/guides/code_examples/playwright_crawler_stagehand/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py b/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py deleted file mode 100644 index 67b76f1f47..0000000000 --- a/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations - -from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, cast - -from stagehand.context import StagehandContext -from typing_extensions import override - -from crawlee.browsers import ( - PlaywrightBrowserController, - PlaywrightBrowserPlugin, - PlaywrightPersistentBrowser, -) - -from .support_classes import CrawleeStagehandPage - -if TYPE_CHECKING: - from collections.abc import Mapping - - from playwright.async_api import Page - from stagehand import Stagehand - - from crawlee.proxy_configuration import ProxyInfo - - -class StagehandBrowserController(PlaywrightBrowserController): - @override - def __init__( - self, browser: PlaywrightPersistentBrowser, stagehand: Stagehand, **kwargs: Any - ) -> None: - # Initialize with browser context instead of browser instance - super().__init__(browser, **kwargs) - - self._stagehand = stagehand - self._stagehand_context: StagehandContext | None = None - - @override - async def new_page( - self, - browser_new_context_options: Mapping[str, Any] | None = None, - proxy_info: ProxyInfo | None = None, - ) -> Page: - # Initialize browser context if not already done - if not self._browser_context: - self._browser_context = await self._create_browser_context( - browser_new_context_options=browser_new_context_options, - proxy_info=proxy_info, - ) - - # Initialize Stagehand context if not already done - if not self._stagehand_context: - self._stagehand_context = await StagehandContext.init( - self._browser_context, self._stagehand - ) - - # Create a new page using Stagehand context - page = await self._stagehand_context.new_page() - - pw_page = page._page # noqa: SLF001 - - # Handle page close event - pw_page.on(event='close', f=self._on_page_close) - - # Update internal state - self._pages.append(pw_page) - self._last_page_opened_at = datetime.now(timezone.utc) - - self._total_opened_pages += 1 - - # Wrap StagehandPage to provide Playwright Page interface - return cast('Page', CrawleeStagehandPage(page)) - - -class StagehandPlugin(PlaywrightBrowserPlugin): - """Browser plugin that integrates Stagehand with Crawlee's browser management.""" - - @override - def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None: - super().__init__(**kwargs) - - self._stagehand = stagehand - - @override - async def new_browser(self) -> StagehandBrowserController: - if not self._playwright: - raise RuntimeError('Playwright browser plugin is not initialized.') - - browser = PlaywrightPersistentBrowser( - # Stagehand can run only on a Chromium-based browser. - self._playwright.chromium, - self._user_data_dir, - self._browser_launch_options, - ) - - # Return custom controller with Stagehand - return StagehandBrowserController( - browser=browser, - stagehand=self._stagehand, - header_generator=None, - fingerprint_generator=None, - ) diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py b/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py deleted file mode 100644 index 6cf8cc2689..0000000000 --- a/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py +++ /dev/null @@ -1,66 +0,0 @@ -from __future__ import annotations - -import asyncio -import os -from typing import cast - -from stagehand import StagehandConfig, StagehandPage - -from crawlee import ConcurrencySettings -from crawlee.browsers import BrowserPool -from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext - -from .browser_classes import StagehandPlugin -from .support_classes import CrawleeStagehand - - -async def main() -> None: - # Configure local Stagehand with Gemini model - config = StagehandConfig( - env='LOCAL', - model_name='google/gemini-2.5-flash-preview-05-20', - model_api_key=os.getenv('GEMINI_API_KEY'), - ) - - # Create Stagehand instance - stagehand = CrawleeStagehand(config) - - # Create crawler with custom browser pool using Stagehand - crawler = PlaywrightCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. - max_requests_per_crawl=10, - # Custom browser pool. Gives users full control over browsers used by the crawler. - concurrency_settings=ConcurrencySettings(max_tasks_per_minute=10), - browser_pool=BrowserPool( - plugins=[ - StagehandPlugin(stagehand, browser_launch_options={'headless': True}) - ], - ), - ) - - # Define the default request handler, which will be called for every request. - @crawler.router.default_handler - async def request_handler(context: PlaywrightCrawlingContext) -> None: - context.log.info(f'Processing {context.request.url} ...') - - # Cast to StagehandPage for proper type hints in IDE - page = cast('StagehandPage', context.page) - - # Use regular Playwright method - playwright_title = await page.title() - context.log.info(f'Playwright page title: {playwright_title}') - - # highlight-start - # Use AI-powered extraction with natural language - gemini_title = await page.extract('Extract page title') - context.log.info(f'Gemini page title: {gemini_title}') - # highlight-end - - await context.enqueue_links() - - # Run the crawler with the initial list of URLs. - await crawler.run(['https://crawlee.dev/']) - - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py b/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py deleted file mode 100644 index cccb62e989..0000000000 --- a/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from stagehand import Stagehand, StagehandPage - -if TYPE_CHECKING: - from types import TracebackType - - -class CrawleeStagehandPage: - """StagehandPage wrapper for Crawlee.""" - - def __init__(self, page: StagehandPage) -> None: - self._page = page - - async def goto( - self, - url: str, - *, - referer: str | None = None, - timeout: int | None = None, - wait_until: str | None = None, - ) -> Any: - """Navigate to the specified URL.""" - # Override goto to return navigation result that `PlaywrightCrawler` expects - return await self._page._page.goto( # noqa: SLF001 - url, - referer=referer, - timeout=timeout, - wait_until=wait_until, - ) - - def __getattr__(self, name: str) -> Any: - """Delegate all other methods to the underlying StagehandPage.""" - return getattr(self._page, name) - - async def __aenter__(self) -> CrawleeStagehandPage: - """Enter the context manager.""" - return self - - async def __aexit__( - self, - exc_type: type[BaseException] | None, - exc_value: BaseException | None, - exc_traceback: TracebackType | None, - ) -> None: - await self._page.close() - - -class CrawleeStagehand(Stagehand): - """Stagehand wrapper for Crawlee to disable the launch of Playwright.""" - - async def init(self) -> None: - # Skip Stagehand's own Playwright initialization - # Let Crawlee's PlaywrightBrowserPlugin manage the browser lifecycle - self._initialized = True diff --git a/docs/guides/playwright_crawler_stagehand.mdx b/docs/guides/playwright_crawler_stagehand.mdx deleted file mode 100644 index 59a34b4cd2..0000000000 --- a/docs/guides/playwright_crawler_stagehand.mdx +++ /dev/null @@ -1,66 +0,0 @@ ---- -id: playwright-crawler-stagehand -title: Playwright with Stagehand -description: How to integrate Stagehand AI-powered automation with PlaywrightCrawler. ---- - -import ApiLink from '@site/src/components/ApiLink'; -import CodeBlock from '@theme/CodeBlock'; - -import SupportClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/support_classes.py'; -import BrowserClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/browser_classes.py'; -import StagehandRun from '!!raw-loader!./code_examples/playwright_crawler_stagehand/stagehand_run.py'; - -[Stagehand](https://docs.stagehand.dev/) is a framework that combines [Playwright](https://playwright.dev/python/) with AI-driven natural language understanding and decision-making capabilities. With Stagehand, you can use natural language instructions to interact with web pages instead of writing complex selectors and automation logic. - -Stagehand supports multiple AI models through [`LiteLLM`](https://docs.litellm.ai/docs/). This guide demonstrates how to integrate Stagehand with `PlaywrightCrawler` using [Gemini](https://ai.google.dev/gemini-api/docs) as the AI model provider. - -:::info - -This guide is based on stagehand-python v0.4.0 with local configuration settings and may not be compatible with newer versions. - -::: - -## Get Gemini API key - -You need to register with [Google AI Studio](https://aistudio.google.com/) and navigate to [Get API key](https://aistudio.google.com/app/apikey) to obtain your API key. - -## Create support classes for Stagehand - -To integrate Stagehand with Crawlee, you need to create wrapper classes that allow `PlaywrightBrowserPlugin` to manage the Playwright lifecycle. - -Create `CrawleeStagehand` - a custom Stagehand subclass that overrides the `init` method to prevent Stagehand from launching its own Playwright instance. - -Create `CrawleeStagehandPage` - a wrapper class for `StagehandPage` that implements the [Playwright Page](https://playwright.dev/python/docs/next/api/class-page) behavior expected by `PlaywrightCrawler`. - - - {SupportClasses} - - -## Create browser integration classes - -You need to create a custom browser plugin and controller that properly initialize Stagehand and obtain browser pages from `StagehandContext`. - -Create `StagehandPlugin` - a subclass of `PlaywrightBrowserPlugin` that holds the Stagehand instance and creates `PlaywrightPersistentBrowser` instances. - -Create `StagehandBrowserController` - a subclass of `PlaywrightBrowserController` that lazily initializes `StagehandContext` and creates new pages with AI capabilities on demand. - - - {BrowserClasses} - - -## Create a crawler - -Now you can create a `PlaywrightCrawler` that uses Stagehand's AI capabilities to interact with web pages using natural language commands: - - - {StagehandRun} - - -The integration works through several key components: -- `CrawleeStagehand` prevents Stagehand from launching its own Playwright instance, allowing Crawlee to manage the browser lifecycle -- `StagehandPlugin` extends the Playwright browser plugin to create Stagehand-enabled browser instances -- `StagehandBrowserController` uses `StagehandContext` to create pages with AI capabilities -- `CrawleeStagehandPage` provides interface compatibility between Stagehand pages and Crawlee's expectations - -In the request handler, you can use natural language commands like `page.extract('Extract title page')` to perform intelligent data extraction without writing complex selectors. diff --git a/pyproject.toml b/pyproject.toml index 4a5444ea44..bd91dd98cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,9 @@ sql_postgres = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "asyncpg>=0.24.0" ] +stragehard = [ + "stagehand>=3.19.0", +] sql_sqlite = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "aiosqlite>=0.21.0", diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py new file mode 100644 index 0000000000..c391471f2c --- /dev/null +++ b/src/crawlee/browsers/_stagehand_browser_controller.py @@ -0,0 +1,193 @@ +from __future__ import annotations + +from asyncio import Lock +from datetime import datetime, timedelta, timezone +from logging import getLogger +from typing import TYPE_CHECKING, Any, cast + +from playwright.async_api import Browser, BrowserContext, Page, ProxySettings +from typing_extensions import override + +from crawlee._utils.docs import docs_group +from crawlee.browsers._browser_controller import BrowserController +from crawlee.browsers._types import StagehandPage + +if TYPE_CHECKING: + from collections.abc import Mapping + + from stagehand import AsyncSession + + from crawlee.browsers._types import BrowserType + from crawlee.proxy_configuration import ProxyInfo + +logger = getLogger(__name__) + + +@docs_group('Browser management') +class StagehandBrowserController(BrowserController): + """Controller for managing a Stagehand-controlled browser instance. + + Bridges Crawlee's browser management with Stagehand: provides page creation via + Playwright (connected to Stagehand's browser via CDP) and exposes the Stagehand + session so the crawling context can access AI methods (act/extract/observe). + """ + + AUTOMATION_LIBRARY = 'stagehand' + + def __init__( + self, + browser: Browser, + session: AsyncSession, + *, + max_open_pages_per_browser: int = 20, + ) -> None: + """Initialize a new instance. + + Args: + browser: Playwright browser connected to Stagehand via CDP. + session: Active Stagehand session used for AI operations. + max_open_pages_per_browser: Maximum number of pages open at the same time. + """ + self._browser = browser + self._session = session + self._max_open_pages_per_browser = max_open_pages_per_browser + + self._browser_context: BrowserContext | None = None + self._pages = list[Page]() + self._total_opened_pages = 0 + self._opening_pages_count = 0 + self._last_page_opened_at = datetime.now(timezone.utc) + self._context_creation_lock: Lock | None = None + + @property + @override + def pages(self) -> list[Page]: + return self._pages # type: ignore[return-value] + + @property + @override + def total_opened_pages(self) -> int: + return self._total_opened_pages + + @property + @override + def pages_count(self) -> int: + return len(self._pages) + + @property + @override + def last_page_opened_at(self) -> datetime: + return self._last_page_opened_at + + @property + @override + def idle_time(self) -> timedelta: + return datetime.now(timezone.utc) - self._last_page_opened_at + + @property + @override + def has_free_capacity(self) -> bool: + return (self.pages_count + self._opening_pages_count) < self._max_open_pages_per_browser + + @property + @override + def is_browser_connected(self) -> bool: + return self._browser.is_connected() + + @property + @override + def browser_type(self) -> BrowserType: + return 'chromium' + + async def _get_context_creation_lock(self) -> Lock: + if self._context_creation_lock is None: + self._context_creation_lock = Lock() + return self._context_creation_lock + + @override + async def new_page( + self, + browser_new_context_options: Mapping[str, Any] | None = None, + proxy_info: ProxyInfo | None = None, + ) -> StagehandPage: + """Create a new page in the Stagehand-managed browser. + + Args: + browser_new_context_options: Ignored. Context is managed by Stagehand via CDP. + proxy_info: Proxy configuration applied when creating the shared browser context. + All pages share one context, so proxy is fixed on the first call. + + Returns: + The newly created page. + + Raises: + ValueError: If the browser has reached the maximum number of open pages. + """ + if not self.has_free_capacity: + raise ValueError('Cannot open more pages in this browser.') + + if browser_new_context_options: + logger.warning( + 'browser_new_context_options are ignored by StagehandBrowserController. ' + 'The existing CDP context is reused.' + ) + + self._opening_pages_count += 1 + + try: + async with await self._get_context_creation_lock(): + if self._browser_context is None: + if proxy_info: + self._browser_context = await self._browser.new_context( + proxy=ProxySettings( + server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', + username=proxy_info.username, + password=proxy_info.password, + ) + ) + elif self._browser.contexts: + # Reuse the existing CDP context when no proxy is needed. + self._browser_context = self._browser.contexts[0] + else: + self._browser_context = await self._browser.new_context() + elif proxy_info: + logger.warning( + 'proxy_info is ignored for subsequent pages — all pages share the same browser context.' + ) + + raw_page = await self._browser_context.new_page() + page = StagehandPage(raw_page, self._session) + raw_page.on('close', lambda _: self._on_page_close(cast('Page', page))) + + self._pages.append(page) + self._last_page_opened_at = datetime.now(timezone.utc) + self._total_opened_pages += 1 + finally: + self._opening_pages_count -= 1 + + return page + + @override + async def close(self, *, force: bool = False) -> None: + """End the Stagehand session and close the browser connection. + + Args: + force: Whether to force close all open pages before closing. + + Raises: + ValueError: If there are still open pages when closing without force. + """ + if self.pages_count > 0 and not force: + raise ValueError('Cannot close the browser while there are open pages.') + + try: + await self._session.end() + except Exception: + logger.warning('Failed to end Stagehand session gracefully.', exc_info=True) + + if self._browser.is_connected(): + await self._browser.close() + + def _on_page_close(self, page: Page) -> None: + """Handle actions after a page is closed.""" + self._pages.remove(page) diff --git a/src/crawlee/browsers/_stagehand_browser_plugin.py b/src/crawlee/browsers/_stagehand_browser_plugin.py new file mode 100644 index 0000000000..65a65dbf76 --- /dev/null +++ b/src/crawlee/browsers/_stagehand_browser_plugin.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +from logging import getLogger +from typing import TYPE_CHECKING, Any + +from playwright.async_api import Playwright, async_playwright +from stagehand import AsyncStagehand +from typing_extensions import override + +from crawlee import service_locator +from crawlee._utils.context import ensure_context +from crawlee._utils.docs import docs_group +from crawlee.browsers._browser_plugin import BrowserPlugin +from crawlee.browsers._stagehand_browser_controller import StagehandBrowserController +from crawlee.browsers._types import StagehandOptions + +if TYPE_CHECKING: + from collections.abc import Mapping + from types import TracebackType + + from crawlee.browsers._browser_controller import BrowserController + from crawlee.browsers._types import BrowserType + +logger = getLogger(__name__) + + +@docs_group('Browser management') +class StagehandBrowserPlugin(BrowserPlugin): + """A plugin for managing Stagehand AI-powered browser automation. + + Stagehand creates and manages the browser instance (local binary or Browserbase cloud). + Playwright then connects to it via CDP, enabling both standard Playwright automation + and AI-powered operations in the same crawling context. + + Only Chromium is supported because Stagehand relies on the Chrome DevTools Protocol. + """ + + AUTOMATION_LIBRARY = 'stagehand' + + def __init__( + self, + *, + stagehand_options: StagehandOptions | None = None, + browser_new_context_options: dict[str, Any] | None = None, + max_open_pages_per_browser: int = 20, + local_ready_timeout_s: float = 30.0, + ) -> None: + """Initialize a new instance. + + Args: + stagehand_options: Stagehand-specific configuration. Defaults to + ``StagehandOptions()`` if not provided. + browser_new_context_options: Options passed to Playwright's + ``browser.new_context`` after connecting via CDP. Refer to the + Playwright documentation for available options: + https://playwright.dev/python/docs/api/class-browser#browser-new-context. + max_open_pages_per_browser: Maximum number of pages that can be open per browser. + local_ready_timeout_s: Seconds to wait for the local Stagehand binary to + become ready. Only relevant when ``env='LOCAL'``. + """ + opts = stagehand_options or StagehandOptions() + config = service_locator.get_configuration() + + self._opts = opts + self._browser_new_context_options = browser_new_context_options or {} + self._max_open_pages_per_browser = max_open_pages_per_browser + + # headless comes from Configuration, same as PlaywrightBrowserPlugin. + # chrome_path is resolved lazily in __aenter__ once Playwright is available. + self._headless = config.headless + self._chrome_path: str | None = config.default_browser_path + + is_local = opts.env == 'LOCAL' + self._stagehand_init_kwargs: dict[str, Any] = { + 'server': 'local' if is_local else 'remote', + 'local_headless': self._headless, + 'local_ready_timeout_s': local_ready_timeout_s, + } + if is_local: + self._stagehand_init_kwargs['model_api_key'] = opts.api_key + else: + self._stagehand_init_kwargs['browserbase_api_key'] = opts.api_key + self._stagehand_init_kwargs['browserbase_project_id'] = opts.project_id + + # AsyncStagehand is created lazily in __aenter__ so that chrome_path + # can be resolved from playwright.chromium.executable_path if not set. + self._stagehand_context_manager: AsyncStagehand | None = None + self._stagehand_client: AsyncStagehand | None = None + + self._playwright_context_manager = async_playwright() + self._playwright: Playwright | None = None + + # Flag to indicate the context state. + self._active = False + + @property + @override + def active(self) -> bool: + return self._active + + @property + @override + def browser_type(self) -> BrowserType: + return 'chromium' + + @property + @override + def browser_launch_options(self) -> Mapping[str, Any]: + """Return an empty mapping. + + Browser launch is managed by Stagehand, not Playwright directly. + """ + return {} + + @property + @override + def browser_new_context_options(self) -> Mapping[str, Any]: + """Return the options for the ``browser.new_context`` method. + + These options are passed to Playwright's ``browser.new_context`` after + connecting to the Stagehand-managed browser via CDP. Refer to the Playwright + documentation for available options: + https://playwright.dev/python/docs/api/class-browser#browser-new-context. + """ + return self._browser_new_context_options + + @property + @override + def max_open_pages_per_browser(self) -> int: + return self._max_open_pages_per_browser + + @override + async def __aenter__(self) -> StagehandBrowserPlugin: + if self._active: + raise RuntimeError(f'The {self.__class__.__name__} is already active.') + + self._active = True + self._playwright = await self._playwright_context_manager.__aenter__() + + # Resolve Chromium path from Playwright's own installation when not set + # explicitly via Configuration. The stagehand binary needs an explicit path. + if self._chrome_path is None and self._opts.env == 'LOCAL': + self._chrome_path = self._playwright.chromium.executable_path + self._stagehand_init_kwargs['local_chrome_path'] = self._chrome_path + logger.debug(f'Resolved Chromium path from Playwright: {self._chrome_path}') + + self._stagehand_context_manager = AsyncStagehand(**self._stagehand_init_kwargs) + self._stagehand_client = await self._stagehand_context_manager.__aenter__() + + return self + + @override + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + if not self._active: + raise RuntimeError(f'The {self.__class__.__name__} is not active.') + + if self._stagehand_context_manager is not None: + await self._stagehand_context_manager.__aexit__(exc_type, exc_value, exc_traceback) + + await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback) + + self._stagehand_context_manager = None + self._playwright_context_manager = async_playwright() + self._stagehand_client = None + self._playwright = None + self._active = False + + @override + @ensure_context + async def new_browser(self) -> BrowserController: + if not self._playwright or not self._stagehand_client: + raise RuntimeError(f'{self.__class__.__name__} is not initialized.') + + session = await self._stagehand_client.sessions.start(**self._build_session_kwargs()) + + cdp_url = session.data.cdp_url + if not cdp_url: + raise RuntimeError( + f'No cdp_url returned from Stagehand (env={self._opts.env!r}). ' + 'Cannot connect Playwright to the browser.' + ) + + browser = await self._playwright.chromium.connect_over_cdp(cdp_url) + + return StagehandBrowserController( + browser, + session, + max_open_pages_per_browser=self._max_open_pages_per_browser, + ) + + def _build_session_kwargs(self) -> dict[str, Any]: + """Build keyword arguments for ``sessions.start``.""" + opts = self._opts + + if opts.env == 'BROWSERBASE': + browser_param: dict[str, Any] = {'type': 'browserbase'} + else: + launch_options: dict[str, Any] = {'headless': self._headless} + browser_param = { + 'type': 'local', + 'launchOptions': launch_options, + } # , 'local_chrome_path': self._chrome_path} + + kwargs: dict[str, Any] = { + 'model_name': opts.model, + 'browser': browser_param, + 'verbose': opts.verbose, + 'self_heal': opts.self_heal, + } + + if opts.dom_settle_timeout_ms is not None: + kwargs['dom_settle_timeout_ms'] = opts.dom_settle_timeout_ms + if opts.system_prompt is not None: + kwargs['system_prompt'] = opts.system_prompt + + return kwargs diff --git a/src/crawlee/browsers/_types.py b/src/crawlee/browsers/_types.py index c5976b086a..083c542268 100644 --- a/src/crawlee/browsers/_types.py +++ b/src/crawlee/browsers/_types.py @@ -1,10 +1,22 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Any, Literal + +from playwright.async_api import Page if TYPE_CHECKING: - from playwright.async_api import Page + from stagehand import AsyncSession + from stagehand.types.session_act_params import SessionActParams + from stagehand.types.session_act_response import SessionActResponse + from stagehand.types.session_execute_params import SessionExecuteParams + from stagehand.types.session_execute_response import SessionExecuteResponse + from stagehand.types.session_extract_params import SessionExtractParams + from stagehand.types.session_extract_response import SessionExtractResponse + from stagehand.types.session_observe_params import SessionObserveParams + from stagehand.types.session_observe_response import SessionObserveResponse + from typing_extensions import Unpack + BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome'] @@ -16,3 +28,80 @@ class CrawleePage: id: str browser_type: BrowserType page: Page + + +@dataclass +class StagehandOptions: + env: Literal['LOCAL', 'BROWSERBASE'] = 'LOCAL' + api_key: str | None = None + project_id: str | None = None + model: str = 'openai/gpt-4.1-mini' + verbose: Literal[0, 1, 2] = 0 + self_heal: bool = True + dom_settle_timeout_ms: float | None = None + system_prompt: str | None = None + + +class StagehandPage(Page): + """A Playwright `Page` enhanced with Stagehand AI methods. + + Wraps a Playwright `Page` and an `AsyncSession`, proxying all standard Playwright + methods transparently while adding `act()`, `extract()`, and `observe()` AI operations + bound to the current page. + """ + + def __init__(self, page: Page, session: AsyncSession) -> None: + self._page = page + self._session = session + + def __getattr__(self, name: str) -> Any: + return getattr(self._page, name) + + async def act(self, **kwargs: Unpack[SessionActParams]) -> SessionActResponse: + """Perform an action on the page using natural language. + + Args: + **kwargs: Parameters passed to ``AsyncSession.act()``. + The most common is ``instruction`` — a natural language description + of the action to perform, e.g. ``instruction='click the login button'``. + + Returns: + The action result from Stagehand. + """ + return await self._session.act(page=self._page, **kwargs) + + async def observe(self, **kwargs: Unpack[SessionObserveParams]) -> SessionObserveResponse: + """Observe the page and get AI-suggested actions. + + Args: + **kwargs: Parameters passed to ``AsyncSession.observe()``. + Optionally pass ``instruction`` to narrow the observation scope. + + Returns: + Observation result with suggested actions. + """ + return await self._session.observe(page=self._page, **kwargs) + + async def extract(self, **kwargs: Unpack[SessionExtractParams]) -> SessionExtractResponse: + """Extract structured data from the page using natural language. + + Args: + **kwargs: Parameters passed to ``AsyncSession.extract()``. + Common parameters: ``instruction`` and ``schema`` (JSON Schema dict). + + Returns: + Extracted data matching the requested schema. + """ + return await self._session.extract(page=self._page, **kwargs) + + async def execute(self, **kwargs: Unpack[SessionExecuteParams]) -> SessionExecuteResponse: + """Execute arbitrary code on the page via natural language instructions. + + Args: + **kwargs: Parameters passed to ``AsyncSession.execute()``. + Common parameters: ``instruction`` describing the code to execute. + + Returns: + The result of the executed code. + """ + return await self._session.execute(page=self._page, **kwargs) diff --git a/src/crawlee/crawlers/_stagehand/__init__.py b/src/crawlee/crawlers/_stagehand/__init__.py new file mode 100644 index 0000000000..1199f0cf2b --- /dev/null +++ b/src/crawlee/crawlers/_stagehand/__init__.py @@ -0,0 +1,3 @@ +from crawlee.crawlers._stagehand._stagehand_crawler import StagehandCrawler + +__all__ = ['StagehandCrawler'] diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py new file mode 100644 index 0000000000..b367cc727a --- /dev/null +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING, Any + +from crawlee._utils.docs import docs_group +from crawlee.browsers import BrowserPool +from crawlee.browsers._stagehand_browser_plugin import StagehandBrowserPlugin +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +if TYPE_CHECKING: + from typing_extensions import Unpack + + from crawlee.browsers._types import StagehandOptions + from crawlee.crawlers._basic import BasicCrawlerOptions + from crawlee.statistics import StatisticsState + + +@docs_group('Crawlers') +class StagehandCrawler(PlaywrightCrawler): + """A web crawler that integrates Stagehand AI-powered browser automation with Crawlee. + + Extends `PlaywrightCrawler` with a `StagehandBrowserPlugin` that manages a Stagehand + session per browser instance. Each page in the crawling context is a `StagehandPage`, + which exposes AI methods alongside all standard Playwright `Page` methods: + + - `page.act(**kwargs)` — perform actions using natural language + - `page.extract(**kwargs)` — extract structured data with AI + - `page.observe(**kwargs)` — get AI-suggested actions on the page + - `page.execute(**kwargs)` — run an autonomous multi-step agent + + ### Usage + + ```python + from crawlee.crawlers import StagehandCrawler + from crawlee.crawlers._stagehand import StagehandCrawlingContext + + crawler = StagehandCrawler() + + @crawler.router.default_handler + async def handler(context: StagehandCrawlingContext) -> None: + await context.page.act(input='Click the login button') + data = await context.page.extract(instruction='Get the page title') + await context.push_data(data) + + await crawler.run(['https://example.com']) + ``` + """ + + def __init__( + self, + *, + stagehand_options: StagehandOptions | None = None, + browser_pool: BrowserPool | None = None, + browser_new_context_options: dict[str, Any] | None = None, + max_open_pages_per_browser: int = 20, + **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]], + ) -> None: + """Initialize a new instance. + + Args: + stagehand_options: Stagehand-specific configuration (model, API key, env, etc.). + Ignored if `browser_pool` is provided. + browser_pool: A pre-configured `BrowserPool`. All plugins must be instances of + `StagehandBrowserPlugin` (or its subclasses). If omitted, a pool is created + automatically from `stagehand_options`. + browser_new_context_options: Options passed to Playwright's `browser.new_context` + after connecting via CDP. Ignored if `browser_pool` is provided. + max_open_pages_per_browser: Maximum pages open per browser instance. + Ignored if `browser_pool` is provided. + kwargs: Additional keyword arguments forwarded to `BasicCrawler`. + """ + if browser_pool is not None: + self._validate_browser_pool(browser_pool) + if stagehand_options is not None: + warnings.warn( + '`stagehand_options` is ignored when `browser_pool` is provided.', + stacklevel=2, + ) + else: + browser_pool = BrowserPool( + plugins=[ + StagehandBrowserPlugin( + stagehand_options=stagehand_options, + browser_new_context_options=browser_new_context_options, + max_open_pages_per_browser=max_open_pages_per_browser, + ) + ] + ) + + super().__init__(browser_pool=browser_pool, **kwargs) + + @staticmethod + def _validate_browser_pool(pool: BrowserPool) -> None: + invalid = [p for p in pool.plugins if not isinstance(p, StagehandBrowserPlugin)] + if invalid: + raise ValueError( + f'All BrowserPool plugins must be StagehandBrowserPlugin instances. Invalid plugins: {invalid}' + ) diff --git a/uv.lock b/uv.lock index f7df250dc9..6f53d2c868 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-15T07:01:49.228326682Z" +exclude-newer = "2026-04-18T21:01:24.3365857Z" exclude-newer-span = "PT24H" [[package]] @@ -873,6 +873,9 @@ sql-sqlite = [ { name = "aiosqlite" }, { name = "sqlalchemy", extra = ["asyncio"] }, ] +stragehard = [ + { name = "stagehand" }, +] [package.dev-dependencies] dev = [ @@ -948,13 +951,14 @@ requires-dist = [ { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-mysql'", specifier = ">=2.0.0,<3.0.0" }, { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-postgres'", specifier = ">=2.0.0,<3.0.0" }, { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-sqlite'", specifier = ">=2.0.0,<3.0.0" }, + { name = "stagehand", marker = "extra == 'stragehard'", specifier = ">=3.19.0" }, { name = "tldextract", specifier = ">=5.1.0" }, { name = "typer", marker = "extra == 'cli'", specifier = ">=0.12.0" }, { name = "typing-extensions", specifier = ">=4.1.0" }, { name = "wrapt", marker = "extra == 'otel'", specifier = ">=1.17.0" }, { name = "yarl", specifier = ">=1.18.0" }, ] -provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "sql-sqlite", "sql-mysql", "redis"] +provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "stragehard", "sql-sqlite", "sql-mysql", "redis"] [package.metadata.requires-dev] dev = [ @@ -1148,6 +1152,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] + [[package]] name = "docspec" version = "2.2.1" @@ -3704,6 +3717,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -3787,6 +3809,26 @@ asyncio = [ { name = "greenlet" }, ] +[[package]] +name = "stagehand" +version = "3.19.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d9/f8/ccd2bb2758a4eaf0af3846e097ff206e0aa76c8d3b5aa2bded77fb47825e/stagehand-3.19.5.tar.gz", hash = "sha256:3cb8279ac82051e584b34d26e87dc764f0ccad766a01625198ca578eb35f0b6c", size = 281033, upload-time = "2026-04-03T20:21:09.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/6f/a47bad258bfafc193ebb8e0e8c440e8028c9ab28b54a333b46aa3c0cff53/stagehand-3.19.5-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:14f39a4f8d30d77c089166185c705f66aade25432b903a663a937b3747439c26", size = 34495874, upload-time = "2026-04-03T20:21:07.366Z" }, + { url = "https://files.pythonhosted.org/packages/72/f7/e39868903121f1a80ae6eda088383362cd2d3a578c04493a2f83c1aac1da/stagehand-3.19.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:80ed0d732cb9c3e952ad851e071dad5775a9ea88d2787c006289d61097fd2609", size = 33193535, upload-time = "2026-04-03T20:21:18.536Z" }, + { url = "https://files.pythonhosted.org/packages/c8/0b/35cb92bb53e9539c0147892dbd0a227b43bf0d8adcd0a8e867dc5f2bf7fd/stagehand-3.19.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:aa947a5f6241f5953ac238cd9b0ab72e0cb87f559f97e5ee875f83dbc0c351d1", size = 37273148, upload-time = "2026-04-03T20:21:11.939Z" }, + { url = "https://files.pythonhosted.org/packages/7c/c7/dccf63cba1941b5710dc9968218e2883a937cf6534d644bb0c5222d3f40a/stagehand-3.19.5-py3-none-win_amd64.whl", hash = "sha256:e37bf630b99b4a9b7d95f151c56b296940db88b3049b68f0abb56f9e31cc6095", size = 30758357, upload-time = "2026-04-03T20:21:15.121Z" }, +] + [[package]] name = "text-unidecode" version = "1.3"