diff --git a/README.md b/README.md index e945edf4..81af8ed6 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,8 @@ Below is a comprehensive table of all available tools, how to use them with an a | Tool | Agent Usage | Use Case | |------|-------------|----------| | a2a_client | `provider = A2AClientToolProvider(known_agent_urls=["http://localhost:9000"]); agent = Agent(tools=provider.tools)` | Discover and communicate with A2A-compliant agents, send messages between agents | +| apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor with arbitrary input | +| apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | | editor | `agent.tool.editor(command="view", path="path/to/file.py")` | Advanced file operations like syntax highlighting, pattern replacement, and multi-file edits | @@ -960,6 +962,47 @@ result = agent.tool.mongodb_memory( ) ``` +### Apify + +```python +from strands import Agent +from strands_tools.apify import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) + +# Scrape a single URL and get Markdown content +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor and get results in one step +result = agent.tool.apify_run_actor_and_get_dataset( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + dataset_items_limit=50, +) + +# Run a saved task (pre-configured Actor with default inputs) +run_info = agent.tool.apify_run_task(task_id="user/my-task") + +# Run a task and get results in one step +result = agent.tool.apify_run_task_and_get_dataset( + task_id="user/my-task", + task_input={"query": "override default input"}, + dataset_items_limit=50, +) + +# Run an Actor (get metadata only) +run_info = agent.tool.apify_run_actor( + actor_id="apify/google-search-scraper", + run_input={"queries": "AI agent frameworks"}, +) + +# Fetch dataset items separately +items = agent.tool.apify_get_dataset_items( + dataset_id="abc123", + limit=100, +) +``` + ## 🌍 Environment Variables Configuration Agents Tools provides extensive customization through environment variables. This allows you to configure tool behavior without modifying code, making it ideal for different environments (development, testing, production). @@ -1068,6 +1111,12 @@ The Mem0 Memory Tool supports three different backend configurations: - If `NEPTUNE_ANALYTICS_GRAPH_IDENTIFIER` is set, the tool will configure Neptune Analytics as graph store to enhance memory search - LLM configuration applies to all backend modes and allows customization of the language model used for memory processing +#### Apify Tool + +| Environment Variable | Description | Default | +|----------------------|-------------|---------| +| APIFY_API_TOKEN | Apify API token for authentication (required) | None | + #### Bright Data Tool | Environment Variable | Description | Default | diff --git a/docs/apify_tool.md b/docs/apify_tool.md new file mode 100644 index 00000000..ec0a9238 --- /dev/null +++ b/docs/apify_tool.md @@ -0,0 +1,205 @@ +# Apify + +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform — running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. + +## Installation + +```bash +pip install strands-agents-tools[apify] +``` + +## Configuration + +Set your Apify API token as an environment variable: + +```bash +export APIFY_API_TOKEN=apify_api_your_token_here +``` + +Get your token from [Apify Console](https://console.apify.com/account/integrations) → Settings → API & Integrations → Personal API tokens. + +## Usage + +Register all core tools at once: + +```python +from strands import Agent +from strands_tools.apify import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Or pick individual tools: + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_scrape_url, +]) +``` + +### Scrape a URL + +The simplest way to extract content from any web page. Uses the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor under the hood and returns the page content as Markdown: + +```python +content = agent.tool.apify_scrape_url(url="https://example.com") +``` + +### Run an Actor + +Execute any Actor from [Apify Store](https://apify.com/store) by its ID. The call blocks until the Actor run finishes or the timeout is reached: + +```python +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + timeout_secs=300, +) +``` + +The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. + +### Run an Actor and Get Results + +Combine running an Actor and fetching its dataset results in a single call: + +```python +result = agent.tool.apify_run_actor_and_get_dataset( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + dataset_items_limit=50, +) +``` + +### Run a task + +Execute a saved [Actor task](https://docs.apify.com/platform/actors/running/tasks) — a pre-configured Actor with preset inputs. Use this when a task has already been set up in Apify Console: + +```python +result = agent.tool.apify_run_task( + task_id="user~my-task", + task_input={"query": "override input"}, + timeout_secs=300, +) +``` + +The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. + +### Run a task and get results + +Combine running a task and fetching its dataset results in a single call: + +```python +result = agent.tool.apify_run_task_and_get_dataset( + task_id="user~my-task", + dataset_items_limit=50, +) +``` + +### Fetch dataset items + +Retrieve results from a dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing dataset: + +```python +items = agent.tool.apify_get_dataset_items( + dataset_id="abc123", + limit=100, + offset=0, +) +``` + +## Tool Parameters + +### apify_scrape_url + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `url` | string | Yes | — | The URL to scrape | +| `timeout_secs` | int | No | 120 | Maximum time in seconds to wait for scraping to finish | +| `crawler_type` | string | No | `"cheerio"` | Crawler engine to use. One of `"cheerio"` (fastest, no JS rendering), `"playwright:adaptive"` (fast, renders JS if present), or `"playwright:firefox"` (reliable, renders JS, best at avoiding blocking but slower) | + +**Returns:** Markdown content of the scraped page as a plain string. + +### apify_run_actor + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `actor_id` | string | Yes | — | Actor identifier (e.g., `apify/website-content-crawler`) | +| `run_input` | dict | No | None | JSON-serializable input for the Actor | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `build` | string | No | None | Actor build tag or number to run a specific version (uses latest build if not set) | + +**Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. + +### apify_run_task + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `task_id` | string | Yes | — | Task identifier (e.g., `user~my-task` or a task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | + +**Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. + +### apify_run_task_and_get_dataset + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `task_id` | string | Yes | — | Task identifier (e.g., `user~my-task` or a task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | +| `dataset_items_offset` | int | No | 0 | Number of dataset items to skip for pagination | + +**Returns:** JSON string with run metadata plus an `items` array containing the dataset results. + +### apify_get_dataset_items + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `dataset_id` | string | Yes | — | The Apify dataset ID to fetch items from | +| `limit` | int | No | 100 | Maximum number of items to return | +| `offset` | int | No | 0 | Number of items to skip for pagination | + +**Returns:** JSON string containing an array of dataset items. + +### apify_run_actor_and_get_dataset + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `actor_id` | string | Yes | — | Actor identifier (e.g., `apify/website-content-crawler`) | +| `run_input` | dict | No | None | JSON-serializable input for the Actor | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `build` | string | No | None | Actor build tag or number to run a specific version (uses latest build if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | +| `dataset_items_offset` | int | No | 0 | Number of dataset items to skip for pagination | + +**Returns:** JSON string with run metadata plus an `items` array containing the dataset results. + +## Troubleshooting + +| Error | Cause | Fix | +|-------|-------|-----| +| `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | +| `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | +| `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in [Apify Console](https://console.apify.com) | +| `Task ... finished with status FAILED` | Task execution error | Check task configuration and run logs in [Apify Console](https://console.apify.com) | +| `Actor/task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `Task ... returned no run data` | Task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | +| `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | + +## References + +- [Strands Agents Tools](https://strandsagents.com/latest/user-guide/concepts/tools/tools_overview/) +- [Apify Platform](https://apify.com) +- [Apify API Documentation](https://docs.apify.com/api/v2) +- [Apify Store](https://apify.com/store) +- [Apify Python Client](https://docs.apify.com/api/client/python/docs) diff --git a/pyproject.toml b/pyproject.toml index bf00325f..93e05c6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,9 @@ Homepage = "https://github.com/strands-agents/tools" Documentation = "https://strandsagents.com/" [project.optional-dependencies] +apify = [ + "apify-client>=2.5.0,<3.0.0", +] build = [ "hatch>=1.16.5", ] @@ -122,7 +125,7 @@ mongodb-memory = [ ] [tool.hatch.envs.hatch-static-analysis] -features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory"] +features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory", "apify"] dependencies = [ "strands-agents>=1.0.0", "mypy>=0.981,<1.0.0", @@ -141,7 +144,7 @@ lint-check = [ lint-fix = ["ruff check --fix"] [tool.hatch.envs.hatch-test] -features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory"] +features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory", "apify"] extra-dependencies = [ "moto>=5.1.0,<6.0.0", "pytest>=8.0.0,<10.0.0", diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py new file mode 100644 index 00000000..19f8696a --- /dev/null +++ b/src/strands_tools/apify.py @@ -0,0 +1,671 @@ +"""Apify platform tools for Strands Agents. + + +Apify is the world's largest marketplace of tools for web scraping, crawling, data extraction, and web automation. +These tools are called Actors, serverless cloud programs that take JSON input and store results +in a dataset (structured, tabular output) or key-value store (files and unstructured data). +Get structured data from social media, e-commerce, search engines, maps, travel sites, or any other website. + +Available Tools: +--------------- +- apify_run_actor: Run any Apify Actor with custom input +- apify_get_dataset_items: Fetch items from an Apify dataset with pagination +- apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step +- apify_run_task: Run a saved Actor task with optional input overrides +- apify_run_task_and_get_dataset: Run a task and fetch results in one step +- apify_scrape_url: Scrape a single URL and return content as Markdown + +Setup Requirements: +------------------ +1. Create an Apify account at https://apify.com +2. Get your API token: Apify Console > Settings > API & Integrations > Personal API tokens +3. Install the optional dependency: pip install strands-agents-tools[apify] +4. Set the environment variable: + APIFY_API_TOKEN=your_api_token_here + +Usage Examples: +-------------- +Register all core tools at once via the preset list: + +```python +from strands import Agent +from strands_tools.apify import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Or pick individual tools for a smaller LLM tool surface: + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_scrape_url, + apify.apify_run_actor, +]) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) +``` +""" + +import json +import logging +import os +from typing import Any, Dict, List, Literal, Optional, get_args +from urllib.parse import urlparse + +from rich.panel import Panel +from rich.text import Text +from strands import tool + +from strands_tools.utils import console_util + +logger = logging.getLogger(__name__) +console = console_util.create() + +try: + from apify_client import ApifyClient + from apify_client.errors import ApifyApiError + + HAS_APIFY_CLIENT = True +except ImportError: + HAS_APIFY_CLIENT = False + +# Attribution header - lets Apify track usage originating from strands-agents (analytics only) +TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} +ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" +DEFAULT_TIMEOUT_SECS = 300 +DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +DEFAULT_DATASET_ITEMS_LIMIT = 100 + +WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" +CrawlerType = Literal["playwright:adaptive", "playwright:firefox", "cheerio"] +WEBSITE_CONTENT_CRAWLER_TYPES = get_args(CrawlerType) + + +# --- Helper functions --- + + +def _check_dependency() -> None: + """Raise ImportError if apify-client is not installed.""" + if not HAS_APIFY_CLIENT: + raise ImportError("apify-client package is required. Install with: pip install strands-agents-tools[apify]") + + +def _format_error(e: Exception) -> str: + """Map exceptions to user-friendly error messages, with special handling for ApifyApiError.""" + if HAS_APIFY_CLIENT and isinstance(e, ApifyApiError): + status_code = getattr(e, "status_code", None) + msg = getattr(e, "message", str(e)) + match status_code: + case 400: + return f"Invalid request: {msg}" + case 401: + return "Authentication failed. Verify your APIFY_API_TOKEN is valid." + case 402: + return "Insufficient Apify plan credits or subscription limits exceeded." + case 404: + return f"Resource not found: {msg}" + case 408: + return f"Actor run timed out: {msg}" + case 429: + return ( + "Rate limit exceeded. The Apify client retries automatically; " + "if this persists, reduce request frequency." + ) + case None: + return f"Apify API error: {msg}" + case _: + return f"Apify API error ({status_code}): {msg}" + return str(e) + + +def _error_result(e: Exception, tool_name: str) -> Dict[str, Any]: + """Build a structured error response and display an error panel.""" + message = _format_error(e) + logger.error("%s failed: %s", tool_name, message) + console.print(Panel(Text(message, style="red"), title=ERROR_PANEL_TITLE, border_style="red")) + return {"status": "error", "content": [{"text": message}]} + + +def _success_result(text: str, panel_body: str, panel_title: str) -> Dict[str, Any]: + """Build a structured success response and display a success panel.""" + console.print(Panel(panel_body, title=f"[bold cyan]{panel_title}[/bold cyan]", border_style="green")) + return {"status": "success", "content": [{"text": text}]} + + +class ApifyToolClient: + """Helper class encapsulating Apify API interactions via apify-client.""" + + def __init__(self) -> None: + token = os.getenv("APIFY_API_TOKEN", "") + if not token: + raise ValueError( + "APIFY_API_TOKEN environment variable is not set. " + "Get your token at https://console.apify.com/account/integrations" + ) + self.client: "ApifyClient" = ApifyClient(token, headers=TRACKING_HEADER) + + @staticmethod + def _check_run_status(actor_run: Dict[str, Any], label: str) -> None: + """Raise RuntimeError if the Actor run did not succeed.""" + status = actor_run.get("status", "UNKNOWN") + if status != "SUCCEEDED": + run_id = actor_run.get("id", "N/A") + raise RuntimeError(f"{label} finished with status {status}. Run ID: {run_id}") + + @staticmethod + def _validate_url(url: str) -> None: + """Raise ValueError if the URL does not have a valid HTTP(S) scheme and domain.""" + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + raise ValueError(f"Invalid URL scheme '{parsed.scheme}'. Only http and https URLs are supported.") + if not parsed.netloc: + raise ValueError(f"Invalid URL '{url}'. A domain is required.") + + @staticmethod + def _validate_identifier(value: str, name: str) -> None: + """Raise ValueError if a required string identifier is empty or whitespace-only.""" + if not value.strip(): + raise ValueError(f"'{name}' must be a non-empty string.") + + @staticmethod + def _validate_positive(value: int, name: str) -> None: + """Raise ValueError if the value is not a positive integer (> 0).""" + if value <= 0: + raise ValueError(f"'{name}' must be a positive integer, got {value}.") + + @staticmethod + def _validate_non_negative(value: int, name: str) -> None: + """Raise ValueError if the value is negative.""" + if value < 0: + raise ValueError(f"'{name}' must be a non-negative integer, got {value}.") + + def run_actor( + self, + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + ) -> Dict[str, Any]: + """Run an Apify Actor synchronously and return run metadata.""" + self._validate_identifier(actor_id, "actor_id") + self._validate_positive(timeout_secs, "timeout_secs") + if memory_mbytes is not None: + self._validate_positive(memory_mbytes, "memory_mbytes") + + call_kwargs: Dict[str, Any] = { + "run_input": run_input if run_input is not None else {}, + "timeout_secs": timeout_secs, + "logger": None, # Suppress verbose apify-client logging not useful to end users + } + if memory_mbytes is not None: + call_kwargs["memory_mbytes"] = memory_mbytes + if build is not None: + call_kwargs["build"] = build + + actor_run = self.client.actor(actor_id).call(**call_kwargs) + if actor_run is None: + raise RuntimeError(f"Actor {actor_id} returned no run data (possible wait timeout).") + self._check_run_status(actor_run, f"Actor {actor_id}") + + return { + "run_id": actor_run.get("id"), + "status": actor_run.get("status"), + "dataset_id": actor_run.get("defaultDatasetId"), + "started_at": actor_run.get("startedAt"), + "finished_at": actor_run.get("finishedAt"), + } + + def get_dataset_items( + self, + dataset_id: str, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + offset: int = 0, + ) -> List[Dict[str, Any]]: + """Fetch items from an Apify dataset.""" + self._validate_identifier(dataset_id, "dataset_id") + self._validate_positive(limit, "limit") + self._validate_non_negative(offset, "offset") + + result = self.client.dataset(dataset_id).list_items(limit=limit, offset=offset) + return list(result.items) + + def run_actor_and_get_dataset( + self, + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, + ) -> Dict[str, Any]: + """Run an Actor synchronously, then fetch its default dataset items.""" + self._validate_positive(dataset_items_limit, "dataset_items_limit") + self._validate_non_negative(dataset_items_offset, "dataset_items_offset") + + run_metadata = self.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + ) + dataset_id = run_metadata["dataset_id"] + if not dataset_id: + raise RuntimeError(f"Actor {actor_id} run has no default dataset.") + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) + return {**run_metadata, "items": items} + + def run_task( + self, + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + ) -> Dict[str, Any]: + """Run an Apify task synchronously and return run metadata.""" + self._validate_identifier(task_id, "task_id") + self._validate_positive(timeout_secs, "timeout_secs") + if memory_mbytes is not None: + self._validate_positive(memory_mbytes, "memory_mbytes") + + call_kwargs: Dict[str, Any] = {"timeout_secs": timeout_secs} + if task_input is not None: + call_kwargs["task_input"] = task_input + if memory_mbytes is not None: + call_kwargs["memory_mbytes"] = memory_mbytes + + task_run = self.client.task(task_id).call(**call_kwargs) + if task_run is None: + raise RuntimeError(f"Task {task_id} returned no run data (possible wait timeout).") + self._check_run_status(task_run, f"Task {task_id}") + + return { + "run_id": task_run.get("id"), + "status": task_run.get("status"), + "dataset_id": task_run.get("defaultDatasetId"), + "started_at": task_run.get("startedAt"), + "finished_at": task_run.get("finishedAt"), + } + + def run_task_and_get_dataset( + self, + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, + ) -> Dict[str, Any]: + """Run a task synchronously, then fetch its default dataset items.""" + self._validate_positive(dataset_items_limit, "dataset_items_limit") + self._validate_non_negative(dataset_items_offset, "dataset_items_offset") + + run_metadata = self.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + dataset_id = run_metadata["dataset_id"] + if not dataset_id: + raise RuntimeError(f"Task {task_id} run has no default dataset.") + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) + return {**run_metadata, "items": items} + + def scrape_url( + self, + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: CrawlerType = "cheerio", + ) -> str: + """Scrape a single URL using Website Content Crawler and return Markdown.""" + self._validate_url(url) + self._validate_positive(timeout_secs, "timeout_secs") + if crawler_type not in WEBSITE_CONTENT_CRAWLER_TYPES: + raise ValueError( + f"Invalid crawler_type '{crawler_type}'. Must be one of: {', '.join(WEBSITE_CONTENT_CRAWLER_TYPES)}." + ) + + run_input: Dict[str, Any] = { + "startUrls": [{"url": url}], + "maxCrawlPages": 1, + "crawlerType": crawler_type, + } + actor_run = self.client.actor(WEBSITE_CONTENT_CRAWLER).call( + run_input=run_input, + timeout_secs=timeout_secs, + logger=None, # Suppress verbose apify-client logging not useful to end users + ) + if actor_run is None: + raise RuntimeError("Website Content Crawler returned no run data (possible wait timeout).") + self._check_run_status(actor_run, "Website Content Crawler") + + dataset_id = actor_run.get("defaultDatasetId") + if not dataset_id: + raise RuntimeError("Website Content Crawler run has no default dataset.") + result = self.client.dataset(dataset_id).list_items(limit=1) + items = list(result.items) + + if not items: + raise RuntimeError(f"No content returned for URL: {url}") + + return str(items[0].get("markdown") or items[0].get("text", "")) + + +# --- Tool functions --- + + +@tool +def apify_run_actor( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, +) -> Dict[str, Any]: + """Run any Apify Actor and return the run metadata as JSON. + + An Actor is a serverless cloud app on the Apify platform — it takes JSON input, + runs the scraping or automation job, and writes results to a dataset. This tool + executes the Actor synchronously and returns run metadata only (run_id, status, + dataset_id, timestamps). Use apify_run_actor_and_get_dataset to also fetch the + output data in one call, or apify_scrape_url for quick single-URL extraction. + + Common Actors: + - "apify/website-content-crawler" - scrape websites and extract content as Markdown + - "apify/web-scraper" - general-purpose web scraper with JS rendering + - "apify/google-search-scraper" — scrape Google search results + + Args: + actor_id: Actor identifier in "username/actor-name" format, + e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. + run_input: JSON-serializable input for the Actor. Each Actor defines its own + input schema - check the Actor README on Apify Store for required fields. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default `memory` value if not set. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Actor", + ) + except Exception as e: + return _error_result(e, "apify_run_actor") + + +@tool +def apify_get_dataset_items( + dataset_id: str, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + offset: int = 0, +) -> Dict[str, Any]: + """Fetch items from an existing Apify dataset and return them as JSON. + + Every Actor run writes its output to a dataset — a structured, append-only store + for tabular data. Use the dataset_id from the run metadata returned by apify_run_actor + or apify_run_task. Use offset for pagination through large datasets. + + Args: + dataset_id: The Apify dataset ID to fetch items from. + limit: Maximum number of items to return. Defaults to 100. + offset: Number of items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing an array of dataset items. + """ + try: + _check_dependency() + client = ApifyToolClient() + items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) + return _success_result( + text=json.dumps(items, indent=2, default=str), + panel_body=( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" + ), + panel_title="Apify: Dataset Items", + ) + except Exception as e: + return _error_result(e, "apify_get_dataset_items") + + +@tool +def apify_run_actor_and_get_dataset( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify Actor and fetch its dataset results in one step. + + Convenience tool that combines running an Actor and fetching its default dataset + items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + actor_id: Actor identifier in "username/actor-name" format, + e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. + run_input: JSON-serializable input for the Actor. Each Actor defines its own + input schema - check the Actor README on Apify Store for required fields. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default `memory` value if not set. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Actor + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_actor_and_get_dataset") + + +@tool +def apify_run_task( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, +) -> Dict[str, Any]: + """Run a saved Apify task and return the run metadata as JSON. + + Tasks are saved Actor configurations with preset inputs, managed in Apify Console. + Use this when a task has already been configured, so you don't need to specify + the full Actor input every time. Use apify_run_task_and_get_dataset to also fetch + the output data in one call. + + Args: + task_id: Task identifier in "username/task-name" format or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input fields. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. Uses task default `memory` value if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Task", + ) + except Exception as e: + return _error_result(e, "apify_run_task") + + +@tool +def apify_run_task_and_get_dataset( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run a saved Apify task and fetch its dataset results in one step. + + Convenience tool that combines running a task and fetching its default dataset + items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + task_id: Task identifier in "username/task-name" format or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input fields. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. Uses task default `memory` value if not set. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task_and_get_dataset( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed with dataset[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Task + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_task_and_get_dataset") + + +@tool +def apify_scrape_url( + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: CrawlerType = "cheerio", +) -> Dict[str, Any]: + """Scrape a single URL and return its content as Markdown. + + Uses the Website Content Crawler Actor under the hood, pre-configured for + fast single-page scraping. This is the simplest way to extract readable content + from any web page — no Actor input schema needed. For multi-page crawls, use + apify_run_actor_and_get_dataset with "apify/website-content-crawler" directly. + + Args: + url: The URL to scrape, e.g. "https://example.com". + timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + crawler_type: Crawler engine to use. One of: + - "cheerio" (default): Fastest, no JavaScript rendering. Best for static HTML. + - "playwright:adaptive": Renders JS only when needed. Good general-purpose choice. + - "playwright:firefox": Full JS rendering, best at bypassing anti-bot protection but slowest. + + Returns: + Dict with status and content containing the Markdown content of the scraped page. + """ + try: + _check_dependency() + client = ApifyToolClient() + content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) + return _success_result( + text=content, + panel_body=( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" + ), + panel_title="Apify: Scrape URL", + ) + except Exception as e: + return _error_result(e, "apify_scrape_url") + + +APIFY_CORE_TOOLS = [ + apify_run_actor, + apify_get_dataset_items, + apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, + apify_scrape_url, +] diff --git a/tests/test_apify.py b/tests/test_apify.py new file mode 100644 index 00000000..78f15694 --- /dev/null +++ b/tests/test_apify.py @@ -0,0 +1,778 @@ +"""Tests for the Apify tools.""" + +import json +from unittest.mock import MagicMock, patch + +import pytest + +from strands_tools import apify +from strands_tools.apify import ( + ApifyToolClient, + apify_get_dataset_items, + apify_run_actor, + apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, + apify_scrape_url, +) + +MOCK_ACTOR_RUN = { + "id": "run-HG7ml5fB1hCp8YEBA", + "actId": "actor~my-scraper", + "userId": "user-abc123", + "startedAt": "2026-03-15T14:30:00.000Z", + "finishedAt": "2026-03-15T14:35:22.000Z", + "status": "SUCCEEDED", + "statusMessage": "Actor finished successfully", + "defaultDatasetId": "dataset-WkC9gct8rq1uR5vDZ", + "defaultKeyValueStoreId": "kvs-Xb3A8gct8rq1uR5vD", + "buildNumber": "1.2.3", +} + +MOCK_FAILED_RUN = { + **MOCK_ACTOR_RUN, + "status": "FAILED", + "statusMessage": "Actor failed with an error", +} + +MOCK_TIMED_OUT_RUN = { + **MOCK_ACTOR_RUN, + "status": "TIMED-OUT", + "statusMessage": "Actor run timed out", +} + +MOCK_DATASET_ITEMS = [ + {"url": "https://example.com/product/1", "title": "Widget A", "price": 19.99, "currency": "USD"}, + {"url": "https://example.com/product/2", "title": "Widget B", "price": 29.99, "currency": "USD"}, + {"url": "https://example.com/product/3", "title": "Widget C", "price": 39.99, "currency": "EUR"}, +] + +MOCK_SCRAPED_ITEM = { + "url": "https://example.com", + "markdown": "# Example Domain\n\nThis domain is for use in illustrative examples.", + "text": "Example Domain. This domain is for use in illustrative examples.", +} + + +def _make_apify_api_error(status_code: int, message: str) -> Exception: + """Create an ApifyApiError instance for testing without calling its real __init__.""" + from apify_client.errors import ApifyApiError + + error = ApifyApiError.__new__(ApifyApiError) + Exception.__init__(error, message) + error.status_code = status_code + error.message = message + return error + + +@pytest.fixture +def mock_apify_client(): + """Create a mock ApifyClient with pre-configured responses.""" + client = MagicMock() + + mock_actor = MagicMock() + mock_actor.call.return_value = MOCK_ACTOR_RUN + client.actor.return_value = mock_actor + + mock_task = MagicMock() + mock_task.call.return_value = MOCK_ACTOR_RUN + client.task.return_value = mock_task + + mock_dataset = MagicMock() + mock_list_result = MagicMock() + mock_list_result.items = MOCK_DATASET_ITEMS + mock_dataset.list_items.return_value = mock_list_result + client.dataset.return_value = mock_dataset + + return client + + +@pytest.fixture +def mock_apify_env(monkeypatch): + """Set required Apify environment variables.""" + monkeypatch.setenv("APIFY_API_TOKEN", "test-token-12345") + + +# --- Module import --- + + +def test_apify_module_is_importable(): + """Verify that the apify module can be imported from strands_tools.""" + assert apify is not None + assert apify.__name__ == "strands_tools.apify" + + +# --- ApifyToolClient --- + + +def test_client_missing_token(monkeypatch): + """ApifyToolClient raises ValueError when APIFY_API_TOKEN is not set.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + with pytest.raises(ValueError, match="APIFY_API_TOKEN"): + ApifyToolClient() + + +def test_client_uses_env_token(mock_apify_env): + """ApifyToolClient passes the env token to ApifyClient.""" + with patch("strands_tools.apify.ApifyClient") as MockClient: + ApifyToolClient() + MockClient.assert_called_once_with( + "test-token-12345", + headers={"x-apify-integration-platform": "strands-agents"}, + ) + + +# --- apify_run_actor --- + + +def test_run_actor_success(mock_apify_env, mock_apify_client): + """Successful Actor run returns structured result with run metadata.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper", run_input={"url": "https://example.com"}) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + assert "started_at" in data + assert "finished_at" in data + mock_apify_client.actor.assert_called_once_with("actor/my-scraper") + + +def test_run_actor_default_input(mock_apify_env, mock_apify_client): + """Actor run defaults run_input to empty dict when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "success" + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["run_input"] == {} + + +def test_run_actor_explicit_empty_input(mock_apify_env, mock_apify_client): + """Actor run passes through an explicitly empty dict instead of treating it as falsy.""" + empty_input: dict = {} + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper", run_input=empty_input) + + assert result["status"] == "success" + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["run_input"] is empty_input + + +def test_run_actor_with_memory(mock_apify_env, mock_apify_client): + """Actor run passes memory_mbytes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_run_actor(actor_id="actor/my-scraper", memory_mbytes=512) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["memory_mbytes"] == 512 + + +def test_run_actor_failure(mock_apify_env, mock_apify_client): + """Actor run returns error dict when Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +def test_run_actor_timeout(mock_apify_env, mock_apify_client): + """Actor run returns error dict when Actor times out.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "TIMED-OUT" in result["content"][0]["text"] + + +def test_run_actor_api_exception(mock_apify_env, mock_apify_client): + """Actor run returns error dict on API exceptions.""" + mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "Connection failed" in result["content"][0]["text"] + + +def test_run_actor_none_response(mock_apify_env, mock_apify_client): + """Actor run returns error dict when ActorClient.call() returns None.""" + mock_apify_client.actor.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): + """Actor run returns friendly message for 401 authentication errors.""" + error = _make_apify_api_error(401, "Unauthorized") + mock_apify_client.actor.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "Authentication failed" in result["content"][0]["text"] + + +def test_run_actor_apify_api_error_404(mock_apify_env, mock_apify_client): + """Actor run returns friendly message for 404 not-found errors.""" + error = _make_apify_api_error(404, "Actor not found") + mock_apify_client.actor.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/nonexistent") + + assert result["status"] == "error" + assert "Resource not found" in result["content"][0]["text"] + + +# --- apify_get_dataset_items --- + + +def test_get_dataset_items_success(mock_apify_env, mock_apify_client): + """Successful dataset retrieval returns structured result with items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_get_dataset_items(dataset_id="dataset-WkC9gct8rq1uR5vDZ") + + assert result["status"] == "success" + items = json.loads(result["content"][0]["text"]) + assert len(items) == 3 + assert items[0]["title"] == "Widget A" + assert items[2]["currency"] == "EUR" + mock_apify_client.dataset.assert_called_once_with("dataset-WkC9gct8rq1uR5vDZ") + + +def test_get_dataset_items_with_pagination(mock_apify_env, mock_apify_client): + """dataset retrieval passes limit and offset.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_get_dataset_items(dataset_id="dataset-xyz", limit=50, offset=10) + + mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10) + + +def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): + """Empty dataset returns a structured result with empty JSON array.""" + mock_list_result = MagicMock() + mock_list_result.items = [] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_get_dataset_items(dataset_id="dataset-empty") + + assert result["status"] == "success" + items = json.loads(result["content"][0]["text"]) + assert items == [] + + +# --- apify_run_actor_and_get_dataset --- + + +def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): + """Combined run + dataset fetch returns structured result with metadata and items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset( + actor_id="actor/my-scraper", + run_input={"url": "https://example.com"}, + dataset_items_limit=50, + ) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + assert len(data["items"]) == 3 + assert data["items"][0]["title"] == "Widget A" + + +def test_run_actor_and_get_dataset_no_dataset_id(mock_apify_env, mock_apify_client): + """Combined tool returns error when the Actor run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_client): + """Combined tool returns error dict when the Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +# --- apify_run_task --- + + +def test_run_task_success(mock_apify_env, mock_apify_client): + """Successful task run returns structured result with run metadata.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task", task_input={"query": "test"}) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + mock_apify_client.task.assert_called_once_with("user~my-task") + + +def test_run_task_no_input(mock_apify_env, mock_apify_client): + """task run omits task_input kwarg when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "success" + call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs + assert "task_input" not in call_kwargs + + +def test_run_task_with_memory(mock_apify_env, mock_apify_client): + """task run passes memory_mbytes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_run_task(task_id="user~my-task", memory_mbytes=1024) + + call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs + assert call_kwargs["memory_mbytes"] == 1024 + + +def test_run_task_failure(mock_apify_env, mock_apify_client): + """task run returns error dict when task fails.""" + mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +def test_run_task_none_response(mock_apify_env, mock_apify_client): + """task run returns error dict when TaskClient.call() returns None.""" + mock_apify_client.task.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): + """task run returns friendly message for 401 authentication errors.""" + error = _make_apify_api_error(401, "Unauthorized") + mock_apify_client.task.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "Authentication failed" in result["content"][0]["text"] + + +# --- apify_run_task_and_get_dataset --- + + +def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): + """Combined task run + dataset fetch returns structured result with metadata and items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset( + task_id="user~my-task", + task_input={"query": "test"}, + dataset_items_limit=50, + ) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert len(data["items"]) == 3 + assert data["items"][0]["title"] == "Widget A" + + +def test_run_task_and_get_dataset_no_dataset_id(mock_apify_env, mock_apify_client): + """Combined task tool returns error when the task run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.task.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client): + """Combined task tool returns error dict when the task fails.""" + mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +# --- apify_scrape_url --- + + +def test_scrape_url_success(mock_apify_env, mock_apify_client): + """Scrape URL returns structured result with markdown content.""" + mock_list_result = MagicMock() + mock_list_result.items = [MOCK_SCRAPED_ITEM] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "success" + assert "Example Domain" in result["content"][0]["text"] + mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") + + +def test_scrape_url_none_response(mock_apify_env, mock_apify_client): + """Scrape URL returns error dict when ActorClient.call() returns None.""" + mock_apify_client.actor.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_scrape_url_no_dataset_id(mock_apify_env, mock_apify_client): + """Scrape URL returns error when the crawler run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + +def test_scrape_url_no_content(mock_apify_env, mock_apify_client): + """Scrape URL returns error dict when no content is returned.""" + mock_list_result = MagicMock() + mock_list_result.items = [] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "No content returned" in result["content"][0]["text"] + + +def test_scrape_url_crawler_failure(mock_apify_env, mock_apify_client): + """Scrape URL returns error dict when the crawler Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +def test_scrape_url_falls_back_to_text(mock_apify_env, mock_apify_client): + """Scrape URL falls back to text field when markdown is absent.""" + item_without_markdown = {"url": "https://example.com", "text": "Plain text content"} + mock_list_result = MagicMock() + mock_list_result.items = [item_without_markdown] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "success" + assert result["content"][0]["text"] == "Plain text content" + + +def test_scrape_url_invalid_url_scheme(mock_apify_env): + """apify_scrape_url returns error for invalid URL scheme.""" + result = apify_scrape_url(url="ftp://example.com") + + assert result["status"] == "error" + assert "Invalid URL scheme" in result["content"][0]["text"] + + +def test_scrape_url_missing_scheme(mock_apify_env): + """apify_scrape_url returns error for URL without http/https scheme.""" + result = apify_scrape_url(url="example.com") + + assert result["status"] == "error" + assert "Invalid URL scheme" in result["content"][0]["text"] + + +# --- Parameter validation --- + + +def test_run_actor_empty_actor_id(mock_apify_env): + """apify_run_actor returns error for whitespace-only actor_id.""" + result = apify_run_actor(actor_id=" ") + + assert result["status"] == "error" + assert "actor_id" in result["content"][0]["text"] + + +def test_run_actor_zero_timeout(mock_apify_env): + """apify_run_actor returns error for non-positive timeout_secs.""" + result = apify_run_actor(actor_id="actor/valid", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_actor_negative_timeout(mock_apify_env): + """apify_run_actor returns error for negative timeout_secs.""" + result = apify_run_actor(actor_id="actor/valid", timeout_secs=-5) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_actor_zero_memory(mock_apify_env): + """apify_run_actor returns error for non-positive memory_mbytes.""" + result = apify_run_actor(actor_id="actor/valid", memory_mbytes=0) + + assert result["status"] == "error" + assert "memory_mbytes" in result["content"][0]["text"] + + +def test_run_task_empty_task_id(mock_apify_env): + """apify_run_task returns error for whitespace-only task_id.""" + result = apify_run_task(task_id=" ") + + assert result["status"] == "error" + assert "task_id" in result["content"][0]["text"] + + +def test_run_task_zero_timeout(mock_apify_env): + """apify_run_task returns error for non-positive timeout_secs.""" + result = apify_run_task(task_id="user~my-task", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_task_zero_memory(mock_apify_env): + """apify_run_task returns error for non-positive memory_mbytes.""" + result = apify_run_task(task_id="user~my-task", memory_mbytes=0) + + assert result["status"] == "error" + assert "memory_mbytes" in result["content"][0]["text"] + + +def test_get_dataset_items_empty_dataset_id(mock_apify_env): + """apify_get_dataset_items returns error for whitespace-only dataset_id.""" + result = apify_get_dataset_items(dataset_id=" ") + + assert result["status"] == "error" + assert "dataset_id" in result["content"][0]["text"] + + +def test_get_dataset_items_zero_limit(mock_apify_env): + """apify_get_dataset_items returns error for non-positive limit.""" + result = apify_get_dataset_items(dataset_id="dataset-abc", limit=0) + + assert result["status"] == "error" + assert "limit" in result["content"][0]["text"] + + +def test_get_dataset_items_negative_offset(mock_apify_env): + """apify_get_dataset_items returns error for negative offset.""" + result = apify_get_dataset_items(dataset_id="dataset-abc", offset=-1) + + assert result["status"] == "error" + assert "offset" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_zero_dataset_limit(mock_apify_env): + """apify_run_actor_and_get_dataset returns error for non-positive dataset_items_limit.""" + result = apify_run_actor_and_get_dataset(actor_id="actor/valid", dataset_items_limit=0) + + assert result["status"] == "error" + assert "dataset_items_limit" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_negative_dataset_offset(mock_apify_env): + """apify_run_actor_and_get_dataset returns error for negative dataset_items_offset.""" + result = apify_run_actor_and_get_dataset(actor_id="actor/valid", dataset_items_offset=-1) + + assert result["status"] == "error" + assert "dataset_items_offset" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_zero_dataset_limit(mock_apify_env): + """apify_run_task_and_get_dataset returns error for non-positive dataset_items_limit.""" + result = apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_limit=0) + + assert result["status"] == "error" + assert "dataset_items_limit" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_negative_dataset_offset(mock_apify_env): + """apify_run_task_and_get_dataset returns error for negative dataset_items_offset.""" + result = apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_offset=-1) + + assert result["status"] == "error" + assert "dataset_items_offset" in result["content"][0]["text"] + + +def test_scrape_url_zero_timeout(mock_apify_env): + """apify_scrape_url returns error for non-positive timeout_secs.""" + result = apify_scrape_url(url="https://example.com", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_scrape_url_invalid_crawler_type(mock_apify_env): + """apify_scrape_url returns error for unsupported crawler_type.""" + result = apify_scrape_url(url="https://example.com", crawler_type="invalid") + + assert result["status"] == "error" + assert "crawler_type" in result["content"][0]["text"] + + +def test_scrape_url_missing_domain(mock_apify_env): + """apify_scrape_url returns error for URL with no domain.""" + result = apify_scrape_url(url="https://") + + assert result["status"] == "error" + assert "domain" in result["content"][0]["text"].lower() + + +# --- Dependency guard --- + + +def test_missing_apify_client_run_actor(mock_apify_env): + """apify_run_actor returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_actor(actor_id="test/actor") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_get_dataset(mock_apify_env): + """apify_get_dataset_items returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_get_dataset_items(dataset_id="dataset-123") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_run_and_get(mock_apify_env): + """apify_run_actor_and_get_dataset returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_actor_and_get_dataset(actor_id="test/actor") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_run_task(mock_apify_env): + """apify_run_task returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_run_task_and_get(mock_apify_env): + """apify_run_task_and_get_dataset returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_scrape_url(mock_apify_env): + """apify_scrape_url returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +# --- Missing token from tool entry points --- + + +def test_run_actor_missing_token(monkeypatch): + """apify_run_actor returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_actor(actor_id="test/actor") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_get_dataset_items_missing_token(monkeypatch): + """apify_get_dataset_items returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_get_dataset_items(dataset_id="dataset-123") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_missing_token(monkeypatch): + """apify_run_actor_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_actor_and_get_dataset(actor_id="test/actor") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_task_missing_token(monkeypatch): + """apify_run_task returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_task(task_id="user~my-task") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_missing_token(monkeypatch): + """apify_run_task_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_scrape_url_missing_token(monkeypatch): + """apify_scrape_url returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"]