From 6ff22559820ee06ddd4b9d97f4a616ad5aae7af7 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 18 Mar 2026 15:20:00 +0100 Subject: [PATCH 01/24] feat: implementing the first tool --- README.md | 4 + pyproject.toml | 7 +- src/strands_tools/apify.py | 377 +++++++++++++++++++++++++++++++++++++ tests/test_apify.py | 317 +++++++++++++++++++++++++++++++ 4 files changed, 703 insertions(+), 2 deletions(-) create mode 100644 src/strands_tools/apify.py create mode 100644 tests/test_apify.py diff --git a/README.md b/README.md index e945edf4..8ec23e98 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,10 @@ Below is a comprehensive table of all available tools, how to use them with an a | Tool | Agent Usage | Use Case | |------|-------------|----------| | a2a_client | `provider = A2AClientToolProvider(known_agent_urls=["http://localhost:9000"]); agent = Agent(tools=provider.tools)` | Discover and communicate with A2A-compliant agents, send messages between agents | +| apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor by ID with arbitrary input | +| apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify Dataset | +| apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its Dataset results in one step | +| apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | | editor | `agent.tool.editor(command="view", path="path/to/file.py")` | Advanced file operations like syntax highlighting, pattern replacement, and multi-file edits | diff --git a/pyproject.toml b/pyproject.toml index bf00325f..de75e0be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,9 @@ Homepage = "https://github.com/strands-agents/tools" Documentation = "https://strandsagents.com/" [project.optional-dependencies] +apify = [ + "apify-client>=1.0.0", +] build = [ "hatch>=1.16.5", ] @@ -122,7 +125,7 @@ mongodb-memory = [ ] [tool.hatch.envs.hatch-static-analysis] -features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory"] +features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory", "apify"] dependencies = [ "strands-agents>=1.0.0", "mypy>=0.981,<1.0.0", @@ -141,7 +144,7 @@ lint-check = [ lint-fix = ["ruff check --fix"] [tool.hatch.envs.hatch-test] -features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory"] +features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory", "apify"] extra-dependencies = [ "moto>=5.1.0,<6.0.0", "pytest>=8.0.0,<10.0.0", diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py new file mode 100644 index 00000000..96e5f9b8 --- /dev/null +++ b/src/strands_tools/apify.py @@ -0,0 +1,377 @@ +"""Apify platform integration tool for Strands Agents. + +Provides capabilities to run Apify Actors, retrieve Datasets, and scrape URLs +using the Apify platform programmatically. + +Available tools: +- apify_run_actor: Run any Apify Actor by ID with arbitrary input +- apify_get_dataset_items: Fetch items from an Apify Dataset +- apify_run_actor_and_get_dataset: Run an Actor and fetch its Dataset results in one step +- apify_scrape_url: Scrape a URL and return its content as markdown + +Setup Requirements: +------------------ +1. Create an Apify account at https://apify.com +2. Obtain your API token: Apify Console β†’ Settings β†’ API & Integrations β†’ Personal API tokens +3. Install the optional dependency: pip install -e ".[apify]" +4. Set the environment variable: + APIFY_API_TOKEN=your_api_token_here + +Usage with Strands Agent: +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_get_dataset_items, + apify.apify_run_actor_and_get_dataset, + apify.apify_scrape_url, +]) + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") +``` + +!!!!!!!!!!!!! IMPORTANT: !!!!!!!!!!!!! + +Environment Variables: +- APIFY_API_TOKEN: Your Apify API token (required) + Obtain from https://console.apify.com/account/integrations + +Example .env configuration: + APIFY_API_TOKEN=apify_api_1a2B3cD4eF5gH6iJ7kL8m + +!!!!!!!!!!!!! IMPORTANT: !!!!!!!!!!!!! + +See the function docstrings for complete parameter documentation. +""" + +import json +import logging +import os +from typing import Any, Dict, List, Optional + +from rich.panel import Panel +from rich.text import Text +from strands import tool + +from strands_tools.utils import console_util + +logger = logging.getLogger(__name__) +console = console_util.create() + +try: + from apify_client import ApifyClient + + HAS_APIFY_CLIENT = True +except ImportError: + HAS_APIFY_CLIENT = False + +WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" +TRACKING_HEADER = {"x-apify-integration-platform": "strands"} + + +def _check_dependency() -> None: + """Raise ImportError if apify-client is not installed.""" + if not HAS_APIFY_CLIENT: + raise ImportError("apify-client package is required. Install with: pip install strands-agents-tools[apify]") + + +class ApifyToolClient: + """Helper class encapsulating Apify API interactions via apify-client.""" + + def __init__(self) -> None: + token = os.getenv("APIFY_API_TOKEN", "") + if not token: + raise ValueError( + "APIFY_API_TOKEN environment variable is not set. " + "Get your token at https://console.apify.com/account/integrations" + ) + self.client: "ApifyClient" = ApifyClient(token, headers=TRACKING_HEADER) + + def run_actor( + self, + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + ) -> Dict[str, Any]: + """Run an Apify Actor synchronously and return run metadata.""" + call_kwargs: Dict[str, Any] = { + "run_input": run_input or {}, + "timeout_secs": timeout_secs, + } + if memory_mbytes is not None: + call_kwargs["memory_mbytes"] = memory_mbytes + + actor_run = self.client.actor(actor_id).call(**call_kwargs) + + status = actor_run.get("status", "UNKNOWN") + if status not in ("SUCCEEDED",): + raise RuntimeError(f"Actor {actor_id} finished with status {status}. Run ID: {actor_run.get('id', 'N/A')}") + + return { + "run_id": actor_run.get("id"), + "status": status, + "dataset_id": actor_run.get("defaultDatasetId"), + "started_at": actor_run.get("startedAt"), + "finished_at": actor_run.get("finishedAt"), + } + + def get_dataset_items( + self, + dataset_id: str, + limit: int = 100, + offset: int = 0, + ) -> List[Dict[str, Any]]: + """Fetch items from an Apify Dataset.""" + result = self.client.dataset(dataset_id).list_items(limit=limit, offset=offset) + return list(result.items) + + def run_actor_and_get_dataset( + self, + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = 100, + ) -> Dict[str, Any]: + """Run an Actor synchronously, then fetch its default Dataset items.""" + run_metadata = self.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + dataset_id = run_metadata["dataset_id"] + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) + return {**run_metadata, "items": items} + + def scrape_url(self, url: str, timeout_secs: int = 120) -> str: + """Scrape a single URL using Website Content Crawler and return markdown.""" + run_input = { + "startUrls": [{"url": url}], + "maxCrawlPages": 1, + } + actor_run = self.client.actor(WEBSITE_CONTENT_CRAWLER).call( + run_input=run_input, + timeout_secs=timeout_secs, + ) + + status = actor_run.get("status", "UNKNOWN") + if status not in ("SUCCEEDED",): + raise RuntimeError( + f"Website Content Crawler finished with status {status}. Run ID: {actor_run.get('id', 'N/A')}" + ) + + dataset_id = actor_run.get("defaultDatasetId") + result = self.client.dataset(dataset_id).list_items(limit=1) + items = list(result.items) + + if not items: + raise RuntimeError(f"No content returned for URL: {url}") + + return str(items[0].get("markdown") or items[0].get("text", "")) + + +# --- Tool functions --- + + +@tool +def apify_run_actor( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, +) -> str: + """Run any Apify Actor by its ID or name and return the run metadata as JSON. + + Executes the Actor synchronously - blocks until the Actor Run finishes or the timeout + is reached. Use this when you need to run a specific Actor and then inspect or process + the results separately. + + Common Actors: + - "apify/website-content-crawler" - scrape websites and extract content + - "apify/web-scraper" - general-purpose web scraper + - "apify/google-search-scraper" - scrape Google search results + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. + timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor Run. Uses Actor default if not set. + + Returns: + JSON string with run metadata: run_id, status, dataset_id, started_at, finished_at. + """ + _check_dependency() + try: + client = ApifyToolClient() + result = client.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + panel = Panel( + f"[green]Actor Run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}", + title="[bold cyan]Apify: Run Actor[/bold cyan]", + border_style="green", + ) + console.print(panel) + return json.dumps(result, indent=2, default=str) + except Exception as e: + error_panel = Panel( + Text(str(e), style="red"), + title="[bold red]Apify Error[/bold red]", + border_style="red", + ) + console.print(error_panel) + raise + + +@tool +def apify_get_dataset_items( + dataset_id: str, + limit: int = 100, + offset: int = 0, +) -> str: + """Fetch items from an existing Apify Dataset and return them as JSON. + + Use this after running an Actor to retrieve the structured results from its + default Dataset, or to access any Dataset by ID. + + Args: + dataset_id: The Apify Dataset ID to fetch items from. + limit: Maximum number of items to return. Defaults to 100. + offset: Number of items to skip for pagination. Defaults to 0. + + Returns: + JSON string containing an array of Dataset items. + """ + _check_dependency() + try: + client = ApifyToolClient() + items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) + panel = Panel( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}", + title="[bold cyan]Apify: Dataset Items[/bold cyan]", + border_style="green", + ) + console.print(panel) + return json.dumps(items, indent=2, default=str) + except Exception as e: + error_panel = Panel( + Text(str(e), style="red"), + title="[bold red]Apify Error[/bold red]", + border_style="red", + ) + console.print(error_panel) + raise + + +@tool +def apify_run_actor_and_get_dataset( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = 100, +) -> str: + """Run an Apify Actor and fetch its Dataset results in one step. + + Convenience tool that combines running an Actor and fetching its default Dataset + items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. + timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor Run. + dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. + + Returns: + JSON string with run metadata (run_id, status, dataset_id, started_at, finished_at) + plus an "items" array containing the Dataset results. + """ + _check_dependency() + try: + client = ApifyToolClient() + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + ) + panel = Panel( + f"[green]Actor Run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}", + title="[bold cyan]Apify: Run Actor + Dataset[/bold cyan]", + border_style="green", + ) + console.print(panel) + return json.dumps(result, indent=2, default=str) + except Exception as e: + error_panel = Panel( + Text(str(e), style="red"), + title="[bold red]Apify Error[/bold red]", + border_style="red", + ) + console.print(error_panel) + raise + + +@tool +def apify_scrape_url( + url: str, + timeout_secs: int = 120, +) -> str: + """Scrape a single URL and return its content as markdown. + + Uses the Apify Website Content Crawler Actor under the hood, pre-configured for + fast single-page scraping. This is the simplest way to extract readable content + from any web page. + + Args: + url: The URL to scrape, e.g. "https://example.com". + timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + + Returns: + Markdown content of the scraped page as a plain string. + """ + _check_dependency() + try: + client = ApifyToolClient() + content = client.scrape_url(url=url, timeout_secs=timeout_secs) + panel = Panel( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters", + title="[bold cyan]Apify: Scrape URL[/bold cyan]", + border_style="green", + ) + console.print(panel) + return content + except Exception as e: + error_panel = Panel( + Text(str(e), style="red"), + title="[bold red]Apify Error[/bold red]", + border_style="red", + ) + console.print(error_panel) + raise diff --git a/tests/test_apify.py b/tests/test_apify.py new file mode 100644 index 00000000..d8a3c5e4 --- /dev/null +++ b/tests/test_apify.py @@ -0,0 +1,317 @@ +"""Tests for the Apify tools.""" + +import json +from unittest.mock import MagicMock, patch + +import pytest + +from strands_tools import apify +from strands_tools.apify import ( + ApifyToolClient, + apify_get_dataset_items, + apify_run_actor, + apify_run_actor_and_get_dataset, + apify_scrape_url, +) + +MOCK_ACTOR_RUN = { + "id": "run-HG7ml5fB1hCp8YEBA", + "actId": "janedoe~my-scraper", + "userId": "user-abc123", + "startedAt": "2026-03-15T14:30:00.000Z", + "finishedAt": "2026-03-15T14:35:22.000Z", + "status": "SUCCEEDED", + "statusMessage": "Actor finished successfully", + "defaultDatasetId": "dataset-WkC9gct8rq1uR5vDZ", + "defaultKeyValueStoreId": "kvs-Xb3A8gct8rq1uR5vD", + "buildNumber": "1.2.3", +} + +MOCK_FAILED_RUN = { + **MOCK_ACTOR_RUN, + "status": "FAILED", + "statusMessage": "Actor failed with an error", +} + +MOCK_TIMED_OUT_RUN = { + **MOCK_ACTOR_RUN, + "status": "TIMED-OUT", + "statusMessage": "Actor run timed out", +} + +MOCK_DATASET_ITEMS = [ + {"url": "https://example.com/product/1", "title": "Widget A", "price": 19.99, "currency": "USD"}, + {"url": "https://example.com/product/2", "title": "Widget B", "price": 29.99, "currency": "USD"}, + {"url": "https://example.com/product/3", "title": "Widget C", "price": 39.99, "currency": "EUR"}, +] + +MOCK_SCRAPED_ITEM = { + "url": "https://example.com", + "markdown": "# Example Domain\n\nThis domain is for use in illustrative examples.", + "text": "Example Domain. This domain is for use in illustrative examples.", +} + + +@pytest.fixture +def mock_apify_client(): + """Create a mock ApifyClient with pre-configured responses.""" + client = MagicMock() + + mock_actor = MagicMock() + mock_actor.call.return_value = MOCK_ACTOR_RUN + client.actor.return_value = mock_actor + + mock_dataset = MagicMock() + mock_list_result = MagicMock() + mock_list_result.items = MOCK_DATASET_ITEMS + mock_dataset.list_items.return_value = mock_list_result + client.dataset.return_value = mock_dataset + + return client + + +@pytest.fixture +def mock_apify_env(monkeypatch): + """Set required Apify environment variables.""" + monkeypatch.setenv("APIFY_API_TOKEN", "test-token-12345") + + +# --- Module import --- + + +def test_apify_module_is_importable(): + """Verify that the apify tool module can be imported from strands_tools.""" + assert apify is not None + assert apify.__name__ == "strands_tools.apify" + + +# --- ApifyToolClient --- + + +def test_client_missing_token(monkeypatch): + """ApifyToolClient raises ValueError when APIFY_API_TOKEN is not set.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + with pytest.raises(ValueError, match="APIFY_API_TOKEN"): + ApifyToolClient() + + +def test_client_uses_env_token(mock_apify_env): + """ApifyToolClient passes the env token to ApifyClient.""" + with patch("strands_tools.apify.ApifyClient") as MockClient: + ApifyToolClient() + MockClient.assert_called_once_with("test-token-12345") + + +# --- apify_run_actor --- + + +def test_run_actor_success(mock_apify_env, mock_apify_client): + """Successful Actor Run returns JSON with run metadata.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="janedoe/my-scraper", run_input={"url": "https://example.com"}) + + data = json.loads(result) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + assert "started_at" in data + assert "finished_at" in data + mock_apify_client.actor.assert_called_once_with("janedoe/my-scraper") + + +def test_run_actor_with_memory(mock_apify_env, mock_apify_client): + """Actor Run passes memory_mbytes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_run_actor(actor_id="janedoe/my-scraper", memory_mbytes=512) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["memory_mbytes"] == 512 + + +def test_run_actor_failure(mock_apify_env, mock_apify_client): + """Actor Run raises RuntimeError when Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="FAILED"): + apify_run_actor(actor_id="janedoe/my-scraper") + + +def test_run_actor_timeout(mock_apify_env, mock_apify_client): + """Actor Run raises RuntimeError when Actor times out.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="TIMED-OUT"): + apify_run_actor(actor_id="janedoe/my-scraper") + + +def test_run_actor_api_exception(mock_apify_env, mock_apify_client): + """Actor Run re-raises exceptions from the Apify client.""" + mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(Exception, match="Connection failed"): + apify_run_actor(actor_id="janedoe/my-scraper") + + +# --- apify_get_dataset_items --- + + +def test_get_dataset_items_success(mock_apify_env, mock_apify_client): + """Successful dataset retrieval returns JSON array of items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_get_dataset_items(dataset_id="dataset-WkC9gct8rq1uR5vDZ") + + items = json.loads(result) + assert len(items) == 3 + assert items[0]["title"] == "Widget A" + assert items[2]["currency"] == "EUR" + mock_apify_client.dataset.assert_called_once_with("dataset-WkC9gct8rq1uR5vDZ") + + +def test_get_dataset_items_with_pagination(mock_apify_env, mock_apify_client): + """Dataset retrieval passes limit and offset.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_get_dataset_items(dataset_id="dataset-xyz", limit=50, offset=10) + + mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10) + + +def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): + """Empty dataset returns an empty JSON array.""" + mock_list_result = MagicMock() + mock_list_result.items = [] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_get_dataset_items(dataset_id="dataset-empty") + + items = json.loads(result) + assert items == [] + + +# --- apify_run_actor_and_get_dataset --- + + +def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): + """Combined run + dataset fetch returns run metadata and items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset( + actor_id="janedoe/my-scraper", + run_input={"url": "https://example.com"}, + dataset_items_limit=50, + ) + + data = json.loads(result) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + assert len(data["items"]) == 3 + assert data["items"][0]["title"] == "Widget A" + + +def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_client): + """Combined tool raises when the Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="FAILED"): + apify_run_actor_and_get_dataset(actor_id="janedoe/my-scraper") + + +# --- apify_scrape_url --- + + +def test_scrape_url_success(mock_apify_env, mock_apify_client): + """Scrape URL returns markdown content from the crawled page.""" + mock_list_result = MagicMock() + mock_list_result.items = [MOCK_SCRAPED_ITEM] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert "Example Domain" in result + mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") + + +def test_scrape_url_no_content(mock_apify_env, mock_apify_client): + """Scrape URL raises when no content is returned.""" + mock_list_result = MagicMock() + mock_list_result.items = [] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="No content returned"): + apify_scrape_url(url="https://example.com") + + +def test_scrape_url_crawler_failure(mock_apify_env, mock_apify_client): + """Scrape URL raises when the crawler Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="FAILED"): + apify_scrape_url(url="https://example.com") + + +def test_scrape_url_falls_back_to_text(mock_apify_env, mock_apify_client): + """Scrape URL falls back to text field when markdown is absent.""" + item_without_markdown = {"url": "https://example.com", "text": "Plain text content"} + mock_list_result = MagicMock() + mock_list_result.items = [item_without_markdown] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result == "Plain text content" + + +# --- Dependency guard --- + + +def test_missing_apify_client_run_actor(mock_apify_env): + """apify_run_actor raises ImportError when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + with pytest.raises(ImportError, match="apify-client"): + apify_run_actor(actor_id="test/actor") + + +def test_missing_apify_client_get_dataset(mock_apify_env): + """apify_get_dataset_items raises ImportError when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + with pytest.raises(ImportError, match="apify-client"): + apify_get_dataset_items(dataset_id="dataset-123") + + +def test_missing_apify_client_run_and_get(mock_apify_env): + """apify_run_actor_and_get_dataset raises ImportError when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + with pytest.raises(ImportError, match="apify-client"): + apify_run_actor_and_get_dataset(actor_id="test/actor") + + +def test_missing_apify_client_scrape_url(mock_apify_env): + """apify_scrape_url raises ImportError when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + with pytest.raises(ImportError, match="apify-client"): + apify_scrape_url(url="https://example.com") + + +# --- Missing token from tool entry points --- + + +def test_run_actor_missing_token(monkeypatch): + """apify_run_actor raises ValueError when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + with pytest.raises(ValueError, match="APIFY_API_TOKEN"): + apify_run_actor(actor_id="test/actor") + + +def test_scrape_url_missing_token(monkeypatch): + """apify_scrape_url raises ValueError when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + with pytest.raises(ValueError, match="APIFY_API_TOKEN"): + apify_scrape_url(url="https://example.com") From 5f01fa3d82966e63a83c4d7fcd15042ba03edc9d Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 18 Mar 2026 16:19:55 +0100 Subject: [PATCH 02/24] chore: renamed platform integration tag --- src/strands_tools/apify.py | 2 +- tests/test_apify.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 96e5f9b8..a43a7c85 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -75,7 +75,7 @@ HAS_APIFY_CLIENT = False WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" -TRACKING_HEADER = {"x-apify-integration-platform": "strands"} +TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} def _check_dependency() -> None: diff --git a/tests/test_apify.py b/tests/test_apify.py index d8a3c5e4..31644288 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -99,7 +99,10 @@ def test_client_uses_env_token(mock_apify_env): """ApifyToolClient passes the env token to ApifyClient.""" with patch("strands_tools.apify.ApifyClient") as MockClient: ApifyToolClient() - MockClient.assert_called_once_with("test-token-12345") + MockClient.assert_called_once_with( + "test-token-12345", + headers={"x-apify-integration-platform": "strands-agents"}, + ) # --- apify_run_actor --- From c685127f3f1019f7e2f03837c654ca7c13233f90 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 19 Mar 2026 14:34:23 +0100 Subject: [PATCH 03/24] feat: updading apify tool docs --- README.md | 43 +++++++++++ docs/apify_tool.md | 145 +++++++++++++++++++++++++++++++++++++ src/strands_tools/apify.py | 66 ++++------------- 3 files changed, 201 insertions(+), 53 deletions(-) create mode 100644 docs/apify_tool.md diff --git a/README.md b/README.md index 8ec23e98..457bf57b 100644 --- a/README.md +++ b/README.md @@ -964,6 +964,43 @@ result = agent.tool.mongodb_memory( ) ``` +### Apify Core Tools + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_get_dataset_items, + apify.apify_run_actor_and_get_dataset, + apify.apify_scrape_url, +]) + +# Scrape a single URL and get markdown content +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor and get results in one step +result = agent.tool.apify_run_actor_and_get_dataset( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + dataset_items_limit=50, +) + +# Run an Actor (get metadata only) +run_info = agent.tool.apify_run_actor( + actor_id="apify/google-search-scraper", + run_input={"queries": "AI agent frameworks"}, +) + +# Fetch Dataset items separately +items = agent.tool.apify_get_dataset_items( + dataset_id="abc123", + limit=100, +) +``` + + ## 🌍 Environment Variables Configuration Agents Tools provides extensive customization through environment variables. This allows you to configure tool behavior without modifying code, making it ideal for different environments (development, testing, production). @@ -1072,6 +1109,12 @@ The Mem0 Memory Tool supports three different backend configurations: - If `NEPTUNE_ANALYTICS_GRAPH_IDENTIFIER` is set, the tool will configure Neptune Analytics as graph store to enhance memory search - LLM configuration applies to all backend modes and allows customization of the language model used for memory processing +#### Apify Tool + +| Environment Variable | Description | Default | +|----------------------|-------------|---------| +| APIFY_API_TOKEN | Apify API token for authentication (required) | None | + #### Bright Data Tool | Environment Variable | Description | Default | diff --git a/docs/apify_tool.md b/docs/apify_tool.md new file mode 100644 index 00000000..9c930e1c --- /dev/null +++ b/docs/apify_tool.md @@ -0,0 +1,145 @@ +# Apify Core Tools + +The Apify core tools (`apify.py`) provide the foundational building blocks for interacting with the [Apify](https://apify.com) platform from Strands Agents. These generic tools let you run any [Actor](https://apify.com/store) by ID, fetch Dataset results, and scrape individual URLs. + +For higher-level, domain-specific tools see: +- [Apify Social Media Tools](apify_social_media_tool.md) β€” simplified wrappers for Instagram, LinkedIn, Twitter/X, TikTok, and Facebook scrapers +- [Apify Search Tools](apify_search_tool.md) β€” simplified wrappers for Google Search, Google Maps, YouTube, web crawling, and e-commerce scrapers + +## Installation + +```bash +pip install strands-agents-tools[apify] +``` + +## Configuration + +Set your Apify API token as an environment variable: + +```bash +export APIFY_API_TOKEN=apify_api_your_token_here +``` + +Get your token from the [Apify Console](https://console.apify.com/account/integrations) β†’ Settings β†’ API & Integrations β†’ Personal API tokens. + +## Usage + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_scrape_url, + apify.apify_get_dataset_items, + apify.apify_run_actor_and_get_dataset, +]) +``` + +### Scrape a URL + +The simplest way to extract content from any web page. Uses the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor under the hood and returns the page content as Markdown: + +```python +content = agent.tool.apify_scrape_url(url="https://example.com") +``` + +### Run an Actor + +Execute any Actor from the [Apify Store](https://apify.com/store) by its ID. The call blocks until the Actor Run finishes or the timeout is reached: + +```python +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + timeout_secs=300, +) +``` + +The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. + +### Run an Actor and Get Results + +Combine running an Actor and fetching its Dataset results in a single call: + +```python +result = agent.tool.apify_run_actor_and_get_dataset( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + dataset_items_limit=50, +) +``` + +### Fetch Dataset Items + +Retrieve results from a Dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing Dataset: + +```python +items = agent.tool.apify_get_dataset_items( + dataset_id="abc123", + limit=100, + offset=0, +) +``` + +## Tool Parameters + +### apify_scrape_url + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `url` | string | Yes | β€” | The URL to scrape | +| `timeout_secs` | int | No | 120 | Maximum time in seconds to wait for scraping to finish | + +**Returns:** Markdown content of the scraped page as a plain string. + +### apify_run_actor + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | +| `run_input` | dict | No | `{}` | JSON-serializable input for the Actor | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | +| `memory_mbytes` | int | No | Actor default | Memory allocation in MB for the Actor Run | + +**Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. + +### apify_get_dataset_items + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `dataset_id` | string | Yes | β€” | The Apify Dataset ID to fetch items from | +| `limit` | int | No | 100 | Maximum number of items to return | +| `offset` | int | No | 0 | Number of items to skip for pagination | + +**Returns:** JSON string containing an array of Dataset items. + +### apify_run_actor_and_get_dataset + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | +| `run_input` | dict | No | `{}` | JSON-serializable input for the Actor | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | +| `memory_mbytes` | int | No | Actor default | Memory allocation in MB for the Actor Run | +| `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | + +**Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. + +## Troubleshooting + +| Error | Cause | Fix | +|-------|-------|-----| +| `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | +| `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | +| `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in the [Apify Console](https://console.apify.com) | +| `Actor ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | + +## References + +- [Strands Agents Tools](https://strandsagents.com/latest/user-guide/concepts/tools/tools_overview/) +- [Apify Platform](https://apify.com) +- [Apify API Documentation](https://docs.apify.com/api/v2) +- [Apify Store](https://apify.com/store) +- [Apify Python Client](https://docs.apify.com/api/client/python/docs) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index a43a7c85..a272a8bf 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,56 +1,16 @@ -"""Apify platform integration tool for Strands Agents. - -Provides capabilities to run Apify Actors, retrieve Datasets, and scrape URLs -using the Apify platform programmatically. - -Available tools: -- apify_run_actor: Run any Apify Actor by ID with arbitrary input -- apify_get_dataset_items: Fetch items from an Apify Dataset -- apify_run_actor_and_get_dataset: Run an Actor and fetch its Dataset results in one step -- apify_scrape_url: Scrape a URL and return its content as markdown - -Setup Requirements: ------------------- -1. Create an Apify account at https://apify.com -2. Obtain your API token: Apify Console β†’ Settings β†’ API & Integrations β†’ Personal API tokens -3. Install the optional dependency: pip install -e ".[apify]" -4. Set the environment variable: - APIFY_API_TOKEN=your_api_token_here - -Usage with Strands Agent: -```python -from strands import Agent -from strands_tools import apify - -agent = Agent(tools=[ - apify.apify_run_actor, - apify.apify_get_dataset_items, - apify.apify_run_actor_and_get_dataset, - apify.apify_scrape_url, -]) - -# Run an Actor -result = agent.tool.apify_run_actor( - actor_id="apify/website-content-crawler", - run_input={"startUrls": [{"url": "https://example.com"}]}, -) - -# Scrape a single URL -content = agent.tool.apify_scrape_url(url="https://example.com") -``` - -!!!!!!!!!!!!! IMPORTANT: !!!!!!!!!!!!! - -Environment Variables: -- APIFY_API_TOKEN: Your Apify API token (required) - Obtain from https://console.apify.com/account/integrations - -Example .env configuration: - APIFY_API_TOKEN=apify_api_1a2B3cD4eF5gH6iJ7kL8m - -!!!!!!!!!!!!! IMPORTANT: !!!!!!!!!!!!! - -See the function docstrings for complete parameter documentation. +"""Core Apify platform tools for Strands Agents. + +Provides the foundational building blocks for interacting with the Apify platform: +run any Actor by ID, fetch Dataset results, and scrape individual URLs. +For domain-specific wrappers see apify_social_media.py and apify_search.py. + +Setup: + 1. Create an Apify account at https://apify.com + 2. Get your API token: Console > Settings > API & Integrations + 3. export APIFY_API_TOKEN=your_token + 4. pip install strands-agents-tools[apify] + +See docs/apify_tool.md for usage examples, parameter reference, and troubleshooting. """ import json From 4deefa82415691095b87df706c070afc0b451805 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 20 Mar 2026 13:44:40 +0100 Subject: [PATCH 04/24] feat: update Apify tool dependencies and enhance documentation Made-with: Cursor --- README.md | 3 +- docs/apify_tool.md | 16 +-- pyproject.toml | 2 +- src/strands_tools/apify.py | 263 +++++++++++++++++++++++-------------- tests/test_apify.py | 213 ++++++++++++++++++++++-------- 5 files changed, 333 insertions(+), 164 deletions(-) diff --git a/README.md b/README.md index 457bf57b..516b83ac 100644 --- a/README.md +++ b/README.md @@ -964,7 +964,7 @@ result = agent.tool.mongodb_memory( ) ``` -### Apify Core Tools +### Apify ```python from strands import Agent @@ -1000,7 +1000,6 @@ items = agent.tool.apify_get_dataset_items( ) ``` - ## 🌍 Environment Variables Configuration Agents Tools provides extensive customization through environment variables. This allows you to configure tool behavior without modifying code, making it ideal for different environments (development, testing, production). diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 9c930e1c..d4cf3bfd 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,10 +1,6 @@ -# Apify Core Tools +# Apify -The Apify core tools (`apify.py`) provide the foundational building blocks for interacting with the [Apify](https://apify.com) platform from Strands Agents. These generic tools let you run any [Actor](https://apify.com/store) by ID, fetch Dataset results, and scrape individual URLs. - -For higher-level, domain-specific tools see: -- [Apify Social Media Tools](apify_social_media_tool.md) β€” simplified wrappers for Instagram, LinkedIn, Twitter/X, TikTok, and Facebook scrapers -- [Apify Search Tools](apify_search_tool.md) β€” simplified wrappers for Google Search, Google Maps, YouTube, web crawling, and e-commerce scrapers +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) by ID, fetching Dataset results, and scraping individual URLs. ## Installation @@ -98,9 +94,9 @@ items = agent.tool.apify_get_dataset_items( | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| | `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | -| `run_input` | dict | No | `{}` | JSON-serializable input for the Actor | +| `run_input` | dict | No | None | JSON-serializable input for the Actor | | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | -| `memory_mbytes` | int | No | Actor default | Memory allocation in MB for the Actor Run | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor Run (uses Actor default if not set) | **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. @@ -119,9 +115,9 @@ items = agent.tool.apify_get_dataset_items( | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| | `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | -| `run_input` | dict | No | `{}` | JSON-serializable input for the Actor | +| `run_input` | dict | No | None | JSON-serializable input for the Actor | | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | -| `memory_mbytes` | int | No | Actor default | Memory allocation in MB for the Actor Run | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor Run (uses Actor default if not set) | | `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | **Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. diff --git a/pyproject.toml b/pyproject.toml index de75e0be..93e05c6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ Documentation = "https://strandsagents.com/" [project.optional-dependencies] apify = [ - "apify-client>=1.0.0", + "apify-client>=2.5.0,<3.0.0", ] build = [ "hatch>=1.16.5", diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index a272a8bf..c85f3cc7 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,22 +1,64 @@ -"""Core Apify platform tools for Strands Agents. - -Provides the foundational building blocks for interacting with the Apify platform: -run any Actor by ID, fetch Dataset results, and scrape individual URLs. -For domain-specific wrappers see apify_social_media.py and apify_search.py. - -Setup: - 1. Create an Apify account at https://apify.com - 2. Get your API token: Console > Settings > API & Integrations - 3. export APIFY_API_TOKEN=your_token - 4. pip install strands-agents-tools[apify] - -See docs/apify_tool.md for usage examples, parameter reference, and troubleshooting. +"""Apify platform tools for Strands Agents. + +This module provides web scraping, data extraction, and automation capabilities +using the Apify platform. It lets you run any Actor by ID, fetch Dataset results, +and scrape individual URLs. + +Key Features: +------------ +1. Actor Execution: + β€’ apify_run_actor: Run any Apify Actor by ID with custom input + β€’ apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step + +2. Data Retrieval: + β€’ apify_get_dataset_items: Fetch items from an Apify Dataset with pagination + β€’ apify_scrape_url: Scrape a single URL and return content as Markdown + +3. Error Handling: + β€’ Graceful API error handling with descriptive messages + β€’ Dependency checking (apify-client optional install) + β€’ Timeout management for Actor Runs + +Setup Requirements: +------------------ +1. Create an Apify account at https://apify.com +2. Obtain your API token: Apify Console > Settings > API & Integrations > Personal API tokens +3. Install the optional dependency: pip install strands-agents-tools[apify] +4. Set the environment variable: + APIFY_API_TOKEN=your_api_token_here + +Example .env configuration: + APIFY_API_TOKEN=apify_api_1a2B3cD4eF5gH6iJ7kL8m + +Usage Examples: +-------------- +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_get_dataset_items, + apify.apify_run_actor_and_get_dataset, + apify.apify_scrape_url, +]) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) +``` """ import json import logging import os from typing import Any, Dict, List, Optional +from urllib.parse import urlparse from rich.panel import Panel from rich.text import Text @@ -29,6 +71,7 @@ try: from apify_client import ApifyClient + from apify_client.errors import ApifyApiError HAS_APIFY_CLIENT = True except ImportError: @@ -36,6 +79,7 @@ WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} +ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" def _check_dependency() -> None: @@ -44,6 +88,49 @@ def _check_dependency() -> None: raise ImportError("apify-client package is required. Install with: pip install strands-agents-tools[apify]") +def _validate_url(url: str) -> None: + """Raise ValueError if the URL does not have a valid HTTP(S) scheme and domain.""" + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + raise ValueError(f"Invalid URL scheme '{parsed.scheme}'. Only http and https URLs are supported.") + if not parsed.netloc: + raise ValueError(f"Invalid URL '{url}'. A domain is required.") + + +def _format_error(e: Exception) -> str: + """Map exceptions to user-friendly error messages, with special handling for ApifyApiError.""" + if HAS_APIFY_CLIENT and isinstance(e, ApifyApiError): + status_code = getattr(e, "status_code", None) + msg = getattr(e, "message", str(e)) + match status_code: + case 401: + return "Authentication failed. Verify your APIFY_API_TOKEN is valid." + case 404: + return f"Resource not found: {msg}" + case 429: + return ( + "Rate limit exceeded. The Apify client retries automatically; " + "if this persists, reduce request frequency." + ) + case _: + return f"Apify API error ({status_code}): {msg}" + return str(e) + + +def _error_result(e: Exception, tool_name: str) -> Dict[str, Any]: + """Build a structured error response and display an error panel.""" + message = _format_error(e) + logger.error("%s failed: %s", tool_name, message) + console.print(Panel(Text(message, style="red"), title=ERROR_PANEL_TITLE, border_style="red")) + return {"status": "error", "content": [{"text": message}]} + + +def _success_result(text: str, panel_body: str, panel_title: str) -> Dict[str, Any]: + """Build a structured success response and display a success panel.""" + console.print(Panel(panel_body, title=f"[bold cyan]{panel_title}[/bold cyan]", border_style="green")) + return {"status": "success", "content": [{"text": text}]} + + class ApifyToolClient: """Helper class encapsulating Apify API interactions via apify-client.""" @@ -56,6 +143,14 @@ def __init__(self) -> None: ) self.client: "ApifyClient" = ApifyClient(token, headers=TRACKING_HEADER) + @staticmethod + def _check_run_status(actor_run: Dict[str, Any], label: str) -> None: + """Raise RuntimeError if the Actor Run did not succeed.""" + status = actor_run.get("status", "UNKNOWN") + if status != "SUCCEEDED": + run_id = actor_run.get("id", "N/A") + raise RuntimeError(f"{label} finished with status {status}. Run ID: {run_id}") + def run_actor( self, actor_id: str, @@ -72,14 +167,11 @@ def run_actor( call_kwargs["memory_mbytes"] = memory_mbytes actor_run = self.client.actor(actor_id).call(**call_kwargs) - - status = actor_run.get("status", "UNKNOWN") - if status not in ("SUCCEEDED",): - raise RuntimeError(f"Actor {actor_id} finished with status {status}. Run ID: {actor_run.get('id', 'N/A')}") + self._check_run_status(actor_run, f"Actor {actor_id}") return { "run_id": actor_run.get("id"), - "status": status, + "status": actor_run.get("status"), "dataset_id": actor_run.get("defaultDatasetId"), "started_at": actor_run.get("startedAt"), "finished_at": actor_run.get("finishedAt"), @@ -116,7 +208,7 @@ def run_actor_and_get_dataset( def scrape_url(self, url: str, timeout_secs: int = 120) -> str: """Scrape a single URL using Website Content Crawler and return markdown.""" - run_input = { + run_input: Dict[str, Any] = { "startUrls": [{"url": url}], "maxCrawlPages": 1, } @@ -124,12 +216,7 @@ def scrape_url(self, url: str, timeout_secs: int = 120) -> str: run_input=run_input, timeout_secs=timeout_secs, ) - - status = actor_run.get("status", "UNKNOWN") - if status not in ("SUCCEEDED",): - raise RuntimeError( - f"Website Content Crawler finished with status {status}. Run ID: {actor_run.get('id', 'N/A')}" - ) + self._check_run_status(actor_run, "Website Content Crawler") dataset_id = actor_run.get("defaultDatasetId") result = self.client.dataset(dataset_id).list_items(limit=1) @@ -150,7 +237,7 @@ def apify_run_actor( run_input: Optional[Dict[str, Any]] = None, timeout_secs: int = 300, memory_mbytes: Optional[int] = None, -) -> str: +) -> Dict[str, Any]: """Run any Apify Actor by its ID or name and return the run metadata as JSON. Executes the Actor synchronously - blocks until the Actor Run finishes or the timeout @@ -169,10 +256,11 @@ def apify_run_actor( memory_mbytes: Memory allocation in MB for the Actor Run. Uses Actor default if not set. Returns: - JSON string with run metadata: run_id, status, dataset_id, started_at, finished_at. + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. """ - _check_dependency() try: + _check_dependency() client = ApifyToolClient() result = client.run_actor( actor_id=actor_id, @@ -180,25 +268,19 @@ def apify_run_actor( timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, ) - panel = Panel( - f"[green]Actor Run completed[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}", - title="[bold cyan]Apify: Run Actor[/bold cyan]", - border_style="green", + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor Run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Actor", ) - console.print(panel) - return json.dumps(result, indent=2, default=str) except Exception as e: - error_panel = Panel( - Text(str(e), style="red"), - title="[bold red]Apify Error[/bold red]", - border_style="red", - ) - console.print(error_panel) - raise + return _error_result(e, "apify_run_actor") @tool @@ -206,7 +288,7 @@ def apify_get_dataset_items( dataset_id: str, limit: int = 100, offset: int = 0, -) -> str: +) -> Dict[str, Any]: """Fetch items from an existing Apify Dataset and return them as JSON. Use this after running an Actor to retrieve the structured results from its @@ -218,27 +300,21 @@ def apify_get_dataset_items( offset: Number of items to skip for pagination. Defaults to 0. Returns: - JSON string containing an array of Dataset items. + Dict with status and content containing an array of Dataset items. """ - _check_dependency() try: + _check_dependency() client = ApifyToolClient() items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) - panel = Panel( - f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}", - title="[bold cyan]Apify: Dataset Items[/bold cyan]", - border_style="green", + return _success_result( + text=json.dumps(items, indent=2, default=str), + panel_body=( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" + ), + panel_title="Apify: Dataset Items", ) - console.print(panel) - return json.dumps(items, indent=2, default=str) except Exception as e: - error_panel = Panel( - Text(str(e), style="red"), - title="[bold red]Apify Error[/bold red]", - border_style="red", - ) - console.print(error_panel) - raise + return _error_result(e, "apify_get_dataset_items") @tool @@ -248,7 +324,7 @@ def apify_run_actor_and_get_dataset( timeout_secs: int = 300, memory_mbytes: Optional[int] = None, dataset_items_limit: int = 100, -) -> str: +) -> Dict[str, Any]: """Run an Apify Actor and fetch its Dataset results in one step. Convenience tool that combines running an Actor and fetching its default Dataset @@ -263,11 +339,11 @@ def apify_run_actor_and_get_dataset( dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. Returns: - JSON string with run metadata (run_id, status, dataset_id, started_at, finished_at) - plus an "items" array containing the Dataset results. + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the Dataset results. """ - _check_dependency() try: + _check_dependency() client = ApifyToolClient() result = client.run_actor_and_get_dataset( actor_id=actor_id, @@ -276,33 +352,27 @@ def apify_run_actor_and_get_dataset( memory_mbytes=memory_mbytes, dataset_items_limit=dataset_items_limit, ) - panel = Panel( - f"[green]Actor Run completed with dataset[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}", - title="[bold cyan]Apify: Run Actor + Dataset[/bold cyan]", - border_style="green", + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor Run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Actor + Dataset", ) - console.print(panel) - return json.dumps(result, indent=2, default=str) except Exception as e: - error_panel = Panel( - Text(str(e), style="red"), - title="[bold red]Apify Error[/bold red]", - border_style="red", - ) - console.print(error_panel) - raise + return _error_result(e, "apify_run_actor_and_get_dataset") @tool def apify_scrape_url( url: str, timeout_secs: int = 120, -) -> str: +) -> Dict[str, Any]: """Scrape a single URL and return its content as markdown. Uses the Apify Website Content Crawler Actor under the hood, pre-configured for @@ -314,24 +384,19 @@ def apify_scrape_url( timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. Returns: - Markdown content of the scraped page as a plain string. + Dict with status and content containing the markdown content of the scraped page. """ - _check_dependency() try: + _validate_url(url) + _check_dependency() client = ApifyToolClient() content = client.scrape_url(url=url, timeout_secs=timeout_secs) - panel = Panel( - f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters", - title="[bold cyan]Apify: Scrape URL[/bold cyan]", - border_style="green", + return _success_result( + text=content, + panel_body=( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" + ), + panel_title="Apify: Scrape URL", ) - console.print(panel) - return content except Exception as e: - error_panel = Panel( - Text(str(e), style="red"), - title="[bold red]Apify Error[/bold red]", - border_style="red", - ) - console.print(error_panel) - raise + return _error_result(e, "apify_scrape_url") diff --git a/tests/test_apify.py b/tests/test_apify.py index 31644288..19ae534b 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -16,7 +16,7 @@ MOCK_ACTOR_RUN = { "id": "run-HG7ml5fB1hCp8YEBA", - "actId": "janedoe~my-scraper", + "actId": "aimee~my-scraper", "userId": "user-abc123", "startedAt": "2026-03-15T14:30:00.000Z", "finishedAt": "2026-03-15T14:35:22.000Z", @@ -52,6 +52,17 @@ } +def _make_apify_api_error(status_code: int, message: str) -> Exception: + """Create an ApifyApiError instance for testing without calling its real __init__.""" + from apify_client.errors import ApifyApiError + + error = ApifyApiError.__new__(ApifyApiError) + Exception.__init__(error, message) + error.status_code = status_code + error.message = message + return error + + @pytest.fixture def mock_apify_client(): """Create a mock ApifyClient with pre-configured responses.""" @@ -109,64 +120,106 @@ def test_client_uses_env_token(mock_apify_env): def test_run_actor_success(mock_apify_env, mock_apify_client): - """Successful Actor Run returns JSON with run metadata.""" + """Successful Actor Run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="janedoe/my-scraper", run_input={"url": "https://example.com"}) + result = apify_run_actor(actor_id="aimee/my-scraper", run_input={"url": "https://example.com"}) - data = json.loads(result) + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" assert data["status"] == "SUCCEEDED" assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" assert "started_at" in data assert "finished_at" in data - mock_apify_client.actor.assert_called_once_with("janedoe/my-scraper") + mock_apify_client.actor.assert_called_once_with("aimee/my-scraper") + + +def test_run_actor_default_input(mock_apify_env, mock_apify_client): + """Actor Run defaults run_input to empty dict when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "success" + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["run_input"] == {} def test_run_actor_with_memory(mock_apify_env, mock_apify_client): """Actor Run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - apify_run_actor(actor_id="janedoe/my-scraper", memory_mbytes=512) + apify_run_actor(actor_id="aimee/my-scraper", memory_mbytes=512) call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs assert call_kwargs["memory_mbytes"] == 512 def test_run_actor_failure(mock_apify_env, mock_apify_client): - """Actor Run raises RuntimeError when Actor fails.""" + """Actor Run returns error dict when Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="FAILED"): - apify_run_actor(actor_id="janedoe/my-scraper") + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] def test_run_actor_timeout(mock_apify_env, mock_apify_client): - """Actor Run raises RuntimeError when Actor times out.""" + """Actor Run returns error dict when Actor times out.""" mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="TIMED-OUT"): - apify_run_actor(actor_id="janedoe/my-scraper") + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "TIMED-OUT" in result["content"][0]["text"] def test_run_actor_api_exception(mock_apify_env, mock_apify_client): - """Actor Run re-raises exceptions from the Apify client.""" + """Actor Run returns error dict on API exceptions.""" mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(Exception, match="Connection failed"): - apify_run_actor(actor_id="janedoe/my-scraper") + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "Connection failed" in result["content"][0]["text"] + + +def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): + """Actor Run returns friendly message for 401 authentication errors.""" + error = _make_apify_api_error(401, "Unauthorized") + mock_apify_client.actor.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "Authentication failed" in result["content"][0]["text"] + + +def test_run_actor_apify_api_error_404(mock_apify_env, mock_apify_client): + """Actor Run returns friendly message for 404 not-found errors.""" + error = _make_apify_api_error(404, "Actor not found") + mock_apify_client.actor.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="aimee/nonexistent") + + assert result["status"] == "error" + assert "Resource not found" in result["content"][0]["text"] # --- apify_get_dataset_items --- def test_get_dataset_items_success(mock_apify_env, mock_apify_client): - """Successful dataset retrieval returns JSON array of items.""" + """Successful dataset retrieval returns structured result with items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_get_dataset_items(dataset_id="dataset-WkC9gct8rq1uR5vDZ") - items = json.loads(result) + assert result["status"] == "success" + items = json.loads(result["content"][0]["text"]) assert len(items) == 3 assert items[0]["title"] == "Widget A" assert items[2]["currency"] == "EUR" @@ -182,7 +235,7 @@ def test_get_dataset_items_with_pagination(mock_apify_env, mock_apify_client): def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): - """Empty dataset returns an empty JSON array.""" + """Empty dataset returns a structured result with empty JSON array.""" mock_list_result = MagicMock() mock_list_result.items = [] mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result @@ -190,7 +243,8 @@ def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_get_dataset_items(dataset_id="dataset-empty") - items = json.loads(result) + assert result["status"] == "success" + items = json.loads(result["content"][0]["text"]) assert items == [] @@ -198,15 +252,16 @@ def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): - """Combined run + dataset fetch returns run metadata and items.""" + """Combined run + dataset fetch returns structured result with metadata and items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_actor_and_get_dataset( - actor_id="janedoe/my-scraper", + actor_id="aimee/my-scraper", run_input={"url": "https://example.com"}, dataset_items_limit=50, ) - data = json.loads(result) + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" assert data["status"] == "SUCCEEDED" assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" @@ -215,19 +270,21 @@ def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_client): - """Combined tool raises when the Actor fails.""" + """Combined tool returns error dict when the Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="FAILED"): - apify_run_actor_and_get_dataset(actor_id="janedoe/my-scraper") + result = apify_run_actor_and_get_dataset(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] # --- apify_scrape_url --- def test_scrape_url_success(mock_apify_env, mock_apify_client): - """Scrape URL returns markdown content from the crawled page.""" + """Scrape URL returns structured result with markdown content.""" mock_list_result = MagicMock() mock_list_result.items = [MOCK_SCRAPED_ITEM] mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result @@ -235,28 +292,33 @@ def test_scrape_url_success(mock_apify_env, mock_apify_client): with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_scrape_url(url="https://example.com") - assert "Example Domain" in result + assert result["status"] == "success" + assert "Example Domain" in result["content"][0]["text"] mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") def test_scrape_url_no_content(mock_apify_env, mock_apify_client): - """Scrape URL raises when no content is returned.""" + """Scrape URL returns error dict when no content is returned.""" mock_list_result = MagicMock() mock_list_result.items = [] mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="No content returned"): - apify_scrape_url(url="https://example.com") + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "No content returned" in result["content"][0]["text"] def test_scrape_url_crawler_failure(mock_apify_env, mock_apify_client): - """Scrape URL raises when the crawler Actor fails.""" + """Scrape URL returns error dict when the crawler Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="FAILED"): - apify_scrape_url(url="https://example.com") + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] def test_scrape_url_falls_back_to_text(mock_apify_env, mock_apify_client): @@ -269,52 +331,99 @@ def test_scrape_url_falls_back_to_text(mock_apify_env, mock_apify_client): with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_scrape_url(url="https://example.com") - assert result == "Plain text content" + assert result["status"] == "success" + assert result["content"][0]["text"] == "Plain text content" + + +def test_scrape_url_invalid_url_scheme(mock_apify_env): + """apify_scrape_url returns error for invalid URL scheme.""" + result = apify_scrape_url(url="ftp://example.com") + + assert result["status"] == "error" + assert "Invalid URL scheme" in result["content"][0]["text"] + + +def test_scrape_url_missing_scheme(mock_apify_env): + """apify_scrape_url returns error for URL without http/https scheme.""" + result = apify_scrape_url(url="example.com") + + assert result["status"] == "error" + assert "Invalid URL scheme" in result["content"][0]["text"] # --- Dependency guard --- def test_missing_apify_client_run_actor(mock_apify_env): - """apify_run_actor raises ImportError when apify-client is not installed.""" + """apify_run_actor returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - with pytest.raises(ImportError, match="apify-client"): - apify_run_actor(actor_id="test/actor") + result = apify_run_actor(actor_id="test/actor") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] def test_missing_apify_client_get_dataset(mock_apify_env): - """apify_get_dataset_items raises ImportError when apify-client is not installed.""" + """apify_get_dataset_items returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - with pytest.raises(ImportError, match="apify-client"): - apify_get_dataset_items(dataset_id="dataset-123") + result = apify_get_dataset_items(dataset_id="dataset-123") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] def test_missing_apify_client_run_and_get(mock_apify_env): - """apify_run_actor_and_get_dataset raises ImportError when apify-client is not installed.""" + """apify_run_actor_and_get_dataset returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - with pytest.raises(ImportError, match="apify-client"): - apify_run_actor_and_get_dataset(actor_id="test/actor") + result = apify_run_actor_and_get_dataset(actor_id="test/actor") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] def test_missing_apify_client_scrape_url(mock_apify_env): - """apify_scrape_url raises ImportError when apify-client is not installed.""" + """apify_scrape_url returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - with pytest.raises(ImportError, match="apify-client"): - apify_scrape_url(url="https://example.com") + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] # --- Missing token from tool entry points --- def test_run_actor_missing_token(monkeypatch): - """apify_run_actor raises ValueError when APIFY_API_TOKEN is missing.""" + """apify_run_actor returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) - with pytest.raises(ValueError, match="APIFY_API_TOKEN"): - apify_run_actor(actor_id="test/actor") + result = apify_run_actor(actor_id="test/actor") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_get_dataset_items_missing_token(monkeypatch): + """apify_get_dataset_items returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_get_dataset_items(dataset_id="dataset-123") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_missing_token(monkeypatch): + """apify_run_actor_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_actor_and_get_dataset(actor_id="test/actor") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] def test_scrape_url_missing_token(monkeypatch): - """apify_scrape_url raises ValueError when APIFY_API_TOKEN is missing.""" + """apify_scrape_url returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) - with pytest.raises(ValueError, match="APIFY_API_TOKEN"): - apify_scrape_url(url="https://example.com") + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] From dd2d6fda98e6edb387a33bab4e7a7c74f6b287d6 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 20 Mar 2026 14:26:51 +0100 Subject: [PATCH 05/24] feat: add task execution tools to Apify integration and create unit tests for it --- src/strands_tools/apify.py | 160 ++++++++++++++++++++++++++++++++++++- tests/test_apify.py | 141 ++++++++++++++++++++++++++++++++ 2 files changed, 299 insertions(+), 2 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index c85f3cc7..9f707134 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -10,11 +10,15 @@ β€’ apify_run_actor: Run any Apify Actor by ID with custom input β€’ apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step -2. Data Retrieval: +2. Task Execution: + β€’ apify_run_task: Run a saved Actor Task by ID with optional input overrides + β€’ apify_run_task_and_get_dataset: Run a Task and fetch results in one step + +3. Data Retrieval: β€’ apify_get_dataset_items: Fetch items from an Apify Dataset with pagination β€’ apify_scrape_url: Scrape a single URL and return content as Markdown -3. Error Handling: +4. Error Handling: β€’ Graceful API error handling with descriptive messages β€’ Dependency checking (apify-client optional install) β€’ Timeout management for Actor Runs @@ -38,8 +42,10 @@ agent = Agent(tools=[ apify.apify_run_actor, + apify.apify_run_task, apify.apify_get_dataset_items, apify.apify_run_actor_and_get_dataset, + apify.apify_run_task_and_get_dataset, apify.apify_scrape_url, ]) @@ -162,6 +168,7 @@ def run_actor( call_kwargs: Dict[str, Any] = { "run_input": run_input or {}, "timeout_secs": timeout_secs, + "logger": None, } if memory_mbytes is not None: call_kwargs["memory_mbytes"] = memory_mbytes @@ -203,6 +210,56 @@ def run_actor_and_get_dataset( memory_mbytes=memory_mbytes, ) dataset_id = run_metadata["dataset_id"] + if not dataset_id: + raise RuntimeError(f"Actor {actor_id} run has no default Dataset.") + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) + return {**run_metadata, "items": items} + + def run_task( + self, + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + ) -> Dict[str, Any]: + """Run an Apify Task synchronously and return run metadata.""" + call_kwargs: Dict[str, Any] = {"timeout_secs": timeout_secs} + if task_input is not None: + call_kwargs["task_input"] = task_input + if memory_mbytes is not None: + call_kwargs["memory_mbytes"] = memory_mbytes + + task_run = self.client.task(task_id).call(**call_kwargs) + if task_run is None: + raise RuntimeError(f"Task {task_id} returned no run data (possible wait timeout).") + self._check_run_status(task_run, f"Task {task_id}") + + return { + "run_id": task_run.get("id"), + "status": task_run.get("status"), + "dataset_id": task_run.get("defaultDatasetId"), + "started_at": task_run.get("startedAt"), + "finished_at": task_run.get("finishedAt"), + } + + def run_task_and_get_dataset( + self, + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = 100, + ) -> Dict[str, Any]: + """Run a Task synchronously, then fetch its default Dataset items.""" + run_metadata = self.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + dataset_id = run_metadata["dataset_id"] + if not dataset_id: + raise RuntimeError(f"Task {task_id} run has no default Dataset.") items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) return {**run_metadata, "items": items} @@ -215,6 +272,7 @@ def scrape_url(self, url: str, timeout_secs: int = 120) -> str: actor_run = self.client.actor(WEBSITE_CONTENT_CRAWLER).call( run_input=run_input, timeout_secs=timeout_secs, + logger=None, ) self._check_run_status(actor_run, "Website Content Crawler") @@ -368,6 +426,104 @@ def apify_run_actor_and_get_dataset( return _error_result(e, "apify_run_actor_and_get_dataset") +@tool +def apify_run_task( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, +) -> Dict[str, Any]: + """Run an Apify Task by its ID or name and return the run metadata as JSON. + + Tasks are saved Actor configurations with preset inputs. Use this when a Task + has already been configured in the Apify Console, so you don't need to specify + the full Actor input every time. + + Args: + task_id: Task identifier, e.g. "janedoe~my-task" or a Task ID string. + task_input: Optional JSON-serializable input to override the Task's default input. + timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Task Run. Uses Task default if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task Run completed[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Task", + ) + except Exception as e: + return _error_result(e, "apify_run_task") + + +@tool +def apify_run_task_and_get_dataset( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = 100, +) -> Dict[str, Any]: + """Run an Apify Task and fetch its Dataset results in one step. + + Convenience tool that combines running a Task and fetching its default Dataset + items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + task_id: Task identifier, e.g. "janedoe~my-task" or a Task ID string. + task_input: Optional JSON-serializable input to override the Task's default input. + timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Task Run. + dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the Dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task_and_get_dataset( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task Run completed with dataset[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Task + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_task_and_get_dataset") + + @tool def apify_scrape_url( url: str, diff --git a/tests/test_apify.py b/tests/test_apify.py index 19ae534b..f4ed99f0 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -11,6 +11,8 @@ apify_get_dataset_items, apify_run_actor, apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, apify_scrape_url, ) @@ -72,6 +74,10 @@ def mock_apify_client(): mock_actor.call.return_value = MOCK_ACTOR_RUN client.actor.return_value = mock_actor + mock_task = MagicMock() + mock_task.call.return_value = MOCK_ACTOR_RUN + client.task.return_value = mock_task + mock_dataset = MagicMock() mock_list_result = MagicMock() mock_list_result.items = MOCK_DATASET_ITEMS @@ -280,6 +286,105 @@ def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_clie assert "FAILED" in result["content"][0]["text"] +# --- apify_run_task --- + + +def test_run_task_success(mock_apify_env, mock_apify_client): + """Successful Task Run returns structured result with run metadata.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task", task_input={"query": "test"}) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + mock_apify_client.task.assert_called_once_with("janedoe~my-task") + + +def test_run_task_no_input(mock_apify_env, mock_apify_client): + """Task Run omits task_input kwarg when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "success" + call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs + assert "task_input" not in call_kwargs + + +def test_run_task_with_memory(mock_apify_env, mock_apify_client): + """Task Run passes memory_mbytes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_run_task(task_id="janedoe~my-task", memory_mbytes=1024) + + call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs + assert call_kwargs["memory_mbytes"] == 1024 + + +def test_run_task_failure(mock_apify_env, mock_apify_client): + """Task Run returns error dict when Task fails.""" + mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +def test_run_task_none_response(mock_apify_env, mock_apify_client): + """Task Run returns error dict when TaskClient.call() returns None.""" + mock_apify_client.task.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): + """Task Run returns friendly message for 401 authentication errors.""" + error = _make_apify_api_error(401, "Unauthorized") + mock_apify_client.task.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "Authentication failed" in result["content"][0]["text"] + + +# --- apify_run_task_and_get_dataset --- + + +def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): + """Combined Task run + dataset fetch returns structured result with metadata and items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset( + task_id="janedoe~my-task", + task_input={"query": "test"}, + dataset_items_limit=50, + ) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert len(data["items"]) == 3 + assert data["items"][0]["title"] == "Widget A" + + +def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client): + """Combined Task tool returns error dict when the Task fails.""" + mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + # --- apify_scrape_url --- @@ -381,6 +486,24 @@ def test_missing_apify_client_run_and_get(mock_apify_env): assert "apify-client" in result["content"][0]["text"] +def test_missing_apify_client_run_task(mock_apify_env): + """apify_run_task returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_run_task_and_get(mock_apify_env): + """apify_run_task_and_get_dataset returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + def test_missing_apify_client_scrape_url(mock_apify_env): """apify_scrape_url returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): @@ -420,6 +543,24 @@ def test_run_actor_and_get_dataset_missing_token(monkeypatch): assert "APIFY_API_TOKEN" in result["content"][0]["text"] +def test_run_task_missing_token(monkeypatch): + """apify_run_task returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_missing_token(monkeypatch): + """apify_run_task_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + def test_scrape_url_missing_token(monkeypatch): """apify_scrape_url returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) From f823eaef9680e359b4a6884c2854f9bbfae3041e Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 20 Mar 2026 14:58:27 +0100 Subject: [PATCH 06/24] feat: edit docs for apify tools --- README.md | 14 ++++++++++ docs/apify_tool.md | 56 ++++++++++++++++++++++++++++++++++++-- src/strands_tools/apify.py | 4 +-- tests/test_apify.py | 50 +++++++++++++++++----------------- 4 files changed, 95 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 516b83ac..45a3d6e8 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,8 @@ Below is a comprehensive table of all available tools, how to use them with an a | apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor by ID with arbitrary input | | apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify Dataset | | apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its Dataset results in one step | +| apify_run_task | `agent.tool.apify_run_task(task_id="user~my-task")` | Run a saved Apify Task by ID with optional input overrides | +| apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_limit=50)` | Run a Task and fetch its Dataset results in one step | | apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | @@ -972,8 +974,10 @@ from strands_tools import apify agent = Agent(tools=[ apify.apify_run_actor, + apify.apify_run_task, apify.apify_get_dataset_items, apify.apify_run_actor_and_get_dataset, + apify.apify_run_task_and_get_dataset, apify.apify_scrape_url, ]) @@ -987,6 +991,16 @@ result = agent.tool.apify_run_actor_and_get_dataset( dataset_items_limit=50, ) +# Run a saved Task (pre-configured Actor with default inputs) +run_info = agent.tool.apify_run_task(task_id="user~my-task") + +# Run a Task and get results in one step +result = agent.tool.apify_run_task_and_get_dataset( + task_id="user~my-task", + task_input={"query": "override default input"}, + dataset_items_limit=50, +) + # Run an Actor (get metadata only) run_info = agent.tool.apify_run_actor( actor_id="apify/google-search-scraper", diff --git a/docs/apify_tool.md b/docs/apify_tool.md index d4cf3bfd..2a436246 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,6 +1,6 @@ # Apify -The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) by ID, fetching Dataset results, and scraping individual URLs. +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [Task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching Dataset results, and scraping individual URLs. ## Installation @@ -26,9 +26,11 @@ from strands_tools import apify agent = Agent(tools=[ apify.apify_run_actor, + apify.apify_run_task, apify.apify_scrape_url, apify.apify_get_dataset_items, apify.apify_run_actor_and_get_dataset, + apify.apify_run_task_and_get_dataset, ]) ``` @@ -66,6 +68,31 @@ result = agent.tool.apify_run_actor_and_get_dataset( ) ``` +### Run a Task + +Execute a saved [Actor Task](https://docs.apify.com/platform/actors/running/tasks) β€” a pre-configured Actor with preset inputs. Use this when a Task has already been set up in the Apify Console: + +```python +result = agent.tool.apify_run_task( + task_id="user~my-task", + task_input={"query": "override input"}, + timeout_secs=300, +) +``` + +The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. + +### Run a Task and Get Results + +Combine running a Task and fetching its Dataset results in a single call: + +```python +result = agent.tool.apify_run_task_and_get_dataset( + task_id="user~my-task", + dataset_items_limit=50, +) +``` + ### Fetch Dataset Items Retrieve results from a Dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing Dataset: @@ -100,6 +127,29 @@ items = agent.tool.apify_get_dataset_items( **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. +### apify_run_task + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a Task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the Task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Task Run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Task Run (uses Task default if not set) | + +**Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. + +### apify_run_task_and_get_dataset + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a Task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the Task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Task Run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Task Run (uses Task default if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | + +**Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. + ### apify_get_dataset_items | Parameter | Type | Required | Default | Description | @@ -129,7 +179,9 @@ items = agent.tool.apify_get_dataset_items( | `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | | `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | | `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in the [Apify Console](https://console.apify.com) | -| `Actor ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `Task ... finished with status FAILED` | Task execution error | Check Task configuration and run logs in the [Apify Console](https://console.apify.com) | +| `Actor/Task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `Task ... returned no run data` | Task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | | `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | ## References diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 9f707134..6fc61eba 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -440,7 +440,7 @@ def apify_run_task( the full Actor input every time. Args: - task_id: Task identifier, e.g. "janedoe~my-task" or a Task ID string. + task_id: Task identifier, e.g. "user~my-task" or a Task ID string. task_input: Optional JSON-serializable input to override the Task's default input. timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Task Run. Uses Task default if not set. @@ -488,7 +488,7 @@ def apify_run_task_and_get_dataset( result data without making two separate tool calls. Args: - task_id: Task identifier, e.g. "janedoe~my-task" or a Task ID string. + task_id: Task identifier, e.g. "user~my-task" or a Task ID string. task_input: Optional JSON-serializable input to override the Task's default input. timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Task Run. diff --git a/tests/test_apify.py b/tests/test_apify.py index f4ed99f0..a88a085b 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -18,7 +18,7 @@ MOCK_ACTOR_RUN = { "id": "run-HG7ml5fB1hCp8YEBA", - "actId": "aimee~my-scraper", + "actId": "actor~my-scraper", "userId": "user-abc123", "startedAt": "2026-03-15T14:30:00.000Z", "finishedAt": "2026-03-15T14:35:22.000Z", @@ -128,7 +128,7 @@ def test_client_uses_env_token(mock_apify_env): def test_run_actor_success(mock_apify_env, mock_apify_client): """Successful Actor Run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper", run_input={"url": "https://example.com"}) + result = apify_run_actor(actor_id="actor/my-scraper", run_input={"url": "https://example.com"}) assert result["status"] == "success" data = json.loads(result["content"][0]["text"]) @@ -137,13 +137,13 @@ def test_run_actor_success(mock_apify_env, mock_apify_client): assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" assert "started_at" in data assert "finished_at" in data - mock_apify_client.actor.assert_called_once_with("aimee/my-scraper") + mock_apify_client.actor.assert_called_once_with("actor/my-scraper") def test_run_actor_default_input(mock_apify_env, mock_apify_client): """Actor Run defaults run_input to empty dict when not provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "success" call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs @@ -153,7 +153,7 @@ def test_run_actor_default_input(mock_apify_env, mock_apify_client): def test_run_actor_with_memory(mock_apify_env, mock_apify_client): """Actor Run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - apify_run_actor(actor_id="aimee/my-scraper", memory_mbytes=512) + apify_run_actor(actor_id="actor/my-scraper", memory_mbytes=512) call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs assert call_kwargs["memory_mbytes"] == 512 @@ -164,7 +164,7 @@ def test_run_actor_failure(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "error" assert "FAILED" in result["content"][0]["text"] @@ -175,7 +175,7 @@ def test_run_actor_timeout(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "error" assert "TIMED-OUT" in result["content"][0]["text"] @@ -186,7 +186,7 @@ def test_run_actor_api_exception(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "error" assert "Connection failed" in result["content"][0]["text"] @@ -198,7 +198,7 @@ def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.side_effect = error with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "error" assert "Authentication failed" in result["content"][0]["text"] @@ -210,7 +210,7 @@ def test_run_actor_apify_api_error_404(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.side_effect = error with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/nonexistent") + result = apify_run_actor(actor_id="actor/nonexistent") assert result["status"] == "error" assert "Resource not found" in result["content"][0]["text"] @@ -261,7 +261,7 @@ def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): """Combined run + dataset fetch returns structured result with metadata and items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_actor_and_get_dataset( - actor_id="aimee/my-scraper", + actor_id="actor/my-scraper", run_input={"url": "https://example.com"}, dataset_items_limit=50, ) @@ -280,7 +280,7 @@ def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_clie mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor_and_get_dataset(actor_id="aimee/my-scraper") + result = apify_run_actor_and_get_dataset(actor_id="actor/my-scraper") assert result["status"] == "error" assert "FAILED" in result["content"][0]["text"] @@ -292,20 +292,20 @@ def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_clie def test_run_task_success(mock_apify_env, mock_apify_client): """Successful Task Run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task", task_input={"query": "test"}) + result = apify_run_task(task_id="user~my-task", task_input={"query": "test"}) assert result["status"] == "success" data = json.loads(result["content"][0]["text"]) assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" assert data["status"] == "SUCCEEDED" assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" - mock_apify_client.task.assert_called_once_with("janedoe~my-task") + mock_apify_client.task.assert_called_once_with("user~my-task") def test_run_task_no_input(mock_apify_env, mock_apify_client): """Task Run omits task_input kwarg when not provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "success" call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs @@ -315,7 +315,7 @@ def test_run_task_no_input(mock_apify_env, mock_apify_client): def test_run_task_with_memory(mock_apify_env, mock_apify_client): """Task Run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - apify_run_task(task_id="janedoe~my-task", memory_mbytes=1024) + apify_run_task(task_id="user~my-task", memory_mbytes=1024) call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs assert call_kwargs["memory_mbytes"] == 1024 @@ -326,7 +326,7 @@ def test_run_task_failure(mock_apify_env, mock_apify_client): mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "FAILED" in result["content"][0]["text"] @@ -337,7 +337,7 @@ def test_run_task_none_response(mock_apify_env, mock_apify_client): mock_apify_client.task.return_value.call.return_value = None with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "no run data" in result["content"][0]["text"] @@ -349,7 +349,7 @@ def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): mock_apify_client.task.return_value.call.side_effect = error with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "Authentication failed" in result["content"][0]["text"] @@ -362,7 +362,7 @@ def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): """Combined Task run + dataset fetch returns structured result with metadata and items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_task_and_get_dataset( - task_id="janedoe~my-task", + task_id="user~my-task", task_input={"query": "test"}, dataset_items_limit=50, ) @@ -379,7 +379,7 @@ def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + result = apify_run_task_and_get_dataset(task_id="user~my-task") assert result["status"] == "error" assert "FAILED" in result["content"][0]["text"] @@ -489,7 +489,7 @@ def test_missing_apify_client_run_and_get(mock_apify_env): def test_missing_apify_client_run_task(mock_apify_env): """apify_run_task returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "apify-client" in result["content"][0]["text"] @@ -498,7 +498,7 @@ def test_missing_apify_client_run_task(mock_apify_env): def test_missing_apify_client_run_task_and_get(mock_apify_env): """apify_run_task_and_get_dataset returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + result = apify_run_task_and_get_dataset(task_id="user~my-task") assert result["status"] == "error" assert "apify-client" in result["content"][0]["text"] @@ -546,7 +546,7 @@ def test_run_actor_and_get_dataset_missing_token(monkeypatch): def test_run_task_missing_token(monkeypatch): """apify_run_task returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "APIFY_API_TOKEN" in result["content"][0]["text"] @@ -555,7 +555,7 @@ def test_run_task_missing_token(monkeypatch): def test_run_task_and_get_dataset_missing_token(monkeypatch): """apify_run_task_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) - result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + result = apify_run_task_and_get_dataset(task_id="user~my-task") assert result["status"] == "error" assert "APIFY_API_TOKEN" in result["content"][0]["text"] From bcf09508f52cccc40aad003bb7158c23768e0c11 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 23 Mar 2026 18:13:27 +0100 Subject: [PATCH 07/24] feat: enhance Apify tool with validation methods and default parameters --- src/strands_tools/apify.py | 150 +++++++++++++++++++++++++++++-------- 1 file changed, 119 insertions(+), 31 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 6fc61eba..bdac5124 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -86,6 +86,13 @@ WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" +DEFAULT_TIMEOUT_SECS = 300 +DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +DEFAULT_DATASET_ITEMS_LIMIT = 100 +VALID_CRAWLER_TYPES = ("playwright:adaptive", "playwright:firefox", "cheerio") + + +# --- Helper functions --- def _check_dependency() -> None: @@ -94,25 +101,22 @@ def _check_dependency() -> None: raise ImportError("apify-client package is required. Install with: pip install strands-agents-tools[apify]") -def _validate_url(url: str) -> None: - """Raise ValueError if the URL does not have a valid HTTP(S) scheme and domain.""" - parsed = urlparse(url) - if parsed.scheme not in ("http", "https"): - raise ValueError(f"Invalid URL scheme '{parsed.scheme}'. Only http and https URLs are supported.") - if not parsed.netloc: - raise ValueError(f"Invalid URL '{url}'. A domain is required.") - - def _format_error(e: Exception) -> str: """Map exceptions to user-friendly error messages, with special handling for ApifyApiError.""" if HAS_APIFY_CLIENT and isinstance(e, ApifyApiError): status_code = getattr(e, "status_code", None) msg = getattr(e, "message", str(e)) match status_code: + case 400: + return f"Invalid request: {msg}" case 401: return "Authentication failed. Verify your APIFY_API_TOKEN is valid." + case 402: + return "Insufficient Apify plan credits or subscription limits exceeded." case 404: return f"Resource not found: {msg}" + case 408: + return f"Actor Run timed out: {msg}" case 429: return ( "Rate limit exceeded. The Apify client retries automatically; " @@ -157,23 +161,60 @@ def _check_run_status(actor_run: Dict[str, Any], label: str) -> None: run_id = actor_run.get("id", "N/A") raise RuntimeError(f"{label} finished with status {status}. Run ID: {run_id}") + @staticmethod + def _validate_url(url: str) -> None: + """Raise ValueError if the URL does not have a valid HTTP(S) scheme and domain.""" + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + raise ValueError(f"Invalid URL scheme '{parsed.scheme}'. Only http and https URLs are supported.") + if not parsed.netloc: + raise ValueError(f"Invalid URL '{url}'. A domain is required.") + + @staticmethod + def _validate_identifier(value: str, name: str) -> None: + """Raise ValueError if a required string identifier is empty or whitespace-only.""" + if not value.strip(): + raise ValueError(f"'{name}' must be a non-empty string.") + + @staticmethod + def _validate_positive(value: int, name: str) -> None: + """Raise ValueError if the value is not a positive integer (> 0).""" + if value <= 0: + raise ValueError(f"'{name}' must be a positive integer, got {value}.") + + @staticmethod + def _validate_non_negative(value: int, name: str) -> None: + """Raise ValueError if the value is negative.""" + if value < 0: + raise ValueError(f"'{name}' must be a non-negative integer, got {value}.") + def run_actor( self, actor_id: str, run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, + build: Optional[str] = None, ) -> Dict[str, Any]: """Run an Apify Actor synchronously and return run metadata.""" + self._validate_identifier(actor_id, "actor_id") + self._validate_positive(timeout_secs, "timeout_secs") + if memory_mbytes is not None: + self._validate_positive(memory_mbytes, "memory_mbytes") + call_kwargs: Dict[str, Any] = { "run_input": run_input or {}, "timeout_secs": timeout_secs, - "logger": None, + "logger": None, # Suppress verbose apify-client logging not useful to end users } if memory_mbytes is not None: call_kwargs["memory_mbytes"] = memory_mbytes + if build is not None: + call_kwargs["build"] = build actor_run = self.client.actor(actor_id).call(**call_kwargs) + if actor_run is None: + raise RuntimeError(f"Actor {actor_id} returned no run data (possible wait timeout).") self._check_run_status(actor_run, f"Actor {actor_id}") return { @@ -187,10 +228,14 @@ def run_actor( def get_dataset_items( self, dataset_id: str, - limit: int = 100, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0, ) -> List[Dict[str, Any]]: """Fetch items from an Apify Dataset.""" + self._validate_identifier(dataset_id, "dataset_id") + self._validate_positive(limit, "limit") + self._validate_non_negative(offset, "offset") + result = self.client.dataset(dataset_id).list_items(limit=limit, offset=offset) return list(result.items) @@ -198,31 +243,42 @@ def run_actor_and_get_dataset( self, actor_id: str, run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, - dataset_items_limit: int = 100, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, ) -> Dict[str, Any]: """Run an Actor synchronously, then fetch its default Dataset items.""" + self._validate_positive(dataset_items_limit, "dataset_items_limit") + self._validate_non_negative(dataset_items_offset, "dataset_items_offset") + run_metadata = self.run_actor( actor_id=actor_id, run_input=run_input, timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, + build=build, ) dataset_id = run_metadata["dataset_id"] if not dataset_id: raise RuntimeError(f"Actor {actor_id} run has no default Dataset.") - items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) return {**run_metadata, "items": items} def run_task( self, task_id: str, task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: """Run an Apify Task synchronously and return run metadata.""" + self._validate_identifier(task_id, "task_id") + self._validate_positive(timeout_secs, "timeout_secs") + if memory_mbytes is not None: + self._validate_positive(memory_mbytes, "memory_mbytes") + call_kwargs: Dict[str, Any] = {"timeout_secs": timeout_secs} if task_input is not None: call_kwargs["task_input"] = task_input @@ -246,11 +302,15 @@ def run_task_and_get_dataset( self, task_id: str, task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, - dataset_items_limit: int = 100, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, ) -> Dict[str, Any]: """Run a Task synchronously, then fetch its default Dataset items.""" + self._validate_positive(dataset_items_limit, "dataset_items_limit") + self._validate_non_negative(dataset_items_offset, "dataset_items_offset") + run_metadata = self.run_task( task_id=task_id, task_input=task_input, @@ -260,19 +320,32 @@ def run_task_and_get_dataset( dataset_id = run_metadata["dataset_id"] if not dataset_id: raise RuntimeError(f"Task {task_id} run has no default Dataset.") - items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) return {**run_metadata, "items": items} - def scrape_url(self, url: str, timeout_secs: int = 120) -> str: + def scrape_url( + self, + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: str = "cheerio", + ) -> str: """Scrape a single URL using Website Content Crawler and return markdown.""" + self._validate_url(url) + self._validate_positive(timeout_secs, "timeout_secs") + if crawler_type not in VALID_CRAWLER_TYPES: + raise ValueError( + f"Invalid crawler_type '{crawler_type}'. Must be one of: {', '.join(VALID_CRAWLER_TYPES)}." + ) + run_input: Dict[str, Any] = { "startUrls": [{"url": url}], "maxCrawlPages": 1, + "crawlerType": crawler_type, } actor_run = self.client.actor(WEBSITE_CONTENT_CRAWLER).call( run_input=run_input, timeout_secs=timeout_secs, - logger=None, + logger=None, # Suppress verbose apify-client logging not useful to end users ) self._check_run_status(actor_run, "Website Content Crawler") @@ -293,8 +366,9 @@ def scrape_url(self, url: str, timeout_secs: int = 120) -> str: def apify_run_actor( actor_id: str, run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, + build: Optional[str] = None, ) -> Dict[str, Any]: """Run any Apify Actor by its ID or name and return the run metadata as JSON. @@ -312,6 +386,7 @@ def apify_run_actor( run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor Run. Uses Actor default if not set. + build: Actor Build tag or number to run a specific version. Uses latest Build if not set. Returns: Dict with status and content containing run metadata: run_id, status, dataset_id, @@ -325,6 +400,7 @@ def apify_run_actor( run_input=run_input, timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, + build=build, ) return _success_result( text=json.dumps(result, indent=2, default=str), @@ -344,7 +420,7 @@ def apify_run_actor( @tool def apify_get_dataset_items( dataset_id: str, - limit: int = 100, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0, ) -> Dict[str, Any]: """Fetch items from an existing Apify Dataset and return them as JSON. @@ -379,9 +455,11 @@ def apify_get_dataset_items( def apify_run_actor_and_get_dataset( actor_id: str, run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, - dataset_items_limit: int = 100, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, ) -> Dict[str, Any]: """Run an Apify Actor and fetch its Dataset results in one step. @@ -394,7 +472,9 @@ def apify_run_actor_and_get_dataset( run_input: JSON-serializable input for the Actor. timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor Run. + build: Actor Build tag or number to run a specific version. Uses latest Build if not set. dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. + dataset_items_offset: Number of Dataset items to skip for pagination. Defaults to 0. Returns: Dict with status and content containing run metadata (run_id, status, dataset_id, @@ -408,7 +488,9 @@ def apify_run_actor_and_get_dataset( run_input=run_input, timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, + build=build, dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, ) return _success_result( text=json.dumps(result, indent=2, default=str), @@ -430,7 +512,7 @@ def apify_run_actor_and_get_dataset( def apify_run_task( task_id: str, task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: """Run an Apify Task by its ID or name and return the run metadata as JSON. @@ -477,9 +559,10 @@ def apify_run_task( def apify_run_task_and_get_dataset( task_id: str, task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, - dataset_items_limit: int = 100, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, ) -> Dict[str, Any]: """Run an Apify Task and fetch its Dataset results in one step. @@ -493,6 +576,7 @@ def apify_run_task_and_get_dataset( timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Task Run. dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. + dataset_items_offset: Number of Dataset items to skip for pagination. Defaults to 0. Returns: Dict with status and content containing run metadata (run_id, status, dataset_id, @@ -507,6 +591,7 @@ def apify_run_task_and_get_dataset( timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, ) return _success_result( text=json.dumps(result, indent=2, default=str), @@ -527,7 +612,8 @@ def apify_run_task_and_get_dataset( @tool def apify_scrape_url( url: str, - timeout_secs: int = 120, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: str = "cheerio", ) -> Dict[str, Any]: """Scrape a single URL and return its content as markdown. @@ -538,15 +624,17 @@ def apify_scrape_url( Args: url: The URL to scrape, e.g. "https://example.com". timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + crawler_type: Crawler engine to use. One of "playwright:adaptive" (fast, renders JS if + present, recommended default), "playwright:firefox" (reliable, renders JS, best at + avoiding blocking but slower), or "cheerio" (fastest, no JS rendering). Returns: Dict with status and content containing the markdown content of the scraped page. """ try: - _validate_url(url) _check_dependency() client = ApifyToolClient() - content = client.scrape_url(url=url, timeout_secs=timeout_secs) + content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) return _success_result( text=content, panel_body=( From d15a3371927b5bbff7230ac12eaba338040599d3 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 23 Mar 2026 18:31:50 +0100 Subject: [PATCH 08/24] feat: create validation tests --- tests/test_apify.py | 139 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/tests/test_apify.py b/tests/test_apify.py index a88a085b..70b3aca5 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -456,6 +456,145 @@ def test_scrape_url_missing_scheme(mock_apify_env): assert "Invalid URL scheme" in result["content"][0]["text"] +# --- Parameter validation --- + + +def test_run_actor_empty_actor_id(mock_apify_env): + """apify_run_actor returns error for whitespace-only actor_id.""" + result = apify_run_actor(actor_id=" ") + + assert result["status"] == "error" + assert "actor_id" in result["content"][0]["text"] + + +def test_run_actor_zero_timeout(mock_apify_env): + """apify_run_actor returns error for non-positive timeout_secs.""" + result = apify_run_actor(actor_id="actor/valid", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_actor_negative_timeout(mock_apify_env): + """apify_run_actor returns error for negative timeout_secs.""" + result = apify_run_actor(actor_id="actor/valid", timeout_secs=-5) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_actor_zero_memory(mock_apify_env): + """apify_run_actor returns error for non-positive memory_mbytes.""" + result = apify_run_actor(actor_id="actor/valid", memory_mbytes=0) + + assert result["status"] == "error" + assert "memory_mbytes" in result["content"][0]["text"] + + +def test_run_task_empty_task_id(mock_apify_env): + """apify_run_task returns error for whitespace-only task_id.""" + result = apify_run_task(task_id=" ") + + assert result["status"] == "error" + assert "task_id" in result["content"][0]["text"] + + +def test_run_task_zero_timeout(mock_apify_env): + """apify_run_task returns error for non-positive timeout_secs.""" + result = apify_run_task(task_id="user~my-task", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_task_zero_memory(mock_apify_env): + """apify_run_task returns error for non-positive memory_mbytes.""" + result = apify_run_task(task_id="user~my-task", memory_mbytes=0) + + assert result["status"] == "error" + assert "memory_mbytes" in result["content"][0]["text"] + + +def test_get_dataset_items_empty_dataset_id(mock_apify_env): + """apify_get_dataset_items returns error for whitespace-only dataset_id.""" + result = apify_get_dataset_items(dataset_id=" ") + + assert result["status"] == "error" + assert "dataset_id" in result["content"][0]["text"] + + +def test_get_dataset_items_zero_limit(mock_apify_env): + """apify_get_dataset_items returns error for non-positive limit.""" + result = apify_get_dataset_items(dataset_id="dataset-abc", limit=0) + + assert result["status"] == "error" + assert "limit" in result["content"][0]["text"] + + +def test_get_dataset_items_negative_offset(mock_apify_env): + """apify_get_dataset_items returns error for negative offset.""" + result = apify_get_dataset_items(dataset_id="dataset-abc", offset=-1) + + assert result["status"] == "error" + assert "offset" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_zero_dataset_limit(mock_apify_env): + """apify_run_actor_and_get_dataset returns error for non-positive dataset_items_limit.""" + result = apify_run_actor_and_get_dataset(actor_id="actor/valid", dataset_items_limit=0) + + assert result["status"] == "error" + assert "dataset_items_limit" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_negative_dataset_offset(mock_apify_env): + """apify_run_actor_and_get_dataset returns error for negative dataset_items_offset.""" + result = apify_run_actor_and_get_dataset(actor_id="actor/valid", dataset_items_offset=-1) + + assert result["status"] == "error" + assert "dataset_items_offset" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_zero_dataset_limit(mock_apify_env): + """apify_run_task_and_get_dataset returns error for non-positive dataset_items_limit.""" + result = apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_limit=0) + + assert result["status"] == "error" + assert "dataset_items_limit" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_negative_dataset_offset(mock_apify_env): + """apify_run_task_and_get_dataset returns error for negative dataset_items_offset.""" + result = apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_offset=-1) + + assert result["status"] == "error" + assert "dataset_items_offset" in result["content"][0]["text"] + + +def test_scrape_url_zero_timeout(mock_apify_env): + """apify_scrape_url returns error for non-positive timeout_secs.""" + result = apify_scrape_url(url="https://example.com", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_scrape_url_invalid_crawler_type(mock_apify_env): + """apify_scrape_url returns error for unsupported crawler_type.""" + result = apify_scrape_url(url="https://example.com", crawler_type="invalid") + + assert result["status"] == "error" + assert "crawler_type" in result["content"][0]["text"] + + +def test_scrape_url_missing_domain(mock_apify_env): + """apify_scrape_url returns error for URL with no domain.""" + result = apify_scrape_url(url="https://") + + assert result["status"] == "error" + assert "domain" in result["content"][0]["text"].lower() + + # --- Dependency guard --- From 2d8bfbeca34823e1ed99ea0c485df9322ebaf80e Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 24 Mar 2026 12:56:55 +0100 Subject: [PATCH 09/24] feat: standardize terminology in apify tool documentation and code --- README.md | 18 +++--- docs/apify_tool.md | 64 ++++++++++---------- src/strands_tools/apify.py | 116 ++++++++++++++++++------------------- tests/test_apify.py | 34 +++++------ 4 files changed, 116 insertions(+), 116 deletions(-) diff --git a/README.md b/README.md index 45a3d6e8..0ed290cf 100644 --- a/README.md +++ b/README.md @@ -100,10 +100,10 @@ Below is a comprehensive table of all available tools, how to use them with an a |------|-------------|----------| | a2a_client | `provider = A2AClientToolProvider(known_agent_urls=["http://localhost:9000"]); agent = Agent(tools=provider.tools)` | Discover and communicate with A2A-compliant agents, send messages between agents | | apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor by ID with arbitrary input | -| apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify Dataset | -| apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its Dataset results in one step | -| apify_run_task | `agent.tool.apify_run_task(task_id="user~my-task")` | Run a saved Apify Task by ID with optional input overrides | -| apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_limit=50)` | Run a Task and fetch its Dataset results in one step | +| apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify dataset | +| apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its dataset results in one step | +| apify_run_task | `agent.tool.apify_run_task(task_id="user/my-task")` | Run a saved Apify task by ID with optional input overrides | +| apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user/my-task", dataset_items_limit=50)` | Run a task and fetch its dataset results in one step | | apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | @@ -991,12 +991,12 @@ result = agent.tool.apify_run_actor_and_get_dataset( dataset_items_limit=50, ) -# Run a saved Task (pre-configured Actor with default inputs) -run_info = agent.tool.apify_run_task(task_id="user~my-task") +# Run a saved task (pre-configured Actor with default inputs) +run_info = agent.tool.apify_run_task(task_id="user/my-task") -# Run a Task and get results in one step +# Run a task and get results in one step result = agent.tool.apify_run_task_and_get_dataset( - task_id="user~my-task", + task_id="user/my-task", task_input={"query": "override default input"}, dataset_items_limit=50, ) @@ -1007,7 +1007,7 @@ run_info = agent.tool.apify_run_actor( run_input={"queries": "AI agent frameworks"}, ) -# Fetch Dataset items separately +# Fetch dataset items separately items = agent.tool.apify_get_dataset_items( dataset_id="abc123", limit=100, diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 2a436246..58803bd9 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,6 +1,6 @@ # Apify -The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [Task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching Dataset results, and scraping individual URLs. +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. ## Installation @@ -16,7 +16,7 @@ Set your Apify API token as an environment variable: export APIFY_API_TOKEN=apify_api_your_token_here ``` -Get your token from the [Apify Console](https://console.apify.com/account/integrations) β†’ Settings β†’ API & Integrations β†’ Personal API tokens. +Get your token from [Apify Console](https://console.apify.com/account/integrations) β†’ Settings β†’ API & Integrations β†’ Personal API tokens. ## Usage @@ -44,7 +44,7 @@ content = agent.tool.apify_scrape_url(url="https://example.com") ### Run an Actor -Execute any Actor from the [Apify Store](https://apify.com/store) by its ID. The call blocks until the Actor Run finishes or the timeout is reached: +Execute any Actor from [Apify Store](https://apify.com/store) by its ID. The call blocks until the Actor run finishes or the timeout is reached: ```python result = agent.tool.apify_run_actor( @@ -58,7 +58,7 @@ The result is a JSON string containing run metadata: `run_id`, `status`, `datase ### Run an Actor and Get Results -Combine running an Actor and fetching its Dataset results in a single call: +Combine running an Actor and fetching its dataset results in a single call: ```python result = agent.tool.apify_run_actor_and_get_dataset( @@ -68,9 +68,9 @@ result = agent.tool.apify_run_actor_and_get_dataset( ) ``` -### Run a Task +### Run a task -Execute a saved [Actor Task](https://docs.apify.com/platform/actors/running/tasks) β€” a pre-configured Actor with preset inputs. Use this when a Task has already been set up in the Apify Console: +Execute a saved [Actor task](https://docs.apify.com/platform/actors/running/tasks) β€” a pre-configured Actor with preset inputs. Use this when a task has already been set up in Apify Console: ```python result = agent.tool.apify_run_task( @@ -82,9 +82,9 @@ result = agent.tool.apify_run_task( The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. -### Run a Task and Get Results +### Run a task and get results -Combine running a Task and fetching its Dataset results in a single call: +Combine running a task and fetching its dataset results in a single call: ```python result = agent.tool.apify_run_task_and_get_dataset( @@ -93,9 +93,9 @@ result = agent.tool.apify_run_task_and_get_dataset( ) ``` -### Fetch Dataset Items +### Fetch dataset items -Retrieve results from a Dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing Dataset: +Retrieve results from a dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing dataset: ```python items = agent.tool.apify_get_dataset_items( @@ -122,8 +122,8 @@ items = agent.tool.apify_get_dataset_items( |-----------|------|----------|---------|-------------| | `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | | `run_input` | dict | No | None | JSON-serializable input for the Actor | -| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | -| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor Run (uses Actor default if not set) | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. @@ -131,10 +131,10 @@ items = agent.tool.apify_get_dataset_items( | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| -| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a Task ID) | -| `task_input` | dict | No | None | JSON-serializable input to override the Task's default input | -| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Task Run to finish | -| `memory_mbytes` | int | No | None | Memory allocation in MB for the Task Run (uses Task default if not set) | +| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. @@ -142,23 +142,23 @@ items = agent.tool.apify_get_dataset_items( | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| -| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a Task ID) | -| `task_input` | dict | No | None | JSON-serializable input to override the Task's default input | -| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Task Run to finish | -| `memory_mbytes` | int | No | None | Memory allocation in MB for the Task Run (uses Task default if not set) | -| `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | +| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | -**Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. +**Returns:** JSON string with run metadata plus an `items` array containing the dataset results. ### apify_get_dataset_items | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| -| `dataset_id` | string | Yes | β€” | The Apify Dataset ID to fetch items from | +| `dataset_id` | string | Yes | β€” | The Apify dataset ID to fetch items from | | `limit` | int | No | 100 | Maximum number of items to return | | `offset` | int | No | 0 | Number of items to skip for pagination | -**Returns:** JSON string containing an array of Dataset items. +**Returns:** JSON string containing an array of dataset items. ### apify_run_actor_and_get_dataset @@ -166,11 +166,11 @@ items = agent.tool.apify_get_dataset_items( |-----------|------|----------|---------|-------------| | `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | | `run_input` | dict | No | None | JSON-serializable input for the Actor | -| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | -| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor Run (uses Actor default if not set) | -| `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | -**Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. +**Returns:** JSON string with run metadata plus an `items` array containing the dataset results. ## Troubleshooting @@ -178,10 +178,10 @@ items = agent.tool.apify_get_dataset_items( |-------|-------|-----| | `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | | `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | -| `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in the [Apify Console](https://console.apify.com) | -| `Task ... finished with status FAILED` | Task execution error | Check Task configuration and run logs in the [Apify Console](https://console.apify.com) | -| `Actor/Task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | -| `Task ... returned no run data` | Task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | +| `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in [Apify Console](https://console.apify.com) | +| `Task ... finished with status FAILED` | task execution error | Check task configuration and run logs in [Apify Console](https://console.apify.com) | +| `Actor/task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `Task ... returned no run data` | task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | | `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | ## References diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index bdac5124..12176eae 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,27 +1,27 @@ """Apify platform tools for Strands Agents. This module provides web scraping, data extraction, and automation capabilities -using the Apify platform. It lets you run any Actor by ID, fetch Dataset results, -and scrape individual URLs. +using the Apify platform. It lets you run any Actor, task, fetch dataset +results, and scrape individual URLs. Key Features: ------------ 1. Actor Execution: - β€’ apify_run_actor: Run any Apify Actor by ID with custom input + β€’ apify_run_actor: Run any Apify Actor with custom input β€’ apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step 2. Task Execution: - β€’ apify_run_task: Run a saved Actor Task by ID with optional input overrides - β€’ apify_run_task_and_get_dataset: Run a Task and fetch results in one step + β€’ apify_run_task: Run a saved Actor task with optional input overrides + β€’ apify_run_task_and_get_dataset: Run a task and fetch results in one step 3. Data Retrieval: - β€’ apify_get_dataset_items: Fetch items from an Apify Dataset with pagination + β€’ apify_get_dataset_items: Fetch items from an Apify dataset with pagination β€’ apify_scrape_url: Scrape a single URL and return content as Markdown 4. Error Handling: β€’ Graceful API error handling with descriptive messages β€’ Dependency checking (apify-client optional install) - β€’ Timeout management for Actor Runs + β€’ Timeout management for Actor runs Setup Requirements: ------------------ @@ -116,7 +116,7 @@ def _format_error(e: Exception) -> str: case 404: return f"Resource not found: {msg}" case 408: - return f"Actor Run timed out: {msg}" + return f"Actor run timed out: {msg}" case 429: return ( "Rate limit exceeded. The Apify client retries automatically; " @@ -155,7 +155,7 @@ def __init__(self) -> None: @staticmethod def _check_run_status(actor_run: Dict[str, Any], label: str) -> None: - """Raise RuntimeError if the Actor Run did not succeed.""" + """Raise RuntimeError if the Actor run did not succeed.""" status = actor_run.get("status", "UNKNOWN") if status != "SUCCEEDED": run_id = actor_run.get("id", "N/A") @@ -231,7 +231,7 @@ def get_dataset_items( limit: int = DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0, ) -> List[Dict[str, Any]]: - """Fetch items from an Apify Dataset.""" + """Fetch items from an Apify dataset.""" self._validate_identifier(dataset_id, "dataset_id") self._validate_positive(limit, "limit") self._validate_non_negative(offset, "offset") @@ -249,7 +249,7 @@ def run_actor_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run an Actor synchronously, then fetch its default Dataset items.""" + """Run an Actor synchronously, then fetch its default dataset items.""" self._validate_positive(dataset_items_limit, "dataset_items_limit") self._validate_non_negative(dataset_items_offset, "dataset_items_offset") @@ -262,7 +262,7 @@ def run_actor_and_get_dataset( ) dataset_id = run_metadata["dataset_id"] if not dataset_id: - raise RuntimeError(f"Actor {actor_id} run has no default Dataset.") + raise RuntimeError(f"Actor {actor_id} run has no default dataset.") items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) return {**run_metadata, "items": items} @@ -273,7 +273,7 @@ def run_task( timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: - """Run an Apify Task synchronously and return run metadata.""" + """Run an Apify task synchronously and return run metadata.""" self._validate_identifier(task_id, "task_id") self._validate_positive(timeout_secs, "timeout_secs") if memory_mbytes is not None: @@ -307,7 +307,7 @@ def run_task_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run a Task synchronously, then fetch its default Dataset items.""" + """Run a task synchronously, then fetch its default dataset items.""" self._validate_positive(dataset_items_limit, "dataset_items_limit") self._validate_non_negative(dataset_items_offset, "dataset_items_offset") @@ -319,7 +319,7 @@ def run_task_and_get_dataset( ) dataset_id = run_metadata["dataset_id"] if not dataset_id: - raise RuntimeError(f"Task {task_id} run has no default Dataset.") + raise RuntimeError(f"Task {task_id} run has no default dataset.") items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) return {**run_metadata, "items": items} @@ -370,9 +370,9 @@ def apify_run_actor( memory_mbytes: Optional[int] = None, build: Optional[str] = None, ) -> Dict[str, Any]: - """Run any Apify Actor by its ID or name and return the run metadata as JSON. + """Run any Apify Actor and return the run metadata as JSON. - Executes the Actor synchronously - blocks until the Actor Run finishes or the timeout + Executes the Actor synchronously - blocks until the Actor run finishes or the timeout is reached. Use this when you need to run a specific Actor and then inspect or process the results separately. @@ -384,9 +384,9 @@ def apify_run_actor( Args: actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. - timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor Run. Uses Actor default if not set. - build: Actor Build tag or number to run a specific version. Uses latest Build if not set. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + build: Actor build tag or number to run a specific version. Uses latest build if not set. Returns: Dict with status and content containing run metadata: run_id, status, dataset_id, @@ -405,7 +405,7 @@ def apify_run_actor( return _success_result( text=json.dumps(result, indent=2, default=str), panel_body=( - f"[green]Actor Run completed[/green]\n" + f"[green]Actor run completed[/green]\n" f"Actor: {actor_id}\n" f"Run ID: {result['run_id']}\n" f"Status: {result['status']}\n" @@ -423,18 +423,18 @@ def apify_get_dataset_items( limit: int = DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0, ) -> Dict[str, Any]: - """Fetch items from an existing Apify Dataset and return them as JSON. + """Fetch items from an existing Apify dataset and return them as JSON. Use this after running an Actor to retrieve the structured results from its - default Dataset, or to access any Dataset by ID. + default dataset, or to access any dataset by ID. Args: - dataset_id: The Apify Dataset ID to fetch items from. + dataset_id: The Apify dataset ID to fetch items from. limit: Maximum number of items to return. Defaults to 100. offset: Number of items to skip for pagination. Defaults to 0. Returns: - Dict with status and content containing an array of Dataset items. + Dict with status and content containing an array of dataset items. """ try: _check_dependency() @@ -461,24 +461,24 @@ def apify_run_actor_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run an Apify Actor and fetch its Dataset results in one step. + """Run an Apify Actor and fetch its dataset results in one step. - Convenience tool that combines running an Actor and fetching its default Dataset - items into a single call. Use this when you want both the run metadata and the + Convenience tool that combines running an Actor and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the result data without making two separate tool calls. Args: actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". run_input: JSON-serializable input for the Actor. - timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor Run. - build: Actor Build tag or number to run a specific version. Uses latest Build if not set. - dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. - dataset_items_offset: Number of Dataset items to skip for pagination. Defaults to 0. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. Returns: Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the Dataset results. + started_at, finished_at) plus an "items" array containing the dataset results. """ try: _check_dependency() @@ -495,7 +495,7 @@ def apify_run_actor_and_get_dataset( return _success_result( text=json.dumps(result, indent=2, default=str), panel_body=( - f"[green]Actor Run completed with dataset[/green]\n" + f"[green]Actor run completed with dataset[/green]\n" f"Actor: {actor_id}\n" f"Run ID: {result['run_id']}\n" f"Status: {result['status']}\n" @@ -515,17 +515,17 @@ def apify_run_task( timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: - """Run an Apify Task by its ID or name and return the run metadata as JSON. + """Run an Apify task and return the run metadata as JSON. - Tasks are saved Actor configurations with preset inputs. Use this when a Task - has already been configured in the Apify Console, so you don't need to specify + Tasks are saved Actor configurations with preset inputs. Use this when a task + has already been configured in Apify Console, so you don't need to specify the full Actor input every time. Args: - task_id: Task identifier, e.g. "user~my-task" or a Task ID string. - task_input: Optional JSON-serializable input to override the Task's default input. - timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Task Run. Uses Task default if not set. + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. Returns: Dict with status and content containing run metadata: run_id, status, dataset_id, @@ -543,7 +543,7 @@ def apify_run_task( return _success_result( text=json.dumps(result, indent=2, default=str), panel_body=( - f"[green]Task Run completed[/green]\n" + f"[green]Task run completed[/green]\n" f"Task: {task_id}\n" f"Run ID: {result['run_id']}\n" f"Status: {result['status']}\n" @@ -564,23 +564,23 @@ def apify_run_task_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run an Apify Task and fetch its Dataset results in one step. + """Run an Apify task and fetch its dataset results in one step. - Convenience tool that combines running a Task and fetching its default Dataset - items into a single call. Use this when you want both the run metadata and the + Convenience tool that combines running a task and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the result data without making two separate tool calls. Args: - task_id: Task identifier, e.g. "user~my-task" or a Task ID string. - task_input: Optional JSON-serializable input to override the Task's default input. - timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Task Run. - dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. - dataset_items_offset: Number of Dataset items to skip for pagination. Defaults to 0. + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. Returns: Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the Dataset results. + started_at, finished_at) plus an "items" array containing the dataset results. """ try: _check_dependency() @@ -596,7 +596,7 @@ def apify_run_task_and_get_dataset( return _success_result( text=json.dumps(result, indent=2, default=str), panel_body=( - f"[green]Task Run completed with dataset[/green]\n" + f"[green]Task run completed with dataset[/green]\n" f"Task: {task_id}\n" f"Run ID: {result['run_id']}\n" f"Status: {result['status']}\n" @@ -617,16 +617,16 @@ def apify_scrape_url( ) -> Dict[str, Any]: """Scrape a single URL and return its content as markdown. - Uses the Apify Website Content Crawler Actor under the hood, pre-configured for + Uses the Website Content Crawler Actor under the hood, pre-configured for fast single-page scraping. This is the simplest way to extract readable content from any web page. Args: url: The URL to scrape, e.g. "https://example.com". timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. - crawler_type: Crawler engine to use. One of "playwright:adaptive" (fast, renders JS if - present, recommended default), "playwright:firefox" (reliable, renders JS, best at - avoiding blocking but slower), or "cheerio" (fastest, no JS rendering). + crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, + default), "playwright:adaptive" (fast, renders JS if present), or + "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). Returns: Dict with status and content containing the markdown content of the scraped page. diff --git a/tests/test_apify.py b/tests/test_apify.py index 70b3aca5..3c9ec899 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -126,7 +126,7 @@ def test_client_uses_env_token(mock_apify_env): def test_run_actor_success(mock_apify_env, mock_apify_client): - """Successful Actor Run returns structured result with run metadata.""" + """Successful Actor run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_actor(actor_id="actor/my-scraper", run_input={"url": "https://example.com"}) @@ -141,7 +141,7 @@ def test_run_actor_success(mock_apify_env, mock_apify_client): def test_run_actor_default_input(mock_apify_env, mock_apify_client): - """Actor Run defaults run_input to empty dict when not provided.""" + """Actor run defaults run_input to empty dict when not provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_actor(actor_id="actor/my-scraper") @@ -151,7 +151,7 @@ def test_run_actor_default_input(mock_apify_env, mock_apify_client): def test_run_actor_with_memory(mock_apify_env, mock_apify_client): - """Actor Run passes memory_mbytes when provided.""" + """Actor run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): apify_run_actor(actor_id="actor/my-scraper", memory_mbytes=512) @@ -160,7 +160,7 @@ def test_run_actor_with_memory(mock_apify_env, mock_apify_client): def test_run_actor_failure(mock_apify_env, mock_apify_client): - """Actor Run returns error dict when Actor fails.""" + """Actor run returns error dict when Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -171,7 +171,7 @@ def test_run_actor_failure(mock_apify_env, mock_apify_client): def test_run_actor_timeout(mock_apify_env, mock_apify_client): - """Actor Run returns error dict when Actor times out.""" + """Actor run returns error dict when Actor times out.""" mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -182,7 +182,7 @@ def test_run_actor_timeout(mock_apify_env, mock_apify_client): def test_run_actor_api_exception(mock_apify_env, mock_apify_client): - """Actor Run returns error dict on API exceptions.""" + """Actor run returns error dict on API exceptions.""" mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -193,7 +193,7 @@ def test_run_actor_api_exception(mock_apify_env, mock_apify_client): def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): - """Actor Run returns friendly message for 401 authentication errors.""" + """Actor run returns friendly message for 401 authentication errors.""" error = _make_apify_api_error(401, "Unauthorized") mock_apify_client.actor.return_value.call.side_effect = error @@ -205,7 +205,7 @@ def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): def test_run_actor_apify_api_error_404(mock_apify_env, mock_apify_client): - """Actor Run returns friendly message for 404 not-found errors.""" + """Actor run returns friendly message for 404 not-found errors.""" error = _make_apify_api_error(404, "Actor not found") mock_apify_client.actor.return_value.call.side_effect = error @@ -233,7 +233,7 @@ def test_get_dataset_items_success(mock_apify_env, mock_apify_client): def test_get_dataset_items_with_pagination(mock_apify_env, mock_apify_client): - """Dataset retrieval passes limit and offset.""" + """dataset retrieval passes limit and offset.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): apify_get_dataset_items(dataset_id="dataset-xyz", limit=50, offset=10) @@ -290,7 +290,7 @@ def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_clie def test_run_task_success(mock_apify_env, mock_apify_client): - """Successful Task Run returns structured result with run metadata.""" + """Successful task run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_task(task_id="user~my-task", task_input={"query": "test"}) @@ -303,7 +303,7 @@ def test_run_task_success(mock_apify_env, mock_apify_client): def test_run_task_no_input(mock_apify_env, mock_apify_client): - """Task Run omits task_input kwarg when not provided.""" + """task run omits task_input kwarg when not provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_task(task_id="user~my-task") @@ -313,7 +313,7 @@ def test_run_task_no_input(mock_apify_env, mock_apify_client): def test_run_task_with_memory(mock_apify_env, mock_apify_client): - """Task Run passes memory_mbytes when provided.""" + """task run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): apify_run_task(task_id="user~my-task", memory_mbytes=1024) @@ -322,7 +322,7 @@ def test_run_task_with_memory(mock_apify_env, mock_apify_client): def test_run_task_failure(mock_apify_env, mock_apify_client): - """Task Run returns error dict when Task fails.""" + """task run returns error dict when task fails.""" mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -333,7 +333,7 @@ def test_run_task_failure(mock_apify_env, mock_apify_client): def test_run_task_none_response(mock_apify_env, mock_apify_client): - """Task Run returns error dict when TaskClient.call() returns None.""" + """task run returns error dict when TaskClient.call() returns None.""" mock_apify_client.task.return_value.call.return_value = None with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -344,7 +344,7 @@ def test_run_task_none_response(mock_apify_env, mock_apify_client): def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): - """Task Run returns friendly message for 401 authentication errors.""" + """task run returns friendly message for 401 authentication errors.""" error = _make_apify_api_error(401, "Unauthorized") mock_apify_client.task.return_value.call.side_effect = error @@ -359,7 +359,7 @@ def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): - """Combined Task run + dataset fetch returns structured result with metadata and items.""" + """Combined task run + dataset fetch returns structured result with metadata and items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_task_and_get_dataset( task_id="user~my-task", @@ -375,7 +375,7 @@ def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client): - """Combined Task tool returns error dict when the Task fails.""" + """Combined task tool returns error dict when the task fails.""" mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): From 1ef943d257e06bbd60b1080cba6e8c66756f9138 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 25 Mar 2026 13:08:01 +0100 Subject: [PATCH 10/24] feat: refactor Apify tools into core module and update docs --- README.md | 13 +- docs/apify_tool.md | 23 +- src/strands_tools/apify.py | 345 +----------------------------- src/strands_tools/apify_core.py | 366 ++++++++++++++++++++++++++++++++ tests/test_apify.py | 16 +- 5 files changed, 401 insertions(+), 362 deletions(-) create mode 100644 src/strands_tools/apify_core.py diff --git a/README.md b/README.md index 0ed290cf..d79af517 100644 --- a/README.md +++ b/README.md @@ -970,16 +970,9 @@ result = agent.tool.mongodb_memory( ```python from strands import Agent -from strands_tools import apify - -agent = Agent(tools=[ - apify.apify_run_actor, - apify.apify_run_task, - apify.apify_get_dataset_items, - apify.apify_run_actor_and_get_dataset, - apify.apify_run_task_and_get_dataset, - apify.apify_scrape_url, -]) +from strands_tools.apify_core import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) # Scrape a single URL and get markdown content content = agent.tool.apify_scrape_url(url="https://example.com") diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 58803bd9..36358192 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,6 +1,6 @@ # Apify -The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. +The Apify core tools (`apify_core.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. ## Installation @@ -20,17 +20,24 @@ Get your token from [Apify Console](https://console.apify.com/account/integratio ## Usage +Register all core tools at once: + +```python +from strands import Agent +from strands_tools.apify_core import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Or pick individual tools: + ```python from strands import Agent -from strands_tools import apify +from strands_tools import apify_core agent = Agent(tools=[ - apify.apify_run_actor, - apify.apify_run_task, - apify.apify_scrape_url, - apify.apify_get_dataset_items, - apify.apify_run_actor_and_get_dataset, - apify.apify_run_task_and_get_dataset, + apify_core.apify_run_actor, + apify_core.apify_scrape_url, ]) ``` diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 12176eae..cb63ae70 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,27 +1,11 @@ -"""Apify platform tools for Strands Agents. +"""Shared base for Apify platform tools. -This module provides web scraping, data extraction, and automation capabilities -using the Apify platform. It lets you run any Actor, task, fetch dataset -results, and scrape individual URLs. +This module provides the shared infrastructure used by all Apify tool modules +(e.g. apify_core, apify_social). It contains the API client, error handling, +response helpers, and constants. It does NOT contain any @tool functions itself. -Key Features: ------------- -1. Actor Execution: - β€’ apify_run_actor: Run any Apify Actor with custom input - β€’ apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step - -2. Task Execution: - β€’ apify_run_task: Run a saved Actor task with optional input overrides - β€’ apify_run_task_and_get_dataset: Run a task and fetch results in one step - -3. Data Retrieval: - β€’ apify_get_dataset_items: Fetch items from an Apify dataset with pagination - β€’ apify_scrape_url: Scrape a single URL and return content as Markdown - -4. Error Handling: - β€’ Graceful API error handling with descriptive messages - β€’ Dependency checking (apify-client optional install) - β€’ Timeout management for Actor runs +Tool modules import from here: + from strands_tools.apify import ApifyToolClient, _check_dependency, ... Setup Requirements: ------------------ @@ -30,37 +14,8 @@ 3. Install the optional dependency: pip install strands-agents-tools[apify] 4. Set the environment variable: APIFY_API_TOKEN=your_api_token_here - -Example .env configuration: - APIFY_API_TOKEN=apify_api_1a2B3cD4eF5gH6iJ7kL8m - -Usage Examples: --------------- -```python -from strands import Agent -from strands_tools import apify - -agent = Agent(tools=[ - apify.apify_run_actor, - apify.apify_run_task, - apify.apify_get_dataset_items, - apify.apify_run_actor_and_get_dataset, - apify.apify_run_task_and_get_dataset, - apify.apify_scrape_url, -]) - -# Scrape a single URL -content = agent.tool.apify_scrape_url(url="https://example.com") - -# Run an Actor -result = agent.tool.apify_run_actor( - actor_id="apify/website-content-crawler", - run_input={"startUrls": [{"url": "https://example.com"}]}, -) -``` """ -import json import logging import os from typing import Any, Dict, List, Optional @@ -68,7 +23,6 @@ from rich.panel import Panel from rich.text import Text -from strands import tool from strands_tools.utils import console_util @@ -357,290 +311,3 @@ def scrape_url( raise RuntimeError(f"No content returned for URL: {url}") return str(items[0].get("markdown") or items[0].get("text", "")) - - -# --- Tool functions --- - - -@tool -def apify_run_actor( - actor_id: str, - run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - build: Optional[str] = None, -) -> Dict[str, Any]: - """Run any Apify Actor and return the run metadata as JSON. - - Executes the Actor synchronously - blocks until the Actor run finishes or the timeout - is reached. Use this when you need to run a specific Actor and then inspect or process - the results separately. - - Common Actors: - - "apify/website-content-crawler" - scrape websites and extract content - - "apify/web-scraper" - general-purpose web scraper - - "apify/google-search-scraper" - scrape Google search results - - Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. - timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. - build: Actor build tag or number to run a specific version. Uses latest build if not set. - - Returns: - Dict with status and content containing run metadata: run_id, status, dataset_id, - started_at, finished_at. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_actor( - actor_id=actor_id, - run_input=run_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - build=build, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Actor run completed[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}" - ), - panel_title="Apify: Run Actor", - ) - except Exception as e: - return _error_result(e, "apify_run_actor") - - -@tool -def apify_get_dataset_items( - dataset_id: str, - limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - offset: int = 0, -) -> Dict[str, Any]: - """Fetch items from an existing Apify dataset and return them as JSON. - - Use this after running an Actor to retrieve the structured results from its - default dataset, or to access any dataset by ID. - - Args: - dataset_id: The Apify dataset ID to fetch items from. - limit: Maximum number of items to return. Defaults to 100. - offset: Number of items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing an array of dataset items. - """ - try: - _check_dependency() - client = ApifyToolClient() - items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) - return _success_result( - text=json.dumps(items, indent=2, default=str), - panel_body=( - f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" - ), - panel_title="Apify: Dataset Items", - ) - except Exception as e: - return _error_result(e, "apify_get_dataset_items") - - -@tool -def apify_run_actor_and_get_dataset( - actor_id: str, - run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - build: Optional[str] = None, - dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - dataset_items_offset: int = 0, -) -> Dict[str, Any]: - """Run an Apify Actor and fetch its dataset results in one step. - - Convenience tool that combines running an Actor and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the - result data without making two separate tool calls. - - Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. - timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. - build: Actor build tag or number to run a specific version. Uses latest build if not set. - dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. - dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the dataset results. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_actor_and_get_dataset( - actor_id=actor_id, - run_input=run_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - build=build, - dataset_items_limit=dataset_items_limit, - dataset_items_offset=dataset_items_offset, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Actor run completed with dataset[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}" - ), - panel_title="Apify: Run Actor + Dataset", - ) - except Exception as e: - return _error_result(e, "apify_run_actor_and_get_dataset") - - -@tool -def apify_run_task( - task_id: str, - task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, -) -> Dict[str, Any]: - """Run an Apify task and return the run metadata as JSON. - - Tasks are saved Actor configurations with preset inputs. Use this when a task - has already been configured in Apify Console, so you don't need to specify - the full Actor input every time. - - Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. - timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. - - Returns: - Dict with status and content containing run metadata: run_id, status, dataset_id, - started_at, finished_at. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_task( - task_id=task_id, - task_input=task_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Task run completed[/green]\n" - f"Task: {task_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}" - ), - panel_title="Apify: Run Task", - ) - except Exception as e: - return _error_result(e, "apify_run_task") - - -@tool -def apify_run_task_and_get_dataset( - task_id: str, - task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - dataset_items_offset: int = 0, -) -> Dict[str, Any]: - """Run an Apify task and fetch its dataset results in one step. - - Convenience tool that combines running a task and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the - result data without making two separate tool calls. - - Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. - timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. - dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. - dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the dataset results. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_task_and_get_dataset( - task_id=task_id, - task_input=task_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - dataset_items_limit=dataset_items_limit, - dataset_items_offset=dataset_items_offset, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Task run completed with dataset[/green]\n" - f"Task: {task_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}" - ), - panel_title="Apify: Run Task + Dataset", - ) - except Exception as e: - return _error_result(e, "apify_run_task_and_get_dataset") - - -@tool -def apify_scrape_url( - url: str, - timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, - crawler_type: str = "cheerio", -) -> Dict[str, Any]: - """Scrape a single URL and return its content as markdown. - - Uses the Website Content Crawler Actor under the hood, pre-configured for - fast single-page scraping. This is the simplest way to extract readable content - from any web page. - - Args: - url: The URL to scrape, e.g. "https://example.com". - timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. - crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, - default), "playwright:adaptive" (fast, renders JS if present), or - "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). - - Returns: - Dict with status and content containing the markdown content of the scraped page. - """ - try: - _check_dependency() - client = ApifyToolClient() - content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) - return _success_result( - text=content, - panel_body=( - f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" - ), - panel_title="Apify: Scrape URL", - ) - except Exception as e: - return _error_result(e, "apify_scrape_url") diff --git a/src/strands_tools/apify_core.py b/src/strands_tools/apify_core.py new file mode 100644 index 00000000..69d330ae --- /dev/null +++ b/src/strands_tools/apify_core.py @@ -0,0 +1,366 @@ +"""Core Apify platform tools for Strands Agents. + +This module provides web scraping, data extraction, and automation capabilities +using the Apify platform. It lets you run any Actor, task, fetch dataset +results, and scrape individual URLs. + +Available Tools: +--------------- +- apify_run_actor: Run any Apify Actor with custom input +- apify_get_dataset_items: Fetch items from an Apify dataset with pagination +- apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step +- apify_run_task: Run a saved Actor task with optional input overrides +- apify_run_task_and_get_dataset: Run a task and fetch results in one step +- apify_scrape_url: Scrape a single URL and return content as Markdown + +Setup Requirements: +------------------ +1. Create an Apify account at https://apify.com +2. Obtain your API token: Apify Console > Settings > API & Integrations > Personal API tokens +3. Install the optional dependency: pip install strands-agents-tools[apify] +4. Set the environment variable: + APIFY_API_TOKEN=your_api_token_here + +Usage Examples: +-------------- +Register all core tools at once via the preset list: + +```python +from strands import Agent +from strands_tools.apify_core import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Or pick individual tools for a smaller LLM tool surface: + +```python +from strands import Agent +from strands_tools import apify_core + +agent = Agent(tools=[ + apify_core.apify_scrape_url, + apify_core.apify_run_actor, +]) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) +``` +""" + +import json +from typing import Any, Dict, Optional + +from strands import tool + +from strands_tools.apify import ( + DEFAULT_DATASET_ITEMS_LIMIT, + DEFAULT_SCRAPE_TIMEOUT_SECS, + DEFAULT_TIMEOUT_SECS, + ApifyToolClient, + _check_dependency, + _error_result, + _success_result, +) + + +@tool +def apify_run_actor( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, +) -> Dict[str, Any]: + """Run any Apify Actor and return the run metadata as JSON. + + Executes the Actor synchronously - blocks until the Actor run finishes or the timeout + is reached. Use this when you need to run a specific Actor and then inspect or process + the results separately. + + Common Actors: + - "apify/website-content-crawler" - scrape websites and extract content + - "apify/web-scraper" - general-purpose web scraper + - "apify/google-search-scraper" - scrape Google search results + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Actor", + ) + except Exception as e: + return _error_result(e, "apify_run_actor") + + +@tool +def apify_get_dataset_items( + dataset_id: str, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + offset: int = 0, +) -> Dict[str, Any]: + """Fetch items from an existing Apify dataset and return them as JSON. + + Use this after running an Actor to retrieve the structured results from its + default dataset, or to access any dataset by ID. + + Args: + dataset_id: The Apify dataset ID to fetch items from. + limit: Maximum number of items to return. Defaults to 100. + offset: Number of items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing an array of dataset items. + """ + try: + _check_dependency() + client = ApifyToolClient() + items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) + return _success_result( + text=json.dumps(items, indent=2, default=str), + panel_body=( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" + ), + panel_title="Apify: Dataset Items", + ) + except Exception as e: + return _error_result(e, "apify_get_dataset_items") + + +@tool +def apify_run_actor_and_get_dataset( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify Actor and fetch its dataset results in one step. + + Convenience tool that combines running an Actor and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Actor + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_actor_and_get_dataset") + + +@tool +def apify_run_task( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, +) -> Dict[str, Any]: + """Run an Apify task and return the run metadata as JSON. + + Tasks are saved Actor configurations with preset inputs. Use this when a task + has already been configured in Apify Console, so you don't need to specify + the full Actor input every time. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Task", + ) + except Exception as e: + return _error_result(e, "apify_run_task") + + +@tool +def apify_run_task_and_get_dataset( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify task and fetch its dataset results in one step. + + Convenience tool that combines running a task and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task_and_get_dataset( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed with dataset[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Task + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_task_and_get_dataset") + + +@tool +def apify_scrape_url( + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: str = "cheerio", +) -> Dict[str, Any]: + """Scrape a single URL and return its content as markdown. + + Uses the Website Content Crawler Actor under the hood, pre-configured for + fast single-page scraping. This is the simplest way to extract readable content + from any web page. + + Args: + url: The URL to scrape, e.g. "https://example.com". + timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, + default), "playwright:adaptive" (fast, renders JS if present), or + "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). + + Returns: + Dict with status and content containing the markdown content of the scraped page. + """ + try: + _check_dependency() + client = ApifyToolClient() + content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) + return _success_result( + text=content, + panel_body=( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" + ), + panel_title="Apify: Scrape URL", + ) + except Exception as e: + return _error_result(e, "apify_scrape_url") + + +# Pre-built list of all core tools for convenient agent registration. +# Usage: Agent(tools=APIFY_CORE_TOOLS) +APIFY_CORE_TOOLS = [ + apify_run_actor, + apify_get_dataset_items, + apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, + apify_scrape_url, +] diff --git a/tests/test_apify.py b/tests/test_apify.py index 3c9ec899..963225fd 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -5,9 +5,9 @@ import pytest -from strands_tools import apify -from strands_tools.apify import ( - ApifyToolClient, +from strands_tools import apify, apify_core +from strands_tools.apify import ApifyToolClient +from strands_tools.apify_core import ( apify_get_dataset_items, apify_run_actor, apify_run_actor_and_get_dataset, @@ -96,12 +96,18 @@ def mock_apify_env(monkeypatch): # --- Module import --- -def test_apify_module_is_importable(): - """Verify that the apify tool module can be imported from strands_tools.""" +def test_apify_base_module_is_importable(): + """Verify that the apify base module can be imported from strands_tools.""" assert apify is not None assert apify.__name__ == "strands_tools.apify" +def test_apify_core_module_is_importable(): + """Verify that the apify_core tool module can be imported from strands_tools.""" + assert apify_core is not None + assert apify_core.__name__ == "strands_tools.apify_core" + + # --- ApifyToolClient --- From ff2494ef7fb4399bfe9dd8b90f1c0915ea4d78ce Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 25 Mar 2026 14:28:42 +0100 Subject: [PATCH 11/24] feat: refactor the tool, use one file only --- README.md | 2 +- docs/apify_tool.md | 10 +- src/strands_tools/apify.py | 349 +++++++++++++++++++++++++++++- src/strands_tools/apify_core.py | 366 -------------------------------- tests/test_apify.py | 16 +- 5 files changed, 354 insertions(+), 389 deletions(-) delete mode 100644 src/strands_tools/apify_core.py diff --git a/README.md b/README.md index d79af517..67d9833a 100644 --- a/README.md +++ b/README.md @@ -970,7 +970,7 @@ result = agent.tool.mongodb_memory( ```python from strands import Agent -from strands_tools.apify_core import APIFY_CORE_TOOLS +from strands_tools.apify import APIFY_CORE_TOOLS agent = Agent(tools=APIFY_CORE_TOOLS) diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 36358192..f1455cdb 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,6 +1,6 @@ # Apify -The Apify core tools (`apify_core.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. ## Installation @@ -24,7 +24,7 @@ Register all core tools at once: ```python from strands import Agent -from strands_tools.apify_core import APIFY_CORE_TOOLS +from strands_tools.apify import APIFY_CORE_TOOLS agent = Agent(tools=APIFY_CORE_TOOLS) ``` @@ -33,11 +33,11 @@ Or pick individual tools: ```python from strands import Agent -from strands_tools import apify_core +from strands_tools import apify agent = Agent(tools=[ - apify_core.apify_run_actor, - apify_core.apify_scrape_url, + apify.apify_run_actor, + apify.apify_scrape_url, ]) ``` diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index cb63ae70..5855ab83 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,11 +1,17 @@ -"""Shared base for Apify platform tools. +"""Apify platform tools for Strands Agents. -This module provides the shared infrastructure used by all Apify tool modules -(e.g. apify_core, apify_social). It contains the API client, error handling, -response helpers, and constants. It does NOT contain any @tool functions itself. +This module provides web scraping, data extraction, and automation capabilities +using the Apify platform. It lets you run any Actor, task, fetch dataset +results, and scrape individual URLs. -Tool modules import from here: - from strands_tools.apify import ApifyToolClient, _check_dependency, ... +Available Tools: +--------------- +- apify_run_actor: Run any Apify Actor with custom input +- apify_get_dataset_items: Fetch items from an Apify dataset with pagination +- apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step +- apify_run_task: Run a saved Actor task with optional input overrides +- apify_run_task_and_get_dataset: Run a task and fetch results in one step +- apify_scrape_url: Scrape a single URL and return content as Markdown Setup Requirements: ------------------ @@ -14,8 +20,41 @@ 3. Install the optional dependency: pip install strands-agents-tools[apify] 4. Set the environment variable: APIFY_API_TOKEN=your_api_token_here + +Usage Examples: +-------------- +Register all core tools at once via the preset list: + +```python +from strands import Agent +from strands_tools.apify import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Or pick individual tools for a smaller LLM tool surface: + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_scrape_url, + apify.apify_run_actor, +]) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) +``` """ +import json import logging import os from typing import Any, Dict, List, Optional @@ -23,6 +62,7 @@ from rich.panel import Panel from rich.text import Text +from strands import tool from strands_tools.utils import console_util @@ -311,3 +351,300 @@ def scrape_url( raise RuntimeError(f"No content returned for URL: {url}") return str(items[0].get("markdown") or items[0].get("text", "")) + + +# --- Tool functions --- + + +@tool +def apify_run_actor( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, +) -> Dict[str, Any]: + """Run any Apify Actor and return the run metadata as JSON. + + Executes the Actor synchronously - blocks until the Actor run finishes or the timeout + is reached. Use this when you need to run a specific Actor and then inspect or process + the results separately. + + Common Actors: + - "apify/website-content-crawler" - scrape websites and extract content + - "apify/web-scraper" - general-purpose web scraper + - "apify/google-search-scraper" - scrape Google search results + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Actor", + ) + except Exception as e: + return _error_result(e, "apify_run_actor") + + +@tool +def apify_get_dataset_items( + dataset_id: str, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + offset: int = 0, +) -> Dict[str, Any]: + """Fetch items from an existing Apify dataset and return them as JSON. + + Use this after running an Actor to retrieve the structured results from its + default dataset, or to access any dataset by ID. + + Args: + dataset_id: The Apify dataset ID to fetch items from. + limit: Maximum number of items to return. Defaults to 100. + offset: Number of items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing an array of dataset items. + """ + try: + _check_dependency() + client = ApifyToolClient() + items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) + return _success_result( + text=json.dumps(items, indent=2, default=str), + panel_body=( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" + ), + panel_title="Apify: Dataset Items", + ) + except Exception as e: + return _error_result(e, "apify_get_dataset_items") + + +@tool +def apify_run_actor_and_get_dataset( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify Actor and fetch its dataset results in one step. + + Convenience tool that combines running an Actor and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Actor + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_actor_and_get_dataset") + + +@tool +def apify_run_task( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, +) -> Dict[str, Any]: + """Run an Apify task and return the run metadata as JSON. + + Tasks are saved Actor configurations with preset inputs. Use this when a task + has already been configured in Apify Console, so you don't need to specify + the full Actor input every time. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Task", + ) + except Exception as e: + return _error_result(e, "apify_run_task") + + +@tool +def apify_run_task_and_get_dataset( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify task and fetch its dataset results in one step. + + Convenience tool that combines running a task and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task_and_get_dataset( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed with dataset[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Task + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_task_and_get_dataset") + + +@tool +def apify_scrape_url( + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: str = "cheerio", +) -> Dict[str, Any]: + """Scrape a single URL and return its content as markdown. + + Uses the Website Content Crawler Actor under the hood, pre-configured for + fast single-page scraping. This is the simplest way to extract readable content + from any web page. + + Args: + url: The URL to scrape, e.g. "https://example.com". + timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, + default), "playwright:adaptive" (fast, renders JS if present), or + "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). + + Returns: + Dict with status and content containing the markdown content of the scraped page. + """ + try: + _check_dependency() + client = ApifyToolClient() + content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) + return _success_result( + text=content, + panel_body=( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" + ), + panel_title="Apify: Scrape URL", + ) + except Exception as e: + return _error_result(e, "apify_scrape_url") + + +APIFY_CORE_TOOLS = [ + apify_run_actor, + apify_get_dataset_items, + apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, + apify_scrape_url, +] diff --git a/src/strands_tools/apify_core.py b/src/strands_tools/apify_core.py deleted file mode 100644 index 69d330ae..00000000 --- a/src/strands_tools/apify_core.py +++ /dev/null @@ -1,366 +0,0 @@ -"""Core Apify platform tools for Strands Agents. - -This module provides web scraping, data extraction, and automation capabilities -using the Apify platform. It lets you run any Actor, task, fetch dataset -results, and scrape individual URLs. - -Available Tools: ---------------- -- apify_run_actor: Run any Apify Actor with custom input -- apify_get_dataset_items: Fetch items from an Apify dataset with pagination -- apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step -- apify_run_task: Run a saved Actor task with optional input overrides -- apify_run_task_and_get_dataset: Run a task and fetch results in one step -- apify_scrape_url: Scrape a single URL and return content as Markdown - -Setup Requirements: ------------------- -1. Create an Apify account at https://apify.com -2. Obtain your API token: Apify Console > Settings > API & Integrations > Personal API tokens -3. Install the optional dependency: pip install strands-agents-tools[apify] -4. Set the environment variable: - APIFY_API_TOKEN=your_api_token_here - -Usage Examples: --------------- -Register all core tools at once via the preset list: - -```python -from strands import Agent -from strands_tools.apify_core import APIFY_CORE_TOOLS - -agent = Agent(tools=APIFY_CORE_TOOLS) -``` - -Or pick individual tools for a smaller LLM tool surface: - -```python -from strands import Agent -from strands_tools import apify_core - -agent = Agent(tools=[ - apify_core.apify_scrape_url, - apify_core.apify_run_actor, -]) - -# Scrape a single URL -content = agent.tool.apify_scrape_url(url="https://example.com") - -# Run an Actor -result = agent.tool.apify_run_actor( - actor_id="apify/website-content-crawler", - run_input={"startUrls": [{"url": "https://example.com"}]}, -) -``` -""" - -import json -from typing import Any, Dict, Optional - -from strands import tool - -from strands_tools.apify import ( - DEFAULT_DATASET_ITEMS_LIMIT, - DEFAULT_SCRAPE_TIMEOUT_SECS, - DEFAULT_TIMEOUT_SECS, - ApifyToolClient, - _check_dependency, - _error_result, - _success_result, -) - - -@tool -def apify_run_actor( - actor_id: str, - run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - build: Optional[str] = None, -) -> Dict[str, Any]: - """Run any Apify Actor and return the run metadata as JSON. - - Executes the Actor synchronously - blocks until the Actor run finishes or the timeout - is reached. Use this when you need to run a specific Actor and then inspect or process - the results separately. - - Common Actors: - - "apify/website-content-crawler" - scrape websites and extract content - - "apify/web-scraper" - general-purpose web scraper - - "apify/google-search-scraper" - scrape Google search results - - Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. - timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. - build: Actor build tag or number to run a specific version. Uses latest build if not set. - - Returns: - Dict with status and content containing run metadata: run_id, status, dataset_id, - started_at, finished_at. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_actor( - actor_id=actor_id, - run_input=run_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - build=build, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Actor run completed[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}" - ), - panel_title="Apify: Run Actor", - ) - except Exception as e: - return _error_result(e, "apify_run_actor") - - -@tool -def apify_get_dataset_items( - dataset_id: str, - limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - offset: int = 0, -) -> Dict[str, Any]: - """Fetch items from an existing Apify dataset and return them as JSON. - - Use this after running an Actor to retrieve the structured results from its - default dataset, or to access any dataset by ID. - - Args: - dataset_id: The Apify dataset ID to fetch items from. - limit: Maximum number of items to return. Defaults to 100. - offset: Number of items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing an array of dataset items. - """ - try: - _check_dependency() - client = ApifyToolClient() - items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) - return _success_result( - text=json.dumps(items, indent=2, default=str), - panel_body=( - f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" - ), - panel_title="Apify: Dataset Items", - ) - except Exception as e: - return _error_result(e, "apify_get_dataset_items") - - -@tool -def apify_run_actor_and_get_dataset( - actor_id: str, - run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - build: Optional[str] = None, - dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - dataset_items_offset: int = 0, -) -> Dict[str, Any]: - """Run an Apify Actor and fetch its dataset results in one step. - - Convenience tool that combines running an Actor and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the - result data without making two separate tool calls. - - Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. - timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. - build: Actor build tag or number to run a specific version. Uses latest build if not set. - dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. - dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the dataset results. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_actor_and_get_dataset( - actor_id=actor_id, - run_input=run_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - build=build, - dataset_items_limit=dataset_items_limit, - dataset_items_offset=dataset_items_offset, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Actor run completed with dataset[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}" - ), - panel_title="Apify: Run Actor + Dataset", - ) - except Exception as e: - return _error_result(e, "apify_run_actor_and_get_dataset") - - -@tool -def apify_run_task( - task_id: str, - task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, -) -> Dict[str, Any]: - """Run an Apify task and return the run metadata as JSON. - - Tasks are saved Actor configurations with preset inputs. Use this when a task - has already been configured in Apify Console, so you don't need to specify - the full Actor input every time. - - Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. - timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. - - Returns: - Dict with status and content containing run metadata: run_id, status, dataset_id, - started_at, finished_at. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_task( - task_id=task_id, - task_input=task_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Task run completed[/green]\n" - f"Task: {task_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}" - ), - panel_title="Apify: Run Task", - ) - except Exception as e: - return _error_result(e, "apify_run_task") - - -@tool -def apify_run_task_and_get_dataset( - task_id: str, - task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - dataset_items_offset: int = 0, -) -> Dict[str, Any]: - """Run an Apify task and fetch its dataset results in one step. - - Convenience tool that combines running a task and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the - result data without making two separate tool calls. - - Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. - timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. - dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. - dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the dataset results. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_task_and_get_dataset( - task_id=task_id, - task_input=task_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - dataset_items_limit=dataset_items_limit, - dataset_items_offset=dataset_items_offset, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Task run completed with dataset[/green]\n" - f"Task: {task_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}" - ), - panel_title="Apify: Run Task + Dataset", - ) - except Exception as e: - return _error_result(e, "apify_run_task_and_get_dataset") - - -@tool -def apify_scrape_url( - url: str, - timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, - crawler_type: str = "cheerio", -) -> Dict[str, Any]: - """Scrape a single URL and return its content as markdown. - - Uses the Website Content Crawler Actor under the hood, pre-configured for - fast single-page scraping. This is the simplest way to extract readable content - from any web page. - - Args: - url: The URL to scrape, e.g. "https://example.com". - timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. - crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, - default), "playwright:adaptive" (fast, renders JS if present), or - "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). - - Returns: - Dict with status and content containing the markdown content of the scraped page. - """ - try: - _check_dependency() - client = ApifyToolClient() - content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) - return _success_result( - text=content, - panel_body=( - f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" - ), - panel_title="Apify: Scrape URL", - ) - except Exception as e: - return _error_result(e, "apify_scrape_url") - - -# Pre-built list of all core tools for convenient agent registration. -# Usage: Agent(tools=APIFY_CORE_TOOLS) -APIFY_CORE_TOOLS = [ - apify_run_actor, - apify_get_dataset_items, - apify_run_actor_and_get_dataset, - apify_run_task, - apify_run_task_and_get_dataset, - apify_scrape_url, -] diff --git a/tests/test_apify.py b/tests/test_apify.py index 963225fd..038f8211 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -5,9 +5,9 @@ import pytest -from strands_tools import apify, apify_core -from strands_tools.apify import ApifyToolClient -from strands_tools.apify_core import ( +from strands_tools import apify +from strands_tools.apify import ( + ApifyToolClient, apify_get_dataset_items, apify_run_actor, apify_run_actor_and_get_dataset, @@ -96,18 +96,12 @@ def mock_apify_env(monkeypatch): # --- Module import --- -def test_apify_base_module_is_importable(): - """Verify that the apify base module can be imported from strands_tools.""" +def test_apify_module_is_importable(): + """Verify that the apify module can be imported from strands_tools.""" assert apify is not None assert apify.__name__ == "strands_tools.apify" -def test_apify_core_module_is_importable(): - """Verify that the apify_core tool module can be imported from strands_tools.""" - assert apify_core is not None - assert apify_core.__name__ == "strands_tools.apify_core" - - # --- ApifyToolClient --- From 488a168d0cf3c7b48ee932ad0b2433a690688614 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 26 Mar 2026 09:57:56 +0100 Subject: [PATCH 12/24] docs: add missing tools parameters --- docs/apify_tool.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/apify_tool.md b/docs/apify_tool.md index f1455cdb..46e9e800 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -120,6 +120,7 @@ items = agent.tool.apify_get_dataset_items( |-----------|------|----------|---------|-------------| | `url` | string | Yes | β€” | The URL to scrape | | `timeout_secs` | int | No | 120 | Maximum time in seconds to wait for scraping to finish | +| `crawler_type` | string | No | `"cheerio"` | Crawler engine to use. One of `"cheerio"` (fastest, no JS rendering), `"playwright:adaptive"` (fast, renders JS if present), or `"playwright:firefox"` (reliable, renders JS, best at avoiding blocking but slower) | **Returns:** Markdown content of the scraped page as a plain string. @@ -131,6 +132,7 @@ items = agent.tool.apify_get_dataset_items( | `run_input` | dict | No | None | JSON-serializable input for the Actor | | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | | `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `build` | string | No | None | Actor build tag or number to run a specific version (uses latest build if not set) | **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. @@ -154,6 +156,7 @@ items = agent.tool.apify_get_dataset_items( | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | | `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | | `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | +| `dataset_items_offset` | int | No | 0 | Number of dataset items to skip for pagination | **Returns:** JSON string with run metadata plus an `items` array containing the dataset results. @@ -175,7 +178,9 @@ items = agent.tool.apify_get_dataset_items( | `run_input` | dict | No | None | JSON-serializable input for the Actor | | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | | `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `build` | string | No | None | Actor build tag or number to run a specific version (uses latest build if not set) | | `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | +| `dataset_items_offset` | int | No | 0 | Number of dataset items to skip for pagination | **Returns:** JSON string with run metadata plus an `items` array containing the dataset results. From 1b9675f339003ae2bc97f1c5fbc8fe6b9c43894d Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Thu, 2 Apr 2026 11:46:55 +0200 Subject: [PATCH 13/24] fix: Update Apify tools documentation for improved clarity and expanded details on input, usage, and examples. --- src/strands_tools/apify.py | 82 ++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 5855ab83..1505cb60 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,8 +1,10 @@ """Apify platform tools for Strands Agents. -This module provides web scraping, data extraction, and automation capabilities -using the Apify platform. It lets you run any Actor, task, fetch dataset -results, and scrape individual URLs. +Apify is a large marketplace of tools for web scraping, data extraction, +and web automation. These tools are called Actors β€” serverless cloud applications that +take JSON input and store results in a dataset (structured, tabular output) or key-value +store (files and unstructured data). Actors exist for social media, e-commerce, search +engines, maps, travel sites, and many other sources. Available Tools: --------------- @@ -16,7 +18,7 @@ Setup Requirements: ------------------ 1. Create an Apify account at https://apify.com -2. Obtain your API token: Apify Console > Settings > API & Integrations > Personal API tokens +2. Get your API token: Apify Console > Settings > API & Integrations > Personal API tokens 3. Install the optional dependency: pip install strands-agents-tools[apify] 4. Set the environment variable: APIFY_API_TOKEN=your_api_token_here @@ -366,18 +368,22 @@ def apify_run_actor( ) -> Dict[str, Any]: """Run any Apify Actor and return the run metadata as JSON. - Executes the Actor synchronously - blocks until the Actor run finishes or the timeout - is reached. Use this when you need to run a specific Actor and then inspect or process - the results separately. + An Actor is a serverless cloud app on the Apify platform β€” it takes JSON input, + runs the scraping or automation job, and writes results to a dataset. This tool + executes the Actor synchronously and returns run metadata only (run_id, status, + dataset_id, timestamps). Use apify_run_actor_and_get_dataset to also fetch the + output data in one call, or apify_scrape_url for quick single-URL extraction. Common Actors: - - "apify/website-content-crawler" - scrape websites and extract content - - "apify/web-scraper" - general-purpose web scraper - - "apify/google-search-scraper" - scrape Google search results + - "apify/website-content-crawler" β€” scrape websites and extract content as markdown + - "apify/web-scraper" β€” general-purpose web scraper with JS rendering + - "apify/google-search-scraper" β€” scrape Google search results Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. + actor_id: Actor identifier in "username/actor-name" format, + e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. + run_input: JSON-serializable input for the Actor. Each Actor defines its own + input schema β€” check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. @@ -419,8 +425,9 @@ def apify_get_dataset_items( ) -> Dict[str, Any]: """Fetch items from an existing Apify dataset and return them as JSON. - Use this after running an Actor to retrieve the structured results from its - default dataset, or to access any dataset by ID. + Every Actor run writes its output to a dataset β€” a structured, append-only store + for tabular data. Use the dataset_id from the run metadata returned by apify_run_actor + or apify_run_task. Use offset for pagination through large datasets. Args: dataset_id: The Apify dataset ID to fetch items from. @@ -457,15 +464,17 @@ def apify_run_actor_and_get_dataset( ) -> Dict[str, Any]: """Run an Apify Actor and fetch its dataset results in one step. - Convenience tool that combines running an Actor and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the + Convenience tool that combines running an Actor and fetching its default dataset + items into a single call. Use this when you want both the run metadata and the result data without making two separate tool calls. Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. + actor_id: Actor identifier in "username/actor-name" format, + e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. + run_input: JSON-serializable input for the Actor. Each Actor defines its own + input schema β€” check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. @@ -509,15 +518,16 @@ def apify_run_task( timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: - """Run an Apify task and return the run metadata as JSON. + """Run a saved Apify task and return the run metadata as JSON. - Tasks are saved Actor configurations with preset inputs. Use this when a task - has already been configured in Apify Console, so you don't need to specify - the full Actor input every time. + Tasks are saved Actor configurations with preset inputs, managed in Apify Console. + Use this when a task has already been configured, so you don't need to specify + the full Actor input every time. Use apify_run_task_and_get_dataset to also fetch + the output data in one call. Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. + task_id: Task identifier in "username~task-name" format or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. @@ -558,17 +568,17 @@ def apify_run_task_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run an Apify task and fetch its dataset results in one step. + """Run a saved Apify task and fetch its dataset results in one step. - Convenience tool that combines running a task and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the + Convenience tool that combines running a task and fetching its default dataset + items into a single call. Use this when you want both the run metadata and the result data without making two separate tool calls. Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. + task_id: Task identifier in "username~task-name" format or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. + memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. @@ -613,14 +623,16 @@ def apify_scrape_url( Uses the Website Content Crawler Actor under the hood, pre-configured for fast single-page scraping. This is the simplest way to extract readable content - from any web page. + from any web page β€” no Actor input schema needed. For multi-page crawls, use + apify_run_actor_and_get_dataset with "apify/website-content-crawler" directly. Args: url: The URL to scrape, e.g. "https://example.com". timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. - crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, - default), "playwright:adaptive" (fast, renders JS if present), or - "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). + crawler_type: Crawler engine to use. One of: + - "cheerio" (default): Fastest, no JavaScript rendering. Best for static HTML. + - "playwright:adaptive": Renders JS only when needed. Good general-purpose choice. + - "playwright:firefox": Full JS rendering, best at bypassing anti-bot protection but slowest. Returns: Dict with status and content containing the markdown content of the scraped page. From 46daa97100e2e2a67d1f371eec0a6155d4053118 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 14:57:12 +0200 Subject: [PATCH 14/24] docs: keep most important tools in readme --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 67d9833a..3838222b 100644 --- a/README.md +++ b/README.md @@ -99,11 +99,7 @@ Below is a comprehensive table of all available tools, how to use them with an a | Tool | Agent Usage | Use Case | |------|-------------|----------| | a2a_client | `provider = A2AClientToolProvider(known_agent_urls=["http://localhost:9000"]); agent = Agent(tools=provider.tools)` | Discover and communicate with A2A-compliant agents, send messages between agents | -| apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor by ID with arbitrary input | -| apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify dataset | -| apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its dataset results in one step | -| apify_run_task | `agent.tool.apify_run_task(task_id="user/my-task")` | Run a saved Apify task by ID with optional input overrides | -| apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user/my-task", dataset_items_limit=50)` | Run a task and fetch its dataset results in one step | +| apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor with arbitrary input | | apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | From 19500c7dec491448eb4fcf320084a874a7738677 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:04:08 +0200 Subject: [PATCH 15/24] feat: update crawler type constants in Apify tool --- src/strands_tools/apify.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 5855ab83..adbe38b1 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -77,13 +77,14 @@ except ImportError: HAS_APIFY_CLIENT = False -WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" DEFAULT_TIMEOUT_SECS = 300 DEFAULT_SCRAPE_TIMEOUT_SECS = 120 DEFAULT_DATASET_ITEMS_LIMIT = 100 -VALID_CRAWLER_TYPES = ("playwright:adaptive", "playwright:firefox", "cheerio") + +WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" +WEBSITE_CONTENT_CRAWLER_TYPES = ("playwright:adaptive", "playwright:firefox", "cheerio") # --- Helper functions --- @@ -326,9 +327,9 @@ def scrape_url( """Scrape a single URL using Website Content Crawler and return markdown.""" self._validate_url(url) self._validate_positive(timeout_secs, "timeout_secs") - if crawler_type not in VALID_CRAWLER_TYPES: + if crawler_type not in WEBSITE_CONTENT_CRAWLER_TYPES: raise ValueError( - f"Invalid crawler_type '{crawler_type}'. Must be one of: {', '.join(VALID_CRAWLER_TYPES)}." + f"Invalid crawler_type '{crawler_type}'. Must be one of: {', '.join(WEBSITE_CONTENT_CRAWLER_TYPES)}." ) run_input: Dict[str, Any] = { From 4405ebe8554eece9257ec77050c0dddb7969a994 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:29:52 +0200 Subject: [PATCH 16/24] feat: use Literal for crawler types in Apify tool --- src/strands_tools/apify.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index adbe38b1..9f1eb080 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -57,7 +57,7 @@ import json import logging import os -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Literal, Optional, get_args from urllib.parse import urlparse from rich.panel import Panel @@ -84,7 +84,8 @@ DEFAULT_DATASET_ITEMS_LIMIT = 100 WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" -WEBSITE_CONTENT_CRAWLER_TYPES = ("playwright:adaptive", "playwright:firefox", "cheerio") +CrawlerType = Literal["playwright:adaptive", "playwright:firefox", "cheerio"] +WEBSITE_CONTENT_CRAWLER_TYPES = get_args(CrawlerType) # --- Helper functions --- @@ -322,7 +323,7 @@ def scrape_url( self, url: str, timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, - crawler_type: str = "cheerio", + crawler_type: CrawlerType = "cheerio", ) -> str: """Scrape a single URL using Website Content Crawler and return markdown.""" self._validate_url(url) @@ -608,7 +609,7 @@ def apify_run_task_and_get_dataset( def apify_scrape_url( url: str, timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, - crawler_type: str = "cheerio", + crawler_type: CrawlerType = "cheerio", ) -> Dict[str, Any]: """Scrape a single URL and return its content as markdown. From ab930ad36dcb4cbe94e5615f81d1748130862747 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:39:26 +0200 Subject: [PATCH 17/24] feat: add comment for tracking header --- src/strands_tools/apify.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 9f1eb080..17efac93 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -77,6 +77,7 @@ except ImportError: HAS_APIFY_CLIENT = False +# Attribution header - lets Apify track usage originating from strands-agents (analytics only) TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" DEFAULT_TIMEOUT_SECS = 300 From b07d7c195e997d401df58b6f2ff12425759a1931 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:55:09 +0200 Subject: [PATCH 18/24] feat: add error handling for missing actor run data and dataset in Apify tool --- src/strands_tools/apify.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 17efac93..06a03811 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -344,9 +344,13 @@ def scrape_url( timeout_secs=timeout_secs, logger=None, # Suppress verbose apify-client logging not useful to end users ) + if actor_run is None: + raise RuntimeError("Website Content Crawler returned no run data (possible wait timeout).") self._check_run_status(actor_run, "Website Content Crawler") dataset_id = actor_run.get("defaultDatasetId") + if not dataset_id: + raise RuntimeError("Website Content Crawler run has no default dataset.") result = self.client.dataset(dataset_id).list_items(limit=1) items = list(result.items) From 30412f7135777658d0950f7172eda6f7727758d7 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:57:59 +0200 Subject: [PATCH 19/24] feat: add unit tests for new tools guarding --- tests/test_apify.py | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/test_apify.py b/tests/test_apify.py index 038f8211..a34ae6b7 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -192,6 +192,17 @@ def test_run_actor_api_exception(mock_apify_env, mock_apify_client): assert "Connection failed" in result["content"][0]["text"] +def test_run_actor_none_response(mock_apify_env, mock_apify_client): + """Actor run returns error dict when ActorClient.call() returns None.""" + mock_apify_client.actor.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): """Actor run returns friendly message for 401 authentication errors.""" error = _make_apify_api_error(401, "Unauthorized") @@ -275,6 +286,18 @@ def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): assert data["items"][0]["title"] == "Widget A" +def test_run_actor_and_get_dataset_no_dataset_id(mock_apify_env, mock_apify_client): + """Combined tool returns error when the Actor run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_client): """Combined tool returns error dict when the Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN @@ -374,6 +397,18 @@ def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): assert data["items"][0]["title"] == "Widget A" +def test_run_task_and_get_dataset_no_dataset_id(mock_apify_env, mock_apify_client): + """Combined task tool returns error when the task run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.task.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client): """Combined task tool returns error dict when the task fails.""" mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN @@ -402,6 +437,29 @@ def test_scrape_url_success(mock_apify_env, mock_apify_client): mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") +def test_scrape_url_none_response(mock_apify_env, mock_apify_client): + """Scrape URL returns error dict when ActorClient.call() returns None.""" + mock_apify_client.actor.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_scrape_url_no_dataset_id(mock_apify_env, mock_apify_client): + """Scrape URL returns error when the crawler run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + def test_scrape_url_no_content(mock_apify_env, mock_apify_client): """Scrape URL returns error dict when no content is returned.""" mock_list_result = MagicMock() From 4732185ec6169edca0b398f2b9d69957ef037a01 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 16:03:24 +0200 Subject: [PATCH 20/24] fix: ensure explicit empty input is correctly passed to Apify actor --- src/strands_tools/apify.py | 2 +- tests/test_apify.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 06a03811..509a2052 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -200,7 +200,7 @@ def run_actor( self._validate_positive(memory_mbytes, "memory_mbytes") call_kwargs: Dict[str, Any] = { - "run_input": run_input or {}, + "run_input": run_input if run_input is not None else {}, "timeout_secs": timeout_secs, "logger": None, # Suppress verbose apify-client logging not useful to end users } diff --git a/tests/test_apify.py b/tests/test_apify.py index a34ae6b7..78f15694 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -150,6 +150,17 @@ def test_run_actor_default_input(mock_apify_env, mock_apify_client): assert call_kwargs["run_input"] == {} +def test_run_actor_explicit_empty_input(mock_apify_env, mock_apify_client): + """Actor run passes through an explicitly empty dict instead of treating it as falsy.""" + empty_input: dict = {} + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper", run_input=empty_input) + + assert result["status"] == "success" + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["run_input"] is empty_input + + def test_run_actor_with_memory(mock_apify_env, mock_apify_client): """Actor run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): From b1a792cc556130d4053ac6f6bdfe14649bd5bc9e Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 16:10:11 +0200 Subject: [PATCH 21/24] fix: add error status message for None --- src/strands_tools/apify.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 509a2052..168613a6 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -119,6 +119,8 @@ def _format_error(e: Exception) -> str: "Rate limit exceeded. The Apify client retries automatically; " "if this persists, reduce request frequency." ) + case None: + return f"Apify API error: {msg}" case _: return f"Apify API error ({status_code}): {msg}" return str(e) From 07799a14e313eef296b3c1e610709c6c31dd4f91 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Wed, 8 Apr 2026 08:59:43 +0200 Subject: [PATCH 22/24] fix: Improve docs using apify-writing-style --- README.md | 10 +++++----- src/strands_tools/apify.py | 28 ++++++++++++++-------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 67d9833a..60062155 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Strands Agents Tools is a community-driven project that provides a powerful set - πŸ“ **File Operations** - Read, write, and edit files with syntax highlighting and intelligent modifications - πŸ–₯️ **Shell Integration** - Execute and interact with shell commands securely -- 🧠 **Memory** - Store user and agent memories across agent runs to provide personalized experiences with both Mem0, Amazon Bedrock Knowledge Bases, Elasticsearch, and MongoDB Atlas +- 🧠 **Memory** - Store user and agent memories across agent runs to provide personalized experiences with Mem0, Amazon Bedrock Knowledge Bases, Elasticsearch, and MongoDB Atlas - πŸ•ΈοΈ **Web Infrastructure** - Perform web searches, extract page content, and crawl websites with Tavily and Exa-powered tools - 🌐 **HTTP Client** - Make API requests with comprehensive authentication support - πŸ’¬ **Slack Client** - Real-time Slack events, message processing, and Slack API access @@ -104,7 +104,7 @@ Below is a comprehensive table of all available tools, how to use them with an a | apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its dataset results in one step | | apify_run_task | `agent.tool.apify_run_task(task_id="user/my-task")` | Run a saved Apify task by ID with optional input overrides | | apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user/my-task", dataset_items_limit=50)` | Run a task and fetch its dataset results in one step | -| apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | +| apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as Markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | | editor | `agent.tool.editor(command="view", path="path/to/file.py")` | Advanced file operations like syntax highlighting, pattern replacement, and multi-file edits | @@ -206,7 +206,7 @@ result = agent.tool.mcp_client( tool_args={"x": 10, "y": 20} ) -# Connect to a SSE-based server +# Connect to an SSE-based server agent.tool.mcp_client( action="connect", connection_id="web_server", @@ -277,7 +277,7 @@ response = agent.tool.http_request( auth_token="your_token_here" ) -# Convert HTML webpages to markdown for better readability +# Convert HTML webpages to Markdown for better readability response = agent.tool.http_request( method="GET", url="https://example.com/article", @@ -974,7 +974,7 @@ from strands_tools.apify import APIFY_CORE_TOOLS agent = Agent(tools=APIFY_CORE_TOOLS) -# Scrape a single URL and get markdown content +# Scrape a single URL and get Markdown content content = agent.tool.apify_scrape_url(url="https://example.com") # Run an Actor and get results in one step diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 1505cb60..ae4f6dee 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,10 +1,10 @@ """Apify platform tools for Strands Agents. -Apify is a large marketplace of tools for web scraping, data extraction, -and web automation. These tools are called Actors β€” serverless cloud applications that -take JSON input and store results in a dataset (structured, tabular output) or key-value -store (files and unstructured data). Actors exist for social media, e-commerce, search -engines, maps, travel sites, and many other sources. + +Apify is the world's largest marketplace of tools for web scraping, crawling, data extraction, and web automation. +These tools are called Actors, serverless cloud programs that take JSON input and store results +in a dataset (structured, tabular output) or key-value store (files and unstructured data). +Get structured data from social media, e-commerce, search engines, maps, travel sites, or any other website. Available Tools: --------------- @@ -325,7 +325,7 @@ def scrape_url( timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, crawler_type: str = "cheerio", ) -> str: - """Scrape a single URL using Website Content Crawler and return markdown.""" + """Scrape a single URL using Website Content Crawler and return Markdown.""" self._validate_url(url) self._validate_positive(timeout_secs, "timeout_secs") if crawler_type not in VALID_CRAWLER_TYPES: @@ -375,15 +375,15 @@ def apify_run_actor( output data in one call, or apify_scrape_url for quick single-URL extraction. Common Actors: - - "apify/website-content-crawler" β€” scrape websites and extract content as markdown - - "apify/web-scraper" β€” general-purpose web scraper with JS rendering + - "apify/website-content-crawler" - scrape websites and extract content as Markdown + - "apify/web-scraper" - general-purpose web scraper with JS rendering - "apify/google-search-scraper" β€” scrape Google search results Args: actor_id: Actor identifier in "username/actor-name" format, e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. run_input: JSON-serializable input for the Actor. Each Actor defines its own - input schema β€” check the Actor README on Apify Store for required fields. + input schema - check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. @@ -472,7 +472,7 @@ def apify_run_actor_and_get_dataset( actor_id: Actor identifier in "username/actor-name" format, e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. run_input: JSON-serializable input for the Actor. Each Actor defines its own - input schema β€” check the Actor README on Apify Store for required fields. + input schema - check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. @@ -526,7 +526,7 @@ def apify_run_task( the output data in one call. Args: - task_id: Task identifier in "username~task-name" format or a task ID string. + task_id: Task identifier in "username/task-name" format or a task ID string. task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. @@ -575,7 +575,7 @@ def apify_run_task_and_get_dataset( result data without making two separate tool calls. Args: - task_id: Task identifier in "username~task-name" format or a task ID string. + task_id: Task identifier in "username/task-name" format or a task ID string. task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. @@ -619,7 +619,7 @@ def apify_scrape_url( timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, crawler_type: str = "cheerio", ) -> Dict[str, Any]: - """Scrape a single URL and return its content as markdown. + """Scrape a single URL and return its content as Markdown. Uses the Website Content Crawler Actor under the hood, pre-configured for fast single-page scraping. This is the simplest way to extract readable content @@ -635,7 +635,7 @@ def apify_scrape_url( - "playwright:firefox": Full JS rendering, best at bypassing anti-bot protection but slowest. Returns: - Dict with status and content containing the markdown content of the scraped page. + Dict with status and content containing the Markdown content of the scraped page. """ try: _check_dependency() From 81810c4af18f3020ceb5c912438aa45e068babc6 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Wed, 8 Apr 2026 09:04:09 +0200 Subject: [PATCH 23/24] fix: Improve docs using apify-writing-style --- docs/apify_tool.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 46e9e800..ec0a9238 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -191,9 +191,9 @@ items = agent.tool.apify_get_dataset_items( | `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | | `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | | `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in [Apify Console](https://console.apify.com) | -| `Task ... finished with status FAILED` | task execution error | Check task configuration and run logs in [Apify Console](https://console.apify.com) | +| `Task ... finished with status FAILED` | Task execution error | Check task configuration and run logs in [Apify Console](https://console.apify.com) | | `Actor/task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | -| `Task ... returned no run data` | task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | +| `Task ... returned no run data` | Task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | | `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | ## References From 2eab80c4ed6949e782489b216c092bd4a4d2b2d4 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 14 Apr 2026 14:08:06 +0200 Subject: [PATCH 24/24] fix: Improve docs using apify-writing-style --- README.md | 6 +++--- src/strands_tools/apify.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 9e75e82a..81af8ed6 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Strands Agents Tools is a community-driven project that provides a powerful set - πŸ“ **File Operations** - Read, write, and edit files with syntax highlighting and intelligent modifications - πŸ–₯️ **Shell Integration** - Execute and interact with shell commands securely -- 🧠 **Memory** - Store user and agent memories across agent runs to provide personalized experiences with Mem0, Amazon Bedrock Knowledge Bases, Elasticsearch, and MongoDB Atlas +- 🧠 **Memory** - Store user and agent memories across agent runs to provide personalized experiences with both Mem0, Amazon Bedrock Knowledge Bases, Elasticsearch, and MongoDB Atlas - πŸ•ΈοΈ **Web Infrastructure** - Perform web searches, extract page content, and crawl websites with Tavily and Exa-powered tools - 🌐 **HTTP Client** - Make API requests with comprehensive authentication support - πŸ’¬ **Slack Client** - Real-time Slack events, message processing, and Slack API access @@ -202,7 +202,7 @@ result = agent.tool.mcp_client( tool_args={"x": 10, "y": 20} ) -# Connect to an SSE-based server +# Connect to a SSE-based server agent.tool.mcp_client( action="connect", connection_id="web_server", @@ -273,7 +273,7 @@ response = agent.tool.http_request( auth_token="your_token_here" ) -# Convert HTML webpages to Markdown for better readability +# Convert HTML webpages to markdown for better readability response = agent.tool.http_request( method="GET", url="https://example.com/article", diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 3f054603..19f8696a 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -394,7 +394,7 @@ def apify_run_actor( run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema - check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default `memory` value if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. Returns: @@ -483,7 +483,7 @@ def apify_run_actor_and_get_dataset( run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema - check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default `memory` value if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. @@ -538,7 +538,7 @@ def apify_run_task( task_id: Task identifier in "username/task-name" format or a task ID string. task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. + memory_mbytes: Memory allocation in MB for the task run. Uses task default `memory` value if not set. Returns: Dict with status and content containing run metadata: run_id, status, dataset_id, @@ -587,7 +587,7 @@ def apify_run_task_and_get_dataset( task_id: Task identifier in "username/task-name" format or a task ID string. task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. + memory_mbytes: Memory allocation in MB for the task run. Uses task default `memory` value if not set. dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0.