diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index e64941d2..677332de 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -9,57 +9,9 @@ on: jobs: unit-tests: - name: Unit Tests (Python ${{ matrix.python-version }}) - runs-on: ${{ matrix.os }} - - strategy: - fail-fast: false - matrix: - os: ${{ github.event_name == 'pull_request' && fromJSON('["ubuntu-latest"]') || fromJSON('["ubuntu-latest", "macos-latest", "windows-latest"]') }} - python-version: ['3.10', '3.12'] - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Install uv - uses: astral-sh/setup-uv@v4 - - - name: Install dependencies - run: | - uv sync - - - name: Install Playwright browsers - run: | - uv run playwright install chromium - - - name: Run unit tests - run: | - uv run pytest tests/ -m "unit or not integration" --cov --cov-report=xml --cov-report=term - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 - with: - file: ./coverage.xml - flags: unittests - name: codecov-${{ matrix.os }}-py${{ matrix.python-version }} - token: ${{ secrets.CODECOV_TOKEN }} - if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12' - - integration-tests: - name: Integration Tests + name: Unit Tests runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - test-group: [smart-scraper, multi-graph, file-formats] - steps: - name: Checkout code uses: actions/checkout@v4 @@ -67,147 +19,16 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.11' + python-version: '3.12' - name: Install uv uses: astral-sh/setup-uv@v4 - name: Install dependencies - run: | - uv sync + run: uv sync - name: Install Playwright browsers - run: | - uv run playwright install chromium - - - name: Run integration tests - env: - OPENAI_APIKEY: ${{ secrets.OPENAI_APIKEY }} - ANTHROPIC_APIKEY: ${{ secrets.ANTHROPIC_APIKEY }} - GROQ_APIKEY: ${{ secrets.GROQ_APIKEY }} - run: | - uv run pytest tests/integration/ -m integration --integration -v - - - name: Upload test results - uses: actions/upload-artifact@v4 - if: always() - with: - name: integration-test-results-${{ matrix.test-group }} - path: | - htmlcov/ - benchmark_results/ - - benchmark-tests: - name: Performance Benchmarks - runs-on: ubuntu-latest - if: github.event_name == 'push' && github.ref == 'refs/heads/main' - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install uv - uses: astral-sh/setup-uv@v4 - - - name: Install dependencies - run: | - uv sync - - - name: Run performance benchmarks - env: - OPENAI_APIKEY: ${{ secrets.OPENAI_APIKEY }} - run: | - uv run pytest tests/ -m benchmark --benchmark -v - - - name: Upload benchmark results - uses: actions/upload-artifact@v4 - with: - name: benchmark-results - path: benchmark_results/ - - - name: Compare with baseline - if: github.event_name == 'pull_request' - run: | - # Download baseline from main branch - # Compare and comment on PR if regression detected - echo "Benchmark comparison would run here" - - code-quality: - name: Code Quality Checks - runs-on: ubuntu-latest - if: github.event_name == 'push' - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install uv - uses: astral-sh/setup-uv@v4 - - - name: Install dependencies - run: | - uv sync - - - name: Run Ruff linting - run: | - uv run ruff check scrapegraphai/ tests/ - - - name: Run Black formatting check - run: | - uv run black --check scrapegraphai/ tests/ - - - name: Run isort check - run: | - uv run isort --check-only scrapegraphai/ tests/ - - - name: Run type checking with mypy - run: | - uv run mypy scrapegraphai/ - continue-on-error: true + run: uv run playwright install chromium - test-coverage-report: - name: Test Coverage Report - needs: [unit-tests, integration-tests] - runs-on: ubuntu-latest - if: always() - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download coverage artifacts - uses: actions/download-artifact@v4 - - - name: Generate coverage report - run: | - echo "Coverage report generation would run here" - - - name: Comment coverage on PR - if: github.event_name == 'pull_request' - uses: py-cov-action/python-coverage-comment-action@v3 - with: - GITHUB_TOKEN: ${{ github.token }} - - test-summary: - name: Test Summary - needs: [unit-tests, integration-tests, code-quality] - runs-on: ubuntu-latest - if: always() - - steps: - - name: Check test results - run: | - echo "All test jobs completed" - echo "Unit tests: ${{ needs.unit-tests.result }}" - echo "Integration tests: ${{ needs.integration-tests.result }}" - echo "Code quality: ${{ needs.code-quality.result }}" + - name: Run unit tests + run: uv run pytest tests/ -m "unit or not integration" diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f85e869..59c99f64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,35 @@ +## [2.0.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.76.0...v2.0.0) (2026-04-19) + + +### ⚠ BREAKING CHANGES + +* requires scrapegraph-py v2.0.0+ + +Co-Authored-By: Claude Opus 4.6 (1M context) + +### Features + +* add scrapegraph-py PR [#84](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/84) SDK compatibility ([e8b2a28](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e8b2a28f4708882ca6cedfdd979b37dda26ef6c2)), closes [#82](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/82) +* align with scrapegraph-py v2 API surface from PR [#82](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/82) ([c0f5fd5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c0f5fd592395f7f0e54bdf7367f3aff46bb6e420)) +* migrate to scrapegraph-py v2 API surface ([fd23bb0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd23bb0544b3bb867c34b009ed7bcecda86e3ac5)), closes [ScrapeGraphAI/scrapegraph-py#82](https://github.com/ScrapeGraphAI/scrapegraph-py/issues/82) + + +### CI + +* bump min Python to 3.12 and trim test suite ([5fda03f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5fda03fb6bd774dee4e89661dd1eaa0ef2d2c4ed)) + +## [1.76.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.75.1...v1.76.0) (2026-04-09) + + +### Features + +* add PlasmateLoader as lightweight scraping backend (no Chrome needed) ([9dd1fb5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9dd1fb54ed2ad08e968444cde0cd052f0c32f60a)), closes [#1055](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1055) + + +### CI + +* reduce GitHub Actions costs by ~85% on PRs ([403080a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/403080ad82c2097b111d3472cc0c6d4ee709c6fe)) + ## [1.75.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.75.0...v1.75.1) (2026-03-24) diff --git a/README.md b/README.md index 89e01f1f..d16bdf3e 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,12 @@ # 🕷️ ScrapeGraphAI: You Only Scrape Once +

+ + ScrapeGraphAI + +

+ [English](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/README.md) | [中文](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/chinese.md) | [日本語](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/japanese.md) | [한국어](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/korean.md) | [Русский](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/russian.md) | [Türkçe](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/turkish.md) @@ -13,14 +19,10 @@ | [Português](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/portuguese.md) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/scrapegraphai?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/scrapegraphai) -[![linting: pylint](https://img.shields.io/badge/linting-pylint-yellowgreen?style=for-the-badge)](https://github.com/pylint-dev/pylint) -[![Pylint](https://img.shields.io/github/actions/workflow/status/VinciGit00/Scrapegraph-ai/code-quality.yml?label=Pylint&logo=github&style=for-the-badge)](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/code-quality.yml) -[![CodeQL](https://img.shields.io/github/actions/workflow/status/VinciGit00/Scrapegraph-ai/codeql.yml?label=CodeQL&logo=github&style=for-the-badge)](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/codeql.yml) + [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT) [![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX) -[![API Banner](https://raw.githubusercontent.com/ScrapeGraphAI/Scrapegraph-ai/main/docs/assets/api_banner.png)](https://scrapegraphai.com/?utm_source=github&utm_medium=readme&utm_campaign=api_banner&utm_content=api_banner_image) -

VinciGit00%2FScrapegraph-ai | Trendshift

@@ -29,11 +31,6 @@ Just say which information you want to extract and the library will do it for you! -

- ScrapeGraphAI Hero -

- - ## 🚀 Integrations ScrapeGraphAI offers seamless integration with popular frameworks and tools to enhance your scraping capabilities. Whether you're building with Python or Node.js, using LLM frameworks, or working with no-code platforms, we've got you covered with our comprehensive integration options.. diff --git a/docs/assets/scrapegraphai_logo.png b/docs/assets/scrapegraphai_logo.png index ca24928b..020c92dc 100644 Binary files a/docs/assets/scrapegraphai_logo.png and b/docs/assets/scrapegraphai_logo.png differ diff --git a/docs/assets/scrapegraphai_logo.svg b/docs/assets/scrapegraphai_logo.svg index 33285d62..8545571a 100644 --- a/docs/assets/scrapegraphai_logo.svg +++ b/docs/assets/scrapegraphai_logo.svg @@ -1,145 +1,15 @@ - - - - + + + + + + + + + + + + + + + diff --git a/examples/markdownify/markdownify_scrapegraphai.py b/examples/markdownify/markdownify_scrapegraphai.py index de36607d..30d9713f 100644 --- a/examples/markdownify/markdownify_scrapegraphai.py +++ b/examples/markdownify/markdownify_scrapegraphai.py @@ -1,35 +1,20 @@ """ -Example script demonstrating the markdownify functionality +Scrape a webpage as clean markdown using scrapegraph-py v2 API. +Replaces the old markdownify() call with scrape(). """ +import json import os + from dotenv import load_dotenv from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -def main(): - # Load environment variables - load_dotenv() - # Set up logging - sgai_logger.set_logging(level="INFO") +load_dotenv() - # Initialize the client - api_key = os.getenv("SCRAPEGRAPH_API_KEY") - if not api_key: - raise ValueError("SCRAPEGRAPH_API_KEY environment variable not found") - sgai_client = Client(api_key=api_key) +api_key = os.getenv("SCRAPEGRAPH_API_KEY") +if not api_key: + raise ValueError("SCRAPEGRAPH_API_KEY environment variable not found") - # Example 1: Convert a website to Markdown - print("Example 1: Converting website to Markdown") - print("-" * 50) - response = sgai_client.markdownify( - website_url="https://example.com" - ) - print("Markdown output:") - print(response["result"]) # Access the result key from the dictionary - print("\nMetadata:") - print(response.get("metadata", {})) # Use get() with default value - print("\n" + "=" * 50 + "\n") -if __name__ == "__main__": - main() +with Client(api_key=api_key) as client: + response = client.scrape(url="https://example.com") + print(json.dumps(response, indent=2)) diff --git a/examples/markdownify/markdownify_scrapegraphai_v3.py b/examples/markdownify/markdownify_scrapegraphai_v3.py new file mode 100644 index 00000000..8e670f3d --- /dev/null +++ b/examples/markdownify/markdownify_scrapegraphai_v3.py @@ -0,0 +1,24 @@ +""" +Scrape a webpage as markdown using the scrapegraph-py v3 API (PR #84). +Uses ScrapeGraphAI client + ScrapeRequest model + ApiResult wrapper. +""" + +import json +import os + +from dotenv import load_dotenv +from scrapegraph_py import ScrapeGraphAI, ScrapeRequest + +load_dotenv() + +api_key = os.getenv("SGAI_API_KEY") or os.getenv("SCRAPEGRAPH_API_KEY") +if not api_key: + raise ValueError("SGAI_API_KEY not found in environment variables") + +with ScrapeGraphAI(api_key=api_key) as sgai: + result = sgai.scrape(ScrapeRequest(url="https://example.com")) + + if result.status == "success": + print(json.dumps(result.data.model_dump(by_alias=True), indent=2, default=str)) + else: + raise RuntimeError(result.error) diff --git a/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py b/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py index e88a92ce..2f8bce3f 100644 --- a/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py +++ b/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py @@ -1,83 +1,20 @@ """ -Example implementation of search-based scraping using Scrapegraph AI. -This example demonstrates how to use the searchscraper to extract information from the web. +Search the web and extract AI-structured results using scrapegraph-py v2 API. +Replaces the old searchscraper() call with search(). """ +import json import os -from typing import Dict, Any + from dotenv import load_dotenv from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -def format_response(response: Dict[str, Any]) -> None: - """ - Format and print the search response in a readable way. - - Args: - response (Dict[str, Any]): The response from the search API - """ - print("\n" + "="*50) - print("SEARCH RESULTS") - print("="*50) - - # Print request ID - print(f"\nRequest ID: {response['request_id']}") - - # Print number of sources - urls = response.get('reference_urls', []) - print(f"\nSources Processed: {len(urls)}") - - # Print the extracted information - print("\nExtracted Information:") - print("-"*30) - if isinstance(response['result'], dict): - for key, value in response['result'].items(): - print(f"\n{key.upper()}:") - if isinstance(value, list): - for item in value: - print(f" • {item}") - else: - print(f" {value}") - else: - print(response['result']) - - # Print source URLs - if urls: - print("\nSources:") - print("-"*30) - for i, url in enumerate(urls, 1): - print(f"{i}. {url}") - print("\n" + "="*50) - -def main(): - # Load environment variables - load_dotenv() - - # Get API key - api_key = os.getenv("SCRAPEGRAPH_API_KEY") - if not api_key: - raise ValueError("SCRAPEGRAPH_API_KEY not found in environment variables") - - # Configure logging - sgai_logger.set_logging(level="INFO") - - # Initialize client - sgai_client = Client(api_key=api_key) - - try: - # Basic search scraper example - print("\nSearching for information...") - search_response = sgai_client.searchscraper( - user_prompt="Extract webpage information" - ) - format_response(search_response) +load_dotenv() - except Exception as e: - print(f"\nError occurred: {str(e)}") - finally: - # Always close the client - sgai_client.close() +api_key = os.getenv("SCRAPEGRAPH_API_KEY") +if not api_key: + raise ValueError("SCRAPEGRAPH_API_KEY not found in environment variables") -if __name__ == "__main__": - main() +with Client(api_key=api_key) as client: + response = client.search(query="Extract webpage information") + print(json.dumps(response, indent=2)) diff --git a/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai_v3.py b/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai_v3.py new file mode 100644 index 00000000..5e7fce49 --- /dev/null +++ b/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai_v3.py @@ -0,0 +1,24 @@ +""" +Search the web using the scrapegraph-py v3 API (PR #84). +Uses ScrapeGraphAI client + SearchRequest model + ApiResult wrapper. +""" + +import json +import os + +from dotenv import load_dotenv +from scrapegraph_py import ScrapeGraphAI, SearchRequest + +load_dotenv() + +api_key = os.getenv("SGAI_API_KEY") or os.getenv("SCRAPEGRAPH_API_KEY") +if not api_key: + raise ValueError("SGAI_API_KEY not found in environment variables") + +with ScrapeGraphAI(api_key=api_key) as sgai: + result = sgai.search(SearchRequest(query="Extract webpage information")) + + if result.status == "success": + print(json.dumps(result.data.model_dump(by_alias=True), indent=2, default=str)) + else: + raise RuntimeError(result.error) diff --git a/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py b/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py index 47181cbb..944a6a6e 100644 --- a/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py +++ b/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py @@ -1,45 +1,23 @@ """ -Example implementation using scrapegraph-py client directly. +Extract structured data from a webpage using scrapegraph-py v2 API. +Replaces the old smartscraper() call with extract(). """ +import json import os + from dotenv import load_dotenv from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -def main(): - # Load environment variables from .env file - load_dotenv() - - # Get API key from environment variables - api_key = os.getenv("SCRAPEGRAPH_API_KEY") - if not api_key: - raise ValueError("SCRAPEGRAPH_API_KEY non trovato nelle variabili d'ambiente") - - # Set up logging - sgai_logger.set_logging(level="INFO") - - # Initialize the client with API key from environment - sgai_client = Client(api_key=api_key) - - try: - # SmartScraper request - response = sgai_client.smartscraper( - website_url="https://scrapegraphai.com", - user_prompt="Extract the founders' informations" - ) - # Print the response - print(f"Request ID: {response['request_id']}") - print(f"Result: {response['result']}") - if response.get('reference_urls'): - print(f"Reference URLs: {response['reference_urls']}") +load_dotenv() - except Exception as e: - print(f"Error occurred: {str(e)}") - finally: - # Always close the client - sgai_client.close() +api_key = os.getenv("SCRAPEGRAPH_API_KEY") +if not api_key: + raise ValueError("SCRAPEGRAPH_API_KEY not found in environment variables") -if __name__ == "__main__": - main() +with Client(api_key=api_key) as client: + response = client.extract( + url="https://scrapegraphai.com", + prompt="Extract the founders' informations", + ) + print(json.dumps(response, indent=2)) diff --git a/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai_v3.py b/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai_v3.py new file mode 100644 index 00000000..184c1c5f --- /dev/null +++ b/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai_v3.py @@ -0,0 +1,29 @@ +""" +Extract structured data using the scrapegraph-py v3 API (PR #84). +Uses ScrapeGraphAI client + ExtractRequest model + ApiResult wrapper. +""" + +import json +import os + +from dotenv import load_dotenv +from scrapegraph_py import ExtractRequest, ScrapeGraphAI + +load_dotenv() + +api_key = os.getenv("SGAI_API_KEY") or os.getenv("SCRAPEGRAPH_API_KEY") +if not api_key: + raise ValueError("SGAI_API_KEY not found in environment variables") + +with ScrapeGraphAI(api_key=api_key) as sgai: + result = sgai.extract( + ExtractRequest( + url="https://scrapegraphai.com", + prompt="Extract the founders' informations", + ) + ) + + if result.status == "success": + print(json.dumps(result.data.model_dump(by_alias=True), indent=2, default=str)) + else: + raise RuntimeError(result.error) diff --git a/media/banner.png b/media/banner.png new file mode 100644 index 00000000..8b06be50 Binary files /dev/null and b/media/banner.png differ diff --git a/pyproject.toml b/pyproject.toml index 6537bbcf..a929d3bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.75.1" +version = "2.0.0" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ @@ -32,7 +32,7 @@ dependencies = [ "jsonschema>=4.25.1", "duckduckgo-search>=8.1.1", "pydantic>=2.12.5", - "scrapegraph-py>=1.44.0", + "scrapegraph-py>=2.0.0", ] readme = "README.md" @@ -65,7 +65,7 @@ classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ] -requires-python = ">=3.10,<4.0" +requires-python = ">=3.12,<4.0" [project.optional-dependencies] burr = ["burr[start]==0.22.1"] diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index f4310c99..99b99c64 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -4,10 +4,12 @@ from .browser_base import browser_base_fetch from .chromium import ChromiumLoader +from .plasmate import PlasmateLoader from .scrape_do import scrape_do_fetch __all__ = [ "browser_base_fetch", "ChromiumLoader", + "PlasmateLoader", "scrape_do_fetch", ] diff --git a/scrapegraphai/docloaders/plasmate.py b/scrapegraphai/docloaders/plasmate.py new file mode 100644 index 00000000..46dcab62 --- /dev/null +++ b/scrapegraphai/docloaders/plasmate.py @@ -0,0 +1,203 @@ +""" +PlasmateLoader — lightweight page fetcher using Plasmate (https://github.com/plasmate-labs/plasmate). + +Plasmate is an open-source Rust browser engine that outputs a Structured Object Model (SOM) +instead of raw HTML. It requires no Chrome process, uses ~64MB RAM per session vs ~300MB, +and delivers 10-100x fewer tokens per page — lowering LLM costs for AI-powered scraping. + +Install: pip install plasmate +Docs: https://plasmate.app +""" + +import asyncio +import subprocess +import shutil +from typing import AsyncIterator, Iterator, List, Optional + +from langchain_community.document_loaders.base import BaseLoader +from langchain_core.documents import Document + +from ..utils import get_logger + +logger = get_logger("plasmate-loader") + +_INSTALL_MSG = ( + "plasmate is required for PlasmateLoader. " + "Install it with: pip install plasmate\n" + "Docs: https://plasmate.app" +) + + +def _check_plasmate() -> str: + """Return the path to the plasmate binary, or raise ImportError.""" + path = shutil.which("plasmate") + if path is None: + # Also check the Python-installed entry point location + try: + import plasmate as _p # noqa: F401 + path = shutil.which("plasmate") + except ImportError: + pass + if path is None: + raise ImportError(_INSTALL_MSG) + return path + + +class PlasmateLoader(BaseLoader): + """Fetches pages using Plasmate — a lightweight Rust browser engine that outputs + Structured Object Model (SOM) instead of raw HTML. + + Advantages over ChromiumLoader for static / server-rendered pages: + - No Chrome/Playwright required — single binary, installs via pip + - ~64MB RAM per session vs ~300MB for Chromium + - 10-100x fewer tokens per page (SOM strips nav, ads, boilerplate) + - Drops into existing ScrapeGraphAI workflows with minimal config changes + + For SPAs or pages that require JavaScript rendering, set ``fallback_to_chrome=True`` + to automatically retry with ChromiumLoader on empty or error responses. + + Attributes: + urls: List of URLs to fetch. + output_format: Plasmate output format — ``"text"`` (default, most compatible), + ``"som"`` (full JSON), or ``"markdown"``. + timeout: Per-request timeout in seconds. Defaults to 30. + selector: Optional ARIA role or CSS id selector to scope extraction + (e.g. ``"main"`` or ``"#content"``). + extra_headers: Optional dict of HTTP headers to pass to each request. + fallback_to_chrome: If True, retry with ChromiumLoader when Plasmate + returns empty content (useful for JS-heavy SPAs). Defaults to False. + chrome_kwargs: Extra kwargs forwarded to ChromiumLoader when fallback is used. + + Example:: + + from scrapegraphai.docloaders import PlasmateLoader + + loader = PlasmateLoader( + urls=["https://docs.python.org/3/library/json.html"], + output_format="text", + timeout=30, + ) + docs = loader.load() + print(docs[0].page_content[:500]) + """ + + def __init__( + self, + urls: List[str], + *, + output_format: str = "text", + timeout: int = 30, + selector: Optional[str] = None, + extra_headers: Optional[dict] = None, + fallback_to_chrome: bool = False, + **chrome_kwargs, + ): + if output_format not in ("som", "text", "markdown", "links"): + raise ValueError( + f"output_format must be one of 'som', 'text', 'markdown', 'links'; got {output_format!r}" + ) + self.urls = urls + self.output_format = output_format + self.timeout = timeout + self.selector = selector + self.extra_headers = extra_headers or {} + self.fallback_to_chrome = fallback_to_chrome + self.chrome_kwargs = chrome_kwargs + + def _build_cmd(self, url: str) -> List[str]: + """Build the plasmate CLI command for a given URL.""" + cmd = [ + "plasmate", "fetch", url, + "--format", self.output_format, + "--timeout", str(self.timeout * 1000), # plasmate uses milliseconds + ] + if self.selector: + cmd += ["--selector", self.selector] + for key, value in self.extra_headers.items(): + cmd += ["--header", f"{key}: {value}"] + return cmd + + def _fetch_url(self, url: str) -> str: + """Synchronously fetch a URL via the plasmate binary.""" + binary = _check_plasmate() + cmd = self._build_cmd(url) + cmd[0] = binary # use resolved path + + logger.info(f"[PlasmateLoader] Fetching: {url}") + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=self.timeout + 5, # outer kill timeout slightly above plasmate's + ) + if result.returncode != 0: + logger.warning( + f"[PlasmateLoader] plasmate exited {result.returncode} for {url}: {result.stderr[:200]}" + ) + return "" + content = result.stdout.strip() + logger.info(f"[PlasmateLoader] Got {len(content)} chars from {url}") + return content + except subprocess.TimeoutExpired: + logger.warning(f"[PlasmateLoader] Timeout fetching {url}") + return "" + except FileNotFoundError: + raise ImportError(_INSTALL_MSG) + + def _fallback_fetch(self, url: str) -> str: + """Fall back to ChromiumLoader when Plasmate returns empty content.""" + from .chromium import ChromiumLoader + + logger.info(f"[PlasmateLoader] Falling back to ChromiumLoader for: {url}") + loader = ChromiumLoader([url], **self.chrome_kwargs) + docs = loader.load() + return docs[0].page_content if docs else "" + + def lazy_load(self) -> Iterator[Document]: + """Yield Documents one at a time, fetching each URL synchronously.""" + for url in self.urls: + content = self._fetch_url(url) + + if not content.strip() and self.fallback_to_chrome: + content = self._fallback_fetch(url) + + if not content.strip(): + logger.warning(f"[PlasmateLoader] Empty content for {url} — skipping") + continue + + yield Document( + page_content=content, + metadata={ + "source": url, + "loader": "plasmate", + "format": self.output_format, + }, + ) + + async def _async_fetch_url(self, url: str) -> str: + """Asynchronously fetch a URL by running the plasmate binary in a thread pool.""" + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self._fetch_url, url) + + async def alazy_load(self) -> AsyncIterator[Document]: + """Asynchronously yield Documents, fetching all URLs concurrently.""" + tasks = [self._async_fetch_url(url) for url in self.urls] + results = await asyncio.gather(*tasks) + + for url, content in zip(self.urls, results): + if not content.strip() and self.fallback_to_chrome: + content = self._fallback_fetch(url) + + if not content.strip(): + logger.warning(f"[PlasmateLoader] Empty content for {url} — skipping") + continue + + yield Document( + page_content=content, + metadata={ + "source": url, + "loader": "plasmate", + "format": self.output_format, + }, + ) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index ffcd3dbe..27c6cb17 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -77,33 +77,20 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping workflow. """ if self.llm_model == "scrapegraphai/smart-scraper": - try: - from scrapegraph_py import Client - from scrapegraph_py.logger import sgai_logger - except ImportError: - raise ImportError( - "scrapegraph_py is not installed. Please install it using 'pip install scrapegraph-py'." - ) + from ..integrations.scrapegraph_py_compat import extract as sgai_extract - sgai_logger.set_logging(level="INFO") - - # Initialize the client with explicit API key - sgai_client = Client(api_key=self.config.get("api_key")) - - # SmartScraper request - response = sgai_client.smartscraper( - website_url=self.source, - user_prompt=self.prompt, + response = sgai_extract( + api_key=self.config.get("api_key"), + url=self.source, + prompt=self.prompt, + schema=self.schema, ) - # Use logging instead of print for better production practices - if "request_id" in response and "result" in response: - logger.info(f"Request ID: {response['request_id']}") - logger.info(f"Result: {response['result']}") - else: - logger.warning("Missing expected keys in response.") - - sgai_client.close() + if isinstance(response, dict): + if "id" in response: + logger.info(f"Request ID: {response['id']}") + if "data" in response: + logger.info(f"Result: {response['data']}") return response diff --git a/scrapegraphai/integrations/scrapegraph_py_compat.py b/scrapegraphai/integrations/scrapegraph_py_compat.py new file mode 100644 index 00000000..651d194f --- /dev/null +++ b/scrapegraphai/integrations/scrapegraph_py_compat.py @@ -0,0 +1,109 @@ +""" +Compatibility layer for scrapegraph-py SDK. + +Supports both the v2 `Client` API (PR #82) and the newer `ScrapeGraphAI` +API (PR #84) which uses Pydantic request models and an ApiResult wrapper. +""" + +from __future__ import annotations + +from typing import Any, Optional, Type + +from pydantic import BaseModel + + +def _detect_api() -> str: + try: + from scrapegraph_py import ScrapeGraphAI # noqa: F401 + + return "v3" + except ImportError: + pass + try: + from scrapegraph_py import Client # noqa: F401 + + return "v2" + except ImportError as e: + raise ImportError( + "scrapegraph_py is not installed. Install it with 'pip install scrapegraph-py'." + ) from e + + +def _schema_to_dict(schema: Optional[Type[BaseModel]]) -> Optional[dict]: + if schema is None: + return None + if isinstance(schema, dict): + return schema + if isinstance(schema, type) and issubclass(schema, BaseModel): + return schema.model_json_schema() + return None + + +def _unwrap_result(result: Any) -> dict: + if hasattr(result, "status") and hasattr(result, "data"): + if result.status != "success": + raise RuntimeError( + getattr(result, "error", "scrapegraph-py request failed") + ) + data = result.data + if hasattr(data, "model_dump"): + return data.model_dump(by_alias=True, exclude_none=True) + return data if isinstance(data, dict) else {"data": data} + return result + + +def extract( + api_key: Optional[str], + url: str, + prompt: str, + schema: Optional[Type[BaseModel]] = None, +) -> dict: + """Call the scrapegraph-py extract endpoint across SDK versions.""" + api = _detect_api() + + if api == "v3": + from scrapegraph_py import ExtractRequest, ScrapeGraphAI + + kwargs: dict[str, Any] = {"url": url, "prompt": prompt} + schema_dict = _schema_to_dict(schema) + if schema_dict is not None: + kwargs["schema_"] = schema_dict + with ScrapeGraphAI(api_key=api_key) as client: + return _unwrap_result(client.extract(ExtractRequest(**kwargs))) + + from scrapegraph_py import Client + + with Client(api_key=api_key) as client: + return client.extract(url=url, prompt=prompt, output_schema=schema) + + +def scrape(api_key: Optional[str], url: str) -> dict: + """Call the scrapegraph-py scrape endpoint across SDK versions.""" + api = _detect_api() + + if api == "v3": + from scrapegraph_py import ScrapeGraphAI, ScrapeRequest + + with ScrapeGraphAI(api_key=api_key) as client: + return _unwrap_result(client.scrape(ScrapeRequest(url=url))) + + from scrapegraph_py import Client + + with Client(api_key=api_key) as client: + return client.scrape(url=url) + + +def search(api_key: Optional[str], query: str) -> dict: + """Call the scrapegraph-py search endpoint across SDK versions.""" + api = _detect_api() + + if api == "v3": + from scrapegraph_py import ScrapeGraphAI, SearchRequest + + with ScrapeGraphAI(api_key=api_key) as client: + return _unwrap_result(client.search(SearchRequest(query=query))) + + from scrapegraph_py import Client + + with Client(api_key=api_key) as client: + return client.search(query=query) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 88b73b63..ada86e59 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -83,6 +83,10 @@ def __init__( None if node_config is None else node_config.get("scrape_do", None) ) + self.plasmate = ( + None if node_config is None else node_config.get("plasmate", None) + ) + self.storage_state = ( None if node_config is None else node_config.get("storage_state", None) ) @@ -351,6 +355,19 @@ def handle_web_source(self, state, source): ) document = [Document(page_content=data, metadata={"source": source})] + elif self.plasmate is not None: + from ..docloaders.plasmate import PlasmateLoader + + plasmate_cfg = self.plasmate if isinstance(self.plasmate, dict) else {} + loader = PlasmateLoader( + [source], + output_format=plasmate_cfg.get("output_format", "text"), + timeout=plasmate_cfg.get("timeout", self.timeout or 30), + selector=plasmate_cfg.get("selector"), + extra_headers=plasmate_cfg.get("extra_headers", {}), + fallback_to_chrome=plasmate_cfg.get("fallback_to_chrome", False), + ) + document = loader.load() else: loader = ChromiumLoader( [source], diff --git a/tests/test_plasmate.py b/tests/test_plasmate.py new file mode 100644 index 00000000..4cf2dcb5 --- /dev/null +++ b/tests/test_plasmate.py @@ -0,0 +1,276 @@ +"""Tests for PlasmateLoader.""" + +import asyncio +import subprocess +from unittest.mock import MagicMock, patch + +import pytest +from langchain_core.documents import Document + +from scrapegraphai.docloaders.plasmate import PlasmateLoader + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_loader(urls=None, **kwargs): + if urls is None: + urls = ["https://example.com"] + return PlasmateLoader(urls, **kwargs) + + +def _mock_run(stdout: str, returncode: int = 0): + """Return a mock subprocess.CompletedProcess.""" + result = MagicMock() + result.stdout = stdout + result.returncode = returncode + result.stderr = "" + return result + + +# --------------------------------------------------------------------------- +# Initialisation +# --------------------------------------------------------------------------- + +def test_init_defaults(): + loader = _make_loader() + assert loader.output_format == "text" + assert loader.timeout == 30 + assert loader.selector is None + assert loader.extra_headers == {} + assert loader.fallback_to_chrome is False + + +def test_init_custom_params(): + loader = _make_loader( + output_format="som", + timeout=60, + selector="main", + extra_headers={"X-Custom": "value"}, + fallback_to_chrome=True, + ) + assert loader.output_format == "som" + assert loader.timeout == 60 + assert loader.selector == "main" + assert loader.extra_headers == {"X-Custom": "value"} + assert loader.fallback_to_chrome is True + + +def test_init_invalid_format(): + with pytest.raises(ValueError, match="output_format"): + _make_loader(output_format="html") + + +# --------------------------------------------------------------------------- +# Command building +# --------------------------------------------------------------------------- + +def test_build_cmd_defaults(): + loader = _make_loader(urls=["https://example.com"]) + cmd = loader._build_cmd("https://example.com") + assert "plasmate" in cmd[0] + assert "fetch" in cmd + assert "https://example.com" in cmd + assert "--format" in cmd + assert "text" in cmd + assert "--timeout" in cmd + assert "30000" in cmd + + +def test_build_cmd_with_selector(): + loader = _make_loader(selector="main") + cmd = loader._build_cmd("https://example.com") + assert "--selector" in cmd + idx = cmd.index("--selector") + assert cmd[idx + 1] == "main" + + +def test_build_cmd_with_headers(): + loader = _make_loader(extra_headers={"Authorization": "Bearer token"}) + cmd = loader._build_cmd("https://example.com") + assert "--header" in cmd + idx = cmd.index("--header") + assert "Authorization: Bearer token" in cmd[idx + 1] + + +# --------------------------------------------------------------------------- +# lazy_load — success paths +# --------------------------------------------------------------------------- + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_lazy_load_yields_document(mock_run, mock_which): + mock_run.return_value = _mock_run("Page content extracted by Plasmate") + loader = _make_loader(urls=["https://example.com"]) + docs = list(loader.lazy_load()) + assert len(docs) == 1 + assert isinstance(docs[0], Document) + assert "Page content" in docs[0].page_content + assert docs[0].metadata["source"] == "https://example.com" + assert docs[0].metadata["loader"] == "plasmate" + assert docs[0].metadata["format"] == "text" + + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_lazy_load_multiple_urls(mock_run, mock_which): + mock_run.side_effect = [ + _mock_run("Content for first"), + _mock_run("Content for second"), + ] + loader = _make_loader(urls=["https://first.com", "https://second.com"]) + docs = list(loader.lazy_load()) + assert len(docs) == 2 + assert "first" in docs[0].page_content + assert "second" in docs[1].page_content + + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_lazy_load_markdown_format(mock_run, mock_which): + mock_run.return_value = _mock_run("# Heading\n\nSome text") + loader = _make_loader(output_format="markdown") + docs = list(loader.lazy_load()) + assert docs[0].metadata["format"] == "markdown" + assert "# Heading" in docs[0].page_content + + +# --------------------------------------------------------------------------- +# lazy_load — failure / fallback paths +# --------------------------------------------------------------------------- + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_lazy_load_skips_empty_content(mock_run, mock_which, caplog): + mock_run.return_value = _mock_run("") + loader = _make_loader() + docs = list(loader.lazy_load()) + assert docs == [] + assert "Empty content" in caplog.text + + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_lazy_load_nonzero_returncode_skips(mock_run, mock_which, caplog): + mock_run.return_value = _mock_run("", returncode=1) + loader = _make_loader() + docs = list(loader.lazy_load()) + assert docs == [] + + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_lazy_load_timeout_skips(mock_run, mock_which, caplog): + mock_run.side_effect = subprocess.TimeoutExpired(cmd="plasmate", timeout=30) + loader = _make_loader() + docs = list(loader.lazy_load()) + assert docs == [] + assert "Timeout" in caplog.text + + +@patch("shutil.which", return_value=None) +def test_lazy_load_no_binary_raises(mock_which): + loader = _make_loader() + with pytest.raises(ImportError, match="plasmate is required"): + list(loader.lazy_load()) + + +# --------------------------------------------------------------------------- +# fallback_to_chrome +# --------------------------------------------------------------------------- + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_fallback_to_chrome_on_empty(mock_run, mock_which): + mock_run.return_value = _mock_run("") + + fallback_doc = Document( + page_content="Chrome fallback", + metadata={"source": "https://example.com"}, + ) + mock_chrome_loader = MagicMock() + mock_chrome_loader.load.return_value = [fallback_doc] + + with patch( + "scrapegraphai.docloaders.plasmate.ChromiumLoader", + return_value=mock_chrome_loader, + ): + loader = _make_loader(fallback_to_chrome=True) + docs = list(loader.lazy_load()) + + assert len(docs) == 1 + assert "Chrome fallback" in docs[0].page_content + + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_no_fallback_when_content_present(mock_run, mock_which): + """When Plasmate returns content, Chrome fallback should not be called.""" + mock_run.return_value = _mock_run("Real Plasmate content") + + with patch("scrapegraphai.docloaders.plasmate.ChromiumLoader") as mock_chrome: + loader = _make_loader(fallback_to_chrome=True) + docs = list(loader.lazy_load()) + + mock_chrome.assert_not_called() + assert len(docs) == 1 + assert "Real Plasmate content" in docs[0].page_content + + +# --------------------------------------------------------------------------- +# alazy_load +# --------------------------------------------------------------------------- + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_alazy_load_yields_documents(mock_run, mock_which): + mock_run.side_effect = [ + _mock_run("Async content A"), + _mock_run("Async content B"), + ] + loader = _make_loader(urls=["https://a.com", "https://b.com"]) + + async def run(): + return [doc async for doc in loader.alazy_load()] + + docs = asyncio.run(run()) + assert len(docs) == 2 + sources = {d.metadata["source"] for d in docs} + assert "https://a.com" in sources + assert "https://b.com" in sources + + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +@patch("subprocess.run") +def test_alazy_load_skips_empty(mock_run, mock_which): + mock_run.return_value = _mock_run("") + loader = _make_loader() + + async def run(): + return [doc async for doc in loader.alazy_load()] + + docs = asyncio.run(run()) + assert docs == [] + + +# --------------------------------------------------------------------------- +# Empty URL list +# --------------------------------------------------------------------------- + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +def test_lazy_load_empty_urls(mock_which): + loader = _make_loader(urls=[]) + docs = list(loader.lazy_load()) + assert docs == [] + + +@patch("shutil.which", return_value="/usr/local/bin/plasmate") +def test_alazy_load_empty_urls(mock_which): + loader = _make_loader(urls=[]) + + async def run(): + return [doc async for doc in loader.alazy_load()] + + docs = asyncio.run(run()) + assert docs == []