From 42c99cd4cd9448dc5a2961c0c8a522107c75ed34 Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Tue, 21 Apr 2026 11:58:51 +0200 Subject: [PATCH 1/6] feat!: replace *Request objects with inline kwargs on public methods BREAKING CHANGE: Public client methods now accept keyword arguments directly instead of Pydantic *Request objects. This matches the JS SDK and conventional Python SDK ergonomics (OpenAI, Anthropic, Stripe). Before: sgai.scrape(ScrapeRequest(url="https://example.com", formats=[...])) After: sgai.scrape("https://example.com", formats=[...]) The *Request classes remain exported for users who want to build payloads manually, but are no longer accepted by the client methods. Structured types (FetchConfig, format configs, schema dicts) still passed as-is. - Refactored ScrapeGraphAI + AsyncScrapeGraphAI sync/async methods - Added _compact helper to strip None kwargs so Pydantic defaults apply - Updated all 30 examples and README - Updated unit + integration tests Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 90 ++++---- examples/crawl/crawl_basic.py | 10 +- examples/crawl/crawl_basic_async.py | 12 +- examples/crawl/crawl_with_formats.py | 13 +- examples/crawl/crawl_with_formats_async.py | 13 +- examples/extract/extract_basic.py | 10 +- examples/extract/extract_basic_async.py | 12 +- examples/extract/extract_with_schema.py | 10 +- examples/extract/extract_with_schema_async.py | 12 +- examples/monitor/monitor_basic.py | 34 +-- examples/monitor/monitor_basic_async.py | 38 ++-- examples/monitor/monitor_with_webhook.py | 34 +-- .../monitor/monitor_with_webhook_async.py | 38 ++-- examples/scrape/scrape_basic.py | 9 +- examples/scrape/scrape_basic_async.py | 12 +- examples/scrape/scrape_json_extraction.py | 10 +- .../scrape/scrape_json_extraction_async.py | 12 +- examples/scrape/scrape_multi_format.py | 12 +- examples/scrape/scrape_multi_format_async.py | 13 +- examples/scrape/scrape_pdf.py | 9 +- examples/scrape/scrape_pdf_async.py | 12 +- examples/scrape/scrape_with_fetchconfig.py | 9 +- .../scrape/scrape_with_fetchconfig_async.py | 12 +- examples/search/search_basic.py | 9 +- examples/search/search_basic_async.py | 12 +- examples/search/search_with_extraction.py | 10 +- .../search/search_with_extraction_async.py | 12 +- examples/utilities/credits.py | 1 + examples/utilities/credits_async.py | 4 + examples/utilities/health.py | 1 + examples/utilities/health_async.py | 4 + examples/utilities/history.py | 5 +- examples/utilities/history_async.py | 12 +- src/scrapegraph_py/async_client.py | 212 +++++++++++++++--- src/scrapegraph_py/client.py | 212 +++++++++++++++--- tests/test_client.py | 150 +++++-------- tests/test_integration.py | 65 ++---- uv.lock | 2 +- 38 files changed, 751 insertions(+), 396 deletions(-) diff --git a/README.md b/README.md index c601dfdd..9262da51 100644 --- a/README.md +++ b/README.md @@ -22,14 +22,12 @@ uv add scrapegraph-py ## Quick Start ```python -from scrapegraph_py import ScrapeGraphAI, ScrapeRequest +from scrapegraph_py import ScrapeGraphAI # reads SGAI_API_KEY from env, or pass explicitly: ScrapeGraphAI(api_key="...") sgai = ScrapeGraphAI() -result = sgai.scrape(ScrapeRequest( - url="https://example.com", -)) +result = sgai.scrape("https://example.com") if result.status == "success": print(result.data["results"]["markdown"]["data"]) @@ -56,14 +54,14 @@ Scrape a webpage in multiple formats (markdown, html, screenshot, json, etc). ```python from scrapegraph_py import ( - ScrapeGraphAI, ScrapeRequest, FetchConfig, + ScrapeGraphAI, FetchConfig, MarkdownFormatConfig, ScreenshotFormatConfig, JsonFormatConfig ) sgai = ScrapeGraphAI() -res = sgai.scrape(ScrapeRequest( - url="https://example.com", +res = sgai.scrape( + "https://example.com", formats=[ MarkdownFormatConfig(mode="reader"), ScreenshotFormatConfig(full_page=True, width=1440, height=900), @@ -80,7 +78,7 @@ res = sgai.scrape(ScrapeRequest( cookies={"session": "abc"}, country="us", ), -)) +) ``` **Formats:** @@ -98,18 +96,17 @@ res = sgai.scrape(ScrapeRequest( Extract structured data from a URL, HTML, or markdown using AI. ```python -from scrapegraph_py import ScrapeGraphAI, ExtractRequest +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.extract(ExtractRequest( - url="https://example.com", +res = sgai.extract( prompt="Extract product names and prices", + url="https://example.com", schema={"type": "object", "properties": {...}}, # optional mode="reader", # optional - fetch_config=FetchConfig(...), # optional -)) -# Or pass html/markdown directly instead of url + # Or pass html/markdown directly instead of url +) ``` ### search @@ -117,20 +114,19 @@ res = sgai.extract(ExtractRequest( Search the web and optionally extract structured data. ```python -from scrapegraph_py import ScrapeGraphAI, SearchRequest +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.search(SearchRequest( - query="best programming languages 2024", +res = sgai.search( + "best programming languages 2024", num_results=5, # 1-20, default 3 format="markdown", # "markdown" | "html" prompt="Extract key points", # optional, for AI extraction schema={...}, # optional time_range="past_week", # optional location_geo_code="us", # optional - fetch_config=FetchConfig(...), # optional -)) +) ``` ### crawl @@ -138,21 +134,20 @@ res = sgai.search(SearchRequest( Crawl a website and its linked pages. ```python -from scrapegraph_py import ScrapeGraphAI, CrawlRequest, MarkdownFormatConfig +from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig sgai = ScrapeGraphAI() # Start a crawl -start = sgai.crawl.start(CrawlRequest( - url="https://example.com", +start = sgai.crawl.start( + "https://example.com", formats=[MarkdownFormatConfig()], max_pages=50, max_depth=2, max_links_per_page=10, include_patterns=["/blog/*"], exclude_patterns=["/admin/*"], - fetch_config=FetchConfig(...), -)) +) # Check status status = sgai.crawl.get(start.data["id"]) @@ -168,24 +163,23 @@ sgai.crawl.delete(crawl_id) Monitor a webpage for changes on a schedule. ```python -from scrapegraph_py import ScrapeGraphAI, MonitorCreateRequest, MarkdownFormatConfig +from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig sgai = ScrapeGraphAI() # Create a monitor -mon = sgai.monitor.create(MonitorCreateRequest( - url="https://example.com", +mon = sgai.monitor.create( + "https://example.com", + "0 * * * *", # cron expression name="Price Monitor", - interval="0 * * * *", # cron expression formats=[MarkdownFormatConfig()], webhook_url="https://...", # optional - fetch_config=FetchConfig(...), -)) +) # Manage monitors sgai.monitor.list() sgai.monitor.get(cron_id) -sgai.monitor.update(cron_id, MonitorUpdateRequest(interval="0 */6 * * *")) +sgai.monitor.update(cron_id, interval="0 */6 * * *") sgai.monitor.pause(cron_id) sgai.monitor.resume(cron_id) sgai.monitor.delete(cron_id) @@ -196,15 +190,15 @@ sgai.monitor.delete(cron_id) Fetch request history. ```python -from scrapegraph_py import ScrapeGraphAI, HistoryFilter +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -history = sgai.history.list(HistoryFilter( +history = sgai.history.list( service="scrape", # optional filter page=1, limit=20, -)) +) entry = sgai.history.get("request-id") ``` @@ -229,11 +223,11 @@ All methods have async equivalents via `AsyncScrapeGraphAI`: ```python import asyncio -from scrapegraph_py import AsyncScrapeGraphAI, ScrapeRequest +from scrapegraph_py import AsyncScrapeGraphAI async def main(): async with AsyncScrapeGraphAI() as sgai: - result = await sgai.scrape(ScrapeRequest(url="https://example.com")) + result = await sgai.scrape("https://example.com") if result.status == "success": print(result.data["results"]["markdown"]["data"]) else: @@ -246,30 +240,24 @@ asyncio.run(main()) ```python async with AsyncScrapeGraphAI() as sgai: - res = await sgai.extract(ExtractRequest( - url="https://example.com", + res = await sgai.extract( prompt="Extract product names and prices", - )) + url="https://example.com", + ) ``` ### Async Search ```python async with AsyncScrapeGraphAI() as sgai: - res = await sgai.search(SearchRequest( - query="best programming languages 2024", - num_results=5, - )) + res = await sgai.search("best programming languages 2024", num_results=5) ``` ### Async Crawl ```python async with AsyncScrapeGraphAI() as sgai: - start = await sgai.crawl.start(CrawlRequest( - url="https://example.com", - max_pages=50, - )) + start = await sgai.crawl.start("https://example.com", max_pages=50) status = await sgai.crawl.get(start.data["id"]) ``` @@ -277,11 +265,11 @@ async with AsyncScrapeGraphAI() as sgai: ```python async with AsyncScrapeGraphAI() as sgai: - mon = await sgai.monitor.create(MonitorCreateRequest( - url="https://example.com", + mon = await sgai.monitor.create( + "https://example.com", + "0 * * * *", name="Price Monitor", - interval="0 * * * *", - )) + ) ``` ## Examples diff --git a/examples/crawl/crawl_basic.py b/examples/crawl/crawl_basic.py index 0c8cfdf3..c173f8de 100644 --- a/examples/crawl/crawl_basic.py +++ b/examples/crawl/crawl_basic.py @@ -1,16 +1,18 @@ from dotenv import load_dotenv + load_dotenv() import time -from scrapegraph_py import ScrapeGraphAI, CrawlRequest + +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -start_res = sgai.crawl.start(CrawlRequest( - url="https://scrapegraphai.com/", +start_res = sgai.crawl.start( + "https://scrapegraphai.com/", max_pages=5, max_depth=2, -)) +) if start_res.status != "success" or not start_res.data: print("Failed to start:", start_res.error) diff --git a/examples/crawl/crawl_basic_async.py b/examples/crawl/crawl_basic_async.py index 30fd0b79..286b02be 100644 --- a/examples/crawl/crawl_basic_async.py +++ b/examples/crawl/crawl_basic_async.py @@ -1,16 +1,19 @@ from dotenv import load_dotenv + load_dotenv() import asyncio -from scrapegraph_py import AsyncScrapeGraphAI, CrawlRequest + +from scrapegraph_py import AsyncScrapeGraphAI + async def main(): async with AsyncScrapeGraphAI() as sgai: - start_res = await sgai.crawl.start(CrawlRequest( - url="https://scrapegraphai.com/", + start_res = await sgai.crawl.start( + "https://scrapegraphai.com/", max_pages=5, max_depth=2, - )) + ) if start_res.status != "success" or not start_res.data: print("Failed to start:", start_res.error) @@ -33,4 +36,5 @@ async def main(): for page in get_res.data.pages: print(f" {page.url} - {page.status}") + asyncio.run(main()) diff --git a/examples/crawl/crawl_with_formats.py b/examples/crawl/crawl_with_formats.py index 1026b384..018675f3 100644 --- a/examples/crawl/crawl_with_formats.py +++ b/examples/crawl/crawl_with_formats.py @@ -1,25 +1,26 @@ from dotenv import load_dotenv + load_dotenv() import time + from scrapegraph_py import ( - ScrapeGraphAI, - CrawlRequest, - MarkdownFormatConfig, LinksFormatConfig, + MarkdownFormatConfig, + ScrapeGraphAI, ) sgai = ScrapeGraphAI() -start_res = sgai.crawl.start(CrawlRequest( - url="https://scrapegraphai.com/", +start_res = sgai.crawl.start( + "https://scrapegraphai.com/", max_pages=3, max_depth=1, formats=[ MarkdownFormatConfig(), LinksFormatConfig(), ], -)) +) if start_res.status != "success" or not start_res.data: print("Failed to start:", start_res.error) diff --git a/examples/crawl/crawl_with_formats_async.py b/examples/crawl/crawl_with_formats_async.py index d238a58c..2976a952 100644 --- a/examples/crawl/crawl_with_formats_async.py +++ b/examples/crawl/crawl_with_formats_async.py @@ -1,25 +1,27 @@ from dotenv import load_dotenv + load_dotenv() import asyncio + from scrapegraph_py import ( AsyncScrapeGraphAI, - CrawlRequest, - MarkdownFormatConfig, LinksFormatConfig, + MarkdownFormatConfig, ) + async def main(): async with AsyncScrapeGraphAI() as sgai: - start_res = await sgai.crawl.start(CrawlRequest( - url="https://scrapegraphai.com/", + start_res = await sgai.crawl.start( + "https://scrapegraphai.com/", max_pages=3, max_depth=1, formats=[ MarkdownFormatConfig(), LinksFormatConfig(), ], - )) + ) if start_res.status != "success" or not start_res.data: print("Failed to start:", start_res.error) @@ -44,4 +46,5 @@ async def main(): print(f" Status: {page.status}") print(f" Depth: {page.depth}") + asyncio.run(main()) diff --git a/examples/extract/extract_basic.py b/examples/extract/extract_basic.py index 5bb82aca..9235c440 100644 --- a/examples/extract/extract_basic.py +++ b/examples/extract/extract_basic.py @@ -1,15 +1,17 @@ from dotenv import load_dotenv + load_dotenv() import json -from scrapegraph_py import ScrapeGraphAI, ExtractRequest + +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.extract(ExtractRequest( +res = sgai.extract( + "What is this page about? Extract the main heading and description.", url="https://example.com", - prompt="What is this page about? Extract the main heading and description.", -)) +) if res.status == "success": print("Extracted:", json.dumps(res.data.json_data, indent=2)) diff --git a/examples/extract/extract_basic_async.py b/examples/extract/extract_basic_async.py index be98fcf4..c8d6d22d 100644 --- a/examples/extract/extract_basic_async.py +++ b/examples/extract/extract_basic_async.py @@ -1,16 +1,19 @@ from dotenv import load_dotenv + load_dotenv() import asyncio import json -from scrapegraph_py import AsyncScrapeGraphAI, ExtractRequest + +from scrapegraph_py import AsyncScrapeGraphAI + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.extract(ExtractRequest( + res = await sgai.extract( + "What is this page about? Extract the main heading and description.", url="https://example.com", - prompt="What is this page about? Extract the main heading and description.", - )) + ) if res.status == "success": print("Extracted:", json.dumps(res.data.json_data, indent=2)) @@ -18,4 +21,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/extract/extract_with_schema.py b/examples/extract/extract_with_schema.py index b10a68d1..a79e9f58 100644 --- a/examples/extract/extract_with_schema.py +++ b/examples/extract/extract_with_schema.py @@ -1,14 +1,16 @@ from dotenv import load_dotenv + load_dotenv() import json -from scrapegraph_py import ScrapeGraphAI, ExtractRequest + +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.extract(ExtractRequest( +res = sgai.extract( + "Extract structured information about this page", url="https://example.com", - prompt="Extract structured information about this page", schema={ "type": "object", "properties": { @@ -21,7 +23,7 @@ }, "required": ["title"], }, -)) +) if res.status == "success": print("Extracted:", json.dumps(res.data.json_data, indent=2)) diff --git a/examples/extract/extract_with_schema_async.py b/examples/extract/extract_with_schema_async.py index 6a6641d8..1d43a082 100644 --- a/examples/extract/extract_with_schema_async.py +++ b/examples/extract/extract_with_schema_async.py @@ -1,15 +1,18 @@ from dotenv import load_dotenv + load_dotenv() import asyncio import json -from scrapegraph_py import AsyncScrapeGraphAI, ExtractRequest + +from scrapegraph_py import AsyncScrapeGraphAI + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.extract(ExtractRequest( + res = await sgai.extract( + "Extract structured information about this page", url="https://example.com", - prompt="Extract structured information about this page", schema={ "type": "object", "properties": { @@ -22,7 +25,7 @@ async def main(): }, "required": ["title"], }, - )) + ) if res.status == "success": print("Extracted:", json.dumps(res.data.json_data, indent=2)) @@ -31,4 +34,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/monitor/monitor_basic.py b/examples/monitor/monitor_basic.py index 5a500200..bc3fb4f9 100644 --- a/examples/monitor/monitor_basic.py +++ b/examples/monitor/monitor_basic.py @@ -1,28 +1,32 @@ from dotenv import load_dotenv + load_dotenv() import json import signal import time -from scrapegraph_py import ScrapeGraphAI, MonitorCreateRequest, JsonFormatConfig + +from scrapegraph_py import JsonFormatConfig, ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.monitor.create(MonitorCreateRequest( - url="https://time.is/", +res = sgai.monitor.create( + "https://time.is/", + "*/10 * * * *", name="Time Monitor", - interval="*/10 * * * *", - formats=[JsonFormatConfig( - prompt="Extract the current time", - schema={ - "type": "object", - "properties": { - "time": {"type": "string"}, + formats=[ + JsonFormatConfig( + prompt="Extract the current time", + schema={ + "type": "object", + "properties": { + "time": {"type": "string"}, + }, + "required": ["time"], }, - "required": ["time"], - }, - )], -)) + ) + ], +) if res.status != "success" or not res.data: print("Failed to create monitor:", res.error) @@ -33,12 +37,14 @@ print(f"Interval: {res.data.interval}") print("\nPolling for activity (Ctrl+C to stop)...\n") + def cleanup(_sig, _frame): print("\nStopping monitor...") sgai.monitor.delete(monitor_id) print("Monitor deleted") exit(0) + signal.signal(signal.SIGINT, cleanup) seen_ids = set() diff --git a/examples/monitor/monitor_basic_async.py b/examples/monitor/monitor_basic_async.py index 137e621f..0d74a200 100644 --- a/examples/monitor/monitor_basic_async.py +++ b/examples/monitor/monitor_basic_async.py @@ -1,27 +1,32 @@ from dotenv import load_dotenv + load_dotenv() import asyncio import json -from scrapegraph_py import AsyncScrapeGraphAI, MonitorCreateRequest, JsonFormatConfig + +from scrapegraph_py import AsyncScrapeGraphAI, JsonFormatConfig + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.monitor.create(MonitorCreateRequest( - url="https://time.is/", + res = await sgai.monitor.create( + "https://time.is/", + "*/10 * * * *", name="Time Monitor", - interval="*/10 * * * *", - formats=[JsonFormatConfig( - prompt="Extract the current time", - schema={ - "type": "object", - "properties": { - "time": {"type": "string"}, + formats=[ + JsonFormatConfig( + prompt="Extract the current time", + schema={ + "type": "object", + "properties": { + "time": {"type": "string"}, + }, + "required": ["time"], }, - "required": ["time"], - }, - )], - )) + ) + ], + ) if res.status != "success" or not res.data: print("Failed to create monitor:", res.error) @@ -44,7 +49,9 @@ async def main(): seen_ids.add(tick.id) changes = "CHANGED" if tick.changed else "no change" - print(f"[{tick.created_at}] {tick.status} - {changes} ({tick.elapsed_ms}ms)") + print( + f"[{tick.created_at}] {tick.status} - {changes} ({tick.elapsed_ms}ms)" + ) diffs = tick.diffs.model_dump(exclude_none=True) if diffs: print(f" Diffs: {json.dumps(diffs, indent=2)}") @@ -58,4 +65,5 @@ async def main(): await sgai.monitor.delete(monitor_id) print("Monitor deleted") + asyncio.run(main()) diff --git a/examples/monitor/monitor_with_webhook.py b/examples/monitor/monitor_with_webhook.py index 710865d6..936f8f77 100644 --- a/examples/monitor/monitor_with_webhook.py +++ b/examples/monitor/monitor_with_webhook.py @@ -1,29 +1,33 @@ from dotenv import load_dotenv + load_dotenv() import json import signal import time -from scrapegraph_py import ScrapeGraphAI, MonitorCreateRequest, JsonFormatConfig + +from scrapegraph_py import JsonFormatConfig, ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.monitor.create(MonitorCreateRequest( - url="https://time.is/", +res = sgai.monitor.create( + "https://time.is/", + "*/10 * * * *", name="Time Monitor with Webhook", - interval="*/10 * * * *", webhook_url="https://your-webhook-endpoint.com/hook", - formats=[JsonFormatConfig( - prompt="Extract the current time", - schema={ - "type": "object", - "properties": { - "time": {"type": "string"}, + formats=[ + JsonFormatConfig( + prompt="Extract the current time", + schema={ + "type": "object", + "properties": { + "time": {"type": "string"}, + }, + "required": ["time"], }, - "required": ["time"], - }, - )], -)) + ) + ], +) if res.status != "success" or not res.data: print("Failed to create monitor:", res.error) @@ -35,12 +39,14 @@ print("Webhook configured") print("\nPolling for activity (Ctrl+C to stop)...\n") + def cleanup(_sig, _frame): print("\nStopping monitor...") sgai.monitor.delete(monitor_id) print("Monitor deleted") exit(0) + signal.signal(signal.SIGINT, cleanup) seen_ids = set() diff --git a/examples/monitor/monitor_with_webhook_async.py b/examples/monitor/monitor_with_webhook_async.py index faac49d8..a58f7308 100644 --- a/examples/monitor/monitor_with_webhook_async.py +++ b/examples/monitor/monitor_with_webhook_async.py @@ -1,28 +1,33 @@ from dotenv import load_dotenv + load_dotenv() import asyncio import json -from scrapegraph_py import AsyncScrapeGraphAI, MonitorCreateRequest, JsonFormatConfig + +from scrapegraph_py import AsyncScrapeGraphAI, JsonFormatConfig + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.monitor.create(MonitorCreateRequest( - url="https://time.is/", + res = await sgai.monitor.create( + "https://time.is/", + "*/10 * * * *", name="Time Monitor with Webhook", - interval="*/10 * * * *", webhook_url="https://your-webhook-endpoint.com/hook", - formats=[JsonFormatConfig( - prompt="Extract the current time", - schema={ - "type": "object", - "properties": { - "time": {"type": "string"}, + formats=[ + JsonFormatConfig( + prompt="Extract the current time", + schema={ + "type": "object", + "properties": { + "time": {"type": "string"}, + }, + "required": ["time"], }, - "required": ["time"], - }, - )], - )) + ) + ], + ) if res.status != "success" or not res.data: print("Failed to create monitor:", res.error) @@ -46,7 +51,9 @@ async def main(): seen_ids.add(tick.id) changes = "CHANGED" if tick.changed else "no change" - print(f"[{tick.created_at}] {tick.status} - {changes} ({tick.elapsed_ms}ms)") + print( + f"[{tick.created_at}] {tick.status} - {changes} ({tick.elapsed_ms}ms)" + ) diffs = tick.diffs.model_dump(exclude_none=True) if diffs: print(f" Diffs: {json.dumps(diffs, indent=2)}") @@ -60,4 +67,5 @@ async def main(): await sgai.monitor.delete(monitor_id) print("Monitor deleted") + asyncio.run(main()) diff --git a/examples/scrape/scrape_basic.py b/examples/scrape/scrape_basic.py index 5c4c800c..b9766a1f 100644 --- a/examples/scrape/scrape_basic.py +++ b/examples/scrape/scrape_basic.py @@ -1,14 +1,15 @@ from dotenv import load_dotenv + load_dotenv() -from scrapegraph_py import ScrapeGraphAI, ScrapeRequest, MarkdownFormatConfig +from scrapegraph_py import MarkdownFormatConfig, ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.scrape(ScrapeRequest( - url="https://example.com", +res = sgai.scrape( + "https://example.com", formats=[MarkdownFormatConfig()], -)) +) if res.status == "success": print("Markdown:", res.data.results.get("markdown", {}).get("data")) diff --git a/examples/scrape/scrape_basic_async.py b/examples/scrape/scrape_basic_async.py index 2e2e2ce2..1f8fcba4 100644 --- a/examples/scrape/scrape_basic_async.py +++ b/examples/scrape/scrape_basic_async.py @@ -1,15 +1,18 @@ from dotenv import load_dotenv + load_dotenv() import asyncio -from scrapegraph_py import AsyncScrapeGraphAI, ScrapeRequest, MarkdownFormatConfig + +from scrapegraph_py import AsyncScrapeGraphAI, MarkdownFormatConfig + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.scrape(ScrapeRequest( - url="https://example.com", + res = await sgai.scrape( + "https://example.com", formats=[MarkdownFormatConfig()], - )) + ) if res.status == "success": print("Markdown:", res.data.results.get("markdown", {}).get("data")) @@ -17,4 +20,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/scrape/scrape_json_extraction.py b/examples/scrape/scrape_json_extraction.py index 7511f00e..22278657 100644 --- a/examples/scrape/scrape_json_extraction.py +++ b/examples/scrape/scrape_json_extraction.py @@ -1,13 +1,15 @@ from dotenv import load_dotenv + load_dotenv() import json -from scrapegraph_py import ScrapeGraphAI, ScrapeRequest, JsonFormatConfig + +from scrapegraph_py import JsonFormatConfig, ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.scrape(ScrapeRequest( - url="https://example.com", +res = sgai.scrape( + "https://example.com", formats=[ JsonFormatConfig( prompt="Extract the company name, tagline, and list of features", @@ -25,7 +27,7 @@ }, ), ], -)) +) if res.status == "success": json_result = res.data.results.get("json", {}) diff --git a/examples/scrape/scrape_json_extraction_async.py b/examples/scrape/scrape_json_extraction_async.py index f61d0df5..18b063e1 100644 --- a/examples/scrape/scrape_json_extraction_async.py +++ b/examples/scrape/scrape_json_extraction_async.py @@ -1,14 +1,17 @@ from dotenv import load_dotenv + load_dotenv() import asyncio import json -from scrapegraph_py import AsyncScrapeGraphAI, ScrapeRequest, JsonFormatConfig + +from scrapegraph_py import AsyncScrapeGraphAI, JsonFormatConfig + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.scrape(ScrapeRequest( - url="https://example.com", + res = await sgai.scrape( + "https://example.com", formats=[ JsonFormatConfig( prompt="Extract the company name, tagline, and list of features", @@ -26,7 +29,7 @@ async def main(): }, ), ], - )) + ) if res.status == "success": json_result = res.data.results.get("json", {}) @@ -44,4 +47,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/scrape/scrape_multi_format.py b/examples/scrape/scrape_multi_format.py index 4e157287..e8834e85 100644 --- a/examples/scrape/scrape_multi_format.py +++ b/examples/scrape/scrape_multi_format.py @@ -1,24 +1,24 @@ from dotenv import load_dotenv + load_dotenv() from scrapegraph_py import ( - ScrapeGraphAI, - ScrapeRequest, - MarkdownFormatConfig, LinksFormatConfig, + MarkdownFormatConfig, + ScrapeGraphAI, ScreenshotFormatConfig, ) sgai = ScrapeGraphAI() -res = sgai.scrape(ScrapeRequest( - url="https://example.com", +res = sgai.scrape( + "https://example.com", formats=[ MarkdownFormatConfig(), LinksFormatConfig(), ScreenshotFormatConfig(width=1280, height=720), ], -)) +) if res.status == "success": results = res.data.results diff --git a/examples/scrape/scrape_multi_format_async.py b/examples/scrape/scrape_multi_format_async.py index cb56891c..0fcc6050 100644 --- a/examples/scrape/scrape_multi_format_async.py +++ b/examples/scrape/scrape_multi_format_async.py @@ -1,25 +1,27 @@ from dotenv import load_dotenv + load_dotenv() import asyncio + from scrapegraph_py import ( AsyncScrapeGraphAI, - ScrapeRequest, - MarkdownFormatConfig, LinksFormatConfig, + MarkdownFormatConfig, ScreenshotFormatConfig, ) + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.scrape(ScrapeRequest( - url="https://example.com", + res = await sgai.scrape( + "https://example.com", formats=[ MarkdownFormatConfig(), LinksFormatConfig(), ScreenshotFormatConfig(width=1280, height=720), ], - )) + ) if res.status == "success": results = res.data.results @@ -40,4 +42,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/scrape/scrape_pdf.py b/examples/scrape/scrape_pdf.py index ad4992dd..948faea3 100644 --- a/examples/scrape/scrape_pdf.py +++ b/examples/scrape/scrape_pdf.py @@ -1,15 +1,16 @@ from dotenv import load_dotenv + load_dotenv() -from scrapegraph_py import ScrapeGraphAI, ScrapeRequest, MarkdownFormatConfig +from scrapegraph_py import MarkdownFormatConfig, ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.scrape(ScrapeRequest( - url="https://pdfobject.com/pdf/sample.pdf", +res = sgai.scrape( + "https://pdfobject.com/pdf/sample.pdf", content_type="application/pdf", formats=[MarkdownFormatConfig()], -)) +) if res.status == "success": print("Markdown:", res.data.results.get("markdown", {}).get("data")) diff --git a/examples/scrape/scrape_pdf_async.py b/examples/scrape/scrape_pdf_async.py index 8ac100b4..06f8990d 100644 --- a/examples/scrape/scrape_pdf_async.py +++ b/examples/scrape/scrape_pdf_async.py @@ -1,16 +1,19 @@ from dotenv import load_dotenv + load_dotenv() import asyncio -from scrapegraph_py import AsyncScrapeGraphAI, ScrapeRequest, MarkdownFormatConfig + +from scrapegraph_py import AsyncScrapeGraphAI, MarkdownFormatConfig + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.scrape(ScrapeRequest( - url="https://pdfobject.com/pdf/sample.pdf", + res = await sgai.scrape( + "https://pdfobject.com/pdf/sample.pdf", content_type="application/pdf", formats=[MarkdownFormatConfig()], - )) + ) if res.status == "success": print("Markdown:", res.data.results.get("markdown", {}).get("data")) @@ -18,4 +21,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/scrape/scrape_with_fetchconfig.py b/examples/scrape/scrape_with_fetchconfig.py index bc3a89bb..e24e902c 100644 --- a/examples/scrape/scrape_with_fetchconfig.py +++ b/examples/scrape/scrape_with_fetchconfig.py @@ -1,12 +1,13 @@ from dotenv import load_dotenv + load_dotenv() -from scrapegraph_py import ScrapeGraphAI, ScrapeRequest, MarkdownFormatConfig, FetchConfig +from scrapegraph_py import FetchConfig, MarkdownFormatConfig, ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.scrape(ScrapeRequest( - url="https://example.com", +res = sgai.scrape( + "https://example.com", formats=[MarkdownFormatConfig()], fetch_config=FetchConfig( mode="js", @@ -14,7 +15,7 @@ wait=2000, stealth=True, ), -)) +) if res.status == "success": print("Markdown:", res.data.results.get("markdown", {}).get("data")) diff --git a/examples/scrape/scrape_with_fetchconfig_async.py b/examples/scrape/scrape_with_fetchconfig_async.py index f0fafde7..33d9f676 100644 --- a/examples/scrape/scrape_with_fetchconfig_async.py +++ b/examples/scrape/scrape_with_fetchconfig_async.py @@ -1,13 +1,16 @@ from dotenv import load_dotenv + load_dotenv() import asyncio -from scrapegraph_py import AsyncScrapeGraphAI, ScrapeRequest, MarkdownFormatConfig, FetchConfig + +from scrapegraph_py import AsyncScrapeGraphAI, FetchConfig, MarkdownFormatConfig + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.scrape(ScrapeRequest( - url="https://example.com", + res = await sgai.scrape( + "https://example.com", formats=[MarkdownFormatConfig()], fetch_config=FetchConfig( mode="js", @@ -15,7 +18,7 @@ async def main(): wait=2000, stealth=True, ), - )) + ) if res.status == "success": print("Markdown:", res.data.results.get("markdown", {}).get("data")) @@ -23,4 +26,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/search/search_basic.py b/examples/search/search_basic.py index 8a84ba9e..cc51b8aa 100644 --- a/examples/search/search_basic.py +++ b/examples/search/search_basic.py @@ -1,14 +1,15 @@ from dotenv import load_dotenv + load_dotenv() -from scrapegraph_py import ScrapeGraphAI, SearchRequest +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.search(SearchRequest( - query="best programming languages 2024", +res = sgai.search( + "best programming languages 2024", num_results=3, -)) +) if res.status == "success": for result in res.data.results: diff --git a/examples/search/search_basic_async.py b/examples/search/search_basic_async.py index cb758919..2a1dce7a 100644 --- a/examples/search/search_basic_async.py +++ b/examples/search/search_basic_async.py @@ -1,15 +1,18 @@ from dotenv import load_dotenv + load_dotenv() import asyncio -from scrapegraph_py import AsyncScrapeGraphAI, SearchRequest + +from scrapegraph_py import AsyncScrapeGraphAI + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.search(SearchRequest( - query="best programming languages 2024", + res = await sgai.search( + "best programming languages 2024", num_results=3, - )) + ) if res.status == "success": for result in res.data.results: @@ -19,4 +22,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/search/search_with_extraction.py b/examples/search/search_with_extraction.py index 5bb043c6..f1c43ded 100644 --- a/examples/search/search_with_extraction.py +++ b/examples/search/search_with_extraction.py @@ -1,13 +1,15 @@ from dotenv import load_dotenv + load_dotenv() import json -from scrapegraph_py import ScrapeGraphAI, SearchRequest + +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.search(SearchRequest( - query="best programming languages 2024", +res = sgai.search( + "best programming languages 2024", num_results=3, prompt="Summarize the top programming languages mentioned and why they are recommended", schema={ @@ -25,7 +27,7 @@ }, }, }, -)) +) if res.status == "success": print("=== Search Results ===") diff --git a/examples/search/search_with_extraction_async.py b/examples/search/search_with_extraction_async.py index fc0487d8..ff8e9cb4 100644 --- a/examples/search/search_with_extraction_async.py +++ b/examples/search/search_with_extraction_async.py @@ -1,14 +1,17 @@ from dotenv import load_dotenv + load_dotenv() import asyncio import json -from scrapegraph_py import AsyncScrapeGraphAI, SearchRequest + +from scrapegraph_py import AsyncScrapeGraphAI + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.search(SearchRequest( - query="best programming languages 2024", + res = await sgai.search( + "best programming languages 2024", num_results=3, prompt="Summarize the top programming languages mentioned and why they are recommended", schema={ @@ -26,7 +29,7 @@ async def main(): }, }, }, - )) + ) if res.status == "success": print("=== Search Results ===") @@ -39,4 +42,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/utilities/credits.py b/examples/utilities/credits.py index d240c573..dc4b8413 100644 --- a/examples/utilities/credits.py +++ b/examples/utilities/credits.py @@ -1,4 +1,5 @@ from dotenv import load_dotenv + load_dotenv() from scrapegraph_py import ScrapeGraphAI diff --git a/examples/utilities/credits_async.py b/examples/utilities/credits_async.py index 1bcec401..eb3d0c54 100644 --- a/examples/utilities/credits_async.py +++ b/examples/utilities/credits_async.py @@ -1,9 +1,12 @@ from dotenv import load_dotenv + load_dotenv() import asyncio + from scrapegraph_py import AsyncScrapeGraphAI + async def main(): async with AsyncScrapeGraphAI() as sgai: res = await sgai.credits() @@ -18,4 +21,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/utilities/health.py b/examples/utilities/health.py index e723b1b9..fce043f0 100644 --- a/examples/utilities/health.py +++ b/examples/utilities/health.py @@ -1,4 +1,5 @@ from dotenv import load_dotenv + load_dotenv() from scrapegraph_py import ScrapeGraphAI diff --git a/examples/utilities/health_async.py b/examples/utilities/health_async.py index f29678ef..7b6ecb1f 100644 --- a/examples/utilities/health_async.py +++ b/examples/utilities/health_async.py @@ -1,9 +1,12 @@ from dotenv import load_dotenv + load_dotenv() import asyncio + from scrapegraph_py import AsyncScrapeGraphAI + async def main(): async with AsyncScrapeGraphAI() as sgai: res = await sgai.health() @@ -18,4 +21,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/examples/utilities/history.py b/examples/utilities/history.py index cd91e3c7..2755b4b7 100644 --- a/examples/utilities/history.py +++ b/examples/utilities/history.py @@ -1,11 +1,12 @@ from dotenv import load_dotenv + load_dotenv() -from scrapegraph_py import ScrapeGraphAI, HistoryFilter +from scrapegraph_py import ScrapeGraphAI sgai = ScrapeGraphAI() -res = sgai.history.list(HistoryFilter(limit=5)) +res = sgai.history.list(limit=5) if res.status == "success": data = res.data diff --git a/examples/utilities/history_async.py b/examples/utilities/history_async.py index 8fc7f284..a6a367f9 100644 --- a/examples/utilities/history_async.py +++ b/examples/utilities/history_async.py @@ -1,17 +1,22 @@ from dotenv import load_dotenv + load_dotenv() import asyncio -from scrapegraph_py import AsyncScrapeGraphAI, HistoryFilter + +from scrapegraph_py import AsyncScrapeGraphAI + async def main(): async with AsyncScrapeGraphAI() as sgai: - res = await sgai.history.list(HistoryFilter(limit=5)) + res = await sgai.history.list(limit=5) if res.status == "success": data = res.data print(f"Total: {data.pagination.total}") - print(f"Page: {data.pagination.page} / {(data.pagination.total // data.pagination.limit) + 1}") + print( + f"Page: {data.pagination.page} / {(data.pagination.total // data.pagination.limit) + 1}" + ) for entry in data.data: print(f"\n ID: {entry.id}") @@ -22,4 +27,5 @@ async def main(): else: print("Failed:", res.error) + asyncio.run(main()) diff --git a/src/scrapegraph_py/async_client.py b/src/scrapegraph_py/async_client.py index 720d0742..2fc8dedd 100644 --- a/src/scrapegraph_py/async_client.py +++ b/src/scrapegraph_py/async_client.py @@ -6,18 +6,24 @@ import sys import time from datetime import datetime +from typing import Literal import httpx from pydantic import BaseModel, TypeAdapter from .env import env from .schemas import ( + ApiFetchContentType, + ApiHtmlMode, ApiResult, + ApiService, + ApiTimeRange, CrawlRequest, CrawlResponse, CreditsResponse, ExtractRequest, ExtractResponse, + FetchConfig, HealthResponse, HistoryEntry, HistoryFilter, @@ -27,6 +33,7 @@ MonitorCreateRequest, MonitorResponse, MonitorUpdateRequest, + ScrapeFormatEntry, ScrapeRequest, ScrapeResponse, SearchRequest, @@ -66,12 +73,45 @@ def _serialize(model: BaseModel) -> dict: return model.model_dump(mode="json", exclude_none=True, by_alias=True) +# Strips None kwargs so Pydantic fields with non-None defaults (formats, max_depth, etc.) +# fall back to their defaults instead of raising a ValidationError on None. +def _compact(**kwargs) -> dict: + return {k: v for k, v in kwargs.items() if v is not None} + + class AsyncCrawlResource: def __init__(self, client: AsyncScrapeGraphAI): self._client = client - async def start(self, params: CrawlRequest) -> ApiResult[CrawlResponse]: - return await self._client._post("/crawl", params, CrawlResponse) + async def start( + self, + url: str, + *, + formats: list[ScrapeFormatEntry] | None = None, + max_depth: int | None = None, + max_pages: int | None = None, + max_links_per_page: int | None = None, + allow_external: bool | None = None, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, + content_types: list[ApiFetchContentType] | None = None, + fetch_config: FetchConfig | None = None, + ) -> ApiResult[CrawlResponse]: + req = CrawlRequest( + **_compact( + url=url, + formats=formats, + max_depth=max_depth, + max_pages=max_pages, + max_links_per_page=max_links_per_page, + allow_external=allow_external, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + content_types=content_types, + fetch_config=fetch_config, + ) + ) + return await self._client._post("/crawl", req, CrawlResponse) async def get(self, id: str) -> ApiResult[CrawlResponse]: return await self._client._get(f"/crawl/{id}", CrawlResponse) @@ -90,8 +130,27 @@ class AsyncMonitorResource: def __init__(self, client: AsyncScrapeGraphAI): self._client = client - async def create(self, params: MonitorCreateRequest) -> ApiResult[MonitorResponse]: - return await self._client._post("/monitor", params, MonitorResponse) + async def create( + self, + url: str, + interval: str, + *, + name: str | None = None, + formats: list[ScrapeFormatEntry] | None = None, + webhook_url: str | None = None, + fetch_config: FetchConfig | None = None, + ) -> ApiResult[MonitorResponse]: + req = MonitorCreateRequest( + **_compact( + url=url, + interval=interval, + name=name, + formats=formats, + webhook_url=webhook_url, + fetch_config=fetch_config, + ) + ) + return await self._client._post("/monitor", req, MonitorResponse) async def list(self) -> ApiResult[list[MonitorResponse]]: return await self._client._get("/monitor", list[MonitorResponse]) @@ -99,8 +158,26 @@ async def list(self) -> ApiResult[list[MonitorResponse]]: async def get(self, id: str) -> ApiResult[MonitorResponse]: return await self._client._get(f"/monitor/{id}", MonitorResponse) - async def update(self, id: str, params: MonitorUpdateRequest) -> ApiResult[MonitorResponse]: - return await self._client._patch(f"/monitor/{id}", params, MonitorResponse) + async def update( + self, + id: str, + *, + name: str | None = None, + formats: list[ScrapeFormatEntry] | None = None, + webhook_url: str | None = None, + interval: str | None = None, + fetch_config: FetchConfig | None = None, + ) -> ApiResult[MonitorResponse]: + req = MonitorUpdateRequest( + **_compact( + name=name, + formats=formats, + webhook_url=webhook_url, + interval=interval, + fetch_config=fetch_config, + ) + ) + return await self._client._patch(f"/monitor/{id}", req, MonitorResponse) async def delete(self, id: str) -> ApiResult[dict]: return await self._client._delete(f"/monitor/{id}") @@ -112,26 +189,48 @@ async def resume(self, id: str) -> ApiResult[MonitorResponse]: return await self._client._post_empty(f"/monitor/{id}/resume", MonitorResponse) async def activity( - self, id: str, params: MonitorActivityRequest | None = None + self, + id: str, + *, + limit: int | None = None, + cursor: str | None = None, ) -> ApiResult[MonitorActivityResponse]: - p = params.model_dump(by_alias=True, exclude_none=True) if params else None - return await self._client._get(f"/monitor/{id}/activity", MonitorActivityResponse, params=p) + kwargs = _compact(limit=limit, cursor=cursor) + qs = ( + MonitorActivityRequest(**kwargs).model_dump( + by_alias=True, exclude_none=True, exclude_defaults=True + ) + if kwargs + else None + ) + return await self._client._get( + f"/monitor/{id}/activity", MonitorActivityResponse, params=qs or None + ) class AsyncHistoryResource: def __init__(self, client: AsyncScrapeGraphAI): self._client = client - async def list(self, params: HistoryFilter | None = None) -> ApiResult[HistoryPage]: - qs = {} - if params: - if params.page: - qs["page"] = str(params.page) - if params.limit: - qs["limit"] = str(params.limit) - if params.service: - qs["service"] = params.service - return await self._client._get("/history", HistoryPage, params=qs if qs else None) + async def list( + self, + *, + page: int | None = None, + limit: int | None = None, + service: ApiService | None = None, + ) -> ApiResult[HistoryPage]: + kwargs = _compact(page=page, limit=limit, service=service) + if not kwargs: + return await self._client._get("/history", HistoryPage) + params = HistoryFilter(**kwargs) + qs: dict[str, str] = {} + if page is not None: + qs["page"] = str(params.page) + if limit is not None: + qs["limit"] = str(params.limit) + if service is not None: + qs["service"] = params.service + return await self._client._get("/history", HistoryPage, params=qs or None) async def get(self, id: str) -> ApiResult[HistoryEntry]: return await self._client._get(f"/history/{id}", HistoryEntry) @@ -227,14 +326,77 @@ async def _patch[T](self, path: str, body: BaseModel, response_type: type[T]) -> async def _delete(self, path: str) -> ApiResult[dict]: return await self._request("DELETE", path, dict) - async def scrape(self, params: ScrapeRequest) -> ApiResult[ScrapeResponse]: - return await self._post("/scrape", params, ScrapeResponse) + async def scrape( + self, + url: str, + *, + formats: list[ScrapeFormatEntry] | None = None, + fetch_config: FetchConfig | None = None, + content_type: ApiFetchContentType | None = None, + ) -> ApiResult[ScrapeResponse]: + req = ScrapeRequest( + **_compact( + url=url, + formats=formats, + fetch_config=fetch_config, + content_type=content_type, + ) + ) + return await self._post("/scrape", req, ScrapeResponse) - async def extract(self, params: ExtractRequest) -> ApiResult[ExtractResponse]: - return await self._post("/extract", params, ExtractResponse) + async def extract( + self, + prompt: str, + *, + url: str | None = None, + html: str | None = None, + markdown: str | None = None, + schema: dict[str, object] | None = None, + mode: ApiHtmlMode | None = None, + fetch_config: FetchConfig | None = None, + content_type: ApiFetchContentType | None = None, + ) -> ApiResult[ExtractResponse]: + req = ExtractRequest( + **_compact( + prompt=prompt, + url=url, + html=html, + markdown=markdown, + schema=schema, + mode=mode, + fetch_config=fetch_config, + content_type=content_type, + ) + ) + return await self._post("/extract", req, ExtractResponse) - async def search(self, params: SearchRequest) -> ApiResult[SearchResponse]: - return await self._post("/search", params, SearchResponse) + async def search( + self, + query: str, + *, + num_results: int | None = None, + format: Literal["html", "markdown"] | None = None, + mode: ApiHtmlMode | None = None, + prompt: str | None = None, + schema: dict[str, object] | None = None, + fetch_config: FetchConfig | None = None, + location_geo_code: str | None = None, + time_range: ApiTimeRange | None = None, + ) -> ApiResult[SearchResponse]: + req = SearchRequest( + **_compact( + query=query, + num_results=num_results, + format=format, + mode=mode, + prompt=prompt, + schema=schema, + fetch_config=fetch_config, + location_geo_code=location_geo_code, + time_range=time_range, + ) + ) + return await self._post("/search", req, SearchResponse) async def credits(self) -> ApiResult[CreditsResponse]: return await self._get("/credits", CreditsResponse) diff --git a/src/scrapegraph_py/client.py b/src/scrapegraph_py/client.py index d1c7c95c..532e11c8 100644 --- a/src/scrapegraph_py/client.py +++ b/src/scrapegraph_py/client.py @@ -6,18 +6,24 @@ import sys import time from datetime import datetime +from typing import Literal import httpx from pydantic import BaseModel, TypeAdapter from .env import env from .schemas import ( + ApiFetchContentType, + ApiHtmlMode, ApiResult, + ApiService, + ApiTimeRange, CrawlRequest, CrawlResponse, CreditsResponse, ExtractRequest, ExtractResponse, + FetchConfig, HealthResponse, HistoryEntry, HistoryFilter, @@ -27,6 +33,7 @@ MonitorCreateRequest, MonitorResponse, MonitorUpdateRequest, + ScrapeFormatEntry, ScrapeRequest, ScrapeResponse, SearchRequest, @@ -66,12 +73,45 @@ def _serialize(model: BaseModel) -> dict: return model.model_dump(mode="json", exclude_none=True, by_alias=True) +# Strips None kwargs so Pydantic fields with non-None defaults (formats, max_depth, etc.) +# fall back to their defaults instead of raising a ValidationError on None. +def _compact(**kwargs) -> dict: + return {k: v for k, v in kwargs.items() if v is not None} + + class CrawlResource: def __init__(self, client: ScrapeGraphAI): self._client = client - def start(self, params: CrawlRequest) -> ApiResult[CrawlResponse]: - return self._client._post("/crawl", params, CrawlResponse) + def start( + self, + url: str, + *, + formats: list[ScrapeFormatEntry] | None = None, + max_depth: int | None = None, + max_pages: int | None = None, + max_links_per_page: int | None = None, + allow_external: bool | None = None, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, + content_types: list[ApiFetchContentType] | None = None, + fetch_config: FetchConfig | None = None, + ) -> ApiResult[CrawlResponse]: + req = CrawlRequest( + **_compact( + url=url, + formats=formats, + max_depth=max_depth, + max_pages=max_pages, + max_links_per_page=max_links_per_page, + allow_external=allow_external, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + content_types=content_types, + fetch_config=fetch_config, + ) + ) + return self._client._post("/crawl", req, CrawlResponse) def get(self, id: str) -> ApiResult[CrawlResponse]: return self._client._get(f"/crawl/{id}", CrawlResponse) @@ -90,8 +130,27 @@ class MonitorResource: def __init__(self, client: ScrapeGraphAI): self._client = client - def create(self, params: MonitorCreateRequest) -> ApiResult[MonitorResponse]: - return self._client._post("/monitor", params, MonitorResponse) + def create( + self, + url: str, + interval: str, + *, + name: str | None = None, + formats: list[ScrapeFormatEntry] | None = None, + webhook_url: str | None = None, + fetch_config: FetchConfig | None = None, + ) -> ApiResult[MonitorResponse]: + req = MonitorCreateRequest( + **_compact( + url=url, + interval=interval, + name=name, + formats=formats, + webhook_url=webhook_url, + fetch_config=fetch_config, + ) + ) + return self._client._post("/monitor", req, MonitorResponse) def list(self) -> ApiResult[list[MonitorResponse]]: return self._client._get("/monitor", list[MonitorResponse]) @@ -99,8 +158,26 @@ def list(self) -> ApiResult[list[MonitorResponse]]: def get(self, id: str) -> ApiResult[MonitorResponse]: return self._client._get(f"/monitor/{id}", MonitorResponse) - def update(self, id: str, params: MonitorUpdateRequest) -> ApiResult[MonitorResponse]: - return self._client._patch(f"/monitor/{id}", params, MonitorResponse) + def update( + self, + id: str, + *, + name: str | None = None, + formats: list[ScrapeFormatEntry] | None = None, + webhook_url: str | None = None, + interval: str | None = None, + fetch_config: FetchConfig | None = None, + ) -> ApiResult[MonitorResponse]: + req = MonitorUpdateRequest( + **_compact( + name=name, + formats=formats, + webhook_url=webhook_url, + interval=interval, + fetch_config=fetch_config, + ) + ) + return self._client._patch(f"/monitor/{id}", req, MonitorResponse) def delete(self, id: str) -> ApiResult[dict]: return self._client._delete(f"/monitor/{id}") @@ -112,26 +189,48 @@ def resume(self, id: str) -> ApiResult[MonitorResponse]: return self._client._post_empty(f"/monitor/{id}/resume", MonitorResponse) def activity( - self, id: str, params: MonitorActivityRequest | None = None + self, + id: str, + *, + limit: int | None = None, + cursor: str | None = None, ) -> ApiResult[MonitorActivityResponse]: - p = params.model_dump(by_alias=True, exclude_none=True) if params else None - return self._client._get(f"/monitor/{id}/activity", MonitorActivityResponse, params=p) + kwargs = _compact(limit=limit, cursor=cursor) + qs = ( + MonitorActivityRequest(**kwargs).model_dump( + by_alias=True, exclude_none=True, exclude_defaults=True + ) + if kwargs + else None + ) + return self._client._get( + f"/monitor/{id}/activity", MonitorActivityResponse, params=qs or None + ) class HistoryResource: def __init__(self, client: ScrapeGraphAI): self._client = client - def list(self, params: HistoryFilter | None = None) -> ApiResult[HistoryPage]: - qs = {} - if params: - if params.page: - qs["page"] = str(params.page) - if params.limit: - qs["limit"] = str(params.limit) - if params.service: - qs["service"] = params.service - return self._client._get("/history", HistoryPage, params=qs if qs else None) + def list( + self, + *, + page: int | None = None, + limit: int | None = None, + service: ApiService | None = None, + ) -> ApiResult[HistoryPage]: + kwargs = _compact(page=page, limit=limit, service=service) + if not kwargs: + return self._client._get("/history", HistoryPage) + params = HistoryFilter(**kwargs) + qs: dict[str, str] = {} + if page is not None: + qs["page"] = str(params.page) + if limit is not None: + qs["limit"] = str(params.limit) + if service is not None: + qs["service"] = params.service + return self._client._get("/history", HistoryPage, params=qs or None) def get(self, id: str) -> ApiResult[HistoryEntry]: return self._client._get(f"/history/{id}", HistoryEntry) @@ -227,14 +326,77 @@ def _patch[T](self, path: str, body: BaseModel, response_type: type[T]) -> ApiRe def _delete(self, path: str) -> ApiResult[dict]: return self._request("DELETE", path, dict) - def scrape(self, params: ScrapeRequest) -> ApiResult[ScrapeResponse]: - return self._post("/scrape", params, ScrapeResponse) + def scrape( + self, + url: str, + *, + formats: list[ScrapeFormatEntry] | None = None, + fetch_config: FetchConfig | None = None, + content_type: ApiFetchContentType | None = None, + ) -> ApiResult[ScrapeResponse]: + req = ScrapeRequest( + **_compact( + url=url, + formats=formats, + fetch_config=fetch_config, + content_type=content_type, + ) + ) + return self._post("/scrape", req, ScrapeResponse) - def extract(self, params: ExtractRequest) -> ApiResult[ExtractResponse]: - return self._post("/extract", params, ExtractResponse) + def extract( + self, + prompt: str, + *, + url: str | None = None, + html: str | None = None, + markdown: str | None = None, + schema: dict[str, object] | None = None, + mode: ApiHtmlMode | None = None, + fetch_config: FetchConfig | None = None, + content_type: ApiFetchContentType | None = None, + ) -> ApiResult[ExtractResponse]: + req = ExtractRequest( + **_compact( + prompt=prompt, + url=url, + html=html, + markdown=markdown, + schema=schema, + mode=mode, + fetch_config=fetch_config, + content_type=content_type, + ) + ) + return self._post("/extract", req, ExtractResponse) - def search(self, params: SearchRequest) -> ApiResult[SearchResponse]: - return self._post("/search", params, SearchResponse) + def search( + self, + query: str, + *, + num_results: int | None = None, + format: Literal["html", "markdown"] | None = None, + mode: ApiHtmlMode | None = None, + prompt: str | None = None, + schema: dict[str, object] | None = None, + fetch_config: FetchConfig | None = None, + location_geo_code: str | None = None, + time_range: ApiTimeRange | None = None, + ) -> ApiResult[SearchResponse]: + req = SearchRequest( + **_compact( + query=query, + num_results=num_results, + format=format, + mode=mode, + prompt=prompt, + schema=schema, + fetch_config=fetch_config, + location_geo_code=location_geo_code, + time_range=time_range, + ) + ) + return self._post("/search", req, SearchResponse) def credits(self) -> ApiResult[CreditsResponse]: return self._get("/credits", CreditsResponse) diff --git a/tests/test_client.py b/tests/test_client.py index 6c7a8849..b6d1c1e3 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -4,20 +4,14 @@ import pytest from scrapegraph_py import ( - CrawlRequest, - ExtractRequest, FetchConfig, - HistoryFilter, HtmlFormatConfig, ImagesFormatConfig, JsonFormatConfig, LinksFormatConfig, MarkdownFormatConfig, - MonitorCreateRequest, ScrapeGraphAI, - ScrapeRequest, ScreenshotFormatConfig, - SearchRequest, ) API_KEY = "test-sgai-key" @@ -41,7 +35,7 @@ def test_success(self): } with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: sgai = ScrapeGraphAI(api_key=API_KEY) - res = sgai.scrape(ScrapeRequest(url="https://example.com")) + res = sgai.scrape("https://example.com") assert res.status == "success" assert res.data.results == body["results"] @@ -60,17 +54,15 @@ def test_with_fetch_config_js_mode(self): with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: sgai = ScrapeGraphAI(api_key=API_KEY) res = sgai.scrape( - ScrapeRequest( - url="https://example.com", - fetch_config=FetchConfig( - mode="js", - stealth=True, - timeout=45000, - wait=2000, - scrolls=3, - ), - formats=[MarkdownFormatConfig()], - ) + "https://example.com", + fetch_config=FetchConfig( + mode="js", + stealth=True, + timeout=45000, + wait=2000, + scrolls=3, + ), + formats=[MarkdownFormatConfig()], ) assert res.status == "success" @@ -87,15 +79,13 @@ def test_with_fetch_config_headers_cookies(self): with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: sgai = ScrapeGraphAI(api_key=API_KEY) res = sgai.scrape( - ScrapeRequest( - url="https://example.com", - fetch_config=FetchConfig( - mode="fast", - headers={"X-Custom": "test"}, - cookies={"session": "abc123"}, - ), - formats=[HtmlFormatConfig()], - ) + "https://example.com", + fetch_config=FetchConfig( + mode="fast", + headers={"X-Custom": "test"}, + cookies={"session": "abc123"}, + ), + formats=[HtmlFormatConfig()], ) assert res.status == "success" @@ -113,18 +103,16 @@ def test_multiple_formats(self): }, "metadata": {"contentType": "text/html"}, } - with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: + with patch.object(httpx.Client, "request", return_value=mock_response(body)): sgai = ScrapeGraphAI(api_key=API_KEY) res = sgai.scrape( - ScrapeRequest( - url="https://example.com", - formats=[ - MarkdownFormatConfig(mode="reader"), - HtmlFormatConfig(mode="prune"), - LinksFormatConfig(), - ImagesFormatConfig(), - ], - ) + "https://example.com", + formats=[ + MarkdownFormatConfig(mode="reader"), + HtmlFormatConfig(mode="prune"), + LinksFormatConfig(), + ImagesFormatConfig(), + ], ) assert res.status == "success" @@ -141,15 +129,13 @@ def test_json_format_with_schema(self): with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: sgai = ScrapeGraphAI(api_key=API_KEY) res = sgai.scrape( - ScrapeRequest( - url="https://example.com", - formats=[ - JsonFormatConfig( - prompt="Extract product info", - schema={"type": "object", "properties": {"title": {"type": "string"}}}, - ), - ], - ) + "https://example.com", + formats=[ + JsonFormatConfig( + prompt="Extract product info", + schema={"type": "object", "properties": {"title": {"type": "string"}}}, + ), + ], ) assert res.status == "success" @@ -165,10 +151,8 @@ def test_screenshot_format(self): with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: sgai = ScrapeGraphAI(api_key=API_KEY) res = sgai.scrape( - ScrapeRequest( - url="https://example.com", - formats=[ScreenshotFormatConfig(full_page=True, width=1920, height=1080)], - ) + "https://example.com", + formats=[ScreenshotFormatConfig(full_page=True, width=1920, height=1080)], ) assert res.status == "success" @@ -181,7 +165,7 @@ def test_http_401_error(self): httpx.Client, "request", return_value=mock_response({"detail": "Invalid key"}, 401) ): sgai = ScrapeGraphAI(api_key=API_KEY) - res = sgai.scrape(ScrapeRequest(url="https://example.com")) + res = sgai.scrape("https://example.com") assert res.status == "error" assert "Invalid or missing API key" in res.error @@ -189,7 +173,7 @@ def test_http_401_error(self): def test_http_402_error(self): with patch.object(httpx.Client, "request", return_value=mock_response({}, 402)): sgai = ScrapeGraphAI(api_key=API_KEY) - res = sgai.scrape(ScrapeRequest(url="https://example.com")) + res = sgai.scrape("https://example.com") assert res.status == "error" assert "Insufficient credits" in res.error @@ -197,7 +181,7 @@ def test_http_402_error(self): def test_http_429_error(self): with patch.object(httpx.Client, "request", return_value=mock_response({}, 429)): sgai = ScrapeGraphAI(api_key=API_KEY) - res = sgai.scrape(ScrapeRequest(url="https://example.com")) + res = sgai.scrape("https://example.com") assert res.status == "error" assert "Rate limited" in res.error @@ -205,7 +189,7 @@ def test_http_429_error(self): def test_timeout_error(self): with patch.object(httpx.Client, "request", side_effect=httpx.TimeoutException("timeout")): sgai = ScrapeGraphAI(api_key=API_KEY) - res = sgai.scrape(ScrapeRequest(url="https://example.com")) + res = sgai.scrape("https://example.com") assert res.status == "error" assert "timed out" in res.error @@ -219,13 +203,11 @@ def test_success(self): "usage": {"promptTokens": 100, "completionTokens": 50}, "metadata": {}, } - with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: + with patch.object(httpx.Client, "request", return_value=mock_response(body)): sgai = ScrapeGraphAI(api_key=API_KEY) res = sgai.extract( - ExtractRequest( - url="https://example.com", - prompt="What is this page about?", - ) + prompt="What is this page about?", + url="https://example.com", ) assert res.status == "success" @@ -241,11 +223,9 @@ def test_with_schema(self): with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: sgai = ScrapeGraphAI(api_key=API_KEY) res = sgai.extract( - ExtractRequest( - url="https://example.com", - prompt="Extract data", - schema={"type": "object"}, - ) + prompt="Extract data", + url="https://example.com", + schema={"type": "object"}, ) assert res.status == "success" @@ -261,7 +241,7 @@ def test_success(self): } with patch.object(httpx.Client, "request", return_value=mock_response(body)): sgai = ScrapeGraphAI(api_key=API_KEY) - res = sgai.search(SearchRequest(query="test query", num_results=5)) + res = sgai.search("test query", num_results=5) assert res.status == "success" assert len(res.data.results) == 1 @@ -274,12 +254,7 @@ def test_with_extraction(self): } with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: sgai = ScrapeGraphAI(api_key=API_KEY) - res = sgai.search( - SearchRequest( - query="test", - prompt="Summarize results", - ) - ) + res = sgai.search("test", prompt="Summarize results") assert res.status == "success" _, kwargs = mock.call_args @@ -292,11 +267,9 @@ def test_start(self): with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: sgai = ScrapeGraphAI(api_key=API_KEY) res = sgai.crawl.start( - CrawlRequest( - url="https://example.com", - max_pages=10, - max_depth=2, - ) + "https://example.com", + max_pages=10, + max_depth=2, ) assert res.status == "success" @@ -338,14 +311,12 @@ def test_create(self): "createdAt": "2024-01-01T00:00:00Z", "updatedAt": "2024-01-01T00:00:00Z", } - with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: + with patch.object(httpx.Client, "request", return_value=mock_response(body)): sgai = ScrapeGraphAI(api_key=API_KEY) res = sgai.monitor.create( - MonitorCreateRequest( - url="https://example.com", - name="Test Monitor", - interval="0 * * * *", - ) + "https://example.com", + "0 * * * *", + name="Test Monitor", ) assert res.status == "success" @@ -399,9 +370,9 @@ def test_list(self): ], "pagination": {"page": 1, "limit": 20, "total": 100}, } - with patch.object(httpx.Client, "request", return_value=mock_response(body)) as mock: + with patch.object(httpx.Client, "request", return_value=mock_response(body)): sgai = ScrapeGraphAI(api_key=API_KEY) - res = sgai.history.list(HistoryFilter(limit=5, service="scrape")) + res = sgai.history.list(limit=5, service="scrape") assert res.status == "success" assert res.data.pagination.total == 100 @@ -471,14 +442,9 @@ def test_snake_to_camel(self): with patch.object(httpx.Client, "request", return_value=mock_response({})) as mock: sgai = ScrapeGraphAI(api_key=API_KEY) sgai.scrape( - ScrapeRequest( - url="https://example.com", - content_type="application/pdf", - fetch_config=FetchConfig( - mode="js", - timeout=30000, - ), - ) + "https://example.com", + content_type="application/pdf", + fetch_config=FetchConfig(mode="js", timeout=30000), ) _, kwargs = mock.call_args diff --git a/tests/test_integration.py b/tests/test_integration.py index c1686b84..14fbf91a 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -9,16 +9,11 @@ pytest.skip("SGAI_API_KEY env var required for integration tests", allow_module_level=True) from scrapegraph_py import ( - CrawlRequest, - ExtractRequest, FetchConfig, - HistoryFilter, ImagesFormatConfig, LinksFormatConfig, MarkdownFormatConfig, ScrapeGraphAI, - ScrapeRequest, - SearchRequest, ) sgai = ScrapeGraphAI() @@ -33,17 +28,15 @@ def test_credits(self): assert "plan" in res.data def test_scrape_default_format(self): - res = sgai.scrape(ScrapeRequest(url="https://example.com")) + res = sgai.scrape("https://example.com") print("scrape default:", res.status, res.error) assert res.status == "success" assert res.data["results"].get("markdown") is not None def test_scrape_single_format(self): res = sgai.scrape( - ScrapeRequest( - url="https://example.com", - formats=[MarkdownFormatConfig()], - ) + "https://example.com", + formats=[MarkdownFormatConfig()], ) print("scrape single:", res.status, res.error) assert res.status == "success" @@ -51,14 +44,12 @@ def test_scrape_single_format(self): def test_scrape_multiple_formats(self): res = sgai.scrape( - ScrapeRequest( - url="https://example.com", - formats=[ - MarkdownFormatConfig(mode="reader"), - LinksFormatConfig(), - ImagesFormatConfig(), - ], - ) + "https://example.com", + formats=[ + MarkdownFormatConfig(mode="reader"), + LinksFormatConfig(), + ImagesFormatConfig(), + ], ) print("scrape multi:", res.status, res.error) assert res.status == "success" @@ -67,11 +58,9 @@ def test_scrape_multiple_formats(self): def test_scrape_pdf(self): res = sgai.scrape( - ScrapeRequest( - url="https://pdfobject.com/pdf/sample.pdf", - content_type="application/pdf", - formats=[MarkdownFormatConfig()], - ) + "https://pdfobject.com/pdf/sample.pdf", + content_type="application/pdf", + formats=[MarkdownFormatConfig()], ) print("scrape PDF:", res.status, res.error) assert res.status == "success" @@ -79,48 +68,34 @@ def test_scrape_pdf(self): def test_scrape_with_fetch_config(self): res = sgai.scrape( - ScrapeRequest( - url="https://example.com", - fetch_config=FetchConfig(mode="fast", timeout=15000), - formats=[MarkdownFormatConfig()], - ) + "https://example.com", + fetch_config=FetchConfig(mode="fast", timeout=15000), + formats=[MarkdownFormatConfig()], ) print("scrape fetchConfig:", res.status, res.error) assert res.status == "success" def test_extract(self): res = sgai.extract( - ExtractRequest( - url="https://example.com", - prompt="What is this page about?", - ) + prompt="What is this page about?", + url="https://example.com", ) print("extract:", res.status, res.error) assert res.status == "success" def test_search(self): - res = sgai.search( - SearchRequest( - query="anthropic claude", - num_results=2, - ) - ) + res = sgai.search("anthropic claude", num_results=2) print("search:", res.status, res.error) assert res.status == "success" assert len(res.data["results"]) > 0 def test_history_list(self): - res = sgai.history.list(HistoryFilter(limit=5)) + res = sgai.history.list(limit=5) print("history.list:", res.status, res.data.get("pagination") if res.data else None) assert res.status == "success" def test_crawl_start_and_get(self): - start_res = sgai.crawl.start( - CrawlRequest( - url="https://example.com", - max_pages=2, - ) - ) + start_res = sgai.crawl.start("https://example.com", max_pages=2) print( "crawl.start:", start_res.status, diff --git a/uv.lock b/uv.lock index 7151348f..9243100d 100644 --- a/uv.lock +++ b/uv.lock @@ -279,7 +279,7 @@ wheels = [ [[package]] name = "scrapegraph-py" -version = "2.0.0" +version = "2.0.1" source = { editable = "." } dependencies = [ { name = "httpx" }, From 35e0ba3c15198cef75e71e60e38b63b5c91d94eb Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Tue, 21 Apr 2026 12:47:20 +0200 Subject: [PATCH 2/6] chore(release): 2.1.0 Bump minor for new kwargs-based public API (see #88). Co-Authored-By: Claude Opus 4.7 (1M context) --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 74580b04..5c37ab66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraph-py" -version = "2.0.1" +version = "2.1.0" description = "Official Python SDK for ScrapeGraph AI API" readme = "README.md" license = "MIT" diff --git a/uv.lock b/uv.lock index 9243100d..f0bfaeb0 100644 --- a/uv.lock +++ b/uv.lock @@ -279,7 +279,7 @@ wheels = [ [[package]] name = "scrapegraph-py" -version = "2.0.1" +version = "2.1.0" source = { editable = "." } dependencies = [ { name = "httpx" }, From f4ff5fdbd0ebd4a8270e1d8232c7f1b9f6f43845 Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Tue, 21 Apr 2026 14:21:01 +0200 Subject: [PATCH 3/6] refactor: rename ScrapeFormatEntry -> FormatConfig The union type was awkwardly named. FormatConfig matches the naming of its members (MarkdownFormatConfig, HtmlFormatConfig, ...) and reads naturally in signatures: formats: list[FormatConfig]. Also exported from the top-level package so users can type-annotate their own code. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/scrapegraph_py/__init__.py | 2 ++ src/scrapegraph_py/async_client.py | 10 +++++----- src/scrapegraph_py/client.py | 10 +++++----- src/scrapegraph_py/schemas.py | 10 +++++----- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/scrapegraph_py/__init__.py b/src/scrapegraph_py/__init__.py index 6f6894dd..af023119 100644 --- a/src/scrapegraph_py/__init__.py +++ b/src/scrapegraph_py/__init__.py @@ -10,6 +10,7 @@ ExtractRequest, ExtractResponse, FetchConfig, + FormatConfig, HealthResponse, HistoryEntry, HistoryFilter, @@ -66,6 +67,7 @@ "HealthResponse", "TokenUsage", "FetchConfig", + "FormatConfig", "MarkdownFormatConfig", "HtmlFormatConfig", "ScreenshotFormatConfig", diff --git a/src/scrapegraph_py/async_client.py b/src/scrapegraph_py/async_client.py index 2fc8dedd..4d88b0f2 100644 --- a/src/scrapegraph_py/async_client.py +++ b/src/scrapegraph_py/async_client.py @@ -24,6 +24,7 @@ ExtractRequest, ExtractResponse, FetchConfig, + FormatConfig, HealthResponse, HistoryEntry, HistoryFilter, @@ -33,7 +34,6 @@ MonitorCreateRequest, MonitorResponse, MonitorUpdateRequest, - ScrapeFormatEntry, ScrapeRequest, ScrapeResponse, SearchRequest, @@ -87,7 +87,7 @@ async def start( self, url: str, *, - formats: list[ScrapeFormatEntry] | None = None, + formats: list[FormatConfig] | None = None, max_depth: int | None = None, max_pages: int | None = None, max_links_per_page: int | None = None, @@ -136,7 +136,7 @@ async def create( interval: str, *, name: str | None = None, - formats: list[ScrapeFormatEntry] | None = None, + formats: list[FormatConfig] | None = None, webhook_url: str | None = None, fetch_config: FetchConfig | None = None, ) -> ApiResult[MonitorResponse]: @@ -163,7 +163,7 @@ async def update( id: str, *, name: str | None = None, - formats: list[ScrapeFormatEntry] | None = None, + formats: list[FormatConfig] | None = None, webhook_url: str | None = None, interval: str | None = None, fetch_config: FetchConfig | None = None, @@ -330,7 +330,7 @@ async def scrape( self, url: str, *, - formats: list[ScrapeFormatEntry] | None = None, + formats: list[FormatConfig] | None = None, fetch_config: FetchConfig | None = None, content_type: ApiFetchContentType | None = None, ) -> ApiResult[ScrapeResponse]: diff --git a/src/scrapegraph_py/client.py b/src/scrapegraph_py/client.py index 532e11c8..301be571 100644 --- a/src/scrapegraph_py/client.py +++ b/src/scrapegraph_py/client.py @@ -24,6 +24,7 @@ ExtractRequest, ExtractResponse, FetchConfig, + FormatConfig, HealthResponse, HistoryEntry, HistoryFilter, @@ -33,7 +34,6 @@ MonitorCreateRequest, MonitorResponse, MonitorUpdateRequest, - ScrapeFormatEntry, ScrapeRequest, ScrapeResponse, SearchRequest, @@ -87,7 +87,7 @@ def start( self, url: str, *, - formats: list[ScrapeFormatEntry] | None = None, + formats: list[FormatConfig] | None = None, max_depth: int | None = None, max_pages: int | None = None, max_links_per_page: int | None = None, @@ -136,7 +136,7 @@ def create( interval: str, *, name: str | None = None, - formats: list[ScrapeFormatEntry] | None = None, + formats: list[FormatConfig] | None = None, webhook_url: str | None = None, fetch_config: FetchConfig | None = None, ) -> ApiResult[MonitorResponse]: @@ -163,7 +163,7 @@ def update( id: str, *, name: str | None = None, - formats: list[ScrapeFormatEntry] | None = None, + formats: list[FormatConfig] | None = None, webhook_url: str | None = None, interval: str | None = None, fetch_config: FetchConfig | None = None, @@ -330,7 +330,7 @@ def scrape( self, url: str, *, - formats: list[ScrapeFormatEntry] | None = None, + formats: list[FormatConfig] | None = None, fetch_config: FetchConfig | None = None, content_type: ApiFetchContentType | None = None, ) -> ApiResult[ScrapeResponse]: diff --git a/src/scrapegraph_py/schemas.py b/src/scrapegraph_py/schemas.py index 4db89a21..c13aa41f 100644 --- a/src/scrapegraph_py/schemas.py +++ b/src/scrapegraph_py/schemas.py @@ -120,7 +120,7 @@ class BrandingFormatConfig(CamelModel): type: Literal["branding"] = "branding" -ScrapeFormatEntry = ( +FormatConfig = ( MarkdownFormatConfig | HtmlFormatConfig | ScreenshotFormatConfig @@ -136,7 +136,7 @@ class ScrapeRequest(CamelModel): url: HttpUrl content_type: ApiFetchContentType | None = None fetch_config: FetchConfig | None = None - formats: list[ScrapeFormatEntry] = Field(default_factory=lambda: [MarkdownFormatConfig()]) + formats: list[FormatConfig] = Field(default_factory=lambda: [MarkdownFormatConfig()]) @model_validator(mode="after") def validate_unique_formats(self): @@ -184,7 +184,7 @@ def validate_schema_requires_prompt(self): class MonitorCreateRequest(CamelModel): url: HttpUrl name: Annotated[str, Field(max_length=200)] | None = None - formats: list[ScrapeFormatEntry] = Field(default_factory=lambda: [MarkdownFormatConfig()]) + formats: list[FormatConfig] = Field(default_factory=lambda: [MarkdownFormatConfig()]) webhook_url: HttpUrl | None = None interval: Annotated[str, Field(min_length=1, max_length=100)] fetch_config: FetchConfig | None = None @@ -199,7 +199,7 @@ def validate_unique_formats(self): class MonitorUpdateRequest(CamelModel): name: Annotated[str, Field(max_length=200)] | None = None - formats: list[ScrapeFormatEntry] | None = None + formats: list[FormatConfig] | None = None webhook_url: HttpUrl | None = None interval: Annotated[str, Field(min_length=1, max_length=100)] | None = None fetch_config: FetchConfig | None = None @@ -215,7 +215,7 @@ def validate_unique_formats(self): class CrawlRequest(CamelModel): url: HttpUrl - formats: list[ScrapeFormatEntry] = Field(default_factory=lambda: [MarkdownFormatConfig()]) + formats: list[FormatConfig] = Field(default_factory=lambda: [MarkdownFormatConfig()]) max_depth: int = Field(default=2, ge=0) max_pages: int = Field(default=50, ge=1, le=1000) max_links_per_page: int = Field(default=10, ge=1) From dc1005d17b85829751c9cc790f03854a5a446e08 Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Tue, 21 Apr 2026 14:21:18 +0200 Subject: [PATCH 4/6] chore: remove accidentally committed playground.py and gitignore it Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8e87e541..731e92ae 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,4 @@ htmlcov/ # Misc .bfg-report/ +playground.py From 83914783efc1c24ca53596184354fecadc694b19 Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Tue, 21 Apr 2026 14:28:32 +0200 Subject: [PATCH 5/6] refactor: drop Api* prefix from type aliases Renames: - ApiService -> Service - ApiHtmlMode -> HtmlMode - ApiFetchMode -> FetchMode - ApiTimeRange -> TimeRange - ApiCrawlStatus -> CrawlStatus - ApiCrawlPageStatus -> CrawlPageStatus - ApiHistoryStatus -> HistoryStatus - ApiFetchContentType -> FetchContentType - ApiMonitorTickStatus -> MonitorTickStatus Removed (unused): - ApiStatus - ApiScrapeFormat - ApiHistoryService (duplicate of ApiService) Kept: ApiResult. The prefix earns its keep there since it's semantically distinct from the *Response types and "Result[T]" alone would be too generic. All aliases now exported from scrapegraph_py for user type annotations. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/scrapegraph_py/__init__.py | 18 ++++++++++ src/scrapegraph_py/async_client.py | 22 ++++++------ src/scrapegraph_py/client.py | 22 ++++++------ src/scrapegraph_py/schemas.py | 57 ++++++++++++++---------------- 4 files changed, 66 insertions(+), 53 deletions(-) diff --git a/src/scrapegraph_py/__init__.py b/src/scrapegraph_py/__init__.py index af023119..6c7777e2 100644 --- a/src/scrapegraph_py/__init__.py +++ b/src/scrapegraph_py/__init__.py @@ -4,18 +4,24 @@ ApiResult, BrandingFormatConfig, CrawlPage, + CrawlPageStatus, CrawlRequest, CrawlResponse, + CrawlStatus, CreditsResponse, ExtractRequest, ExtractResponse, FetchConfig, + FetchContentType, + FetchMode, FormatConfig, HealthResponse, HistoryEntry, HistoryFilter, HistoryPage, + HistoryStatus, HtmlFormatConfig, + HtmlMode, ImagesFormatConfig, JsonFormatConfig, LinksFormatConfig, @@ -27,6 +33,7 @@ MonitorResponse, MonitorResult, MonitorTickEntry, + MonitorTickStatus, MonitorUpdateRequest, ScrapeRequest, ScrapeResponse, @@ -34,7 +41,9 @@ SearchRequest, SearchResponse, SearchResult, + Service, SummaryFormatConfig, + TimeRange, TokenUsage, ) @@ -52,6 +61,8 @@ "CrawlRequest", "CrawlResponse", "CrawlPage", + "CrawlPageStatus", + "CrawlStatus", "MonitorCreateRequest", "MonitorUpdateRequest", "MonitorResponse", @@ -60,14 +71,21 @@ "MonitorActivityRequest", "MonitorActivityResponse", "MonitorTickEntry", + "MonitorTickStatus", "HistoryFilter", "HistoryPage", "HistoryEntry", + "HistoryStatus", "CreditsResponse", "HealthResponse", "TokenUsage", "FetchConfig", + "FetchContentType", + "FetchMode", "FormatConfig", + "HtmlMode", + "Service", + "TimeRange", "MarkdownFormatConfig", "HtmlFormatConfig", "ScreenshotFormatConfig", diff --git a/src/scrapegraph_py/async_client.py b/src/scrapegraph_py/async_client.py index 4d88b0f2..c0da3370 100644 --- a/src/scrapegraph_py/async_client.py +++ b/src/scrapegraph_py/async_client.py @@ -13,22 +13,20 @@ from .env import env from .schemas import ( - ApiFetchContentType, - ApiHtmlMode, ApiResult, - ApiService, - ApiTimeRange, CrawlRequest, CrawlResponse, CreditsResponse, ExtractRequest, ExtractResponse, FetchConfig, + FetchContentType, FormatConfig, HealthResponse, HistoryEntry, HistoryFilter, HistoryPage, + HtmlMode, MonitorActivityRequest, MonitorActivityResponse, MonitorCreateRequest, @@ -38,6 +36,8 @@ ScrapeResponse, SearchRequest, SearchResponse, + Service, + TimeRange, ) _SERVER_TIMING_RE = re.compile(r"dur=(\d+(?:\.\d+)?)") @@ -94,7 +94,7 @@ async def start( allow_external: bool | None = None, include_patterns: list[str] | None = None, exclude_patterns: list[str] | None = None, - content_types: list[ApiFetchContentType] | None = None, + content_types: list[FetchContentType] | None = None, fetch_config: FetchConfig | None = None, ) -> ApiResult[CrawlResponse]: req = CrawlRequest( @@ -217,7 +217,7 @@ async def list( *, page: int | None = None, limit: int | None = None, - service: ApiService | None = None, + service: Service | None = None, ) -> ApiResult[HistoryPage]: kwargs = _compact(page=page, limit=limit, service=service) if not kwargs: @@ -332,7 +332,7 @@ async def scrape( *, formats: list[FormatConfig] | None = None, fetch_config: FetchConfig | None = None, - content_type: ApiFetchContentType | None = None, + content_type: FetchContentType | None = None, ) -> ApiResult[ScrapeResponse]: req = ScrapeRequest( **_compact( @@ -352,9 +352,9 @@ async def extract( html: str | None = None, markdown: str | None = None, schema: dict[str, object] | None = None, - mode: ApiHtmlMode | None = None, + mode: HtmlMode | None = None, fetch_config: FetchConfig | None = None, - content_type: ApiFetchContentType | None = None, + content_type: FetchContentType | None = None, ) -> ApiResult[ExtractResponse]: req = ExtractRequest( **_compact( @@ -376,12 +376,12 @@ async def search( *, num_results: int | None = None, format: Literal["html", "markdown"] | None = None, - mode: ApiHtmlMode | None = None, + mode: HtmlMode | None = None, prompt: str | None = None, schema: dict[str, object] | None = None, fetch_config: FetchConfig | None = None, location_geo_code: str | None = None, - time_range: ApiTimeRange | None = None, + time_range: TimeRange | None = None, ) -> ApiResult[SearchResponse]: req = SearchRequest( **_compact( diff --git a/src/scrapegraph_py/client.py b/src/scrapegraph_py/client.py index 301be571..71474759 100644 --- a/src/scrapegraph_py/client.py +++ b/src/scrapegraph_py/client.py @@ -13,22 +13,20 @@ from .env import env from .schemas import ( - ApiFetchContentType, - ApiHtmlMode, ApiResult, - ApiService, - ApiTimeRange, CrawlRequest, CrawlResponse, CreditsResponse, ExtractRequest, ExtractResponse, FetchConfig, + FetchContentType, FormatConfig, HealthResponse, HistoryEntry, HistoryFilter, HistoryPage, + HtmlMode, MonitorActivityRequest, MonitorActivityResponse, MonitorCreateRequest, @@ -38,6 +36,8 @@ ScrapeResponse, SearchRequest, SearchResponse, + Service, + TimeRange, ) _SERVER_TIMING_RE = re.compile(r"dur=(\d+(?:\.\d+)?)") @@ -94,7 +94,7 @@ def start( allow_external: bool | None = None, include_patterns: list[str] | None = None, exclude_patterns: list[str] | None = None, - content_types: list[ApiFetchContentType] | None = None, + content_types: list[FetchContentType] | None = None, fetch_config: FetchConfig | None = None, ) -> ApiResult[CrawlResponse]: req = CrawlRequest( @@ -217,7 +217,7 @@ def list( *, page: int | None = None, limit: int | None = None, - service: ApiService | None = None, + service: Service | None = None, ) -> ApiResult[HistoryPage]: kwargs = _compact(page=page, limit=limit, service=service) if not kwargs: @@ -332,7 +332,7 @@ def scrape( *, formats: list[FormatConfig] | None = None, fetch_config: FetchConfig | None = None, - content_type: ApiFetchContentType | None = None, + content_type: FetchContentType | None = None, ) -> ApiResult[ScrapeResponse]: req = ScrapeRequest( **_compact( @@ -352,9 +352,9 @@ def extract( html: str | None = None, markdown: str | None = None, schema: dict[str, object] | None = None, - mode: ApiHtmlMode | None = None, + mode: HtmlMode | None = None, fetch_config: FetchConfig | None = None, - content_type: ApiFetchContentType | None = None, + content_type: FetchContentType | None = None, ) -> ApiResult[ExtractResponse]: req = ExtractRequest( **_compact( @@ -376,12 +376,12 @@ def search( *, num_results: int | None = None, format: Literal["html", "markdown"] | None = None, - mode: ApiHtmlMode | None = None, + mode: HtmlMode | None = None, prompt: str | None = None, schema: dict[str, object] | None = None, fetch_config: FetchConfig | None = None, location_geo_code: str | None = None, - time_range: ApiTimeRange | None = None, + time_range: TimeRange | None = None, ) -> ApiResult[SearchResponse]: req = SearchRequest( **_compact( diff --git a/src/scrapegraph_py/schemas.py b/src/scrapegraph_py/schemas.py index c13aa41f..3cb37bd3 100644 --- a/src/scrapegraph_py/schemas.py +++ b/src/scrapegraph_py/schemas.py @@ -16,20 +16,15 @@ class ResponseModel(BaseModel): model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, extra="allow") -ApiService = Literal["scrape", "extract", "search", "monitor", "crawl"] -ApiStatus = Literal["completed", "failed"] -ApiHtmlMode = Literal["normal", "reader", "prune"] -ApiFetchMode = Literal["auto", "fast", "js"] -ApiScrapeFormat = Literal[ - "markdown", "html", "links", "images", "summary", "json", "branding", "screenshot" -] -ApiTimeRange = Literal["past_hour", "past_24_hours", "past_week", "past_month", "past_year"] -ApiCrawlStatus = Literal["running", "completed", "failed", "paused", "deleted"] -ApiCrawlPageStatus = Literal["completed", "failed", "skipped"] -ApiHistoryService = Literal["scrape", "extract", "search", "monitor", "crawl"] -ApiHistoryStatus = Literal["completed", "failed", "running", "paused", "deleted"] - -ApiFetchContentType = Literal[ +Service = Literal["scrape", "extract", "search", "monitor", "crawl"] +HtmlMode = Literal["normal", "reader", "prune"] +FetchMode = Literal["auto", "fast", "js"] +TimeRange = Literal["past_hour", "past_24_hours", "past_week", "past_month", "past_year"] +CrawlStatus = Literal["running", "completed", "failed", "paused", "deleted"] +CrawlPageStatus = Literal["completed", "failed", "skipped"] +HistoryStatus = Literal["completed", "failed", "running", "paused", "deleted"] + +FetchContentType = Literal[ "text/html", "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -68,7 +63,7 @@ class MockConfig(CamelModel): class FetchConfig(CamelModel): - mode: ApiFetchMode = "auto" + mode: FetchMode = "auto" stealth: bool = False timeout: int = Field(default=30000, ge=1000, le=60000) wait: int = Field(default=0, ge=0, le=30000) @@ -81,12 +76,12 @@ class FetchConfig(CamelModel): class MarkdownFormatConfig(CamelModel): type: Literal["markdown"] = "markdown" - mode: ApiHtmlMode = "normal" + mode: HtmlMode = "normal" class HtmlFormatConfig(CamelModel): type: Literal["html"] = "html" - mode: ApiHtmlMode = "normal" + mode: HtmlMode = "normal" class ScreenshotFormatConfig(CamelModel): @@ -101,7 +96,7 @@ class JsonFormatConfig(CamelModel): type: Literal["json"] = "json" prompt: Annotated[str, Field(min_length=1, max_length=10000)] schema_: dict[str, object] | None = Field(default=None, alias="schema") - mode: ApiHtmlMode = "normal" + mode: HtmlMode = "normal" class LinksFormatConfig(CamelModel): @@ -134,7 +129,7 @@ class BrandingFormatConfig(CamelModel): class ScrapeRequest(CamelModel): url: HttpUrl - content_type: ApiFetchContentType | None = None + content_type: FetchContentType | None = None fetch_config: FetchConfig | None = None formats: list[FormatConfig] = Field(default_factory=lambda: [MarkdownFormatConfig()]) @@ -150,10 +145,10 @@ class ExtractRequest(CamelModel): url: HttpUrl | None = None html: str | None = None markdown: str | None = None - mode: ApiHtmlMode = "normal" + mode: HtmlMode = "normal" prompt: Annotated[str, Field(min_length=1, max_length=10000)] schema_: dict[str, object] | None = Field(default=None, alias="schema") - content_type: ApiFetchContentType | None = None + content_type: FetchContentType | None = None fetch_config: FetchConfig | None = None @model_validator(mode="after") @@ -167,12 +162,12 @@ class SearchRequest(CamelModel): query: Annotated[str, Field(min_length=1, max_length=500)] num_results: int = Field(default=3, ge=1, le=20) format: Literal["html", "markdown"] = "markdown" - mode: ApiHtmlMode = "prune" + mode: HtmlMode = "prune" fetch_config: FetchConfig | None = None prompt: Annotated[str, Field(min_length=1, max_length=10000)] | None = None schema_: dict[str, object] | None = Field(default=None, alias="schema") location_geo_code: Annotated[str, Field(max_length=10)] | None = None - time_range: ApiTimeRange | None = None + time_range: TimeRange | None = None @model_validator(mode="after") def validate_schema_requires_prompt(self): @@ -222,7 +217,7 @@ class CrawlRequest(CamelModel): allow_external: bool = False include_patterns: list[str] | None = None exclude_patterns: list[str] | None = None - content_types: list[ApiFetchContentType] | None = None + content_types: list[FetchContentType] | None = None fetch_config: FetchConfig | None = None @model_validator(mode="after") @@ -236,7 +231,7 @@ def validate_unique_formats(self): class HistoryFilter(CamelModel): page: int = Field(default=1, ge=1) limit: int = Field(default=20, ge=1, le=100) - service: ApiService | None = None + service: Service | None = None class TokenUsage(ResponseModel): @@ -315,7 +310,7 @@ class SearchResponse(ResponseModel): class CrawlPage(ResponseModel): url: str - status: ApiCrawlPageStatus + status: CrawlPageStatus depth: int parent_url: str | None links: list[str] @@ -331,7 +326,7 @@ class CrawlPage(ResponseModel): class CrawlResponse(ResponseModel): id: str - status: ApiCrawlStatus + status: CrawlStatus reason: str | None = None total: int finished: int @@ -403,12 +398,12 @@ class MonitorResponse(ResponseModel): model_config = ConfigDict(extra="allow") -ApiMonitorTickStatus = Literal["completed", "failed", "paused", "running"] +MonitorTickStatus = Literal["completed", "failed", "paused", "running"] class MonitorTickEntry(ResponseModel): id: str - status: ApiMonitorTickStatus + status: MonitorTickStatus created_at: str elapsed_ms: int changed: bool @@ -432,8 +427,8 @@ class MonitorActivityRequest(CamelModel): class HistoryEntry(ResponseModel): id: str - service: ApiHistoryService - status: ApiHistoryStatus + service: Service + status: HistoryStatus error: object | None elapsed_ms: int created_at: str From d85a759817d4f22ec04aa5358ad825e8ec30905d Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Tue, 21 Apr 2026 14:39:12 +0200 Subject: [PATCH 6/6] reset the release to the current one --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5c37ab66..74580b04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraph-py" -version = "2.1.0" +version = "2.0.1" description = "Official Python SDK for ScrapeGraph AI API" readme = "README.md" license = "MIT"