Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ htmlcov/

# Misc
.bfg-report/
playground.py
90 changes: 39 additions & 51 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,12 @@ uv add scrapegraph-py
## Quick Start

```python
from scrapegraph_py import ScrapeGraphAI, ScrapeRequest
from scrapegraph_py import ScrapeGraphAI

# reads SGAI_API_KEY from env, or pass explicitly: ScrapeGraphAI(api_key="...")
sgai = ScrapeGraphAI()

result = sgai.scrape(ScrapeRequest(
url="https://example.com",
))
result = sgai.scrape("https://example.com")

if result.status == "success":
print(result.data["results"]["markdown"]["data"])
Expand All @@ -56,14 +54,14 @@ Scrape a webpage in multiple formats (markdown, html, screenshot, json, etc).

```python
from scrapegraph_py import (
ScrapeGraphAI, ScrapeRequest, FetchConfig,
ScrapeGraphAI, FetchConfig,
MarkdownFormatConfig, ScreenshotFormatConfig, JsonFormatConfig
)

sgai = ScrapeGraphAI()

res = sgai.scrape(ScrapeRequest(
url="https://example.com",
res = sgai.scrape(
"https://example.com",
formats=[
MarkdownFormatConfig(mode="reader"),
ScreenshotFormatConfig(full_page=True, width=1440, height=900),
Expand All @@ -80,7 +78,7 @@ res = sgai.scrape(ScrapeRequest(
cookies={"session": "abc"},
country="us",
),
))
)
```

**Formats:**
Expand All @@ -98,61 +96,58 @@ res = sgai.scrape(ScrapeRequest(
Extract structured data from a URL, HTML, or markdown using AI.

```python
from scrapegraph_py import ScrapeGraphAI, ExtractRequest
from scrapegraph_py import ScrapeGraphAI

sgai = ScrapeGraphAI()

res = sgai.extract(ExtractRequest(
url="https://example.com",
res = sgai.extract(
prompt="Extract product names and prices",
url="https://example.com",
schema={"type": "object", "properties": {...}}, # optional
mode="reader", # optional
fetch_config=FetchConfig(...), # optional
))
# Or pass html/markdown directly instead of url
# Or pass html/markdown directly instead of url
)
```

### search

Search the web and optionally extract structured data.

```python
from scrapegraph_py import ScrapeGraphAI, SearchRequest
from scrapegraph_py import ScrapeGraphAI

sgai = ScrapeGraphAI()

res = sgai.search(SearchRequest(
query="best programming languages 2024",
res = sgai.search(
"best programming languages 2024",
num_results=5, # 1-20, default 3
format="markdown", # "markdown" | "html"
prompt="Extract key points", # optional, for AI extraction
schema={...}, # optional
time_range="past_week", # optional
location_geo_code="us", # optional
fetch_config=FetchConfig(...), # optional
))
)
```

### crawl

Crawl a website and its linked pages.

```python
from scrapegraph_py import ScrapeGraphAI, CrawlRequest, MarkdownFormatConfig
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig

sgai = ScrapeGraphAI()

# Start a crawl
start = sgai.crawl.start(CrawlRequest(
url="https://example.com",
start = sgai.crawl.start(
"https://example.com",
formats=[MarkdownFormatConfig()],
max_pages=50,
max_depth=2,
max_links_per_page=10,
include_patterns=["/blog/*"],
exclude_patterns=["/admin/*"],
fetch_config=FetchConfig(...),
))
)

# Check status
status = sgai.crawl.get(start.data["id"])
Expand All @@ -168,24 +163,23 @@ sgai.crawl.delete(crawl_id)
Monitor a webpage for changes on a schedule.

```python
from scrapegraph_py import ScrapeGraphAI, MonitorCreateRequest, MarkdownFormatConfig
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig

sgai = ScrapeGraphAI()

# Create a monitor
mon = sgai.monitor.create(MonitorCreateRequest(
url="https://example.com",
mon = sgai.monitor.create(
"https://example.com",
"0 * * * *", # cron expression
name="Price Monitor",
interval="0 * * * *", # cron expression
formats=[MarkdownFormatConfig()],
webhook_url="https://...", # optional
fetch_config=FetchConfig(...),
))
)

# Manage monitors
sgai.monitor.list()
sgai.monitor.get(cron_id)
sgai.monitor.update(cron_id, MonitorUpdateRequest(interval="0 */6 * * *"))
sgai.monitor.update(cron_id, interval="0 */6 * * *")
sgai.monitor.pause(cron_id)
sgai.monitor.resume(cron_id)
sgai.monitor.delete(cron_id)
Expand All @@ -196,15 +190,15 @@ sgai.monitor.delete(cron_id)
Fetch request history.

```python
from scrapegraph_py import ScrapeGraphAI, HistoryFilter
from scrapegraph_py import ScrapeGraphAI

sgai = ScrapeGraphAI()

history = sgai.history.list(HistoryFilter(
history = sgai.history.list(
service="scrape", # optional filter
page=1,
limit=20,
))
)

entry = sgai.history.get("request-id")
```
Expand All @@ -229,11 +223,11 @@ All methods have async equivalents via `AsyncScrapeGraphAI`:

```python
import asyncio
from scrapegraph_py import AsyncScrapeGraphAI, ScrapeRequest
from scrapegraph_py import AsyncScrapeGraphAI

async def main():
async with AsyncScrapeGraphAI() as sgai:
result = await sgai.scrape(ScrapeRequest(url="https://example.com"))
result = await sgai.scrape("https://example.com")
if result.status == "success":
print(result.data["results"]["markdown"]["data"])
else:
Expand All @@ -246,42 +240,36 @@ asyncio.run(main())

```python
async with AsyncScrapeGraphAI() as sgai:
res = await sgai.extract(ExtractRequest(
url="https://example.com",
res = await sgai.extract(
prompt="Extract product names and prices",
))
url="https://example.com",
)
```

### Async Search

```python
async with AsyncScrapeGraphAI() as sgai:
res = await sgai.search(SearchRequest(
query="best programming languages 2024",
num_results=5,
))
res = await sgai.search("best programming languages 2024", num_results=5)
```

### Async Crawl

```python
async with AsyncScrapeGraphAI() as sgai:
start = await sgai.crawl.start(CrawlRequest(
url="https://example.com",
max_pages=50,
))
start = await sgai.crawl.start("https://example.com", max_pages=50)
status = await sgai.crawl.get(start.data["id"])
```

### Async Monitor

```python
async with AsyncScrapeGraphAI() as sgai:
mon = await sgai.monitor.create(MonitorCreateRequest(
url="https://example.com",
mon = await sgai.monitor.create(
"https://example.com",
"0 * * * *",
name="Price Monitor",
interval="0 * * * *",
))
)
```

## Examples
Expand Down
10 changes: 6 additions & 4 deletions examples/crawl/crawl_basic.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from dotenv import load_dotenv

load_dotenv()

import time
from scrapegraph_py import ScrapeGraphAI, CrawlRequest

from scrapegraph_py import ScrapeGraphAI

sgai = ScrapeGraphAI()

start_res = sgai.crawl.start(CrawlRequest(
url="https://scrapegraphai.com/",
start_res = sgai.crawl.start(
"https://scrapegraphai.com/",
max_pages=5,
max_depth=2,
))
)

if start_res.status != "success" or not start_res.data:
print("Failed to start:", start_res.error)
Expand Down
12 changes: 8 additions & 4 deletions examples/crawl/crawl_basic_async.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from dotenv import load_dotenv

load_dotenv()

import asyncio
from scrapegraph_py import AsyncScrapeGraphAI, CrawlRequest

from scrapegraph_py import AsyncScrapeGraphAI


async def main():
async with AsyncScrapeGraphAI() as sgai:
start_res = await sgai.crawl.start(CrawlRequest(
url="https://scrapegraphai.com/",
start_res = await sgai.crawl.start(
"https://scrapegraphai.com/",
max_pages=5,
max_depth=2,
))
)

if start_res.status != "success" or not start_res.data:
print("Failed to start:", start_res.error)
Expand All @@ -33,4 +36,5 @@ async def main():
for page in get_res.data.pages:
print(f" {page.url} - {page.status}")


asyncio.run(main())
13 changes: 7 additions & 6 deletions examples/crawl/crawl_with_formats.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
from dotenv import load_dotenv

load_dotenv()

import time

from scrapegraph_py import (
ScrapeGraphAI,
CrawlRequest,
MarkdownFormatConfig,
LinksFormatConfig,
MarkdownFormatConfig,
ScrapeGraphAI,
)

sgai = ScrapeGraphAI()

start_res = sgai.crawl.start(CrawlRequest(
url="https://scrapegraphai.com/",
start_res = sgai.crawl.start(
"https://scrapegraphai.com/",
max_pages=3,
max_depth=1,
formats=[
MarkdownFormatConfig(),
LinksFormatConfig(),
],
))
)

if start_res.status != "success" or not start_res.data:
print("Failed to start:", start_res.error)
Expand Down
13 changes: 8 additions & 5 deletions examples/crawl/crawl_with_formats_async.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
from dotenv import load_dotenv

load_dotenv()

import asyncio

from scrapegraph_py import (
AsyncScrapeGraphAI,
CrawlRequest,
MarkdownFormatConfig,
LinksFormatConfig,
MarkdownFormatConfig,
)


async def main():
async with AsyncScrapeGraphAI() as sgai:
start_res = await sgai.crawl.start(CrawlRequest(
url="https://scrapegraphai.com/",
start_res = await sgai.crawl.start(
"https://scrapegraphai.com/",
max_pages=3,
max_depth=1,
formats=[
MarkdownFormatConfig(),
LinksFormatConfig(),
],
))
)

if start_res.status != "success" or not start_res.data:
print("Failed to start:", start_res.error)
Expand All @@ -44,4 +46,5 @@ async def main():
print(f" Status: {page.status}")
print(f" Depth: {page.depth}")


asyncio.run(main())
10 changes: 6 additions & 4 deletions examples/extract/extract_basic.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from dotenv import load_dotenv

load_dotenv()

import json
from scrapegraph_py import ScrapeGraphAI, ExtractRequest

from scrapegraph_py import ScrapeGraphAI

sgai = ScrapeGraphAI()

res = sgai.extract(ExtractRequest(
res = sgai.extract(
"What is this page about? Extract the main heading and description.",
url="https://example.com",
prompt="What is this page about? Extract the main heading and description.",
))
)

if res.status == "success":
print("Extracted:", json.dumps(res.data.json_data, indent=2))
Expand Down
Loading
Loading