-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsimple_scrape.py
More file actions
65 lines (48 loc) · 1.83 KB
/
simple_scrape.py
File metadata and controls
65 lines (48 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Simple Scraping Example
This example demonstrates:
- Using the Scrape SDK for simple page scraping
- Getting page content in markdown and HTML formats
- Capturing page screenshots
- No workflow needed — just URL and format
Site: Wikipedia (https://en.wikipedia.org)
"""
import asyncio
import json
import os
import sys
from dotenv import load_dotenv
from maxun import Scrape, Config
async def main():
scraper = Scrape(Config(
api_key=os.environ["MAXUN_API_KEY"],
base_url=os.environ.get("MAXUN_BASE_URL", "https://app.maxun.dev/api/sdk/"),
))
robot = await scraper.create(
"Wikipedia Web Scraping Article",
"https://en.wikipedia.org/wiki/Web_scraping",
formats=["markdown", "html", "screenshot-fullpage", "screenshot-visible"],
)
print(f"Robot created: {robot.id}")
result = await robot.run()
print("Result:", json.dumps(result, indent=2))
data = result.get("data", {})
print("\n=== Scraping Completed ===")
print(f"Text length: {len(data.get('text') or '')} characters")
print(f"Markdown length: {len(data.get('markdown') or '')} characters")
print(f"HTML length: {len(data.get('html') or '')} characters")
print(f"Screenshots: {len(result.get('screenshots') or [])}")
text = data.get("text")
if text:
print(f"\nText preview (first 200 chars):\n{text[:200]}...")
screenshots = result.get("screenshots") or []
if screenshots:
print("\nScreenshot URLs:")
for i, screenshot in enumerate(screenshots, 1):
url = screenshot if isinstance(screenshot, str) else screenshot.get("data")
print(f" {i}. {url}")
load_dotenv()
if not os.environ.get("MAXUN_API_KEY"):
print("Error: MAXUN_API_KEY environment variable is required", file=sys.stderr)
sys.exit(1)
asyncio.run(main())