From c1a6f09e271ac7d8335e10acbaa9624fcd23742b Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 15 Apr 2026 21:55:36 +0200 Subject: [PATCH 1/2] feat!: migrate CLI to scrapegraph-js v2 API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aligns the CLI with scrapegraph-js PR #13 (v2 SDK). The v2 API consolidates endpoints and drops legacy ones; the CLI follows suit. Commands kept (rewritten against v2 types): - scrape — multi-format (markdown/html/screenshot/branding/links/images/summary/json) - crawl — polls until the job reaches a terminal state - history — new response shape (data/pagination), service filter optional - credits, validate — re-wired to getCredits / checkHealth Commands added: - extract — structured extraction with prompt + schema - search — web search + optional extraction - monitor — create/list/get/update/delete/pause/resume/activity Commands removed (no longer in v2 API): - smart-scraper (use `scrape -f json -p ...` or `extract`) - search-scraper (use `search`) - markdownify (use `scrape` — markdown is the default format) - sitemap, agentic-scraper, generate-schema Other changes: - package.json: scrapegraph-js pinned to github:ScrapeGraphAI/scrapegraph-js#096c110, CLI bumped 0.2.1 → 1.0.0 to track SDK v2.0.0 - src/lib/env.ts: bridges legacy SGAI_TIMEOUT_S / JUST_SCRAPE_TIMEOUT_S → SGAI_TIMEOUT (renamed by SDK v2) - README + smoke test updated Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 341 ++++++++++++++------------------ bun.lock | 6 +- package.json | 4 +- src/cli.ts | 11 +- src/commands/agentic-scraper.ts | 51 ----- src/commands/crawl.ts | 125 ++++++++---- src/commands/credits.ts | 6 +- src/commands/extract.ts | 63 ++++++ src/commands/generate-schema.ts | 37 ---- src/commands/history.ts | 95 ++++----- src/commands/markdownify.ts | 40 ---- src/commands/monitor.ts | 206 +++++++++++++++++++ src/commands/scrape.ts | 102 ++++++++-- src/commands/search-scraper.ts | 52 ----- src/commands/search.ts | 67 +++++++ src/commands/sitemap.ts | 31 --- src/commands/smart-scraper.ts | 57 ------ src/commands/validate.ts | 6 +- src/lib/env.ts | 7 +- tests/smoke.test.ts | 12 +- 20 files changed, 738 insertions(+), 581 deletions(-) delete mode 100644 src/commands/agentic-scraper.ts create mode 100644 src/commands/extract.ts delete mode 100644 src/commands/generate-schema.ts delete mode 100644 src/commands/markdownify.ts create mode 100644 src/commands/monitor.ts delete mode 100644 src/commands/search-scraper.ts create mode 100644 src/commands/search.ts delete mode 100644 src/commands/sitemap.ts delete mode 100644 src/commands/smart-scraper.ts diff --git a/README.md b/README.md index 5871dea..7fb3911 100644 --- a/README.md +++ b/README.md @@ -5,22 +5,14 @@ Made with love by the [ScrapeGraphAI team](https://scrapegraphai.com) 💜 ![Demo Video](/assets/demo.gif) -Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) — AI-powered web scraping, data extraction, search, and crawling. +Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) — AI-powered web scraping, data extraction, search, crawling, and page-change monitoring. + +> **v1.0.0 — SDK v2 migration.** This release migrates the CLI to the [scrapegraph-js v2 SDK](https://github.com/ScrapeGraphAI/scrapegraph-js/pull/13). The v1 endpoints (`smart-scraper`, `search-scraper`, `markdownify`, `sitemap`, `agentic-scraper`, `generate-schema`) have been removed. Use `scrape --format …` for multi-format output, `extract` for structured data, and the new `monitor` command for page-change tracking. ## Project Structure ``` just-scrape/ -├── docs/ # API response docs per endpoint -│ ├── smartscraper.md -│ ├── searchscraper.md -│ ├── markdownify.md -│ ├── crawl.md -│ ├── scrape.md -│ ├── agenticscraper.md -│ ├── generate-schema.md -│ ├── sitemap.md -│ └── credits.md ├── src/ │ ├── cli.ts # Entry point, citty main command + subcommands │ ├── lib/ @@ -28,21 +20,20 @@ just-scrape/ │ │ ├── folders.ts # API key resolution + interactive prompt │ │ └── log.ts # Logger factory + syntax-highlighted JSON output │ ├── commands/ -│ │ ├── smart-scraper.ts -│ │ ├── search-scraper.ts -│ │ ├── markdownify.ts -│ │ ├── crawl.ts -│ │ ├── sitemap.ts -│ │ ├── scrape.ts -│ │ ├── agentic-scraper.ts -│ │ ├── generate-schema.ts -│ │ ├── history.ts -│ │ ├── credits.ts -│ │ └── validate.ts +│ │ ├── scrape.ts # scrape — multi-format (markdown/html/screenshot/json/…) +│ │ ├── extract.ts # extract — structured extraction with prompt + schema +│ │ ├── search.ts # search — web search + extraction +│ │ ├── crawl.ts # crawl — multi-page crawling (polls until done) +│ │ ├── monitor.ts # monitor — create/list/get/update/delete/pause/resume/activity +│ │ ├── history.ts # history — paginated browser for past requests +│ │ ├── credits.ts # credits — balance + job quotas +│ │ └── validate.ts # validate — API key health check │ └── utils/ │ └── banner.ts # ASCII banner + version from package.json ├── dist/ # Build output (git-ignored) │ └── cli.mjs # Bundled ESM with shebang +├── tests/ +│ └── smoke.test.ts # SDK export smoke test ├── package.json ├── tsconfig.json ├── tsup.config.ts @@ -55,7 +46,7 @@ just-scrape/ ```bash npm install -g just-scrape # npm (recommended) pnpm add -g just-scrape # pnpm -yarn global add just-scrape # yarn +yarn global add just-scrape # yarn bun add -g just-scrape # bun npx just-scrape --help # or run without installing bunx just-scrape --help # bun equivalent @@ -65,7 +56,7 @@ Package: [just-scrape](https://www.npmjs.com/package/just-scrape) on npm. ## Coding Agent Skill -You can use just-scrape as a skill for AI coding agents via [Vercel's skills.sh](https://skills.sh) with tis tutorial. +You can use just-scrape as a skill for AI coding agents via [Vercel's skills.sh](https://skills.sh) with this tutorial. Or you can manually install it: ```bash @@ -90,278 +81,216 @@ Four ways to provide it (checked in order): | Variable | Description | Default | |---|---|---| | `SGAI_API_KEY` | ScrapeGraph API key | — | -| `JUST_SCRAPE_API_URL` | Override API base URL | `https://api.scrapegraphai.com/v1` | -| `JUST_SCRAPE_TIMEOUT_S` | Request/polling timeout in seconds | `120` | -| `JUST_SCRAPE_DEBUG` | Set to `1` to enable debug logging to stderr | `0` | +| `SGAI_API_URL` | Override API base URL (also `JUST_SCRAPE_API_URL`) | `https://api.scrapegraphai.com/api/v2` | +| `SGAI_TIMEOUT` | Request timeout in seconds (also legacy `SGAI_TIMEOUT_S` / `JUST_SCRAPE_TIMEOUT_S`) | `120` | +| `SGAI_DEBUG` | Set to `1` to enable debug logging to stderr (also `JUST_SCRAPE_DEBUG`) | `0` | ## JSON Mode (`--json`) All commands support `--json` for machine-readable output. When set, banner, spinners, and interactive prompts are suppressed — only minified JSON on stdout (saves tokens when piped to AI agents). ```bash -just-scrape credits --json | jq '.remaining_credits' -just-scrape smart-scraper https://example.com -p "Extract data" --json > result.json -just-scrape history smartscraper --json | jq '.requests[].status' +just-scrape credits --json | jq '.remaining' +just-scrape scrape https://example.com --json > result.json +just-scrape history scrape --json | jq '.[].id' ``` --- -## Smart Scraper - -Extract structured data from any URL using AI. [docs](https://docs.scrapegraphai.com/services/smartscraper) - -### Usage - -```bash -just-scrape smart-scraper -p # Extract data with AI -just-scrape smart-scraper -p --schema # Enforce output schema -just-scrape smart-scraper -p --scrolls # Infinite scroll (0-100) -just-scrape smart-scraper -p --pages # Multi-page (1-100) -just-scrape smart-scraper -p --stealth # Anti-bot bypass (+4 credits) -just-scrape smart-scraper -p --cookies --headers -just-scrape smart-scraper -p --plain-text # Plain text instead of JSON -``` - -### Examples - -```bash -# Extract product listings from an e-commerce page -just-scrape smart-scraper https://store.example.com/shoes -p "Extract all product names, prices, and ratings" - -# Extract with a strict schema, scrolling to load more content -just-scrape smart-scraper https://news.example.com -p "Get all article headlines and dates" \ - --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' \ - --scrolls 5 - -# Scrape a JS-heavy SPA behind anti-bot protection -just-scrape smart-scraper https://app.example.com/dashboard -p "Extract user stats" \ - --stealth -``` - -## Search Scraper +## Scrape -Search the web and extract structured data from results. [docs](https://docs.scrapegraphai.com/services/searchscraper) +Multi-format scraping from a URL. Supports `markdown`, `html`, `screenshot`, `branding`, `links`, `images`, `summary`, `json` — comma-separated for multi-format output. [docs](https://docs.scrapegraphai.com/api-reference/scrape) ### Usage ```bash -just-scrape search-scraper # AI-powered web search -just-scrape search-scraper --num-results # Sources to scrape (3-20, default 3) -just-scrape search-scraper --no-extraction # Markdown only (2 credits vs 10) -just-scrape search-scraper --schema # Enforce output schema -just-scrape search-scraper --stealth --headers +just-scrape scrape # markdown (default) +just-scrape scrape -f html # raw HTML +just-scrape scrape -f markdown,screenshot,links # multi-format +just-scrape scrape -f json -p "" # AI extraction +just-scrape scrape -f json -p "..." --schema ''# enforce schema +just-scrape scrape --html-mode reader # readable-view cleanup +just-scrape scrape -m js --stealth # JS rendering + anti-bot +just-scrape scrape --country us --scrolls 3 # geo + infinite scroll ``` ### Examples ```bash -# Research a topic across multiple sources -just-scrape search-scraper "What are the best Python web frameworks in 2025?" --num-results 10 - -# Get raw markdown from search results (cheaper) -just-scrape search-scraper "React vs Vue comparison" --no-extraction --num-results 5 - -# Structured output with schema -just-scrape search-scraper "Top 5 cloud providers pricing" \ - --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' -``` - -## Markdownify - -Convert any webpage to clean markdown. [docs](https://docs.scrapegraphai.com/services/markdownify) - -### Usage - -```bash -just-scrape markdownify # Convert to markdown -just-scrape markdownify --stealth # Anti-bot bypass (+4 credits) -just-scrape markdownify --headers # Custom headers -``` - -### Examples +# Replace legacy markdownify +just-scrape scrape https://blog.example.com/post -```bash -# Convert a blog post to markdown -just-scrape markdownify https://blog.example.com/my-article +# Replace legacy smart-scraper (AI extraction) +just-scrape scrape https://store.example.com -f json \ + -p "Extract product name, price, and rating" -# Convert a JS-rendered page behind Cloudflare -just-scrape markdownify https://protected.example.com --stealth +# Multi-format: markdown + screenshot in one request +just-scrape scrape https://example.com -f markdown,screenshot -# Pipe markdown to a file -just-scrape markdownify https://docs.example.com/api --json | jq -r '.result' > api-docs.md +# Structured extraction with schema +just-scrape scrape https://news.example.com -f json \ + -p "All articles" \ + --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' ``` -## Crawl +## Extract -Crawl multiple pages and extract data from each. [docs](https://docs.scrapegraphai.com/services/smartcrawler) +Structured data extraction from a URL using a prompt (and optional schema). [docs](https://docs.scrapegraphai.com/api-reference/extract) ### Usage ```bash -just-scrape crawl -p # Crawl + extract -just-scrape crawl -p --max-pages # Max pages (default 10) -just-scrape crawl -p --depth # Crawl depth (default 1) -just-scrape crawl --no-extraction --max-pages # Markdown only (2 credits/page) -just-scrape crawl -p --schema # Enforce output schema -just-scrape crawl -p --rules # Crawl rules (include_paths, same_domain) -just-scrape crawl -p --no-sitemap # Skip sitemap discovery -just-scrape crawl -p --stealth # Anti-bot bypass +just-scrape extract -p # AI extraction +just-scrape extract -p --schema # enforce schema +just-scrape extract -p --html-mode reader # content-only mode +just-scrape extract -p --stealth --scrolls 5 +just-scrape extract -p --cookies --headers ``` ### Examples ```bash -# Crawl a docs site and extract all code examples -just-scrape crawl https://docs.example.com -p "Extract all code snippets with their language" \ - --max-pages 20 --depth 3 - -# Crawl only blog pages, skip everything else -just-scrape crawl https://example.com -p "Extract article titles and summaries" \ - --rules '{"include_paths":["/blog/*"],"same_domain":true}' --max-pages 50 - -# Get raw markdown from all pages (no AI extraction, cheaper) -just-scrape crawl https://example.com --no-extraction --max-pages 10 +# Extract product data +just-scrape extract https://store.example.com/shoes \ + -p "Extract all product names, prices, and ratings" + +# Behind a login (pass cookies) +just-scrape extract https://app.example.com/dashboard \ + -p "Extract user stats" \ + --cookies '{"session":"abc123"}' ``` -## Sitemap +## Search -Get all URLs from a website's sitemap. [docs](https://docs.scrapegraphai.com/services/sitemap) +Web search + AI extraction across multiple result pages. [docs](https://docs.scrapegraphai.com/api-reference/search) ### Usage ```bash -just-scrape sitemap +just-scrape search "" # default 3 results, markdown +just-scrape search "" --num-results 10 # 1-20 results +just-scrape search "" -p "" # extract across results +just-scrape search "" -p "..." --schema # structured output +just-scrape search "" --country us --time-range past_week +just-scrape search "" --format html # raw HTML per result ``` ### Examples ```bash -# List all pages on a site -just-scrape sitemap https://example.com +# Research a topic across multiple sources +just-scrape search "best Python web frameworks 2025" --num-results 10 -# Pipe URLs to another tool -just-scrape sitemap https://example.com --json | jq -r '.urls[]' +# Structured comparison +just-scrape search "Top 5 cloud providers pricing" \ + -p "Extract provider name, free tier, and starting price" \ + --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"},"price":{"type":"string"}}}}}}' ``` -## Scrape +## Crawl -Get raw HTML content from a URL. [docs](https://docs.scrapegraphai.com/services/scrape) +Multi-page crawl starting from a URL. Polls until the job reaches a terminal state. [docs](https://docs.scrapegraphai.com/api-reference/crawl) ### Usage ```bash -just-scrape scrape # Raw HTML -just-scrape scrape --stealth # Anti-bot bypass (+4 credits) -just-scrape scrape --branding # Extract branding (+2 credits) -just-scrape scrape --country-code # Geo-targeting +just-scrape crawl # markdown, defaults (50 pages, depth 2) +just-scrape crawl -f markdown,links # multi-format per page +just-scrape crawl --max-pages 200 --max-depth 4 +just-scrape crawl --max-links-per-page 25 +just-scrape crawl --allow-external # follow cross-domain links +just-scrape crawl --include-patterns '["/blog/.*"]' --exclude-patterns '["/tag/.*"]' +just-scrape crawl -m js --stealth ``` ### Examples ```bash -# Get raw HTML of a page -just-scrape scrape https://example.com - -# Scrape a geo-restricted page with anti-bot bypass -just-scrape scrape https://store.example.com --stealth --country-code DE +# Crawl a docs site and collect all markdown +just-scrape crawl https://docs.example.com --max-pages 100 -# Extract branding info (logos, colors, fonts) -just-scrape scrape https://example.com --branding +# Only crawl blog posts +just-scrape crawl https://example.com \ + --include-patterns '["/blog/.*"]' --max-pages 50 ``` -## Agentic Scraper +## Monitor -Browser automation with AI — login, click, navigate, fill forms. [docs](https://docs.scrapegraphai.com/services/agenticscraper) +Create and manage page-change monitors. Each monitor re-scrapes a URL on a schedule and diffs the result against the previous tick. [docs](https://docs.scrapegraphai.com/api-reference/monitor) -### Usage +### Actions ```bash -just-scrape agentic-scraper -s # Run browser steps -just-scrape agentic-scraper -s --ai-extraction -p -just-scrape agentic-scraper -s --schema -just-scrape agentic-scraper -s --use-session # Persist browser session +just-scrape monitor create --url --interval [-f ] [--webhook-url ] +just-scrape monitor list # all monitors +just-scrape monitor get --id +just-scrape monitor update --id [--interval ...] [--name ...] [-f ] [--webhook-url ...] +just-scrape monitor delete --id +just-scrape monitor pause --id +just-scrape monitor resume --id +just-scrape monitor activity --id [--limit ] [--cursor ] ``` ### Examples ```bash -# Log in and extract dashboard data -just-scrape agentic-scraper https://app.example.com/login \ - -s "Fill email with user@test.com,Fill password with secret,Click Sign In" \ - --ai-extraction -p "Extract all dashboard metrics" - -# Navigate through a multi-step form -just-scrape agentic-scraper https://example.com/wizard \ - -s "Click Next,Select Premium plan,Fill name with John,Click Submit" - -# Persistent session across multiple runs -just-scrape agentic-scraper https://app.example.com \ - -s "Click Settings" --use-session -``` - -## Generate Schema - -Generate a JSON schema from a natural language description. - -### Usage - -```bash -just-scrape generate-schema # AI generates a schema -just-scrape generate-schema --existing-schema -``` +# Monitor a price page hourly, webhook on change +just-scrape monitor create \ + --url https://store.example.com/product/42 \ + --interval "0 * * * *" \ + -f markdown,screenshot \ + --webhook-url https://hooks.example.com/price-change -### Examples +# List active monitors +just-scrape monitor list -```bash -# Generate a schema for product data -just-scrape generate-schema "E-commerce product with name, price, ratings, and reviews array" +# Pause a monitor temporarily +just-scrape monitor pause --id -# Refine an existing schema -just-scrape generate-schema "Add an availability field" \ - --existing-schema '{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}' +# Paginate tick history +just-scrape monitor activity --id --limit 20 ``` ## History -Browse request history for any service. Interactive by default — arrow keys to navigate, select to view details, "Load more" for infinite scroll. +Browse request history across all services. Interactive by default — arrow keys to navigate, select to view details, "Load more" for pagination. [docs](https://docs.scrapegraphai.com/api-reference/history) ### Usage ```bash -just-scrape history # Interactive browser -just-scrape history # Fetch specific request -just-scrape history --page # Start from page (default 1) -just-scrape history --page-size # Results per page (default 10, max 100) -just-scrape history --json # Raw JSON (pipeable) +just-scrape history # all services, interactive +just-scrape history # scrape | extract | search | monitor | crawl +just-scrape history # fetch specific request +just-scrape history --page --page-size +just-scrape history --json # raw JSON (pipeable) ``` -Services: `markdownify`, `smartscraper`, `searchscraper`, `scrape`, `crawl`, `agentic-scraper`, `sitemap` - ### Examples ```bash -# Browse your smart-scraper history interactively -just-scrape history smartscraper +# Browse scrape history interactively +just-scrape history scrape -# Jump to a specific request by ID -just-scrape history smartscraper abc123-def456-7890 +# Fetch a specific request by id +just-scrape history scrape 550e8400-e29b-41d4-a716-446655440000 -# Export crawl history as JSON -just-scrape history crawl --json --page-size 100 | jq '.requests[] | {id: .request_id, status}' +# Export crawl history +just-scrape history crawl --json --page-size 100 | jq '.[].id' ``` ## Credits -Check your credit balance. +Check your credit balance and per-job quotas (crawl, monitor). ```bash just-scrape credits -just-scrape credits --json | jq '.remaining_credits' +just-scrape credits --json | jq '.remaining' +just-scrape credits --json | jq '.jobs.monitor' ``` ## Validate -Validate your API key (health check). +Validate your API key (calls the SDK's `checkHealth` / `/health` endpoint). ```bash just-scrape validate @@ -369,6 +298,25 @@ just-scrape validate --- +## Migration from v0.x (v1 API) to v1.0 (v2 API) + +The v2 API consolidates and renames endpoints. The CLI now reflects that: + +| v0.x command | v1.0 equivalent | +|---|---| +| `smart-scraper -p "..."` | `scrape -f json -p "..."` (or `extract -p "..."`) | +| `markdownify ` | `scrape ` (markdown is the default format) | +| `search-scraper "..."` | `search "..."` (query now positional, `-p` for extraction) | +| `scrape ` (raw HTML) | `scrape -f html` | +| `crawl -p "..."` | `crawl -f markdown` (crawl no longer runs AI extraction; run `extract`/`scrape` on the pages) | +| `sitemap ` | Removed (fetch sitemap XML directly, or use `crawl` with `--include-patterns`) | +| `agentic-scraper` | Removed | +| `generate-schema` | Removed | + +`credits` and `validate` work the same; response shapes changed (see SDK v2 types). + +--- + ## Contributing ### From Source @@ -392,7 +340,7 @@ bun run dev --help | CLI Framework | **citty** (unjs) | | Prompts | **@clack/prompts** | | Styling | **chalk** v5 (ESM) | -| SDK | **scrapegraph-js** | +| SDK | **scrapegraph-js** v2 | | Env | **dotenv** | | Lint / Format | **Biome** | | Target | **Node.js 22+**, ESM-only | @@ -405,6 +353,7 @@ bun run build # Bundle ESM to dist/cli.mjs bun run lint # Lint + format check bun run format # Auto-format bun run check # Type-check + lint +bun test # Run unit tests ``` ## License diff --git a/bun.lock b/bun.lock index 5a7bd89..5b4c862 100644 --- a/bun.lock +++ b/bun.lock @@ -9,7 +9,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "^1.0.0", + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#096c110", }, "devDependencies": { "@biomejs/biome": "^1.9.4", @@ -229,7 +229,7 @@ "rollup": ["rollup@4.57.1", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.57.1", "@rollup/rollup-android-arm64": "4.57.1", "@rollup/rollup-darwin-arm64": "4.57.1", "@rollup/rollup-darwin-x64": "4.57.1", "@rollup/rollup-freebsd-arm64": "4.57.1", "@rollup/rollup-freebsd-x64": "4.57.1", "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", "@rollup/rollup-linux-arm-musleabihf": "4.57.1", "@rollup/rollup-linux-arm64-gnu": "4.57.1", "@rollup/rollup-linux-arm64-musl": "4.57.1", "@rollup/rollup-linux-loong64-gnu": "4.57.1", "@rollup/rollup-linux-loong64-musl": "4.57.1", "@rollup/rollup-linux-ppc64-gnu": "4.57.1", "@rollup/rollup-linux-ppc64-musl": "4.57.1", "@rollup/rollup-linux-riscv64-gnu": "4.57.1", "@rollup/rollup-linux-riscv64-musl": "4.57.1", "@rollup/rollup-linux-s390x-gnu": "4.57.1", "@rollup/rollup-linux-x64-gnu": "4.57.1", "@rollup/rollup-linux-x64-musl": "4.57.1", "@rollup/rollup-openbsd-x64": "4.57.1", "@rollup/rollup-openharmony-arm64": "4.57.1", "@rollup/rollup-win32-arm64-msvc": "4.57.1", "@rollup/rollup-win32-ia32-msvc": "4.57.1", "@rollup/rollup-win32-x64-gnu": "4.57.1", "@rollup/rollup-win32-x64-msvc": "4.57.1", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A=="], - "scrapegraph-js": ["scrapegraph-js@1.0.0", "", {}, "sha512-eQn8/HRfJHjCoj2yia5yHWQTYUae/bYNhLEx00ZXF+GLKpgUJT0OCGUQM13WGSX5cgw9onz5EiaDJDbzcbeYtQ=="], + "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#096c110", { "dependencies": { "zod": "^4.3.6" } }, "ScrapeGraphAI-scrapegraph-js-096c110"], "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], @@ -256,5 +256,7 @@ "ufo": ["ufo@1.6.3", "", {}, "sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q=="], "undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], + + "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], } } diff --git a/package.json b/package.json index 55c9e7b..c3d5793 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "just-scrape", - "version": "0.2.1", + "version": "1.0.0", "description": "ScrapeGraph AI CLI tool", "type": "module", "main": "dist/cli.mjs", @@ -28,7 +28,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "^1.0.0" + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#096c110" }, "devDependencies": { "@biomejs/biome": "^1.9.4", diff --git a/src/cli.ts b/src/cli.ts index 483a94c..ce66fb0 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -12,14 +12,11 @@ const main = defineCommand({ description: "ScrapeGraph AI CLI tool", }, subCommands: { - "smart-scraper": () => import("./commands/smart-scraper.js").then((m) => m.default), - "search-scraper": () => import("./commands/search-scraper.js").then((m) => m.default), - markdownify: () => import("./commands/markdownify.js").then((m) => m.default), - crawl: () => import("./commands/crawl.js").then((m) => m.default), - sitemap: () => import("./commands/sitemap.js").then((m) => m.default), scrape: () => import("./commands/scrape.js").then((m) => m.default), - "agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default), - "generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default), + extract: () => import("./commands/extract.js").then((m) => m.default), + search: () => import("./commands/search.js").then((m) => m.default), + crawl: () => import("./commands/crawl.js").then((m) => m.default), + monitor: () => import("./commands/monitor.js").then((m) => m.default), history: () => import("./commands/history.js").then((m) => m.default), credits: () => import("./commands/credits.js").then((m) => m.default), validate: () => import("./commands/validate.js").then((m) => m.default), diff --git a/src/commands/agentic-scraper.ts b/src/commands/agentic-scraper.ts deleted file mode 100644 index 67e9b5b..0000000 --- a/src/commands/agentic-scraper.ts +++ /dev/null @@ -1,51 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "agentic-scraper", - description: "Browser automation with AI (login, click, navigate, fill forms)", - }, - args: { - url: { - type: "positional", - description: "Starting URL", - required: true, - }, - steps: { - type: "string", - alias: "s", - description: 'Comma-separated browser steps (e.g. "Click login,Fill email with x")', - }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt (used with --ai-extraction)", - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - "ai-extraction": { type: "boolean", description: "Enable AI extraction after steps" }, - "use-session": { type: "boolean", description: "Persist browser session across requests" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/agenticscraper"); - const key = await resolveApiKey(!!args.json); - - const steps = args.steps ? args.steps.split(",").map((s) => s.trim()) : []; - const params: scrapegraphai.AgenticScraperParams = { url: args.url, steps }; - if (args.prompt) params.user_prompt = args.prompt; - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args["ai-extraction"]) params.ai_extraction = true; - if (args["use-session"]) params.use_session = true; - - out.start("Running browser automation"); - const result = await scrapegraphai.agenticScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index d55101d..45036dd 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -1,62 +1,115 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; +import { crawl } from "scrapegraph-js"; +import type { ApiCrawlRequest, ApiScrapeFormatEntry } from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; +const FORMATS = [ + "markdown", + "html", + "screenshot", + "branding", + "links", + "images", + "summary", +] as const; +type Format = (typeof FORMATS)[number]; + +const POLL_INTERVAL_MS = 3000; + +function buildFormat(f: Format): ApiScrapeFormatEntry { + if (f === "markdown" || f === "html") return { type: f, mode: "normal" }; + return { type: f } as ApiScrapeFormatEntry; +} + export default defineCommand({ meta: { name: "crawl", - description: "Crawl and extract data from multiple pages", + description: "Crawl pages starting from a URL", }, args: { url: { type: "positional", - description: "Starting URL to crawl", + description: "Starting URL", required: true, }, - prompt: { + format: { + type: "string", + alias: "f", + description: `Per-page format(s), comma-separated: ${FORMATS.join(", ")} (default: markdown)`, + }, + "max-pages": { type: "string", description: "Maximum pages to crawl (default 50, max 1000)" }, + "max-depth": { type: "string", description: "Crawl depth (default 2)" }, + "max-links-per-page": { type: "string", description: "Max links per page (default 10)" }, + "allow-external": { type: "boolean", description: "Allow crawling external domains" }, + "include-patterns": { type: "string", - alias: "p", - description: "Extraction prompt (required when extraction mode is on)", + description: "JSON array of regex patterns to include", }, - "no-extraction": { - type: "boolean", - description: "Return markdown only (2 credits/page instead of 10)", + "exclude-patterns": { + type: "string", + description: "JSON array of regex patterns to exclude", }, - "max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" }, - depth: { type: "string", description: "Crawl depth (default 1)" }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - rules: { type: "string", description: "Crawl rules as JSON object string" }, - "no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js" }, + stealth: { type: "boolean", description: "Enable stealth mode" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/smartcrawler"); - const key = await resolveApiKey(!!args.json); - - const base: Record = { url: args.url }; - if (args["max-pages"]) base.max_pages = Number(args["max-pages"]); - if (args.depth) base.depth = Number(args.depth); - if (args.rules) base.rules = JSON.parse(args.rules); - if (args["no-sitemap"]) base.sitemap = false; - if (args.stealth) base.stealth = true; - - if (args["no-extraction"]) { - base.extraction_mode = false; - } else { - if (args.prompt) base.prompt = args.prompt; - if (args.schema) base.schema = JSON.parse(args.schema); + out.docs("https://docs.scrapegraphai.com/api-reference/crawl"); + const apiKey = await resolveApiKey(!!args.json); + + const requested = (args.format ?? "markdown") + .split(",") + .map((f) => f.trim()) + .filter(Boolean); + for (const f of requested) { + if (!FORMATS.includes(f as Format)) + out.error(`Unknown format: ${f}. Valid: ${FORMATS.join(", ")}`); } + const formats = requested.map((f) => buildFormat(f as Format)); - const params = base as scrapegraphai.CrawlParams; + const params: ApiCrawlRequest = { url: args.url, formats }; + const mut = params as Record; + if (args["max-pages"]) mut.maxPages = Number(args["max-pages"]); + if (args["max-depth"]) mut.maxDepth = Number(args["max-depth"]); + if (args["max-links-per-page"]) mut.maxLinksPerPage = Number(args["max-links-per-page"]); + if (args["allow-external"]) mut.allowExternal = true; + if (args["include-patterns"]) mut.includePatterns = JSON.parse(args["include-patterns"]); + if (args["exclude-patterns"]) mut.excludePatterns = JSON.parse(args["exclude-patterns"]); - out.start("Crawling"); - const result = await scrapegraphai.crawl(key, params, out.poll); - out.stop(result.elapsedMs); + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + if (Object.keys(fetchConfig).length > 0) mut.fetchConfig = fetchConfig; - if (result.data) out.result(result.data); - else out.error(result.error); + out.start("Starting crawl"); + const job = await crawl.start(apiKey, params); + if (!job.data) { + out.error(job.error); + return; + } + const jobId = job.data.id; + let totalElapsed = job.elapsedMs; + + while (true) { + await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS)); + const status = await crawl.get(apiKey, jobId); + totalElapsed += status.elapsedMs; + if (!status.data) { + out.error(status.error); + return; + } + out.poll(`${status.data.status} (${status.data.finished}/${status.data.total})`); + if ( + status.data.status === "completed" || + status.data.status === "failed" || + status.data.status === "deleted" + ) { + out.stop(totalElapsed); + out.result(status.data); + return; + } + } }, }); diff --git a/src/commands/credits.ts b/src/commands/credits.ts index 0d7b75f..a02a506 100644 --- a/src/commands/credits.ts +++ b/src/commands/credits.ts @@ -1,5 +1,5 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; +import { getCredits } from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -13,10 +13,10 @@ export default defineCommand({ }, run: async ({ args }) => { const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); + const apiKey = await resolveApiKey(!!args.json); out.start("Fetching credits"); - const result = await scrapegraphai.getCredits(key); + const result = await getCredits(apiKey); out.stop(result.elapsedMs); if (result.data) out.result(result.data); diff --git a/src/commands/extract.ts b/src/commands/extract.ts new file mode 100644 index 0000000..0f116fe --- /dev/null +++ b/src/commands/extract.ts @@ -0,0 +1,63 @@ +import { defineCommand } from "citty"; +import { extract } from "scrapegraph-js"; +import type { ApiExtractRequest } from "scrapegraph-js"; +import { resolveApiKey } from "../lib/folders.js"; +import * as log from "../lib/log.js"; + +export default defineCommand({ + meta: { + name: "extract", + description: "Extract structured data from a URL using AI", + }, + args: { + url: { + type: "positional", + description: "Website URL to extract from", + required: true, + }, + prompt: { + type: "string", + alias: "p", + description: "Extraction prompt", + required: true, + }, + schema: { type: "string", description: "Output JSON schema (JSON string)" }, + mode: { type: "string", description: "Fetch mode: auto (default), fast, js" }, + "html-mode": { + type: "string", + description: "HTML processing mode: normal (default), reader, prune", + }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, + cookies: { type: "string", description: "Cookies as JSON object string" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + country: { type: "string", description: "ISO country code for geo-targeting" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/extract"); + const apiKey = await resolveApiKey(!!args.json); + + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); + if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies); + if (args.headers) fetchConfig.headers = JSON.parse(args.headers); + if (args.country) fetchConfig.country = args.country; + + const params: ApiExtractRequest = { url: args.url, prompt: args.prompt }; + if (args.schema) (params as Record).schema = JSON.parse(args.schema); + if (args["html-mode"]) (params as Record).mode = args["html-mode"]; + if (Object.keys(fetchConfig).length > 0) + (params as Record).fetchConfig = fetchConfig; + + out.start("Extracting"); + const result = await extract(apiKey, params); + out.stop(result.elapsedMs); + + if (result.data) out.result(result.data); + else out.error(result.error); + }, +}); diff --git a/src/commands/generate-schema.ts b/src/commands/generate-schema.ts deleted file mode 100644 index 8d77e57..0000000 --- a/src/commands/generate-schema.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "generate-schema", - description: "Generate a JSON schema from a natural language prompt", - }, - args: { - prompt: { - type: "positional", - description: "Describe the schema you need", - required: true, - }, - "existing-schema": { - type: "string", - description: "Existing schema to modify (as JSON string)", - }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.GenerateSchemaParams = { user_prompt: args.prompt }; - if (args["existing-schema"]) params.existing_schema = JSON.parse(args["existing-schema"]); - - out.start("Generating schema"); - const result = await scrapegraphai.generateSchema(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/history.ts b/src/commands/history.ts index 99ab59e..97c8a76 100644 --- a/src/commands/history.ts +++ b/src/commands/history.ts @@ -1,86 +1,89 @@ import * as p from "@clack/prompts"; import chalk from "chalk"; import { defineCommand } from "citty"; -import { HISTORY_SERVICES } from "scrapegraph-js"; -import * as scrapegraphai from "scrapegraph-js"; +import { history } from "scrapegraph-js"; +import type { ApiHistoryEntry, ApiHistoryService } from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; -const VALID = HISTORY_SERVICES.join(", "); +const SERVICES = ["scrape", "extract", "search", "monitor", "crawl"] as const; +const VALID = SERVICES.join(", "); const LOAD_MORE = "__load_more__"; -function getId(row: Record): string { - return String(row.request_id ?? row.crawl_id ?? row.id ?? "unknown"); +function entryUrl(e: ApiHistoryEntry): string { + const params = e.params as Record; + return String(params.url ?? params.query ?? ""); } -function label(row: Record): string { - const id = getId(row); - const short = id.length > 12 ? `${id.slice(0, 12)}…` : id; - const status = String(row.status ?? "—"); - const url = String(row.website_url ?? row.url ?? row.user_prompt ?? ""); +function entryLabel(e: ApiHistoryEntry): string { + const short = e.id.length > 12 ? `${e.id.slice(0, 12)}…` : e.id; + const url = entryUrl(e); const urlShort = url.length > 50 ? `${url.slice(0, 49)}…` : url; - const color = - status === "completed" || status === "done" - ? chalk.green - : status === "failed" - ? chalk.red - : chalk.yellow; - - return `${chalk.dim(short)} ${color(status)} ${urlShort}`; + e.status === "completed" ? chalk.green : e.status === "failed" ? chalk.red : chalk.yellow; + return `${chalk.dim(short)} ${color(e.status)} ${urlShort}`; } -function hint(row: Record): string { - const ts = row.created_at ?? row.timestamp ?? row.updated_at; - if (!ts) return ""; - const d = new Date(String(ts)); - return Number.isNaN(d.getTime()) ? String(ts) : d.toLocaleString(); +function entryHint(e: ApiHistoryEntry): string { + if (!e.createdAt) return ""; + const d = new Date(e.createdAt); + return Number.isNaN(d.getTime()) ? e.createdAt : d.toLocaleString(); } export default defineCommand({ meta: { name: "history", - description: "View request history for a service", + description: "View request history", }, args: { service: { type: "positional", - description: `Service name (${VALID})`, - required: true, + description: `Service (optional): ${VALID}`, + required: false, }, page: { type: "string", description: "Page number (default: 1)" }, - "page-size": { type: "string", description: "Results per page (default: 10, max: 100)" }, + "page-size": { type: "string", description: "Results per page (default: 20, max: 100)" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { const quiet = !!args.json; const out = log.create(quiet); - const key = await resolveApiKey(quiet); - const service = args.service as scrapegraphai.HistoryParams["service"]; + const apiKey = await resolveApiKey(quiet); + const service = args.service as ApiHistoryService | undefined; + if (service && !SERVICES.includes(service)) out.error(`Invalid service. Valid: ${VALID}`); const requestId = (args as { _: string[] })._.at(1); - const pageSize = args["page-size"] ? Number(args["page-size"]) : 10; + const limit = args["page-size"] ? Number(args["page-size"]) : 20; let page = args.page ? Number(args.page) : 1; const fetchPage = async (pg: number) => { - const r = await scrapegraphai.history(key, { service, page: pg, page_size: pageSize }); - if (r.status === "error") out.error(r.error); - const d = r.data as { requests: Record[]; next_key?: string }; - return { rows: d.requests ?? [], hasMore: !!d.next_key, ms: r.elapsedMs }; + const r = await history.list(apiKey, { + page: pg, + limit, + ...(service ? { service } : {}), + }); + if (!r.data) out.error(r.error); + const d = r.data as { data: ApiHistoryEntry[]; pagination: { total: number } }; + return { + rows: d.data ?? [], + hasMore: (d.pagination?.total ?? 0) > pg * limit, + ms: r.elapsedMs, + }; }; - if (quiet || requestId) { + if (requestId) { + const result = await history.get(apiKey, requestId); + if (result.data) out.result(result.data); + else out.error(result.error); + return; + } + + if (quiet) { const { rows } = await fetchPage(page); - if (requestId) { - const match = rows.find((r) => getId(r) === requestId); - if (!match) out.error(`Request ${requestId} not found on page ${page}`); - out.result(match); - return; - } out.result(rows); return; } - out.start(`Fetching ${service} history`); + out.start(`Fetching ${service ?? "all"} history`); const first = await fetchPage(page); out.stop(first.ms); @@ -94,9 +97,9 @@ export default defineCommand({ while (true) { const options = allRows.map((row) => ({ - value: getId(row), - label: label(row), - hint: hint(row), + value: row.id, + label: entryLabel(row), + hint: entryHint(row), })); if (hasMore) { @@ -136,7 +139,7 @@ export default defineCommand({ continue; } - const match = allRows.find((r) => getId(r) === selected); + const match = allRows.find((r) => r.id === selected); if (match) out.result(match); const back = await p.confirm({ message: "Back to list?" }); diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts deleted file mode 100644 index ccfc494..0000000 --- a/src/commands/markdownify.ts +++ /dev/null @@ -1,40 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "markdownify", - description: "Convert a webpage to clean markdown", - }, - args: { - url: { - type: "positional", - description: "Website URL to convert", - required: true, - }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/markdownify"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.MarkdownifyParams = { - website_url: args.url, - }; - - if (args.stealth) params.stealth = true; - if (args.headers) params.headers = JSON.parse(args.headers); - - out.start("Converting to markdown"); - const result = await scrapegraphai.markdownify(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/monitor.ts b/src/commands/monitor.ts new file mode 100644 index 0000000..fbdfe6b --- /dev/null +++ b/src/commands/monitor.ts @@ -0,0 +1,206 @@ +import * as p from "@clack/prompts"; +import chalk from "chalk"; +import { defineCommand } from "citty"; +import { monitor } from "scrapegraph-js"; +import type { + ApiMonitorCreateInput, + ApiMonitorUpdateInput, + ApiScrapeFormatEntry, +} from "scrapegraph-js"; +import { resolveApiKey } from "../lib/folders.js"; +import * as log from "../lib/log.js"; + +const ACTIONS = [ + "create", + "list", + "get", + "update", + "delete", + "pause", + "resume", + "activity", +] as const; +type Action = (typeof ACTIONS)[number]; + +const FORMATS = [ + "markdown", + "html", + "screenshot", + "branding", + "links", + "images", + "summary", +] as const; + +function buildFormats(raw: string): ApiScrapeFormatEntry[] { + return raw + .split(",") + .map((f) => f.trim()) + .filter(Boolean) + .map((f) => { + if (f === "markdown" || f === "html") return { type: f, mode: "normal" as const }; + return { type: f } as ApiScrapeFormatEntry; + }); +} + +export default defineCommand({ + meta: { + name: "monitor", + description: `Manage page-change monitors (${ACTIONS.join(", ")})`, + }, + args: { + action: { + type: "positional", + description: `Action: ${ACTIONS.join(", ")}`, + required: true, + }, + id: { type: "string", description: "Monitor ID (cronId)" }, + url: { type: "string", description: "URL to monitor (create)" }, + name: { type: "string", description: "Monitor name" }, + interval: { + type: "string", + description: "Cron expression or shorthand (e.g. '0 * * * *', '1h') — required for create", + }, + format: { + type: "string", + alias: "f", + description: `Formats to track, comma-separated: ${FORMATS.join(", ")} (default: markdown)`, + }, + "webhook-url": { type: "string", description: "Webhook URL for change notifications" }, + mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js" }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + limit: { type: "string", description: "Ticks per page for activity (max 100)" }, + cursor: { type: "string", description: "Pagination cursor for activity" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/monitor"); + const apiKey = await resolveApiKey(!!args.json); + const action = args.action as Action; + + if (!ACTIONS.includes(action)) { + out.error(`Unknown action: ${action}. Valid: ${ACTIONS.join(", ")}`); + return; + } + + const needsId: Action[] = ["get", "update", "delete", "pause", "resume", "activity"]; + if (needsId.includes(action) && !args.id) { + out.error(`--id is required for ${action}`); + return; + } + + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + + switch (action) { + case "create": { + if (!args.url) return out.error("--url is required for create"); + if (!args.interval) return out.error("--interval is required for create"); + + const params: ApiMonitorCreateInput = { + url: args.url, + interval: args.interval, + formats: buildFormats(args.format ?? "markdown"), + }; + const mut = params as Record; + if (args.name) mut.name = args.name; + if (args["webhook-url"]) mut.webhookUrl = args["webhook-url"]; + if (Object.keys(fetchConfig).length > 0) mut.fetchConfig = fetchConfig; + + out.start("Creating monitor"); + const result = await monitor.create(apiKey, params); + out.stop(result.elapsedMs); + if (result.data) out.result(result.data); + else out.error(result.error); + return; + } + + case "list": { + out.start("Fetching monitors"); + const result = await monitor.list(apiKey); + out.stop(result.elapsedMs); + if (!result.data) return out.error(result.error); + if (args.json) return out.result(result.data); + if (result.data.length === 0) { + p.log.warning("No monitors found."); + return; + } + for (const m of result.data) { + const color = m.status === "active" ? chalk.green : chalk.yellow; + p.log.info( + `${chalk.dim(m.cronId)} ${color(m.status)} ${m.config.url} ${chalk.dim(m.interval)}`, + ); + } + return; + } + + case "get": { + out.start("Fetching monitor"); + const result = await monitor.get(apiKey, args.id as string); + out.stop(result.elapsedMs); + if (result.data) out.result(result.data); + else out.error(result.error); + return; + } + + case "update": { + const params: ApiMonitorUpdateInput = {}; + const mut = params as Record; + if (args.name) mut.name = args.name; + if (args.interval) mut.interval = args.interval; + if (args["webhook-url"]) mut.webhookUrl = args["webhook-url"]; + if (args.format) mut.formats = buildFormats(args.format); + if (Object.keys(fetchConfig).length > 0) mut.fetchConfig = fetchConfig; + + out.start("Updating monitor"); + const result = await monitor.update(apiKey, args.id as string, params); + out.stop(result.elapsedMs); + if (result.data) out.result(result.data); + else out.error(result.error); + return; + } + + case "delete": { + out.start("Deleting monitor"); + const result = await monitor.delete(apiKey, args.id as string); + out.stop(result.elapsedMs); + if (result.data) out.result(result.data); + else out.error(result.error); + return; + } + + case "pause": { + out.start("Pausing monitor"); + const result = await monitor.pause(apiKey, args.id as string); + out.stop(result.elapsedMs); + if (result.data) out.result(result.data); + else out.error(result.error); + return; + } + + case "resume": { + out.start("Resuming monitor"); + const result = await monitor.resume(apiKey, args.id as string); + out.stop(result.elapsedMs); + if (result.data) out.result(result.data); + else out.error(result.error); + return; + } + + case "activity": { + const qp: { limit?: number; cursor?: string } = {}; + if (args.limit) qp.limit = Number(args.limit); + if (args.cursor) qp.cursor = args.cursor; + + out.start("Fetching monitor activity"); + const result = await monitor.activity(apiKey, args.id as string, qp); + out.stop(result.elapsedMs); + if (result.data) out.result(result.data); + else out.error(result.error); + return; + } + } + }, +}); diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index b0517eb..6ce0106 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -1,12 +1,25 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; +import { scrape } from "scrapegraph-js"; +import type { ApiScrapeFormatEntry, ApiScrapeRequest } from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; +const FORMATS = [ + "markdown", + "html", + "screenshot", + "branding", + "links", + "images", + "summary", + "json", +] as const; +type Format = (typeof FORMATS)[number]; + export default defineCommand({ meta: { name: "scrape", - description: "Get raw HTML content from a URL", + description: `Scrape a URL (formats: ${FORMATS.join(", ")})`, }, args: { url: { @@ -14,24 +27,89 @@ export default defineCommand({ description: "Website URL to scrape", required: true, }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - branding: { type: "boolean", description: "Extract branding info (+2 credits)" }, - "country-code": { type: "string", description: "ISO country code for geo-targeting" }, + format: { + type: "string", + alias: "f", + description: `Output format(s), comma-separated: ${FORMATS.join(", ")} (default: markdown)`, + }, + prompt: { + type: "string", + alias: "p", + description: "Prompt (required when format includes json)", + }, + schema: { type: "string", description: "JSON schema for json format (JSON string)" }, + "html-mode": { + type: "string", + description: "HTML/markdown extraction mode: normal (default), reader, prune", + }, + mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js" }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, + country: { type: "string", description: "ISO country code for geo-targeting" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/scrape"); - const key = await resolveApiKey(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/scrape"); + const apiKey = await resolveApiKey(!!args.json); + + const htmlMode = (args["html-mode"] ?? "normal") as "normal" | "reader" | "prune"; + const requested = (args.format ?? "markdown") + .split(",") + .map((f) => f.trim()) + .filter(Boolean); + + const formats: ApiScrapeFormatEntry[] = []; + for (const f of requested) { + if (!FORMATS.includes(f as Format)) { + out.error(`Unknown format: ${f}. Valid: ${FORMATS.join(", ")}`); + } + switch (f as Format) { + case "markdown": + formats.push({ type: "markdown", mode: htmlMode }); + break; + case "html": + formats.push({ type: "html", mode: htmlMode }); + break; + case "json": + if (!args.prompt) out.error("--prompt is required when format includes json"); + formats.push({ + type: "json", + prompt: args.prompt as string, + ...(args.schema ? { schema: JSON.parse(args.schema) } : {}), + mode: htmlMode, + }); + break; + case "screenshot": + formats.push({ type: "screenshot" }); + break; + case "branding": + formats.push({ type: "branding" }); + break; + case "links": + formats.push({ type: "links" }); + break; + case "images": + formats.push({ type: "images" }); + break; + case "summary": + formats.push({ type: "summary" }); + break; + } + } - const params: scrapegraphai.ScrapeParams = { website_url: args.url }; + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); + if (args.country) fetchConfig.country = args.country; - if (args.stealth) params.stealth = true; - if (args.branding) params.branding = true; - if (args["country-code"]) params.country_code = args["country-code"]; + const params: ApiScrapeRequest = { url: args.url, formats }; + if (Object.keys(fetchConfig).length > 0) + (params as unknown as Record).fetchConfig = fetchConfig; out.start("Scraping"); - const result = await scrapegraphai.scrape(key, params); + const result = await scrape(apiKey, params); out.stop(result.elapsedMs); if (result.data) out.result(result.data); diff --git a/src/commands/search-scraper.ts b/src/commands/search-scraper.ts deleted file mode 100644 index 041e32c..0000000 --- a/src/commands/search-scraper.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "search-scraper", - description: "Search the web and extract data with AI", - }, - args: { - prompt: { - type: "positional", - description: "Search query and extraction instructions", - required: true, - }, - "num-results": { - type: "string", - description: "Number of websites to scrape (3-20, default 3)", - }, - "no-extraction": { - type: "boolean", - description: "Return markdown only (2 credits/site instead of 10)", - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/searchscraper"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.SearchScraperParams = { - user_prompt: args.prompt, - }; - - if (args["num-results"]) params.num_results = Number(args["num-results"]); - if (args["no-extraction"]) params.extraction_mode = false; - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args.stealth) params.stealth = true; - if (args.headers) params.headers = JSON.parse(args.headers); - - out.start("Searching"); - const result = await scrapegraphai.searchScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/search.ts b/src/commands/search.ts new file mode 100644 index 0000000..b574942 --- /dev/null +++ b/src/commands/search.ts @@ -0,0 +1,67 @@ +import { defineCommand } from "citty"; +import { search } from "scrapegraph-js"; +import type { ApiSearchRequest } from "scrapegraph-js"; +import { resolveApiKey } from "../lib/folders.js"; +import * as log from "../lib/log.js"; + +export default defineCommand({ + meta: { + name: "search", + description: "Search the web and extract data with AI", + }, + args: { + query: { + type: "positional", + description: "Search query", + required: true, + }, + "num-results": { + type: "string", + description: "Number of results to scrape (1-20, default 3)", + }, + prompt: { + type: "string", + alias: "p", + description: "Extraction prompt applied to results", + }, + schema: { type: "string", description: "Output JSON schema (JSON string)" }, + format: { type: "string", description: "Result format: markdown (default) or html" }, + country: { + type: "string", + description: "2-letter country code for geo-targeting (e.g. 'us', 'de')", + }, + "time-range": { + type: "string", + description: "Recency filter: past_hour, past_24_hours, past_week, past_month, past_year", + }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/search"); + const apiKey = await resolveApiKey(!!args.json); + + const params: ApiSearchRequest = { query: args.query }; + const mut = params as Record; + if (args["num-results"]) mut.numResults = Number(args["num-results"]); + if (args.prompt) mut.prompt = args.prompt; + if (args.schema) mut.schema = JSON.parse(args.schema); + if (args.format) mut.format = args.format; + if (args.country) mut.locationGeoCode = args.country; + if (args["time-range"]) mut.timeRange = args["time-range"]; + + const fetchConfig: Record = {}; + if (args.stealth) fetchConfig.stealth = true; + if (args.headers) fetchConfig.headers = JSON.parse(args.headers); + if (Object.keys(fetchConfig).length > 0) mut.fetchConfig = fetchConfig; + + out.start("Searching"); + const result = await search(apiKey, params); + out.stop(result.elapsedMs); + + if (result.data) out.result(result.data); + else out.error(result.error); + }, +}); diff --git a/src/commands/sitemap.ts b/src/commands/sitemap.ts deleted file mode 100644 index 2120b16..0000000 --- a/src/commands/sitemap.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "sitemap", - description: "Get all URLs from a website's sitemap", - }, - args: { - url: { - type: "positional", - description: "Website URL", - required: true, - }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/sitemap"); - const key = await resolveApiKey(!!args.json); - - out.start("Fetching sitemap"); - const result = await scrapegraphai.sitemap(key, { website_url: args.url }); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/smart-scraper.ts b/src/commands/smart-scraper.ts deleted file mode 100644 index be3d2a4..0000000 --- a/src/commands/smart-scraper.ts +++ /dev/null @@ -1,57 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "smart-scraper", - description: "Extract structured data from a URL using AI", - }, - args: { - url: { - type: "positional", - description: "Website URL to scrape", - required: true, - }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt", - required: true, - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, - pages: { type: "string", description: "Total pages to scrape (1-100)" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - cookies: { type: "string", description: "Cookies as JSON object string" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - "plain-text": { type: "boolean", description: "Return plain text instead of JSON" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/smartscraper"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.SmartScraperParams = { - website_url: args.url, - user_prompt: args.prompt, - }; - - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args.scrolls) params.number_of_scrolls = Number(args.scrolls); - if (args.pages) params.total_pages = Number(args.pages); - if (args.stealth) params.stealth = true; - if (args.cookies) params.cookies = JSON.parse(args.cookies); - if (args.headers) params.headers = JSON.parse(args.headers); - if (args["plain-text"]) params.plain_text = true; - - out.start("Scraping"); - const result = await scrapegraphai.smartScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/validate.ts b/src/commands/validate.ts index dd2c81d..4f69b51 100644 --- a/src/commands/validate.ts +++ b/src/commands/validate.ts @@ -1,5 +1,5 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; +import { checkHealth } from "scrapegraph-js"; import { resolveApiKey } from "../lib/folders.js"; import * as log from "../lib/log.js"; @@ -13,10 +13,10 @@ export default defineCommand({ }, run: async ({ args }) => { const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); + const apiKey = await resolveApiKey(!!args.json); out.start("Checking API health"); - const result = await scrapegraphai.checkHealth(key); + const result = await checkHealth(apiKey); out.stop(result.elapsedMs); if (result.data) out.result(result.data); diff --git a/src/lib/env.ts b/src/lib/env.ts index 8777ab0..6658268 100644 --- a/src/lib/env.ts +++ b/src/lib/env.ts @@ -10,8 +10,11 @@ if (process.env.JUST_SCRAPE_API_URL && !process.env.SGAI_API_URL) if (process.env.JUST_SCRAPE_DEBUG === "1" && !process.env.SGAI_DEBUG) process.env.SGAI_DEBUG = "1"; -if (process.env.JUST_SCRAPE_TIMEOUT_S && !process.env.SGAI_TIMEOUT_S) - process.env.SGAI_TIMEOUT_S = process.env.JUST_SCRAPE_TIMEOUT_S; +// SDK v2 renamed SGAI_TIMEOUT_S -> SGAI_TIMEOUT. Bridge legacy vars for back-compat. +if (!process.env.SGAI_TIMEOUT) { + const legacy = process.env.JUST_SCRAPE_TIMEOUT_S ?? process.env.SGAI_TIMEOUT_S; + if (legacy) process.env.SGAI_TIMEOUT = legacy; +} function loadConfigFile(): Record { if (!existsSync(CONFIG_PATH)) return {}; diff --git a/tests/smoke.test.ts b/tests/smoke.test.ts index e2fab44..eecfd7b 100644 --- a/tests/smoke.test.ts +++ b/tests/smoke.test.ts @@ -1,7 +1,11 @@ import { expect, test } from "bun:test"; -import { HISTORY_SERVICES, smartScraper } from "scrapegraph-js"; +import { ScrapeGraphAI, crawl, extract, monitor, scrape, search } from "scrapegraph-js"; -test("sdk exports are available", () => { - expect(typeof smartScraper).toBe("function"); - expect(HISTORY_SERVICES.length).toBeGreaterThan(0); +test("v2 SDK exports are available", () => { + expect(typeof scrape).toBe("function"); + expect(typeof extract).toBe("function"); + expect(typeof search).toBe("function"); + expect(typeof crawl.start).toBe("function"); + expect(typeof monitor.create).toBe("function"); + expect(typeof ScrapeGraphAI).toBe("function"); }); From 82b813727aaef09d12c574fb50c384f49b99ee7a Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 15 Apr 2026 21:59:01 +0200 Subject: [PATCH 2/2] fix(ci): build scrapegraph-js in-place before tsc/test scrapegraph-js is pinned to a GitHub commit (PR #13 head) that ships without a prebuilt dist/, so module resolution fails on a fresh install. Build it in-place after bun install so tsc/biome/bun test can resolve it. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b404665..c5f6bbd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,6 +14,10 @@ jobs: - uses: actions/checkout@v4 - uses: oven-sh/setup-bun@v2 - run: bun install + # scrapegraph-js is pinned to a GitHub commit (PR #13 head, not yet on npm) + # and ships without a prebuilt dist/ — build it in-place so module + # resolution works. + - run: cd node_modules/scrapegraph-js && bun install && bun run build - run: bun test lint: @@ -23,4 +27,5 @@ jobs: - uses: actions/checkout@v4 - uses: oven-sh/setup-bun@v2 - run: bun install + - run: cd node_modules/scrapegraph-js && bun install && bun run build - run: bun run check