Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ jobs:
- uses: actions/checkout@v4
- uses: oven-sh/setup-bun@v2
- run: bun install
# scrapegraph-js is pinned to a GitHub commit (PR #13 head, not yet on npm)
# and ships without a prebuilt dist/ — build it in-place so module
# resolution works.
- run: cd node_modules/scrapegraph-js && bun install && bun run build
- run: bun test

lint:
Expand All @@ -23,4 +27,5 @@ jobs:
- uses: actions/checkout@v4
- uses: oven-sh/setup-bun@v2
- run: bun install
- run: cd node_modules/scrapegraph-js && bun install && bun run build
- run: bun run check
341 changes: 145 additions & 196 deletions README.md

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "just-scrape",
"version": "0.2.1",
"version": "1.0.0",
"description": "ScrapeGraph AI CLI tool",
"type": "module",
"main": "dist/cli.mjs",
Expand Down Expand Up @@ -28,7 +28,7 @@
"chalk": "^5.4.1",
"citty": "^0.1.6",
"dotenv": "^17.2.4",
"scrapegraph-js": "^1.0.0"
"scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#096c110"
},
"devDependencies": {
"@biomejs/biome": "^1.9.4",
Expand Down
11 changes: 4 additions & 7 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,11 @@ const main = defineCommand({
description: "ScrapeGraph AI CLI tool",
},
subCommands: {
"smart-scraper": () => import("./commands/smart-scraper.js").then((m) => m.default),
"search-scraper": () => import("./commands/search-scraper.js").then((m) => m.default),
markdownify: () => import("./commands/markdownify.js").then((m) => m.default),
crawl: () => import("./commands/crawl.js").then((m) => m.default),
sitemap: () => import("./commands/sitemap.js").then((m) => m.default),
scrape: () => import("./commands/scrape.js").then((m) => m.default),
"agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default),
"generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default),
extract: () => import("./commands/extract.js").then((m) => m.default),
search: () => import("./commands/search.js").then((m) => m.default),
crawl: () => import("./commands/crawl.js").then((m) => m.default),
monitor: () => import("./commands/monitor.js").then((m) => m.default),
history: () => import("./commands/history.js").then((m) => m.default),
credits: () => import("./commands/credits.js").then((m) => m.default),
validate: () => import("./commands/validate.js").then((m) => m.default),
Expand Down
51 changes: 0 additions & 51 deletions src/commands/agentic-scraper.ts

This file was deleted.

125 changes: 89 additions & 36 deletions src/commands/crawl.ts
Original file line number Diff line number Diff line change
@@ -1,62 +1,115 @@
import { defineCommand } from "citty";
import * as scrapegraphai from "scrapegraph-js";
import { crawl } from "scrapegraph-js";
import type { ApiCrawlRequest, ApiScrapeFormatEntry } from "scrapegraph-js";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";

const FORMATS = [
"markdown",
"html",
"screenshot",
"branding",
"links",
"images",
"summary",
] as const;
type Format = (typeof FORMATS)[number];

const POLL_INTERVAL_MS = 3000;

function buildFormat(f: Format): ApiScrapeFormatEntry {
if (f === "markdown" || f === "html") return { type: f, mode: "normal" };
return { type: f } as ApiScrapeFormatEntry;
}

export default defineCommand({
meta: {
name: "crawl",
description: "Crawl and extract data from multiple pages",
description: "Crawl pages starting from a URL",
},
args: {
url: {
type: "positional",
description: "Starting URL to crawl",
description: "Starting URL",
required: true,
},
prompt: {
format: {
type: "string",
alias: "f",
description: `Per-page format(s), comma-separated: ${FORMATS.join(", ")} (default: markdown)`,
},
"max-pages": { type: "string", description: "Maximum pages to crawl (default 50, max 1000)" },
"max-depth": { type: "string", description: "Crawl depth (default 2)" },
"max-links-per-page": { type: "string", description: "Max links per page (default 10)" },
"allow-external": { type: "boolean", description: "Allow crawling external domains" },
"include-patterns": {
type: "string",
alias: "p",
description: "Extraction prompt (required when extraction mode is on)",
description: "JSON array of regex patterns to include",
},
"no-extraction": {
type: "boolean",
description: "Return markdown only (2 credits/page instead of 10)",
"exclude-patterns": {
type: "string",
description: "JSON array of regex patterns to exclude",
},
"max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" },
depth: { type: "string", description: "Crawl depth (default 1)" },
schema: { type: "string", description: "Output JSON schema (as JSON string)" },
rules: { type: "string", description: "Crawl rules as JSON object string" },
"no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" },
stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" },
mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js" },
stealth: { type: "boolean", description: "Enable stealth mode" },
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
const out = log.create(!!args.json);
out.docs("https://docs.scrapegraphai.com/services/smartcrawler");
const key = await resolveApiKey(!!args.json);

const base: Record<string, unknown> = { url: args.url };
if (args["max-pages"]) base.max_pages = Number(args["max-pages"]);
if (args.depth) base.depth = Number(args.depth);
if (args.rules) base.rules = JSON.parse(args.rules);
if (args["no-sitemap"]) base.sitemap = false;
if (args.stealth) base.stealth = true;

if (args["no-extraction"]) {
base.extraction_mode = false;
} else {
if (args.prompt) base.prompt = args.prompt;
if (args.schema) base.schema = JSON.parse(args.schema);
out.docs("https://docs.scrapegraphai.com/api-reference/crawl");
const apiKey = await resolveApiKey(!!args.json);

const requested = (args.format ?? "markdown")
.split(",")
.map((f) => f.trim())
.filter(Boolean);
for (const f of requested) {
if (!FORMATS.includes(f as Format))
out.error(`Unknown format: ${f}. Valid: ${FORMATS.join(", ")}`);
}
const formats = requested.map((f) => buildFormat(f as Format));

const params = base as scrapegraphai.CrawlParams;
const params: ApiCrawlRequest = { url: args.url, formats };
const mut = params as Record<string, unknown>;
if (args["max-pages"]) mut.maxPages = Number(args["max-pages"]);
if (args["max-depth"]) mut.maxDepth = Number(args["max-depth"]);
if (args["max-links-per-page"]) mut.maxLinksPerPage = Number(args["max-links-per-page"]);
if (args["allow-external"]) mut.allowExternal = true;
if (args["include-patterns"]) mut.includePatterns = JSON.parse(args["include-patterns"]);
if (args["exclude-patterns"]) mut.excludePatterns = JSON.parse(args["exclude-patterns"]);

out.start("Crawling");
const result = await scrapegraphai.crawl(key, params, out.poll);
out.stop(result.elapsedMs);
const fetchConfig: Record<string, unknown> = {};
if (args.mode) fetchConfig.mode = args.mode;
if (args.stealth) fetchConfig.stealth = true;
if (Object.keys(fetchConfig).length > 0) mut.fetchConfig = fetchConfig;

if (result.data) out.result(result.data);
else out.error(result.error);
out.start("Starting crawl");
const job = await crawl.start(apiKey, params);
if (!job.data) {
out.error(job.error);
return;
}
const jobId = job.data.id;
let totalElapsed = job.elapsedMs;

while (true) {
await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
const status = await crawl.get(apiKey, jobId);
totalElapsed += status.elapsedMs;
if (!status.data) {
out.error(status.error);
return;
}
out.poll(`${status.data.status} (${status.data.finished}/${status.data.total})`);
if (
status.data.status === "completed" ||
status.data.status === "failed" ||
status.data.status === "deleted"
) {
out.stop(totalElapsed);
out.result(status.data);
return;
}
}
},
});
6 changes: 3 additions & 3 deletions src/commands/credits.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { defineCommand } from "citty";
import * as scrapegraphai from "scrapegraph-js";
import { getCredits } from "scrapegraph-js";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";

Expand All @@ -13,10 +13,10 @@ export default defineCommand({
},
run: async ({ args }) => {
const out = log.create(!!args.json);
const key = await resolveApiKey(!!args.json);
const apiKey = await resolveApiKey(!!args.json);

out.start("Fetching credits");
const result = await scrapegraphai.getCredits(key);
const result = await getCredits(apiKey);
out.stop(result.elapsedMs);

if (result.data) out.result(result.data);
Expand Down
63 changes: 63 additions & 0 deletions src/commands/extract.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import { defineCommand } from "citty";
import { extract } from "scrapegraph-js";
import type { ApiExtractRequest } from "scrapegraph-js";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";

export default defineCommand({
meta: {
name: "extract",
description: "Extract structured data from a URL using AI",
},
args: {
url: {
type: "positional",
description: "Website URL to extract from",
required: true,
},
prompt: {
type: "string",
alias: "p",
description: "Extraction prompt",
required: true,
},
schema: { type: "string", description: "Output JSON schema (JSON string)" },
mode: { type: "string", description: "Fetch mode: auto (default), fast, js" },
"html-mode": {
type: "string",
description: "HTML processing mode: normal (default), reader, prune",
},
stealth: { type: "boolean", description: "Enable stealth mode" },
scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" },
cookies: { type: "string", description: "Cookies as JSON object string" },
headers: { type: "string", description: "Custom headers as JSON object string" },
country: { type: "string", description: "ISO country code for geo-targeting" },
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
const out = log.create(!!args.json);
out.docs("https://docs.scrapegraphai.com/api-reference/extract");
const apiKey = await resolveApiKey(!!args.json);

const fetchConfig: Record<string, unknown> = {};
if (args.mode) fetchConfig.mode = args.mode;
if (args.stealth) fetchConfig.stealth = true;
if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls);
if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies);
if (args.headers) fetchConfig.headers = JSON.parse(args.headers);
if (args.country) fetchConfig.country = args.country;

const params: ApiExtractRequest = { url: args.url, prompt: args.prompt };
if (args.schema) (params as Record<string, unknown>).schema = JSON.parse(args.schema);
if (args["html-mode"]) (params as Record<string, unknown>).mode = args["html-mode"];
if (Object.keys(fetchConfig).length > 0)
(params as Record<string, unknown>).fetchConfig = fetchConfig;

out.start("Extracting");
const result = await extract(apiKey, params);
out.stop(result.elapsedMs);

if (result.data) out.result(result.data);
else out.error(result.error);
},
});
Loading
Loading