ScrapeGraphAI · VinciGit00 · Apr 18, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -14,6 +14,10 @@ jobs:
       - uses: actions/checkout@v4
       - uses: oven-sh/setup-bun@v2
       - run: bun install
+      # scrapegraph-js is pinned to a GitHub commit (PR #13 head, not yet on npm)
+      # and ships without a prebuilt dist/ — build it in-place so module
+      # resolution works.
+      - run: cd node_modules/scrapegraph-js && bun install && bun run build
       - run: bun test
 
   lint:
@@ -23,4 +27,5 @@ jobs:
       - uses: actions/checkout@v4
       - uses: oven-sh/setup-bun@v2
       - run: bun install
+      - run: cd node_modules/scrapegraph-js && bun install && bun run build
       - run: bun run check
diff --git a/README.md b/README.md
diff --git a/bun.lock b/bun.lock
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "just-scrape",
-	"version": "0.2.1",
+	"version": "1.0.0",
 	"description": "ScrapeGraph AI CLI tool",
 	"type": "module",
 	"main": "dist/cli.mjs",
@@ -28,7 +28,7 @@
 		"chalk": "^5.4.1",
 		"citty": "^0.1.6",
 		"dotenv": "^17.2.4",
-		"scrapegraph-js": "^1.0.0"
+		"scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#096c110"
 	},
 	"devDependencies": {
 		"@biomejs/biome": "^1.9.4",

diff --git a/src/cli.ts b/src/cli.ts
@@ -12,14 +12,11 @@ const main = defineCommand({
 		description: "ScrapeGraph AI CLI tool",
 	},
 	subCommands: {
-		"smart-scraper": () => import("./commands/smart-scraper.js").then((m) => m.default),
-		"search-scraper": () => import("./commands/search-scraper.js").then((m) => m.default),
-		markdownify: () => import("./commands/markdownify.js").then((m) => m.default),
-		crawl: () => import("./commands/crawl.js").then((m) => m.default),
-		sitemap: () => import("./commands/sitemap.js").then((m) => m.default),
 		scrape: () => import("./commands/scrape.js").then((m) => m.default),
-		"agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default),
-		"generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default),
+		extract: () => import("./commands/extract.js").then((m) => m.default),
+		search: () => import("./commands/search.js").then((m) => m.default),
+		crawl: () => import("./commands/crawl.js").then((m) => m.default),
+		monitor: () => import("./commands/monitor.js").then((m) => m.default),
 		history: () => import("./commands/history.js").then((m) => m.default),
 		credits: () => import("./commands/credits.js").then((m) => m.default),
 		validate: () => import("./commands/validate.js").then((m) => m.default),

diff --git a/src/commands/agentic-scraper.ts b/src/commands/agentic-scraper.ts
diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts
@@ -1,62 +1,115 @@
 import { defineCommand } from "citty";
-import * as scrapegraphai from "scrapegraph-js";
+import { crawl } from "scrapegraph-js";
+import type { ApiCrawlRequest, ApiScrapeFormatEntry } from "scrapegraph-js";
 import { resolveApiKey } from "../lib/folders.js";
 import * as log from "../lib/log.js";
 
+const FORMATS = [
+	"markdown",
+	"html",
+	"screenshot",
+	"branding",
+	"links",
+	"images",
+	"summary",
+] as const;
+type Format = (typeof FORMATS)[number];
+
+const POLL_INTERVAL_MS = 3000;
+
+function buildFormat(f: Format): ApiScrapeFormatEntry {
+	if (f === "markdown" || f === "html") return { type: f, mode: "normal" };
+	return { type: f } as ApiScrapeFormatEntry;
+}
+
 export default defineCommand({
 	meta: {
 		name: "crawl",
-		description: "Crawl and extract data from multiple pages",
+		description: "Crawl pages starting from a URL",
 	},
 	args: {
 		url: {
 			type: "positional",
-			description: "Starting URL to crawl",
+			description: "Starting URL",
 			required: true,
 		},
-		prompt: {
+		format: {
+			type: "string",
+			alias: "f",
+			description: `Per-page format(s), comma-separated: ${FORMATS.join(", ")} (default: markdown)`,
+		},
+		"max-pages": { type: "string", description: "Maximum pages to crawl (default 50, max 1000)" },
+		"max-depth": { type: "string", description: "Crawl depth (default 2)" },
+		"max-links-per-page": { type: "string", description: "Max links per page (default 10)" },
+		"allow-external": { type: "boolean", description: "Allow crawling external domains" },
+		"include-patterns": {
 			type: "string",
-			alias: "p",
-			description: "Extraction prompt (required when extraction mode is on)",
+			description: "JSON array of regex patterns to include",
 		},
-		"no-extraction": {
-			type: "boolean",
-			description: "Return markdown only (2 credits/page instead of 10)",
+		"exclude-patterns": {
+			type: "string",
+			description: "JSON array of regex patterns to exclude",
 		},
-		"max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" },
-		depth: { type: "string", description: "Crawl depth (default 1)" },
-		schema: { type: "string", description: "Output JSON schema (as JSON string)" },
-		rules: { type: "string", description: "Crawl rules as JSON object string" },
-		"no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" },
-		stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" },
+		mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js" },
+		stealth: { type: "boolean", description: "Enable stealth mode" },
 		json: { type: "boolean", description: "Output raw JSON (pipeable)" },
 	},
 	run: async ({ args }) => {
 		const out = log.create(!!args.json);
-		out.docs("https://docs.scrapegraphai.com/services/smartcrawler");
-		const key = await resolveApiKey(!!args.json);
-
-		const base: Record<string, unknown> = { url: args.url };
-		if (args["max-pages"]) base.max_pages = Number(args["max-pages"]);
-		if (args.depth) base.depth = Number(args.depth);
-		if (args.rules) base.rules = JSON.parse(args.rules);
-		if (args["no-sitemap"]) base.sitemap = false;
-		if (args.stealth) base.stealth = true;
-
-		if (args["no-extraction"]) {
-			base.extraction_mode = false;
-		} else {
-			if (args.prompt) base.prompt = args.prompt;
-			if (args.schema) base.schema = JSON.parse(args.schema);
+		out.docs("https://docs.scrapegraphai.com/api-reference/crawl");
+		const apiKey = await resolveApiKey(!!args.json);
+
+		const requested = (args.format ?? "markdown")
+			.split(",")
+			.map((f) => f.trim())
+			.filter(Boolean);
+		for (const f of requested) {
+			if (!FORMATS.includes(f as Format))
+				out.error(`Unknown format: ${f}. Valid: ${FORMATS.join(", ")}`);
 		}
+		const formats = requested.map((f) => buildFormat(f as Format));
 
-		const params = base as scrapegraphai.CrawlParams;
+		const params: ApiCrawlRequest = { url: args.url, formats };
+		const mut = params as Record<string, unknown>;
+		if (args["max-pages"]) mut.maxPages = Number(args["max-pages"]);
+		if (args["max-depth"]) mut.maxDepth = Number(args["max-depth"]);
+		if (args["max-links-per-page"]) mut.maxLinksPerPage = Number(args["max-links-per-page"]);
+		if (args["allow-external"]) mut.allowExternal = true;
+		if (args["include-patterns"]) mut.includePatterns = JSON.parse(args["include-patterns"]);
+		if (args["exclude-patterns"]) mut.excludePatterns = JSON.parse(args["exclude-patterns"]);
 
-		out.start("Crawling");
-		const result = await scrapegraphai.crawl(key, params, out.poll);
-		out.stop(result.elapsedMs);
+		const fetchConfig: Record<string, unknown> = {};
+		if (args.mode) fetchConfig.mode = args.mode;
+		if (args.stealth) fetchConfig.stealth = true;
+		if (Object.keys(fetchConfig).length > 0) mut.fetchConfig = fetchConfig;
 
-		if (result.data) out.result(result.data);
-		else out.error(result.error);
+		out.start("Starting crawl");
+		const job = await crawl.start(apiKey, params);
+		if (!job.data) {
+			out.error(job.error);
+			return;
+		}
+		const jobId = job.data.id;
+		let totalElapsed = job.elapsedMs;
+
+		while (true) {
+			await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
+			const status = await crawl.get(apiKey, jobId);
+			totalElapsed += status.elapsedMs;
+			if (!status.data) {
+				out.error(status.error);
+				return;
+			}
+			out.poll(`${status.data.status} (${status.data.finished}/${status.data.total})`);
+			if (
+				status.data.status === "completed" ||
+				status.data.status === "failed" ||
+				status.data.status === "deleted"
+			) {
+				out.stop(totalElapsed);
+				out.result(status.data);
+				return;
+			}
+		}
 	},
 });
diff --git a/src/commands/credits.ts b/src/commands/credits.ts
@@ -1,5 +1,5 @@
 import { defineCommand } from "citty";
-import * as scrapegraphai from "scrapegraph-js";
+import { getCredits } from "scrapegraph-js";
 import { resolveApiKey } from "../lib/folders.js";
 import * as log from "../lib/log.js";
 
@@ -13,10 +13,10 @@ export default defineCommand({
 	},
 	run: async ({ args }) => {
 		const out = log.create(!!args.json);
-		const key = await resolveApiKey(!!args.json);
+		const apiKey = await resolveApiKey(!!args.json);
 
 		out.start("Fetching credits");
-		const result = await scrapegraphai.getCredits(key);
+		const result = await getCredits(apiKey);
 		out.stop(result.elapsedMs);
 
 		if (result.data) out.result(result.data);

diff --git a/src/commands/extract.ts b/src/commands/extract.ts
@@ -0,0 +1,63 @@
+import { defineCommand } from "citty";
+import { extract } from "scrapegraph-js";
+import type { ApiExtractRequest } from "scrapegraph-js";
+import { resolveApiKey } from "../lib/folders.js";
+import * as log from "../lib/log.js";
+
+export default defineCommand({
+	meta: {
+		name: "extract",
+		description: "Extract structured data from a URL using AI",
+	},
+	args: {
+		url: {
+			type: "positional",
+			description: "Website URL to extract from",
+			required: true,
+		},
+		prompt: {
+			type: "string",
+			alias: "p",
+			description: "Extraction prompt",
+			required: true,
+		},
+		schema: { type: "string", description: "Output JSON schema (JSON string)" },
+		mode: { type: "string", description: "Fetch mode: auto (default), fast, js" },
+		"html-mode": {
+			type: "string",
+			description: "HTML processing mode: normal (default), reader, prune",
+		},
+		stealth: { type: "boolean", description: "Enable stealth mode" },
+		scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" },
+		cookies: { type: "string", description: "Cookies as JSON object string" },
+		headers: { type: "string", description: "Custom headers as JSON object string" },
+		country: { type: "string", description: "ISO country code for geo-targeting" },
+		json: { type: "boolean", description: "Output raw JSON (pipeable)" },
+	},
+	run: async ({ args }) => {
+		const out = log.create(!!args.json);
+		out.docs("https://docs.scrapegraphai.com/api-reference/extract");
+		const apiKey = await resolveApiKey(!!args.json);
+
+		const fetchConfig: Record<string, unknown> = {};
+		if (args.mode) fetchConfig.mode = args.mode;
+		if (args.stealth) fetchConfig.stealth = true;
+		if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls);
+		if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies);
+		if (args.headers) fetchConfig.headers = JSON.parse(args.headers);
+		if (args.country) fetchConfig.country = args.country;
+
+		const params: ApiExtractRequest = { url: args.url, prompt: args.prompt };
+		if (args.schema) (params as Record<string, unknown>).schema = JSON.parse(args.schema);
+		if (args["html-mode"]) (params as Record<string, unknown>).mode = args["html-mode"];
+		if (Object.keys(fetchConfig).length > 0)
+			(params as Record<string, unknown>).fetchConfig = fetchConfig;
+
+		out.start("Extracting");
+		const result = await extract(apiKey, params);
+		out.stop(result.elapsedMs);
+
+		if (result.data) out.result(result.data);
+		else out.error(result.error);
+	},
+});