From f36dd77c5344e9ffceac21add03b0bded2752c79 Mon Sep 17 00:00:00 2001 From: Abimael Martell <1450169+abimaelmartell@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:52:31 -0700 Subject: [PATCH 1/3] feat(cli): add video discovery output --- README.md | 9 ++- src/__tests__/commands/scrape.test.ts | 9 +-- src/__tests__/commands/search.test.ts | 9 +-- src/__tests__/utils/options.test.ts | 9 ++- src/__tests__/utils/output.test.ts | 82 +++++++++++++++++++++++++++ src/commands/scrape.ts | 1 + src/index.ts | 12 +++- src/types/scrape.ts | 42 +++++++++++++- src/utils/options.ts | 1 + src/utils/output.ts | 18 ++++++ 10 files changed, 173 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index de6aa9da32..a2cdd2751e 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,9 @@ firecrawl https://spa-app.com --wait-for 3000 # Get all links from a page firecrawl https://example.com --format links +# Discover videos on a page (prints video URLs) +firecrawl https://example.com/product --format video + # Screenshot + markdown firecrawl https://example.com --format markdown --screenshot @@ -704,8 +707,9 @@ firecrawl https://example.com --format links --pretty ### Format Behavior -- **Single format**: Outputs raw content (markdown text, HTML, etc.) +- **Single format**: Outputs raw content (markdown text, HTML, links, image URLs, video URLs, etc.) - **Multiple formats**: Outputs JSON with all requested data +- **Video metadata**: Use `--format video --json` to include thumbnails, descriptions, and other `videos` metadata ```bash # Raw markdown output @@ -713,6 +717,9 @@ firecrawl https://example.com --format markdown # JSON output with multiple formats firecrawl https://example.com --format markdown,links,images + +# Full video metadata +firecrawl https://example.com/product --format video --json ``` --- diff --git a/src/__tests__/commands/scrape.test.ts b/src/__tests__/commands/scrape.test.ts index aaeae6b967..44d864eaee 100644 --- a/src/__tests__/commands/scrape.test.ts +++ b/src/__tests__/commands/scrape.test.ts @@ -447,12 +447,9 @@ describe('executeScrape', () => { describe('Type safety', () => { it('should accept valid ScrapeFormat types', async () => { - const formatList: Array<'markdown' | 'html' | 'rawHtml' | 'links'> = [ - 'markdown', - 'html', - 'rawHtml', - 'links', - ]; + const formatList: Array< + 'markdown' | 'html' | 'rawHtml' | 'links' | 'video' + > = ['markdown', 'html', 'rawHtml', 'links', 'video']; for (const format of formatList) { mockClient.scrape.mockResolvedValue({ [format]: 'test' }); diff --git a/src/__tests__/commands/search.test.ts b/src/__tests__/commands/search.test.ts index f559683815..36836bb492 100644 --- a/src/__tests__/commands/search.test.ts +++ b/src/__tests__/commands/search.test.ts @@ -703,12 +703,9 @@ describe('executeSearch', () => { }); it('should accept valid scrape format types', async () => { - const formatList: Array<'markdown' | 'html' | 'rawHtml' | 'links'> = [ - 'markdown', - 'html', - 'rawHtml', - 'links', - ]; + const formatList: Array< + 'markdown' | 'html' | 'rawHtml' | 'links' | 'video' + > = ['markdown', 'html', 'rawHtml', 'links', 'video']; for (const format of formatList) { mockHttpPost.mockResolvedValue( diff --git a/src/__tests__/utils/options.test.ts b/src/__tests__/utils/options.test.ts index eaed0f5ab3..1cd16b0e6f 100644 --- a/src/__tests__/utils/options.test.ts +++ b/src/__tests__/utils/options.test.ts @@ -51,6 +51,10 @@ describe('Option Parsing Utilities', () => { it('should parse single attributes format', () => { expect(parseFormats('attributes')).toEqual(['attributes']); }); + + it('should parse single video format', () => { + expect(parseFormats('video')).toEqual(['video']); + }); }); describe('Multiple format parsing', () => { @@ -67,12 +71,15 @@ describe('Option Parsing Utilities', () => { }); it('should handle all common formats together', () => { - expect(parseFormats('markdown,html,links,images,screenshot')).toEqual([ + expect( + parseFormats('markdown,html,links,images,screenshot,video') + ).toEqual([ 'markdown', 'html', 'links', 'images', 'screenshot', + 'video', ]); }); }); diff --git a/src/__tests__/utils/output.test.ts b/src/__tests__/utils/output.test.ts index 8667768ece..3bae673a3b 100644 --- a/src/__tests__/utils/output.test.ts +++ b/src/__tests__/utils/output.test.ts @@ -201,6 +201,53 @@ describe('Output Utilities', () => { ); }); + it('should output newline-separated video URLs for single video format', () => { + vi.mocked(fs.existsSync).mockReturnValue(true); + + handleScrapeOutput( + { + success: true, + data: { + videos: [ + { + url: 'https://cdn.example.com/product.mp4', + sourceURL: 'https://example.com/product', + source: 'script', + }, + { + url: 'https://cdn.example.com/demo.mp4', + sourceURL: 'https://example.com/product', + source: 'html', + }, + ], + }, + }, + ['video'] + ); + + expect(stdoutWriteSpy).toHaveBeenCalledWith( + 'https://cdn.example.com/product.mp4\nhttps://cdn.example.com/demo.mp4\n' + ); + }); + + it('should output legacy video URL for single video format', () => { + vi.mocked(fs.existsSync).mockReturnValue(true); + + handleScrapeOutput( + { + success: true, + data: { + video: 'https://storage.example.com/video.mp4', + }, + }, + ['video'] + ); + + expect(stdoutWriteSpy).toHaveBeenCalledWith( + 'https://storage.example.com/video.mp4\n' + ); + }); + it('should output summary for single summary format', () => { vi.mocked(fs.existsSync).mockReturnValue(true); @@ -263,6 +310,41 @@ describe('Output Utilities', () => { expect(parsed.links).toEqual(['https://example.com']); }); + it('should include videos in JSON output for multiple formats', () => { + vi.mocked(fs.existsSync).mockReturnValue(true); + + handleScrapeOutput( + { + success: true, + data: { + markdown: '# Test', + videos: [ + { + url: 'https://cdn.example.com/product.mp4', + sourceURL: 'https://example.com/product', + source: 'script', + thumbnail: 'https://cdn.example.com/poster.jpg', + }, + ], + metadata: { title: 'Test' }, + }, + }, + ['markdown', 'video'] + ); + + const output = stdoutWriteSpy.mock.calls[0][0]; + const parsed = JSON.parse(output); + expect(parsed.markdown).toBe('# Test'); + expect(parsed.videos).toEqual([ + { + url: 'https://cdn.example.com/product.mp4', + sourceURL: 'https://example.com/product', + source: 'script', + thumbnail: 'https://cdn.example.com/poster.jpg', + }, + ]); + }); + it('should output pretty JSON when pretty flag is true', () => { vi.mocked(fs.existsSync).mockReturnValue(true); diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index 67b31bbd7b..8abb33b22a 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -383,6 +383,7 @@ async function runWizard( { name: 'html', value: 'html' }, { name: 'links', value: 'links' }, { name: 'images', value: 'images' }, + { name: 'video', value: 'video' }, { name: 'summary', value: 'summary' }, { name: 'screenshot', value: 'screenshot' }, { name: 'full page screenshot', value: 'fullPageScreenshot' }, diff --git a/src/index.ts b/src/index.ts index 44801d7339..e391da32b9 100644 --- a/src/index.ts +++ b/src/index.ts @@ -320,7 +320,7 @@ function createScrapeCommand(): Command { .option('-H, --html', 'Output raw HTML (shortcut for --format html)') .option( '-f, --format ', - 'Output format(s). Multiple formats can be specified with commas (e.g., "markdown,links,images"). Available: markdown, html, rawHtml, links, images, screenshot, summary, changeTracking, json, attributes, branding. Single format outputs raw content; multiple formats output JSON.' + 'Output format(s). Multiple formats can be specified with commas (e.g., "markdown,links,images"). Available: markdown, html, rawHtml, links, images, screenshot, summary, changeTracking, json, attributes, branding, video. Single format outputs raw content; multiple formats output JSON.' ) .option('--only-main-content', 'Include only main content', false) .option( @@ -501,7 +501,7 @@ function createDownloadCommand(): Command { .option('--allow-subdomains', 'Include subdomains', false) .option( '-f, --format ', - 'Output format(s), comma-separated (default: markdown). Available: markdown, html, rawHtml, links, images, summary, json' + 'Output format(s), comma-separated (default: markdown). Available: markdown, html, rawHtml, links, images, summary, json, video' ) .option('-H, --html', 'Download as HTML (shortcut for --format html)') .option( @@ -847,6 +847,12 @@ Max upload size: 50 MB url: 'file://' + file, format: format ?? 'markdown', }); + if (scrapeOptions.formats?.includes('video')) { + console.error( + 'Error: The video format is not supported for parse. Use scrape with --format video for webpage video discovery.' + ); + process.exit(1); + } await handleParseCommand({ file, @@ -913,7 +919,7 @@ function createSearchCommand(): Command { .option('--scrape', 'Enable scraping of search results', false) .option( '--scrape-formats ', - 'Comma-separated scrape formats when --scrape is enabled: markdown, html, rawHtml, links, etc. (default: markdown)' + 'Comma-separated scrape formats when --scrape is enabled: markdown, html, rawHtml, links, video, etc. (default: markdown)' ) .option( '--only-main-content', diff --git a/src/types/scrape.ts b/src/types/scrape.ts index 6cdbe89e8c..ac57b4f830 100644 --- a/src/types/scrape.ts +++ b/src/types/scrape.ts @@ -13,7 +13,45 @@ export type ScrapeFormat = | 'changeTracking' | 'json' | 'attributes' - | 'branding'; + | 'branding' + | 'video'; + +export interface VideoItem { + url: string; + sourceURL: string; + source: string; + kind?: string; + provider?: string; + title?: string; + thumbnail?: string; + description?: string; + duration?: string; + mimeType?: string; + width?: number; + height?: number; + metadata?: Record; +} + +export interface ScrapeDocument { + markdown?: string; + html?: string; + rawHtml?: string; + links?: string[]; + images?: string[]; + screenshot?: string; + summary?: string; + audio?: string; + video?: string; + videos?: VideoItem[]; + answer?: string; + highlights?: string; + warning?: string; + actions?: Record; + changeTracking?: Record; + branding?: Record; + metadata?: Record; + [key: string]: unknown; +} export interface ScrapeLocation { /** ISO 3166-1 alpha-2 country code (e.g., 'US', 'DE', 'BR') */ @@ -76,6 +114,6 @@ export interface ScrapeOptions { export interface ScrapeResult { success: boolean; - data?: any; + data?: ScrapeDocument; error?: string; } diff --git a/src/utils/options.ts b/src/utils/options.ts index 878e6b5f54..bca086ea15 100644 --- a/src/utils/options.ts +++ b/src/utils/options.ts @@ -23,6 +23,7 @@ const VALID_FORMATS: ScrapeFormat[] = [ 'json', 'attributes', 'branding', + 'video', ]; /** diff --git a/src/utils/output.ts b/src/utils/output.ts index 4c7842c065..ffefee7c57 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -31,6 +31,7 @@ const RAW_TEXT_FORMATS: ScrapeFormat[] = [ 'links', 'images', 'summary', + 'video', ]; /** @@ -99,6 +100,18 @@ function extractContent(data: any, format: ScrapeFormat): string | null { return data.summary || data[format] || null; } + // Handle video format. `video` is the legacy provider-download URL, while + // `videos` is the generic page-level discovery array with metadata. + if (format === 'video') { + if (Array.isArray(data.videos)) { + return data.videos + .map((video: any) => video?.url) + .filter((url: unknown): url is string => typeof url === 'string') + .join('\n'); + } + return data.video || data[format] || null; + } + return null; } @@ -116,6 +129,11 @@ function extractMultipleFormats( if (data[key] !== undefined) { result[key] = data[key]; + if (format === 'video' && data.videos !== undefined) { + result.videos = data.videos; + } + } else if (format === 'video' && data.videos !== undefined) { + result.videos = data.videos; } else if (format === 'html' && data.rawHtml !== undefined) { // Fallback for html -> rawHtml result[key] = data.rawHtml; From 38d72990ac1e966066840af4db54782fe7c2dfa7 Mon Sep 17 00:00:00 2001 From: Abimael Martell <1450169+abimaelmartell@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:00:42 -0700 Subject: [PATCH 2/3] chore(cli): bump version --- README.md | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a2cdd2751e..efaa206f6e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ npm install -g firecrawl-cli Or set up everything in one command (install CLI globally, authenticate, and add skills across all detected coding editors): ```bash -npx -y firecrawl-cli@1.19.6 init -y --browser +npx -y firecrawl-cli@1.19.7 init -y --browser ``` - `-y` runs setup non-interactively @@ -678,7 +678,7 @@ firecrawl --status ``` ``` - 🔥 firecrawl cli v1.19.6 + 🔥 firecrawl cli v1.19.7 ● Authenticated via stored credentials Concurrency: 0/100 jobs (parallel scrape limit) diff --git a/package.json b/package.json index c529441788..869ce32a46 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "firecrawl-cli", - "version": "1.19.6", + "version": "1.19.7", "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.", "main": "dist/index.js", "bin": { From 5aef5b678246e64b46cf3c27f6ae439413c49296 Mon Sep 17 00:00:00 2001 From: Abimael Martell <1450169+abimaelmartell@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:17:47 -0700 Subject: [PATCH 3/3] fix(cli): save video downloads --- src/__tests__/commands/scrape.test.ts | 74 ++++++++++++++++++++++++++- src/commands/scrape.ts | 34 ++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/src/__tests__/commands/scrape.test.ts b/src/__tests__/commands/scrape.test.ts index 44d864eaee..c469857605 100644 --- a/src/__tests__/commands/scrape.test.ts +++ b/src/__tests__/commands/scrape.test.ts @@ -2,8 +2,11 @@ * Tests for scrape command */ +import fs from 'fs'; +import os from 'os'; +import path from 'path'; import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; -import { executeScrape } from '../../commands/scrape'; +import { executeScrape, handleAllScrapeCommand } from '../../commands/scrape'; import { getClient } from '../../utils/client'; import { initializeConfig } from '../../utils/config'; import { setupTest, teardownTest } from '../utils/mock-client'; @@ -480,4 +483,73 @@ describe('executeScrape', () => { }); }); }); + + describe('download output', () => { + it('should save video URLs and metadata for video format downloads', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'firecrawl-cli-')); + const previousCwd = process.cwd(); + const stderrSpy = vi + .spyOn(process.stderr, 'write') + .mockImplementation(() => true); + + const videos = [ + { + url: 'https://cdn.example.com/video-a.mp4', + sourceURL: 'https://example.com/product', + source: 'script', + kind: 'file', + provider: 'cdn.example.com', + thumbnail: 'https://cdn.example.com/thumb-a.jpg', + }, + { + url: 'https://cdn.example.com/video-b.mp4', + sourceURL: 'https://example.com/product', + source: 'script', + kind: 'file', + provider: 'cdn.example.com', + }, + ]; + + mockClient.map = vi.fn().mockResolvedValue({ + links: [{ url: 'https://example.com/product' }], + }); + mockClient.scrape.mockResolvedValue({ videos }); + + try { + process.chdir(tmpDir); + initializeConfig({ apiUrl: 'http://localhost:3002' }); + + await handleAllScrapeCommand( + 'https://example.com/product', + { + url: 'https://example.com/product', + apiUrl: 'http://localhost:3002', + formats: ['video'], + }, + { yes: true, limit: 1 } + ); + + const outputDir = path.join( + tmpDir, + '.firecrawl', + 'example.com', + 'product' + ); + const videosTxt = path.join(outputDir, 'videos.txt'); + const videosJson = path.join(outputDir, 'videos.json'); + + expect(fs.readFileSync(videosTxt, 'utf-8')).toBe( + 'https://cdn.example.com/video-a.mp4\nhttps://cdn.example.com/video-b.mp4' + ); + expect(JSON.parse(fs.readFileSync(videosJson, 'utf-8'))).toEqual( + videos + ); + expect(fs.existsSync(path.join(outputDir, 'index.json'))).toBe(false); + } finally { + process.chdir(previousCwd); + stderrSpy.mockRestore(); + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }); + }); }); diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index 8abb33b22a..a74e4b4aca 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -311,6 +311,24 @@ function urlToNestedPath(url: string, filename: string = 'index.md'): string { } } +function getVideoUrls(data?: ScrapeResult['data']): string[] { + if (!data) return []; + + if (Array.isArray(data.videos)) { + return data.videos + .map((video) => video?.url) + .filter( + (url): url is string => typeof url === 'string' && url.length > 0 + ); + } + + if (typeof data.video === 'string' && data.video.length > 0) { + return [data.video]; + } + + return []; +} + /** * Map an entire site and scrape all discovered URLs. * Organizes results into nested directories based on URL paths. @@ -657,6 +675,22 @@ export async function handleAllScrapeCommand( fs.writeFileSync(filepath, result.data.images.join('\n'), 'utf-8'); savedFiles.push(filepath); } + } else if (fmt === 'video') { + const videoUrls = getVideoUrls(result.data); + if (videoUrls.length > 0) { + const filepath = path.join(dir, 'videos.txt'); + fs.writeFileSync(filepath, videoUrls.join('\n'), 'utf-8'); + savedFiles.push(filepath); + } + if (Array.isArray(result.data?.videos)) { + const filepath = path.join(dir, 'videos.json'); + fs.writeFileSync( + filepath, + JSON.stringify(result.data.videos, null, 2), + 'utf-8' + ); + savedFiles.push(filepath); + } } else if (fmt === 'summary') { if (result.data?.summary) { const filepath = path.join(dir, 'summary.md');