trigger.dev/docs/guides/python/python-crawl4ai.mdx at c3ef5beb9168546d4082b78bfb88aeeeee40b169 · triggerdotdev/trigger.dev

title	sidebarTitle	description
Python headless browser web crawler example	Python headless web crawler	Learn how to use Python, Crawl4AI and Playwright to create a headless browser web crawler with Trigger.dev.

import ScrapingWarning from "/snippets/web-scraping-warning.mdx"; import PythonLearnMore from "/snippets/python-learn-more.mdx";

Prerequisites

A project with Trigger.dev initialized
Python installed on your local machine

Overview

This demo showcases how to use Trigger.dev with Python to build a web crawler that uses a headless browser to navigate websites and extract content.

Features

Trigger.dev for background task orchestration
Our Python build extension to install the dependencies and run the Python script
Crawl4AI, an open source LLM friendly web crawler
A custom Playwright extension to create a headless chromium browser

GitHub repo

<Card title="View the project on GitHub" icon="GitHub" href="https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai"

Click here to view the full code for this project in our examples repository on GitHub. You can fork it and use it as a starting point for your own project.

The code

Build configuration

After you've initialized your project with Trigger.dev, add these build settings to your trigger.config.ts file:

import { defineConfig } from "@trigger.dev/sdk/v3";
import { pythonExtension } from "@trigger.dev/python/extension";
import type { BuildContext, BuildExtension } from "@trigger.dev/core/v3/build";

export default defineConfig({
  project: "<project ref>",
  // Your other config settings...
  build: {
    extensions: [
      // This is required to use the Python extension
      pythonExtension(),
      // This is required to create a headless chromium browser with Playwright
      installPlaywrightChromium(),
    ],
  },
});

// This is a custom build extension to install Playwright and Chromium
export function installPlaywrightChromium(): BuildExtension {
  return {
    name: "InstallPlaywrightChromium",
    onBuildComplete(context: BuildContext) {
      const instructions = [
        // Base and Chromium dependencies
        `RUN apt-get update && apt-get install -y --no-install-recommends \
          curl unzip npm libnspr4 libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 \
          libasound2 libnss3 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
          libgbm1 libxkbcommon0 \
          && apt-get clean && rm -rf /var/lib/apt/lists/*`,

        // Install Playwright and Chromium
        `RUN npm install -g playwright`,
        `RUN mkdir -p /ms-playwright`,
        `RUN PLAYWRIGHT_BROWSERS_PATH=/ms-playwright python -m playwright install --with-deps chromium`,
      ];

      context.addLayer({
        id: "playwright",
        image: { instructions },
        deploy: {
          env: {
            PLAYWRIGHT_BROWSERS_PATH: "/ms-playwright",
            PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1",
            PLAYWRIGHT_SKIP_BROWSER_VALIDATION: "1",
          },
          override: true,
        },
      });
    },
  };
}

Learn more about the trigger.config.ts file including setting default retry settings, customizing the build environment, and more.

Task code

This task uses the python.runScript method to run the crawl-url.py script with the given URL as an argument. You can see the original task in our examples repository here.

import { logger, schemaTask, task } from "@trigger.dev/sdk/v3";
import { python } from "@trigger.dev/python";
import { z } from "zod";

export const convertUrlToMarkdown = schemaTask({
  id: "convert-url-to-markdown",
  schema: z.object({
    url: z.string().url(),
  }),
  run: async (payload) => {
    const result = await python.runScript("./src/python/crawl-url.py", [payload.url]);

    logger.debug("convert-url-to-markdown", {
      url: payload.url,
      result,
    });

    return result.stdout;
  },
});

Add a requirements.txt file

Add the following to your requirements.txt file. This is required in Python projects to install the dependencies.

crawl4ai
playwright
urllib3<2.0.0

The Python script

The Python script is a simple script using Crawl4AI that takes a URL and returns the markdown content of the page. You can see the original script in our examples repository here.

import asyncio
import sys
from crawl4ai import *

async def main(url: str):
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
        )
        print(result.markdown)

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python crawl-url.py <url>")
        sys.exit(1)
    url = sys.argv[1]
    asyncio.run(main(url))

Testing your task

Create a virtual environment python -m venv venv
Activate the virtual environment, depending on your OS: On Mac/Linux: source venv/bin/activate, on Windows: venv\Scripts\activate
Install the Python dependencies pip install -r requirements.txt
If you haven't already, copy your project ref from your Trigger.dev dashboard and and add it to the trigger.config.ts file.
Run the Trigger.dev dev CLI command with with npx trigger dev@latest dev (it may ask you to authorize the CLI if you haven't already).
Test the task in the dashboard, using a URL of your choice.

Deploying your task

Deploy the task to production using the CLI command npx trigger.dev@latest deploy

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Prerequisites

Overview

Features

GitHub repo

The code

Build configuration

Task code

Add a requirements.txt file

The Python script

Testing your task

Deploying your task

Uh oh!

FilesExpand file tree

python-crawl4ai.mdx

Latest commit

History

python-crawl4ai.mdx

File metadata and controls

Prerequisites

Overview

Features

GitHub repo

The code

Build configuration

Task code

Add a requirements.txt file

The Python script

Testing your task

Deploying your task