From 43a1671a10840e42e75f595d5887fdf715a3e3e6 Mon Sep 17 00:00:00 2001 From: Ege BULUT Date: Tue, 23 Jun 2026 11:53:35 +0300 Subject: [PATCH] fix: lazy imports to prevent torchcodec FFmpeg DLL crash on Windows (#1089) Remove BaseLoader inheritance from ChromiumLoader and use lazy imports for PyPDFLoader and AsyncChromiumLoader to avoid triggering the sentence_transformers -> torchcodec -> FFmpeg native DLL loading chain at import time, which crashes on systems where FFmpeg DLLs are not available. Also add torchcodec mock to conftest.py for the test suite. Changes: - chromium.py: remove BaseLoader import/inheritance, add load()/aload() - docloaders/__init__.py: lazy __getattr__ for ChromiumLoader, PlasmateLoader - fetch_node.py: lazy PyPDFLoader import - robots_node.py: lazy AsyncChromiumLoader import - conftest.py: torchcodec module mock --- scrapegraphai/docloaders/__init__.py | 19 +++++++++++++++++-- scrapegraphai/docloaders/chromium.py | 11 +++++++++-- scrapegraphai/nodes/fetch_node.py | 2 +- scrapegraphai/nodes/robots_node.py | 2 +- tests/conftest.py | 15 +++++++++++++++ 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index 99b99c64..a4e8e383 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,12 +1,27 @@ """ This module handles document loading functionalities for the ScrapeGraphAI application. + +Note: ChromiumLoader and PlasmateLoader are lazy-imported to avoid triggering +torchcodec/FFmpeg DLL loading at import time (sentence_transformers -> torchcodec chain). """ from .browser_base import browser_base_fetch -from .chromium import ChromiumLoader -from .plasmate import PlasmateLoader from .scrape_do import scrape_do_fetch +_LAZY_MODULES = { + "ChromiumLoader": ".chromium", + "PlasmateLoader": ".plasmate", +} + + +def __getattr__(name): + if name in _LAZY_MODULES: + import importlib + module = importlib.import_module(_LAZY_MODULES[name], __package__) + return getattr(module, name) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + __all__ = [ "browser_base_fetch", "ChromiumLoader", diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index d8cd00ae..a522582c 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -3,7 +3,6 @@ import aiohttp import async_timeout -from langchain_community.document_loaders.base import BaseLoader from langchain_core.documents import Document from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy @@ -11,7 +10,7 @@ logger = get_logger("web-loader") -class ChromiumLoader(BaseLoader): +class ChromiumLoader: """Scrapes HTML pages from URLs using a (headless) instance of the Chromium web driver with proxy protection. @@ -436,6 +435,14 @@ async def ascrape_with_js_support( finally: await browser.close() + def load(self) -> List[Document]: + """Load all documents synchronously.""" + return list(self.lazy_load()) + + async def aload(self) -> List[Document]: + """Load all documents asynchronously.""" + return [doc async for doc in self.alazy_load()] + def lazy_load(self) -> Iterator[Document]: """ Lazily load text content from the provided URLs. diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index ada86e59..c55b96f6 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -7,7 +7,6 @@ import concurrent.futures import requests -from langchain_community.document_loaders import PyPDFLoader from langchain_core.documents import Document from langchain_openai import AzureChatOpenAI, ChatOpenAI @@ -182,6 +181,7 @@ def load_file_content(self, source, input_type): """ if input_type == "pdf": + from langchain_community.document_loaders import PyPDFLoader loader = PyPDFLoader(source) # PyPDFLoader.load() can be blocking for large PDFs. Run it in a thread and # enforce the configured timeout if provided. diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index aa8da848..eee9c79f 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -7,7 +7,6 @@ from langchain_core.output_parsers import CommaSeparatedListOutputParser from langchain_core.prompts import PromptTemplate -from langchain_community.document_loaders import AsyncChromiumLoader from ..helpers import robots_dictionary from ..prompts import TEMPLATE_ROBOT @@ -90,6 +89,7 @@ def execute(self, state: dict) -> dict: else: parsed_url = urlparse(source) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + from langchain_community.document_loaders import AsyncChromiumLoader loader = AsyncChromiumLoader(f"{base_url}/robots.txt") document = loader.load() if "ollama" in self.llm_model.model: diff --git a/tests/conftest.py b/tests/conftest.py index a9f06374..175b8497 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,9 +14,24 @@ from typing import Any, Dict from unittest.mock import Mock +import sys +import types + import pytest from dotenv import load_dotenv +# Mock torchcodec to prevent FFmpeg DLL crashes at import time. +# sentence_transformers -> torchcodec -> FFmpeg native DLLs can't load on some systems. +_tc = types.ModuleType("torchcodec") +_tc.__version__ = "0.0.0" +_tc.__file__ = "" +_tc.__spec__ = types.ModuleType("spec") +_tc.__spec__.name = "torchcodec" +_tc.__spec__.loader = None +_tc.__spec__.submodule_search_locations = [] +if "torchcodec" not in sys.modules: + sys.modules["torchcodec"] = _tc + # Load environment variables load_dotenv()