From 97e3c7565aa5bbc77a0a3708b2d49444b91de262 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 9 Jun 2026 21:45:16 +0000 Subject: [PATCH 1/2] expose `impersonate` flag on HTTP crawlers --- .../code_examples/http_headers/set_headers.py | 33 +++++ docs/guides/http_headers.mdx | 125 ++++++++++++++++++ .../_abstract_http/_abstract_http_crawler.py | 16 +++ .../test_beautifulsoup_crawler.py | 31 +++++ .../unit/crawlers/_http/test_http_crawler.py | 29 ++++ .../crawlers/_parsel/test_parsel_crawler.py | 31 +++++ 6 files changed, 265 insertions(+) create mode 100644 docs/guides/code_examples/http_headers/set_headers.py create mode 100644 docs/guides/http_headers.mdx diff --git a/docs/guides/code_examples/http_headers/set_headers.py b/docs/guides/code_examples/http_headers/set_headers.py new file mode 100644 index 0000000000..050bc5ac67 --- /dev/null +++ b/docs/guides/code_examples/http_headers/set_headers.py @@ -0,0 +1,33 @@ +import asyncio + +from crawlee import HttpHeaders, Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.http_clients import ImpitHttpClient + + +async def main() -> None: + # Set default headers on the client. They are sent on every request. + http_client = ImpitHttpClient(headers={'X-Api-Key': 'secret'}) + + crawler = HttpCrawler(http_client=http_client) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + # `httpbin.org/headers` echoes the received request headers back. + response = (await context.http_response.read()).decode() + context.log.info(response) + + # Add a header for this request only. It merges with the client defaults. + request = Request.from_url( + 'https://httpbin.org/headers', + headers=HttpHeaders({'Accept': 'application/json'}), + # Both requests target the same URL. Without a distinct `unique_key`, + # deduplication would drop this one. + unique_key='set-headers-example', + ) + + await crawler.run(['https://httpbin.org/headers', request]) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/http_headers.mdx b/docs/guides/http_headers.mdx new file mode 100644 index 0000000000..d32658101a --- /dev/null +++ b/docs/guides/http_headers.mdx @@ -0,0 +1,125 @@ +--- +id: http-headers +title: HTTP headers +description: Learn how HTTP headers work in web scraping and how to work with them in Crawlee. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import SetHeadersExample from '!!raw-loader!roa-loader!./code_examples/http_headers/set_headers.py'; + +Every request a crawler sends includes [HTTP headers](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers). These headers tell the server who is making the request, what content is acceptable, and in what language. The server reads them and decides what to return. The same URL can return different content, a different status code, or a blocked page depending on the headers it sees. This guide covers the headers that shape a scraping request, like `User-Agent`, `Accept-Language`, and `Content-Type`, what Crawlee sends by default, and how to change them. + +## What headers do + +Headers are key-value metadata attached to a request. Some of them shape what you get back. Others identify you or carry state. The ones that matter most for scraping are below. + +### Identity headers + +`User-Agent` identifies the client. Many sites serve different markup to a browser than to a crawler. Some reject requests whose `User-Agent` doesn't look like a real browser. It is one of the basic headers a server uses to identify the client, though not the only one. + +`Referer` says which page the request came from. Some sites gate content, images, or API responses behind an expected `Referer`. A direct request with no `Referer`, or the wrong one, gets a different answer than a click from inside the site. + +### Content negotiation + +These headers tell the server what the client can handle. The server uses them to pick what to send. + +`Accept` lists the formats the client wants. The same endpoint can return HTML to one `Accept` and JSON to another. If you need data from an API, try setting it to `application/json` to get JSON instead of a rendered page. + +`Accept-Language` lists the languages the client prefers, in priority order. It is a preference, not a switch. A server honors it only for content it actually serves in more than one language, and ignores it otherwise. Where it applies, it changes translated text, date and number formats, and sometimes currency. Set it to match the locale you expect, then confirm from the response that the server applied it. + +`Accept-Encoding` lists the compression formats the client accepts, such as `gzip`, `br`, or `zstd`. The server compresses the body to one of them. This matters for cost. Without compression the response body can be several times larger, and when you route traffic through a metered [proxy](./proxy-management) that extra volume is billed bandwidth. Crawlee's HTTP clients advertise the formats they support and decompress the response for you, so you receive the smaller body and read it as plain bytes. + +### Request body + +`Content-Type` declares the format of the body you send, not the format you want back. It applies whenever a request carries a body, for example a `POST` that submits a form or JSON. An API that expects `application/json` can reject a payload sent as `application/x-www-form-urlencoded`, and a form endpoint can reject the reverse. Set it to match the body you attach. + +`Content-Length` is derived from the body for you, so you don't set it by hand. + +`Origin` says which site the request was initiated from. Some APIs check it on requests that carry a body and reject the ones that don't match an expected value. + +### Authentication and stateful headers + +`Cookie` carries session and login state. Crawlee manages cookies through [sessions](./session-management), so you rarely set this one by hand. + +`Authorization` carries credentials, such as a bearer token or basic auth. APIs commonly require it. Set it on the request when the target needs authenticated access. Treat its value as a secret, and don't send it through a [proxy you don't control](./security-of-web-scraping#untrusted-proxies). + +### Client hints and fingerprinting headers + +`sec-ch-ua`, `sec-fetch-*`, and similar client hints describe the browser, its platform, and how the request was initiated. Real browsers send them. Most automated clients don't. Anti-bot systems read them to separate a browser from automated traffic. + +### Non-standard headers + +A server can read any header it wants, not only the standard ones. AJAX endpoints often expect `X-Requested-With: XMLHttpRequest`. A site can require a custom `X-Api-Key` or `X-CSRF-Token`. A mobile app's backend usually expects its own set, such as an app version in `X-App-Version`, a device id in `X-Device-Id`, or a token the app attaches itself. There is no fixed list. When a request works in a browser or an app but fails from a crawler, capture the full set of headers the original sends and look for one you're missing. + +### Headers don't guarantee a result + +A header is a request, not a command. The server decides what to do with it. It can ignore a header entirely, so a value you set has no effect on the response. It can reject a value it doesn't accept and answer with an error. And some headers only take effect in combination with others. Setting a header is the first step. Confirm from the response that it did what you expected. + +## Default headers in Crawlee + +All built-in HTTP clients impersonate a browser by default. Instead of a bare library `User-Agent` like `python-httpx/0.27`, they send a realistic set of browser-like headers: a browser `User-Agent`, an `Accept`, an `Accept-Language`, and client hints where the client supports them. This makes a crawl look like normal browser traffic and avoids the simplest forms of blocking. + +Each client implements impersonation its own way: + +- `ImpitHttpClient` (the default) impersonates Firefox at the TLS and HTTP layer through the [`impit`](https://pypi.org/project/impit/) library. +- `HttpxHttpClient` uses a `HeaderGenerator` to add `Accept`, `Accept-Language`, and `User-Agent`. +- `CurlImpersonateHttpClient` impersonates Chrome at the TLS and HTTP layer through [`curl-cffi`](https://curl-cffi.readthedocs.io/). + +The header values match a specific version of a real browser, so the whole set stays internally consistent rather than a mix that no real client would send. For more on staying unblocked, see the [avoid blocking](./avoid-blocking) guide. + +## When impersonation hurts + +Browser-like headers are the right default for scraping normal web pages. They are the wrong default for some APIs and custom endpoints. + +A server can expect specific header values that differ from the ones a browser sends. When the headers don't match what it expects, the response can be wrong: an error, a redirect, or a payload meant for a different client. The browser-like values Crawlee adds are part of that mismatch. An endpoint can answer correctly to a plain request and break once an `Accept-Language` or a full browser header set is attached. + +If a request behaves differently through Crawlee than through a minimal client, the injected headers are the first thing to check. Inspect what your crawler actually sends by requesting an echo endpoint such as `https://httpbin.org/headers` and reading the response. + +## Turning impersonation off + +The HTTP crawlers (`HttpCrawler`, `BeautifulSoupCrawler`, `ParselCrawler`) take an `impersonate` flag. Set it to `False` to stop the default HTTP client from injecting browser-like headers: + +```python +from crawlee.crawlers import HttpCrawler +from crawlee.http_clients import ImpitHttpClient + +# Disable browser-like headers on the default client. +crawler = HttpCrawler(impersonate=False) + +# The same result when you supply the client yourself. +crawler = HttpCrawler(http_client=ImpitHttpClient(browser=None)) +``` + +The flag only configures the default client. If you pass your own `http_client`, it's ignored, and you turn impersonation off on that client directly. The option is named differently on each one: + +- `ImpitHttpClient(browser=None)` +- `HttpxHttpClient(header_generator=None)` +- `CurlImpersonateHttpClient(impersonate=None)` + +## Setting your own headers + +To send the same custom headers on every request, set them on the HTTP client. To add a header for a single request, pass it on the `Request`. The two sets are merged, and if both define the same header, the per-request value wins. + +The example below sets `X-Api-Key` on the client and `Accept` on one of two requests to an echo endpoint. The client header reaches both requests, and the per-request `Accept` is added only to the request that sets it. Impersonation stays on, so the echo also returns the full browser header set, with the request's `Accept` in place of the impersonated one: + + + {SetHeadersExample} + + +`HttpxHttpClient` and `CurlImpersonateHttpClient` take the same `headers` argument. + +Header names are case insensitive, and `HttpHeaders` normalizes the casing for you, so `user-agent` and `User-Agent` refer to the same header. + +## Header order and fingerprinting + +Anti-bot systems look at more than header values. They look at which headers are present, their casing, and the order they arrive in. Real browsers send a consistent, recognizable set. A request that has a browser `User-Agent` but the wrong header order, or missing client hints, still looks automated. + +This is why `ImpitHttpClient` and `CurlImpersonateHttpClient` replicate the browser at the transport layer rather than just attaching headers. Setting a browser `User-Agent` on a plain client is not enough to pass these checks. If a target uses fingerprinting, prefer an impersonating client over hand-set headers. + +## Conclusion + +Headers decide what a server sends back. Crawlee impersonates a browser by default, which keeps a crawl unblocked on normal pages but can break endpoints that expect different headers. Turn impersonation off with `impersonate=False` when you target such an endpoint, set custom headers on the client or per request, and reach for an impersonating client when the target fingerprints its traffic. + +If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 8d15a1d801..67c8487df2 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -16,6 +16,7 @@ from crawlee._utils.urls import to_absolute_url_iterator from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.errors import SessionError +from crawlee.http_clients import ImpitHttpClient from crawlee.statistics import StatisticsState from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult @@ -72,8 +73,20 @@ def __init__( *, parser: AbstractHttpParser[TParseResult, TSelectResult], navigation_timeout: timedelta | None = None, + impersonate: bool = True, **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]], ) -> None: + """Initialize a new instance. + + Args: + parser: An instance of `AbstractHttpParser` used for parsing HTTP responses. + navigation_timeout: Timeout for navigation (the process between sending a request and calling the request + handler) + impersonate: Whether the default HTTP client should impersonate a browser by sending browser-like + headers. This applies only to the default client. If you pass your own `http_client`, this flag + is ignored and you configure impersonation on that client directly. + kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. + """ self._parser = parser self._navigation_timeout = navigation_timeout or timedelta(minutes=1) self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = [] @@ -86,6 +99,9 @@ def __init__( 'AbstractHttpCrawler._create_static_content_crawler_pipeline() method to initialize it.' ) + if impersonate is False and 'http_client' not in kwargs: + kwargs['http_client'] = ImpitHttpClient(browser=None) + kwargs.setdefault('_logger', logging.getLogger(self.__class__.__name__)) super().__init__(**kwargs) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 9a828b0078..2f88c88a1a 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -1,10 +1,12 @@ from __future__ import annotations import asyncio +import json import sys from datetime import timedelta from typing import TYPE_CHECKING from unittest import mock +from unittest.mock import AsyncMock import pytest @@ -501,3 +503,32 @@ def test_import_error_handled() -> None: sys.modules.pop(mod_name, None) with pytest.raises(ImportError): from crawlee.crawlers import BeautifulSoupCrawler # noqa: F401 PLC0415 + + +@pytest.mark.parametrize( + 'impersonate', + [ + pytest.param(False, id='impersonate_disabled'), + pytest.param(True, id='impersonate_enabled'), + ], +) +async def test_impersonate_option(server_url: URL, *, impersonate: bool) -> None: + crawler = BeautifulSoupCrawler(impersonate=impersonate) + + call_mock = AsyncMock() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + await call_mock(json.loads(await context.http_response.read())) + + await crawler.run([str(server_url / 'headers')]) + + call_mock.assert_called_once() + headers = call_mock.call_args[0][0] + + if impersonate: + assert 'Mozilla' in headers.get('user-agent', '') + assert headers.get('priority', '') == 'u=0, i' + else: + assert headers.get('user-agent', '') == '' + assert headers.get('priority', '') == '' diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index 7d6fbfda0b..dc40559c04 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -686,3 +686,32 @@ async def failed_request_handler(context: BasicCrawlingContext, _error: Exceptio } await queue.drop() + + +@pytest.mark.parametrize( + 'impersonate', + [ + pytest.param(False, id='impersonate_disabled'), + pytest.param(True, id='impersonate_enabled'), + ], +) +async def test_impersonate_option(server_url: URL, *, impersonate: bool) -> None: + crawler = HttpCrawler(impersonate=impersonate) + + call_mock = AsyncMock() + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + await call_mock(json.loads(await context.http_response.read())) + + await crawler.run([str(server_url / 'headers')]) + + call_mock.assert_called_once() + headers = call_mock.call_args[0][0] + + if impersonate: + assert 'Mozilla' in headers.get('user-agent', '') + assert headers.get('priority', '') == 'u=0, i' + else: + assert headers.get('user-agent', '') == '' + assert headers.get('priority', '') == '' diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 02f5b61a86..5b95620277 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -1,8 +1,10 @@ from __future__ import annotations +import json import sys from typing import TYPE_CHECKING from unittest import mock +from unittest.mock import AsyncMock import pytest @@ -498,3 +500,32 @@ async def request_handler(context: ParselCrawlingContext) -> None: mock.call(str(server_url / 'page_3')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) + + +@pytest.mark.parametrize( + 'impersonate', + [ + pytest.param(False, id='impersonate_disabled'), + pytest.param(True, id='impersonate_enabled'), + ], +) +async def test_impersonate_option(server_url: URL, *, impersonate: bool) -> None: + crawler = ParselCrawler(impersonate=impersonate) + + call_mock = AsyncMock() + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + await call_mock(json.loads(await context.http_response.read())) + + await crawler.run([str(server_url / 'headers')]) + + call_mock.assert_called_once() + headers = call_mock.call_args[0][0] + + if impersonate: + assert 'Mozilla' in headers.get('user-agent', '') + assert headers.get('priority', '') == 'u=0, i' + else: + assert headers.get('user-agent', '') == '' + assert headers.get('priority', '') == '' From 681bc2801a4e74f1305113c948db0683c4425907 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 10 Jun 2026 12:07:39 +0000 Subject: [PATCH 2/2] add warning --- .../_abstract_http/_abstract_http_crawler.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 67c8487df2..7b93ee1d33 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -2,6 +2,7 @@ import asyncio import logging +import warnings from abc import ABC from datetime import timedelta from typing import TYPE_CHECKING, Generic @@ -47,6 +48,12 @@ class HttpCrawlerOptions( navigation_timeout: NotRequired[timedelta | None] """Timeout for the HTTP request.""" + impersonate: NotRequired[bool] + """Whether the default HTTP client should impersonate a browser by sending browser-like headers. This applies only + to the default client. If you pass your own `http_client`, this flag is ignored and you configure impersonation + on that client directly. + """ + @docs_group('Crawlers') class AbstractHttpCrawler( @@ -101,6 +108,15 @@ def __init__( if impersonate is False and 'http_client' not in kwargs: kwargs['http_client'] = ImpitHttpClient(browser=None) + elif impersonate is False and 'http_client' in kwargs: + warnings.warn( + ( + '`impersonate` option is ignored when custom `http_client` is provided. ' + 'Please configure impersonation directly on the `http_client` instance.' + ), + category=UserWarning, + stacklevel=2, + ) kwargs.setdefault('_logger', logging.getLogger(self.__class__.__name__)) super().__init__(**kwargs)