Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Replace HeaderGenerator implementation by browserforge implementation #960

Open
wants to merge 37 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
b0d52f2
Draft of integration of browserforge fingerprint generation.
Pijukatel Dec 16, 2024
be15847
Works with page.evaluate.
Pijukatel Dec 17, 2024
a9415ec
Use add_init_script
Pijukatel Dec 17, 2024
36727a1
WIP
Pijukatel Dec 18, 2024
42eff80
Fix format, type check and tests.
Pijukatel Dec 18, 2024
998cbb6
Fix rootcause for flakiness in fingerprint generation
Pijukatel Dec 18, 2024
e1025c8
Use browserforge.injector code for fingerprints
Pijukatel Dec 19, 2024
33fdd6e
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Dec 19, 2024
85ba877
Regenerate poetry lock after merge
Pijukatel Dec 19, 2024
6e35c1d
Remove unintentional change to headless test
Pijukatel Dec 19, 2024
3f96456
Merge branch 'master' into integrate-browserforge-fingerprints
Pijukatel Jan 3, 2025
ddfabea
chore: revert React version bump
barjin Jan 3, 2025
3d37bca
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Jan 10, 2025
1b8e6a3
Add ScreenFingerprint and NavigatorFingerprint
Pijukatel Jan 10, 2025
9828a36
Add Fingerprint and their options types
Pijukatel Jan 13, 2025
f733c07
Add adapter tests
Pijukatel Jan 13, 2025
97011d9
Integrate into pw_crawler
Pijukatel Jan 13, 2025
debe900
Further integration into our code.
Pijukatel Jan 14, 2025
3d8340c
Finalize draft.
Pijukatel Jan 14, 2025
3d9b170
Set fiongerprint generator as top level argument to pw crawler
Pijukatel Jan 14, 2025
25aa4e2
Revert unnecessary change to function doc string.
Pijukatel Jan 14, 2025
5e46b78
Make test adapter-generic.
Pijukatel Jan 14, 2025
69b6974
Add types to __init__ if fingerprint_suite
Pijukatel Jan 14, 2025
27479be
Remove FingerprintGeneratorOptions
Pijukatel Jan 20, 2025
751f67c
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Jan 23, 2025
1cbadb0
Review commnets
Pijukatel Jan 23, 2025
8e44acd
Handle inconsistent result from browserforge fingerprint generator
Pijukatel Jan 24, 2025
d8001e7
Apply suggestions from code review
Pijukatel Jan 27, 2025
07acbfa
Docs
Pijukatel Jan 27, 2025
866fe98
Make sure browserforge files are downloaded before tests.
Pijukatel Jan 27, 2025
acc720f
HeaderGenerator from browserforge
Pijukatel Jan 28, 2025
f360603
Tests in progress
Pijukatel Jan 29, 2025
11cc913
Wait for Accept Language fix in browserforge
Pijukatel Jan 29, 2025
5f9192e
Workaround for missing changes in upstream repo
Pijukatel Feb 5, 2025
03b8610
Merge remote-tracking branch 'origin/master' into remove-header-gener…
Pijukatel Feb 6, 2025
b7d3427
Poetry lock from master
Pijukatel Feb 6, 2025
6937d0f
Add sec headers constraint for chromium
Pijukatel Feb 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,19 @@ async def _create_browser_context(
)

if self._header_generator:
common_headers = self._header_generator.get_common_headers()
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
user_agent_header = self._header_generator.get_user_agent_header(browser_type=self.browser_type)
headers = dict(common_headers | sec_ch_ua_headers | user_agent_header)
extra_http_headers = headers
extra_http_headers = dict(
self._header_generator.get_specific_headers(
header_names={
'Accept',
'Accept-Language',
'User-Agent',
'sec-ch-ua',
'sec-ch-ua-mobile',
'sec-ch-ua-platform',
},
browser_type=self.browser_type,
)
)
else:
extra_http_headers = None

Expand Down
108 changes: 106 additions & 2 deletions src/crawlee/fingerprint_suite/_browserforge_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,66 @@
from copy import deepcopy
from typing import TYPE_CHECKING, Any

from browserforge.bayesian_network import extract_json
from browserforge.fingerprints import Fingerprint as bf_Fingerprint
from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator
from browserforge.fingerprints import Screen
from browserforge.headers.generator import DATA_DIR
from browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator
from typing_extensions import override

from crawlee._utils.docs import docs_group

from ._consts import BROWSER_TYPE_HEADER_KEYWORD
from ._fingerprint_generator import FingerprintGenerator

if TYPE_CHECKING:
from ._types import HeaderGeneratorOptions, ScreenOptions
from ._types import HeaderGeneratorOptions, ScreenOptions, SupportedBrowserType


class PatchedHeaderGenerator(bf_HeaderGenerator):
"""Browserforge `HeaderGenerator` that contains patches not accepted in upstream repo."""

def _get_accept_language_header(self, locales: tuple[str, ...]) -> str:
"""Generates the Accept-Language header based on the given locales.

Patched version due to PR of upstream repo not being merged: https://github.com/daijro/browserforge/pull/24

Parameters:
locales (ListOrString): Locale(s).

Returns:
str: Accept-Language header string.
"""
# First locale does not include quality factor, q=1 is considered as implicit.
additional_locales = [f'{locale};q={0.9 - index * 0.1:.1f}' for index, locale in enumerate(locales[1:])]
return ','.join((locales[0], *additional_locales))


class PatchedFingerprintGenerator(bf_FingerprintGenerator):
"""Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""

def __init__( # type:ignore[no-untyped-def] # Upstream repo types missing.
self,
*,
screen: Screen | None = None,
strict: bool = False,
mock_webrtc: bool = False,
slim: bool = False,
**header_kwargs, # noqa:ANN003 # Upstream repo types missing.
) -> None:
"""Initializes the FingerprintGenerator with the given options.

Parameters:
screen (Screen, optional): Screen constraints for the generated fingerprint.
strict (bool, optional): Whether to raise an exception if the constraints are too strict. Default is False.
mock_webrtc (bool, optional): Whether to mock WebRTC when injecting the fingerprint. Default is False.
slim (bool, optional): Disables performance-heavy evasions when injecting the fingerprint. Default is False.
**header_kwargs: Header generation options for HeaderGenerator
"""
super().__init__(screen=screen, strict=strict, mock_webrtc=mock_webrtc, slim=slim)
# Replace `self.header_generator` To make sure that we consistently use `PatchedHeaderGenerator`
self.header_generator = PatchedHeaderGenerator(**header_kwargs)


@docs_group('Classes')
Expand Down Expand Up @@ -62,7 +111,7 @@ def __init__(
bf_options['screen'] = Screen(**screen_options.model_dump())

self._options = {**bf_options, **bf_header_options}
self._generator = bf_FingerprintGenerator()
self._generator = PatchedFingerprintGenerator()

@override
def generate(self) -> bf_Fingerprint:
Expand All @@ -78,3 +127,58 @@ def generate(self) -> bf_Fingerprint:
if attempt == max_attempts:
raise
raise RuntimeError('Failed to generate fingerprint.')


class BrowserforgeHeaderGenerator:
"""`HeaderGenerator` adapter for fingerprint generator from `browserforge`."""

def __init__(self) -> None:
self._generator = PatchedHeaderGenerator(locale=['en-US', 'en'])

def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str, str]:
"""Generate headers.

browser_type = `chromium` is in general sense not just Google Chrome, but also other chromium based browsers.
For example this Safari user agent can be generated for `chromium` input:
`Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko)
CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1`
To remain consistent with previous implementation only subset of `chromium` header will be allowed.
"""
# browserforge header generation can be flaky. Enforce basic QA on generated headers
max_attempts = 10

if browser_type == 'chromium':
# `BrowserForge` header generator considers `chromium` in general sense and therefore will generate also
# other `Chromium` based browser headers. This adapter desires only specific subset of `chromium` headers
# that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers.
# Increase max attempts as from `BrowserForge` header generator perspective even `chromium`
# headers without `sec-...` headers are valid.
max_attempts += 50

bf_browser_type = 'safari' if browser_type == 'webkit' else browser_type

for _attempt in range(max_attempts):
generated_header: dict[str, str] = self._generator.generate(browser=bf_browser_type)
if any(keyword in generated_header['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]):
if browser_type == 'chromium' and not self._contains_all_sec_headers(generated_header):
continue

return generated_header
raise RuntimeError('Failed to generate header.')

def _contains_all_sec_headers(self, headers: dict[str, str]) -> bool:
return all(header_name in headers for header_name in ('sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform'))


def get_available_header_network() -> dict:
"""Get header network that contains possible header values."""
return extract_json(DATA_DIR / 'header-network.zip')


def get_available_header_values(header_network: dict, node_name: str | set[str]) -> set[str]:
"""Get set of possible header values from available header network."""
node_names = {node_name} if isinstance(node_name, str) else node_name
for node in header_network['nodes']:
if node['name'] in node_names:
return set(node['possibleValues'])
return set()
Loading
Loading