Skip to content

Commit

Permalink
Merge pull request #19 from jxlil/refactor/curl_options
Browse files Browse the repository at this point in the history
Refactor RequestParser and curl_options
  • Loading branch information
jxlil authored Aug 9, 2024
2 parents 8f89f1f + d328cde commit 4f2f393
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 90 deletions.
16 changes: 5 additions & 11 deletions scrapy_impersonate/handler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Type, TypeVar

from curl_cffi.requests import AsyncSession
from curl_cffi.curl import CurlOpt
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.crawler import Crawler
from scrapy.http import Headers, Request, Response
Expand All @@ -11,7 +10,7 @@
from scrapy.utils.reactor import verify_installed_reactor
from twisted.internet.defer import Deferred

from scrapy_impersonate.parser import RequestParser
from scrapy_impersonate.parser import CurlOptionsParser, RequestParser

ImpersonateHandler = TypeVar("ImpersonateHandler", bound="ImpersonateDownloadHandler")

Expand All @@ -35,15 +34,10 @@ def download_request(self, request: Request, spider: Spider) -> Deferred:

@deferred_f_from_coro_f
async def _download_request(self, request: Request, spider: Spider) -> Response:
# Add support for proxy auth headers
curl_options = {}
proxy_header = []
if b'Proxy-Authorization' in request.headers:
proxy_header_authorization=b'Proxy-Authorization: '+ request.headers.pop(b'Proxy-Authorization')[0]
proxy_header.append(proxy_header_authorization)
curl_options[CurlOpt.PROXYHEADER] = proxy_header
async with AsyncSession(max_clients=1,curl_options=curl_options) as client:
response = await client.request(**RequestParser(request).as_dict()) # type: ignore
curl_options = CurlOptionsParser(request).as_dict()
async with AsyncSession(max_clients=1, curl_options=curl_options) as client:
request_args = RequestParser(request).as_dict()
response = await client.request(**request_args)

headers = Headers(response.headers.multi_items())
headers.pop("Content-Encoding", None)
Expand Down
112 changes: 34 additions & 78 deletions scrapy_impersonate/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,39 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from curl_cffi import CurlHttpVersion, CurlMime
from curl_cffi import CurlHttpVersion, CurlMime, CurlOpt
from scrapy.http import Request


class CurlOptionsParser:
def __init__(self, request: Request) -> None:
self.request = request
self.curl_options = {}

@staticmethod
def curl_option_method(func):
func._is_curl_option = True
return func

@curl_option_method
def _set_proxy_auth(self):
"""Add support for proxy auth headers"""

if proxy_authorization := self.request.headers.pop(b"Proxy-Authorization", None):
proxy_header = [b"Proxy-Authorization: " + proxy_authorization[0]]
self.curl_options[CurlOpt.PROXYHEADER] = proxy_header

def as_dict(self):
for method_name in dir(self):
method = getattr(self, method_name)
if callable(method) and getattr(method, "_is_curl_option", False):
method()

return self.curl_options


class RequestParser:
# TODO: Implement @request_arg_method instead of @property

def __init__(self, request: Request) -> None:
self._request = request
self._impersonate_args = request.meta.get("impersonate_args", {})
Expand All @@ -25,10 +54,6 @@ def params(self) -> Optional[Union[Dict, List, Tuple]]:
def data(self) -> Optional[Any]:
return self._request.body

@property
def json(self) -> Optional[dict]:
return self._impersonate_args.get("json")

@property
def headers(self) -> dict:
headers = self._request.headers.to_unicode_dict()
Expand All @@ -46,93 +71,24 @@ def cookies(self) -> dict:
else:
return {}

@property
def files(self) -> Optional[dict]:
return self._impersonate_args.get("files")

@property
def auth(self) -> Optional[Tuple[str, str]]:
return self._impersonate_args.get("auth")

@property
def timeout(self) -> Union[float, Tuple[float, float]]:
return self._impersonate_args.get("timeout", 30.0)

@property
def allow_redirects(self) -> bool:
return False if self._request.meta.get("dont_redirect") else True

@property
def max_redirects(self) -> int:
return self._impersonate_args.get("max_redirects", -1)

@property
def proxies(self) -> Optional[dict]:
return self._impersonate_args.get("proxies")

@property
def proxy(self) -> Optional[str]:
return self._request.meta.get("proxy")

@property
def proxy_auth(self) -> Optional[Tuple[str, str]]:
return self._impersonate_args.get("proxy_auth")

@property
def verify(self) -> Optional[bool]:
return self._impersonate_args.get("verify")

@property
def referer(self) -> Optional[str]:
return self._impersonate_args.get("referer")

@property
def accept_encoding(self) -> str:
return self._impersonate_args.get("accept_encoding", "gzip, deflate, br")

@property
def content_callback(self) -> Optional[Callable]:
return self._impersonate_args.get("content_callback")

@property
def impersonate(self) -> Optional[str]:
return self._request.meta.get("impersonate")

@property
def default_headers(self) -> Optional[bool]:
return self._impersonate_args.get("default_headers")

@property
def default_encoding(self) -> Union[str, Callable[[bytes], str]]:
return self._impersonate_args.get("default_encoding", "utf-8")

@property
def http_version(self) -> Optional[CurlHttpVersion]:
return self._impersonate_args.get("http_version")

@property
def interface(self) -> Optional[str]:
return self._impersonate_args.get("interface")

@property
def cert(self) -> Optional[Union[str, Tuple[str, str]]]:
return self._impersonate_args.get("cert")

@property
def stream(self) -> bool:
return self._impersonate_args.get("stream", False)

@property
def max_recv_speed(self) -> int:
return self._impersonate_args.get("max_recv_speed", 0)

@property
def multipart(self) -> Optional[CurlMime]:
return self._impersonate_args.get("multipart")

def as_dict(self) -> dict:
return {
request_args = {
property_name: getattr(self, property_name)
for property_name, method in self.__class__.__dict__.items()
if isinstance(method, property)
}

request_args.update(self._impersonate_args)
return request_args
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name="scrapy-impersonate",
version="1.3.1",
version="1.4.0",
author="Jalil SA (jxlil)",
description="Scrapy download handler that can impersonate browser fingerprints",
license="MIT",
Expand Down

0 comments on commit 4f2f393

Please sign in to comment.