Skip to content

Commit

Permalink
give up on playwright
Browse files Browse the repository at this point in the history
  • Loading branch information
honzajavorek committed Oct 14, 2024
1 parent e55b329 commit 69ae559
Show file tree
Hide file tree
Showing 7 changed files with 6 additions and 191 deletions.
6 changes: 2 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,9 @@ RUN echo "Python version:" \
&& poetry install --only=main --no-interaction --no-ansi \
&& rm -rf /tmp/.poetry-cache \
&& echo "All installed Python packages:" \
&& pip freeze \
&& echo "Installing Playwright dependencies:" \
&& poetry run playwright install chromium --with-deps
&& pip freeze

RUN python3 -m compileall -q ./jg/plucker

ENV ACTOR_PATH_IN_DOCKER_CONTEXT="${ACTOR_PATH_IN_DOCKER_CONTEXT}"
CMD ["poetry", "run", "plucker", "--debug", "crawl", "--apify"]
CMD ["poetry", "run", "plucker", "crawl", "--apify"]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ Maybe there is now, but the monitoring is already implemented, so…
## Notes on development

- Use [Poetry](https://python-poetry.org/) for dependency management.
After `poetry install` run also `poetry run playwright install chromium` to enable browser scraping.
Run `poetry install`.
- It is preferred to pin exact versions of dependencies, without `^`, and let GitHub's Dependabot to upgrade dependencies in Pull Requests.
Unfortunately there is no setting in pyproject.toml, which would force this behavior, so once new dependencies are added, one needs to go and manually remove the `^` characters.
- Run `pytest` to see if your code has any issues.
Expand Down
11 changes: 2 additions & 9 deletions jg/plucker/jobs_linkedin/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,6 @@
class Spider(BaseSpider):
name = "jobs-linkedin"
download_delay = 5
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
}

search_params = {
"f_TPR": "r2592000", # past month
Expand Down Expand Up @@ -168,12 +162,11 @@ def verify_job(
def _retry(self, url: str, request: Request | None = None) -> Request:
if not request:
raise ValueError(f"Request object is required to retry {url}")
# self.logger.warning(f"Retrying {url} using browser")
return request.replace(
url=url,
dont_filter=True,
headers=self.request_headers,
meta=request.meta, # | dict(playwright=True),
meta=request.meta,
)

def _request(
Expand All @@ -188,7 +181,7 @@ def _request(
cookies=self.lang_cookies,
callback=callback,
cb_kwargs=cb_kwargs or {},
meta=dict(max_retry_times=5),
meta=dict(max_retry_times=10),
)


Expand Down
66 changes: 0 additions & 66 deletions jg/plucker/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from apify import Actor, Configuration
from apify.scrapy.middlewares.apify_proxy import ApifyHttpProxyMiddleware
from apify.scrapy.utils import apply_apify_settings
from playwright.sync_api import Error as PlaywrightError
from scrapy import Item, Request, Spider
from scrapy.crawler import Crawler, CrawlerProcess
from scrapy.settings import BaseSettings, Settings
Expand All @@ -30,15 +29,6 @@ async def run_actor(settings: Settings, spider_class: Type[Spider]) -> None:
actor_input = await Actor.get_input() or {}
proxy_config = actor_input.get("proxyConfig")
settings = apply_apify_settings(settings=settings, proxy_config=proxy_config)

# use custom proxy middleware
priority = settings["DOWNLOADER_MIDDLEWARES"].pop(
"apify.scrapy.middlewares.ApifyHttpProxyMiddleware"
)
settings["DOWNLOADER_MIDDLEWARES"][
"jg.plucker.scrapers.PlaywrightApifyHttpProxyMiddleware"
] = priority

run_spider(settings, spider_class)


Expand Down Expand Up @@ -121,59 +111,3 @@ def raise_for_stats(stats: dict[str, Any]):
raise StatsError(f"Scraping finished with reason {reason!r}")
if item_count := stats.get("item_dropped_reasons_count/MissingRequiredFields"):
raise StatsError(f"Items missing required fields: {item_count}")


class PlaywrightApifyHttpProxyMiddleware(ApifyHttpProxyMiddleware):
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
Actor.log.info("Using customized ApifyHttpProxyMiddleware.")
return cls(super().from_crawler(crawler)._proxy_settings)

async def process_request(self, request: Request, spider: Spider):
if request.meta.get("playwright"):
Actor.log.debug(
f"ApifyHttpProxyMiddleware.process_request: playwright=True, request={request}, spider={spider}"
)
url = await self._get_new_proxy_url()

if not (url.username and url.password):
raise ValueError(
"Username and password must be provided in the proxy URL."
)

proxy = url.geturl()
proxy_hash = hashlib.sha1(proxy.encode()).hexdigest()[0:8]
context_name = f"proxy_{proxy_hash}"
Actor.log.info(f"Using Playwright context {context_name}")
request.meta.update(
{
"playwright_context": f"proxy_{context_name}",
"playwright_context_kwargs": {
"proxy": {
"server": proxy,
"username": url.username,
"password": url.password,
},
},
}
)
Actor.log.debug(
f"ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}"
)
else:
await super().process_request(request, spider)

def process_exception(
self: ApifyHttpProxyMiddleware,
request: Request,
exception: Exception,
spider: Spider,
) -> None | Request:
if request := super().process_exception(request, exception, spider):
return request
if isinstance(exception, PlaywrightError):
Actor.log.warning(
f'ApifyHttpProxyMiddleware: Playwright error occurred for request="{request}", reason="{exception}", skipping...'
)
return request
return None
3 changes: 0 additions & 3 deletions jg/plucker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@

TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

PLAYWRIGHT_BROWSER_TYPE = "chromium"

EXTENSIONS = {
"scrapy.extensions.memusage.MemoryUsage": None,
"scrapy_playwright.memusage.ScrapyPlaywrightMemoryUsageExtension": 0,
}
108 changes: 1 addition & 107 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ feedparser = "6.0.11"
lxml = "5.3.0"
nest-asyncio = "1.6.0"
scrapy = "2.11.2"
scrapy-playwright = "0.0.41"

[tool.poetry.group.dev.dependencies]
cookiecutter = "2.6.0"
Expand Down

0 comments on commit 69ae559

Please sign in to comment.