Skip to content

Commit

Permalink
use builtin apply_apify_settings
Browse files Browse the repository at this point in the history
  • Loading branch information
honzajavorek committed Feb 16, 2024
1 parent 76d1cc0 commit 2c6021c
Showing 1 changed file with 2 additions and 38 deletions.
40 changes: 2 additions & 38 deletions juniorguru_plucker/actors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import nest_asyncio
from apify import Actor
from apify.scrapy.utils import apply_apify_settings
from scrapy import Item, Spider
from scrapy.settings import BaseSettings, Settings
from scrapy.spiderloader import SpiderLoader as BaseSpiderLoader
Expand All @@ -16,47 +17,10 @@ async def run_actor(settings: Settings, spider_class: Type[Spider]) -> None:
Actor.log.info(f"Spider {spider_class.name}")
actor_input = await Actor.get_input() or {}
proxy_config = actor_input.get("proxyConfig")
settings = apply_apify_settings(settings, proxy_config=proxy_config)
settings = apply_apify_settings(settings=settings, proxy_config=proxy_config)
run_spider(settings, spider_class)


def apply_apify_settings(
settings: Settings, proxy_config: dict | None = None
) -> Settings:
# Use ApifyScheduler as the scheduler
settings["SCHEDULER"] = "apify.scrapy.scheduler.ApifyScheduler"

# Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),
# ensuring it is executed as the final step in the pipeline sequence
settings["ITEM_PIPELINES"]["apify.scrapy.pipelines.ActorDatasetPushPipeline"] = 1000

# Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
settings["DOWNLOADER_MIDDLEWARES"][
"scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware"
] = None

# Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
settings["DOWNLOADER_MIDDLEWARES"][
"scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware"
] = None
settings["DOWNLOADER_MIDDLEWARES"][
"apify.scrapy.middlewares.ApifyHttpProxyMiddleware"
] = 950

# Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
settings["DOWNLOADER_MIDDLEWARES"][
"scrapy.downloadermiddlewares.retry.RetryMiddleware"
] = None
settings["DOWNLOADER_MIDDLEWARES"][
"apify.scrapy.middlewares.ApifyRetryMiddleware"
] = 1000

# Store the proxy configuration
settings["APIFY_PROXY_SETTINGS"] = proxy_config

return settings


def configure_async():
install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
nest_asyncio.apply()
Expand Down

0 comments on commit 2c6021c

Please sign in to comment.