From 5961a521c28d1ca13eea96af6e7506c43419f578 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 17 Jan 2025 18:19:31 -0800 Subject: [PATCH 1/7] set failed URL retry to 5 by default --- src/crawler.ts | 2 +- src/util/constants.ts | 1 + src/util/state.ts | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index b1f12a08..63229618 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1921,7 +1921,7 @@ self.__bx_behaviors.selectMainBehavior(); } else if (!downloadResponse) { // log if not already log and rethrow, consider page failed if (msg !== "logged") { - logger.error("Page Load Failed, skipping page", { + logger.error("Page Load Failed, will retry", { msg, loadState: data.loadState, ...logDetails, diff --git a/src/util/constants.ts b/src/util/constants.ts index 72506c6c..0c5d6faf 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -27,6 +27,7 @@ export const ADD_LINK_FUNC = "__bx_addLink"; export const FETCH_FUNC = "__bx_fetch"; export const MAX_DEPTH = 1000000; +export const MAX_RETRY_FAILED = 5; export const FETCH_HEADERS_TIMEOUT_SECS = 30; export const PAGE_OP_TIMEOUT_SECS = 5; diff --git a/src/util/state.ts b/src/util/state.ts index 9388e478..3f5ad807 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -3,7 +3,7 @@ import { v4 as uuidv4 } from "uuid"; import { logger } from "./logger.js"; -import { MAX_DEPTH } from "./constants.js"; +import { MAX_DEPTH, MAX_RETRY_FAILED } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; @@ -170,7 +170,7 @@ export type SaveState = { // ============================================================================ export class RedisCrawlState { redis: Redis; - maxRetryPending = 1; + maxRetryPending = MAX_RETRY_FAILED; uid: string; key: string; @@ -608,7 +608,7 @@ return inx; } if (retryFailed) { - logger.debug("Retring failed URL", { url: data.url }, "state"); + logger.debug("Retrying failed URL", { url: data.url }, "state"); } await this.markStarted(data.url); From 7b5ba9783d44c6287559c3fc274c6a19ee76fc26 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 18 Jan 2025 00:55:50 -0800 Subject: [PATCH 2/7] additional fixes: - redirect: if page url is /path/ -> /path, don't add as extra seed - proxy: don't use global dispatcher, pass dispatcher explicitly when using proxy --- package.json | 2 +- src/crawler.ts | 2 ++ src/util/argParser.ts | 3 +++ src/util/blockrules.ts | 4 +++- src/util/file_reader.ts | 3 ++- src/util/originoverride.ts | 6 +++++- src/util/proxy.ts | 10 ++++++++-- src/util/recorder.ts | 23 ++++++++++++++--------- src/util/sitemapper.ts | 6 +++++- 9 files changed, 43 insertions(+), 16 deletions(-) diff --git a/package.json b/package.json index 8f71b907..edcca228 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.5.0-beta.2", + "version": "1.5.0-beta.3", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index 63229618..a141026a 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1498,6 +1498,7 @@ self.__bx_behaviors.selectMainBehavior(); logger.info("crawl already finished, running post-crawl tasks", { state: initState, }); + this.finalExit = true; await this.postCrawl(); return; } else if (await this.crawlState.isCrawlStopped()) { @@ -1945,6 +1946,7 @@ self.__bx_behaviors.selectMainBehavior(); depth === 0 && !isChromeError && respUrl !== url.split("#")[0] && + respUrl + "/" !== url && !downloadResponse ) { data.seedId = await this.crawlState.addExtraSeed( diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 4c9db398..513657ee 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -700,6 +700,9 @@ class ArgParser { // background behaviors to apply const behaviorOpts: { [key: string]: string | boolean } = {}; + if (argv.blockAds) { + argv.behaviors.push("autoclick"); + } if (argv.behaviors.length > 0) { argv.behaviors.forEach((x: string) => { if (BEHAVIOR_TYPES.includes(x)) { diff --git a/src/util/blockrules.ts b/src/util/blockrules.ts index 5d3238fb..0e7fb511 100644 --- a/src/util/blockrules.ts +++ b/src/util/blockrules.ts @@ -5,6 +5,7 @@ import { HTTPRequest, Page } from "puppeteer-core"; import { Browser } from "./browser.js"; import { fetch } from "undici"; +import { getProxyDispatcher } from "./proxy.js"; const RULE_TYPES = ["block", "allowOnly"]; @@ -271,7 +272,7 @@ export class BlockRules { logDetails: Record, ) { try { - const res = await fetch(reqUrl); + const res = await fetch(reqUrl, { dispatcher: getProxyDispatcher() }); const text = await res.text(); return !!text.match(frameTextMatch); @@ -302,6 +303,7 @@ export class BlockRules { method: "PUT", headers: { "Content-Type": "text/html" }, body, + dispatcher: getProxyDispatcher(), }); } } diff --git a/src/util/file_reader.ts b/src/util/file_reader.ts index 7eea4162..284f0dd8 100644 --- a/src/util/file_reader.ts +++ b/src/util/file_reader.ts @@ -6,6 +6,7 @@ import util from "util"; import { exec as execCallback } from "child_process"; import { logger } from "./logger.js"; +import { getProxyDispatcher } from "./proxy.js"; const exec = util.promisify(execCallback); @@ -85,7 +86,7 @@ async function collectOnlineBehavior(url: string): Promise { const behaviorFilepath = `/app/behaviors/${filename}`; try { - const res = await fetch(url); + const res = await fetch(url, { dispatcher: getProxyDispatcher() }); const fileContents = await res.text(); await fsp.writeFile(behaviorFilepath, fileContents); logger.info( diff --git a/src/util/originoverride.ts b/src/util/originoverride.ts index a00a2b54..1b2b8c41 100644 --- a/src/util/originoverride.ts +++ b/src/util/originoverride.ts @@ -3,6 +3,7 @@ import { formatErr, logger } from "./logger.js"; import { Browser } from "./browser.js"; import { fetch } from "undici"; +import { getProxyDispatcher } from "./proxy.js"; export class OriginOverride { originOverride: { origUrl: URL; destUrl: URL }[]; @@ -45,7 +46,10 @@ export class OriginOverride { headers.set("origin", orig.origin); } - const resp = await fetch(newUrl, { headers }); + const resp = await fetch(newUrl, { + headers, + dispatcher: getProxyDispatcher(), + }); const body = Buffer.from(await resp.arrayBuffer()); const respHeaders = Object.fromEntries(resp.headers); diff --git a/src/util/proxy.ts b/src/util/proxy.ts index cf6a3437..5b15d6e2 100644 --- a/src/util/proxy.ts +++ b/src/util/proxy.ts @@ -1,5 +1,5 @@ import net from "net"; -import { Agent, Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici"; +import { Agent, Dispatcher, ProxyAgent } from "undici"; import child_process from "child_process"; @@ -13,6 +13,8 @@ const SSH_PROXY_LOCAL_PORT = 9722; const SSH_WAIT_TIMEOUT = 30000; +let proxyDispatcher: Dispatcher | undefined = undefined; + export function getEnvProxyUrl() { if (process.env.PROXY_SERVER) { return process.env.PROXY_SERVER; @@ -46,10 +48,14 @@ export async function initProxy( // set global fetch() dispatcher (with proxy, if any) const dispatcher = createDispatcher(proxy, agentOpts); - setGlobalDispatcher(dispatcher); + proxyDispatcher = dispatcher; return proxy; } +export function getProxyDispatcher() { + return proxyDispatcher; +} + export function createDispatcher( proxyUrl: string, opts: Agent.Options, diff --git a/src/util/recorder.ts b/src/util/recorder.ts index a10c8175..2e0005b6 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -8,7 +8,7 @@ import { isRedirectStatus, } from "./reqresp.js"; -import { fetch, getGlobalDispatcher, Response } from "undici"; +import { fetch, Response } from "undici"; import { getCustomRewriter, @@ -23,6 +23,7 @@ import { WARCWriter } from "./warcwriter.js"; import { RedisCrawlState, WorkerId } from "./state.js"; import { CDPSession, Protocol } from "puppeteer-core"; import { Crawler } from "../crawler.js"; +import { getProxyDispatcher } from "./proxy.js"; const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000; const MAX_TEXT_REWRITE_SIZE = 25_000_000; @@ -1588,14 +1589,18 @@ class AsyncFetcher { const headers = reqresp.getRequestHeadersDict(); - const dispatcher = getGlobalDispatcher().compose((dispatch) => { - return (opts, handler) => { - if (opts.headers) { - reqresp.requestHeaders = opts.headers as Record; - } - return dispatch(opts, handler); - }; - }); + let dispatcher = getProxyDispatcher(); + + if (dispatcher) { + dispatcher = dispatcher.compose((dispatch) => { + return (opts, handler) => { + if (opts.headers) { + reqresp.requestHeaders = opts.headers as Record; + } + return dispatch(opts, handler); + }; + }); + } const resp = await fetch(url!, { method, diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index a13eae16..3ffb40c7 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -10,6 +10,7 @@ import { DETECT_SITEMAP } from "./constants.js"; import { sleep } from "./timing.js"; import { fetch, Response } from "undici"; +import { getProxyDispatcher } from "./proxy.js"; const SITEMAP_CONCURRENCY = 5; @@ -65,7 +66,10 @@ export class SitemapReader extends EventEmitter { async _fetchWithRetry(url: string, message: string) { while (true) { - const resp = await fetch(url, { headers: this.headers }); + const resp = await fetch(url, { + headers: this.headers, + dispatcher: getProxyDispatcher(), + }); if (resp.ok) { return resp; From 353975503558c9c21d3bc66ee1b2f9da35630faa Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 18 Jan 2025 02:35:33 -0800 Subject: [PATCH 3/7] ensure 'finalExit' is set if crawl is fully done! --- src/crawler.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index a141026a..1e84d6de 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1582,8 +1582,11 @@ self.__bx_behaviors.selectMainBehavior(); await this.writeStats(); - // if crawl has been stopped, mark as final exit for post-crawl tasks - if (await this.crawlState.isCrawlStopped()) { + // if crawl has been stopped or finished, mark as final exit for post-crawl tasks + if ( + (await this.crawlState.isCrawlStopped()) || + (await this.crawlState.isFinished()) + ) { this.finalExit = true; } From 7a0dda676336502ec613b5e22f3a79227646096c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 18 Jan 2025 17:11:48 -0800 Subject: [PATCH 4/7] don't count retried pages multiple times --- src/crawler.ts | 6 +++--- src/util/state.ts | 24 +++++++++++++----------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 1e84d6de..638bcba5 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1156,12 +1156,12 @@ self.__bx_behaviors.selectMainBehavior(); // if page loaded, considered page finished successfully // (even if behaviors timed out) - const { loadState, logDetails, depth, url } = data; + const { loadState, logDetails, depth, url, retry } = data; if (data.loadState >= LoadState.FULL_PAGE_LOADED) { logger.info("Page Finished", { loadState, ...logDetails }, "pageStatus"); - await this.crawlState.markFinished(url); + await this.crawlState.markFinished(url, retry); if (this.healthChecker) { this.healthChecker.resetErrors(); @@ -1171,7 +1171,7 @@ self.__bx_behaviors.selectMainBehavior(); await this.checkLimits(); } else { - await this.crawlState.markFailed(url); + await this.crawlState.markFailed(url, retry); if (this.healthChecker) { this.healthChecker.incError(); diff --git a/src/util/state.ts b/src/util/state.ts index 3f5ad807..37d595d8 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -35,6 +35,7 @@ export type QueueEntry = { extraHops: number; ts?: number; pageid?: string; + retry?: number; }; // ============================================================================ @@ -54,6 +55,7 @@ export class PageState { seedId: number; depth: number; extraHops: number; + retry: number; status: number; @@ -87,6 +89,7 @@ export class PageState { } this.pageid = redisData.pageid || uuidv4(); this.status = 0; + this.retry = redisData.retry || 0; } } @@ -115,13 +118,7 @@ declare module "ioredis" { uid: string, ): Result; - movefailed( - pkey: string, - fkey: string, - url: string, - value: string, - state: string, - ): Result; + movefailed(pkey: string, fkey: string, url: string): Result; requeuefailed( fkey: string, @@ -283,7 +280,6 @@ local json = redis.call('hget', KEYS[1], ARGV[1]); if json then local data = cjson.decode(json); - data[ARGV[3]] = ARGV[2]; json = cjson.encode(data); redis.call('lpush', KEYS[2], json); @@ -375,15 +371,21 @@ return inx; ); } - async markFinished(url: string) { + async markFinished(url: string, retry: number) { await this.redis.hdel(this.pkey, url); + if (retry) { + return 1; + } return await this.redis.incr(this.dkey); } - async markFailed(url: string) { - await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed"); + async markFailed(url: string, retry: number) { + await this.redis.movefailed(this.pkey, this.fkey, url); + if (retry) { + return 1; + } return await this.redis.incr(this.dkey); } From 688cb5e034b4a02698cb446ce7011f54ef49c7f9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 21 Jan 2025 00:58:26 -0800 Subject: [PATCH 5/7] retries: move page urls that failed after all retries to separate :ff list tests: add test for retry, ensure all retries are used --- src/util/state.ts | 11 +++++-- tests/test-fail-retry.test.js | 54 +++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 tests/test-fail-retry.test.js diff --git a/src/util/state.ts b/src/util/state.ts index 37d595d8..9d8620d3 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -123,6 +123,7 @@ declare module "ioredis" { requeuefailed( fkey: string, qkey: string, + ffkey: string, maxRetryPending: number, maxRegularDepth: number, ): Result; @@ -178,6 +179,7 @@ export class RedisCrawlState { skey: string; dkey: string; fkey: string; + ffkey: string; ekey: string; pageskey: string; esKey: string; @@ -199,6 +201,8 @@ export class RedisCrawlState { this.dkey = this.key + ":d"; // failed this.fkey = this.key + ":f"; + // failed final, no more retry + this.ffkey = this.key + ":ff"; // crawler errors this.ekey = this.key + ":e"; // pages @@ -290,19 +294,21 @@ end }); redis.defineCommand("requeuefailed", { - numberOfKeys: 2, + numberOfKeys: 3, lua: ` local json = redis.call('rpop', KEYS[1]); if json then local data = cjson.decode(json); data['retry'] = (data['retry'] or 0) + 1; + if tonumber(data['retry']) <= tonumber(ARGV[1]) then - json = cjson.encode(data); + local json = cjson.encode(data); local score = (data['depth'] or 0) + ((data['extraHops'] or 0) * ARGV[2]); redis.call('zadd', KEYS[2], score, json); return 1; else + redis.call('lpush', KEYS[3], json); return 2; end end @@ -580,6 +586,7 @@ return inx; const res = await this.redis.requeuefailed( this.fkey, this.qkey, + this.ffkey, this.maxRetryPending, MAX_DEPTH, ); diff --git a/tests/test-fail-retry.test.js b/tests/test-fail-retry.test.js new file mode 100644 index 00000000..76441efa --- /dev/null +++ b/tests/test-fail-retry.test.js @@ -0,0 +1,54 @@ +import { execSync, spawn } from "child_process"; +import Redis from "ioredis"; + +const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal"; + +async function sleep(time) { + await new Promise((resolve) => setTimeout(resolve, time)); +} + +test("run crawl", async () => { + let status = 0; + execSync(`docker run -d -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis`); + +/* + async function runServer() { + console.log("Waiting to start server"); + await sleep(2000); + + console.log("Starting server"); + //spawn("../../node_modules/.bin/http-server", ["-p", "31501", "--username", "user", "--password", "pass"], {cwd: "./docs/site"}); + } +*/ + const redis = new Redis("redis://127.0.0.1:36387/0", { lazyConnect: true, retryStrategy: () => null }); + + await sleep(3000); + + let numRetries = 0; + + try { + await redis.connect({ + maxRetriesPerRequest: 100, + }); + + //runServer(); + + while (true) { + const res = await redis.lrange("test:ff", 0, -1); + if (res.length) { + const data = JSON.parse(res); + if (data.retry) { + numRetries = data.retry; + break; + } + } + await sleep(20); + } + + } catch (e) { + console.error(e); + } finally { + expect(numRetries).toBe(5); + } +}); + From d70bb12c33516b21a1eb0b9a07acf6b79fc99d8c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 21 Jan 2025 16:18:40 -0800 Subject: [PATCH 6/7] move yaml dump into try/catch --- src/crawler.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/crawler.ts b/src/crawler.ts index 638bcba5..6119b567 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2657,8 +2657,9 @@ self.__bx_behaviors.selectMainBehavior(); if (this.origConfig) { this.origConfig.state = state; } - const res = yaml.dump(this.origConfig, { lineWidth: -1 }); + try { + const res = yaml.dump(this.origConfig, { lineWidth: -1 }); logger.info(`Saving crawl state to: ${filename}`); await fsp.writeFile(filename, res); } catch (e) { From ac1470e4e070687df95b53792d47a9a0d51571c6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 21 Jan 2025 16:36:17 -0800 Subject: [PATCH 7/7] hashtag check: when subsequent page URLs are same page but different hashtag, do a full reload --- src/crawler.ts | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 6119b567..e8444651 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1889,12 +1889,14 @@ self.__bx_behaviors.selectMainBehavior(); } }; - page.on("response", waitFirstResponse); + const handleFirstLoadEvents = () => { + page.on("response", waitFirstResponse); - // store that domcontentloaded was finished - page.once("domcontentloaded", () => { - data.loadState = LoadState.CONTENT_LOADED; - }); + // store that domcontentloaded was finished + page.once("domcontentloaded", () => { + data.loadState = LoadState.CONTENT_LOADED; + }); + }; const gotoOpts = data.isHTMLPage ? this.gotoOpts @@ -1902,9 +1904,24 @@ self.__bx_behaviors.selectMainBehavior(); logger.info("Awaiting page load", logDetails); + const urlNoHash = url.split("#")[0]; + + const fullRefresh = urlNoHash === page.url().split("#")[0]; + try { + if (!fullRefresh) { + handleFirstLoadEvents(); + } // store the page load response when page fully loads fullLoadedResponse = await page.goto(url, gotoOpts); + + if (fullRefresh) { + logger.debug("Hashtag-only change, doing full page reload"); + + handleFirstLoadEvents(); + + fullLoadedResponse = await page.reload(gotoOpts); + } } catch (e) { if (!(e instanceof Error)) { throw e; @@ -1948,7 +1965,7 @@ self.__bx_behaviors.selectMainBehavior(); if ( depth === 0 && !isChromeError && - respUrl !== url.split("#")[0] && + respUrl !== urlNoHash && respUrl + "/" !== url && !downloadResponse ) {