Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry support and additional fixes #743

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.5.0-beta.2",
"version": "1.5.0-beta.3",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
4 changes: 3 additions & 1 deletion src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.info("crawl already finished, running post-crawl tasks", {
state: initState,
});
this.finalExit = true;
await this.postCrawl();
return;
} else if (await this.crawlState.isCrawlStopped()) {
Expand Down Expand Up @@ -1921,7 +1922,7 @@ self.__bx_behaviors.selectMainBehavior();
} else if (!downloadResponse) {
// log if not already log and rethrow, consider page failed
if (msg !== "logged") {
logger.error("Page Load Failed, skipping page", {
logger.error("Page Load Failed, will retry", {
msg,
loadState: data.loadState,
...logDetails,
Expand All @@ -1945,6 +1946,7 @@ self.__bx_behaviors.selectMainBehavior();
depth === 0 &&
!isChromeError &&
respUrl !== url.split("#")[0] &&
respUrl + "/" !== url &&
!downloadResponse
) {
data.seedId = await this.crawlState.addExtraSeed(
Expand Down
3 changes: 3 additions & 0 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,9 @@ class ArgParser {

// background behaviors to apply
const behaviorOpts: { [key: string]: string | boolean } = {};
if (argv.blockAds) {
argv.behaviors.push("autoclick");
}
Comment on lines +703 to +705
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just leaving a comment so we remember to undo this

if (argv.behaviors.length > 0) {
argv.behaviors.forEach((x: string) => {
if (BEHAVIOR_TYPES.includes(x)) {
Expand Down
4 changes: 3 additions & 1 deletion src/util/blockrules.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { HTTPRequest, Page } from "puppeteer-core";
import { Browser } from "./browser.js";

import { fetch } from "undici";
import { getProxyDispatcher } from "./proxy.js";

const RULE_TYPES = ["block", "allowOnly"];

Expand Down Expand Up @@ -271,7 +272,7 @@ export class BlockRules {
logDetails: Record<string, any>,
) {
try {
const res = await fetch(reqUrl);
const res = await fetch(reqUrl, { dispatcher: getProxyDispatcher() });
const text = await res.text();

return !!text.match(frameTextMatch);
Expand Down Expand Up @@ -302,6 +303,7 @@ export class BlockRules {
method: "PUT",
headers: { "Content-Type": "text/html" },
body,
dispatcher: getProxyDispatcher(),
});
}
}
Expand Down
1 change: 1 addition & 0 deletions src/util/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ export const ADD_LINK_FUNC = "__bx_addLink";
export const FETCH_FUNC = "__bx_fetch";

export const MAX_DEPTH = 1000000;
export const MAX_RETRY_FAILED = 5;

export const FETCH_HEADERS_TIMEOUT_SECS = 30;
export const PAGE_OP_TIMEOUT_SECS = 5;
Expand Down
3 changes: 2 additions & 1 deletion src/util/file_reader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import util from "util";
import { exec as execCallback } from "child_process";

import { logger } from "./logger.js";
import { getProxyDispatcher } from "./proxy.js";

const exec = util.promisify(execCallback);

Expand Down Expand Up @@ -85,7 +86,7 @@ async function collectOnlineBehavior(url: string): Promise<FileSources> {
const behaviorFilepath = `/app/behaviors/${filename}`;

try {
const res = await fetch(url);
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
const fileContents = await res.text();
await fsp.writeFile(behaviorFilepath, fileContents);
logger.info(
Expand Down
6 changes: 5 additions & 1 deletion src/util/originoverride.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { formatErr, logger } from "./logger.js";
import { Browser } from "./browser.js";

import { fetch } from "undici";
import { getProxyDispatcher } from "./proxy.js";

export class OriginOverride {
originOverride: { origUrl: URL; destUrl: URL }[];
Expand Down Expand Up @@ -45,7 +46,10 @@ export class OriginOverride {
headers.set("origin", orig.origin);
}

const resp = await fetch(newUrl, { headers });
const resp = await fetch(newUrl, {
headers,
dispatcher: getProxyDispatcher(),
});

const body = Buffer.from(await resp.arrayBuffer());
const respHeaders = Object.fromEntries(resp.headers);
Expand Down
10 changes: 8 additions & 2 deletions src/util/proxy.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import net from "net";
import { Agent, Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici";
import { Agent, Dispatcher, ProxyAgent } from "undici";

import child_process from "child_process";

Expand All @@ -13,6 +13,8 @@ const SSH_PROXY_LOCAL_PORT = 9722;

const SSH_WAIT_TIMEOUT = 30000;

let proxyDispatcher: Dispatcher | undefined = undefined;

export function getEnvProxyUrl() {
if (process.env.PROXY_SERVER) {
return process.env.PROXY_SERVER;
Expand Down Expand Up @@ -46,10 +48,14 @@ export async function initProxy(

// set global fetch() dispatcher (with proxy, if any)
const dispatcher = createDispatcher(proxy, agentOpts);
setGlobalDispatcher(dispatcher);
proxyDispatcher = dispatcher;
return proxy;
}

export function getProxyDispatcher() {
return proxyDispatcher;
}

export function createDispatcher(
proxyUrl: string,
opts: Agent.Options,
Expand Down
23 changes: 14 additions & 9 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
isRedirectStatus,
} from "./reqresp.js";

import { fetch, getGlobalDispatcher, Response } from "undici";
import { fetch, Response } from "undici";

import {
getCustomRewriter,
Expand All @@ -23,6 +23,7 @@ import { WARCWriter } from "./warcwriter.js";
import { RedisCrawlState, WorkerId } from "./state.js";
import { CDPSession, Protocol } from "puppeteer-core";
import { Crawler } from "../crawler.js";
import { getProxyDispatcher } from "./proxy.js";

const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
Expand Down Expand Up @@ -1588,14 +1589,18 @@ class AsyncFetcher {

const headers = reqresp.getRequestHeadersDict();

const dispatcher = getGlobalDispatcher().compose((dispatch) => {
return (opts, handler) => {
if (opts.headers) {
reqresp.requestHeaders = opts.headers as Record<string, string>;
}
return dispatch(opts, handler);
};
});
let dispatcher = getProxyDispatcher();

if (dispatcher) {
dispatcher = dispatcher.compose((dispatch) => {
return (opts, handler) => {
if (opts.headers) {
reqresp.requestHeaders = opts.headers as Record<string, string>;
}
return dispatch(opts, handler);
};
});
}

const resp = await fetch(url!, {
method,
Expand Down
6 changes: 5 additions & 1 deletion src/util/sitemapper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { DETECT_SITEMAP } from "./constants.js";
import { sleep } from "./timing.js";

import { fetch, Response } from "undici";
import { getProxyDispatcher } from "./proxy.js";

const SITEMAP_CONCURRENCY = 5;

Expand Down Expand Up @@ -65,7 +66,10 @@ export class SitemapReader extends EventEmitter {

async _fetchWithRetry(url: string, message: string) {
while (true) {
const resp = await fetch(url, { headers: this.headers });
const resp = await fetch(url, {
headers: this.headers,
dispatcher: getProxyDispatcher(),
});

if (resp.ok) {
return resp;
Expand Down
6 changes: 3 additions & 3 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { v4 as uuidv4 } from "uuid";

import { logger } from "./logger.js";

import { MAX_DEPTH } from "./constants.js";
import { MAX_DEPTH, MAX_RETRY_FAILED } from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core";

Expand Down Expand Up @@ -170,7 +170,7 @@ export type SaveState = {
// ============================================================================
export class RedisCrawlState {
redis: Redis;
maxRetryPending = 1;
maxRetryPending = MAX_RETRY_FAILED;

uid: string;
key: string;
Expand Down Expand Up @@ -608,7 +608,7 @@ return inx;
}

if (retryFailed) {
logger.debug("Retring failed URL", { url: data.url }, "state");
logger.debug("Retrying failed URL", { url: data.url }, "state");
}

await this.markStarted(data.url);
Expand Down
Loading