Skip to content

Commit

Permalink
retries: move page urls that failed after all retries to separate :ff…
Browse files Browse the repository at this point in the history
… list

tests: add test for retry, ensure all retries are used
  • Loading branch information
ikreymer committed Jan 21, 2025
1 parent 7a0dda6 commit 688cb5e
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 2 deletions.
11 changes: 9 additions & 2 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ declare module "ioredis" {
requeuefailed(
fkey: string,
qkey: string,
ffkey: string,
maxRetryPending: number,
maxRegularDepth: number,
): Result<number, Context>;
Expand Down Expand Up @@ -178,6 +179,7 @@ export class RedisCrawlState {
skey: string;
dkey: string;
fkey: string;
ffkey: string;
ekey: string;
pageskey: string;
esKey: string;
Expand All @@ -199,6 +201,8 @@ export class RedisCrawlState {
this.dkey = this.key + ":d";
// failed
this.fkey = this.key + ":f";
// failed final, no more retry
this.ffkey = this.key + ":ff";
// crawler errors
this.ekey = this.key + ":e";
// pages
Expand Down Expand Up @@ -290,19 +294,21 @@ end
});

redis.defineCommand("requeuefailed", {
numberOfKeys: 2,
numberOfKeys: 3,
lua: `
local json = redis.call('rpop', KEYS[1]);
if json then
local data = cjson.decode(json);
data['retry'] = (data['retry'] or 0) + 1;
if tonumber(data['retry']) <= tonumber(ARGV[1]) then
json = cjson.encode(data);
local json = cjson.encode(data);
local score = (data['depth'] or 0) + ((data['extraHops'] or 0) * ARGV[2]);
redis.call('zadd', KEYS[2], score, json);
return 1;
else
redis.call('lpush', KEYS[3], json);
return 2;
end
end
Expand Down Expand Up @@ -580,6 +586,7 @@ return inx;
const res = await this.redis.requeuefailed(
this.fkey,
this.qkey,
this.ffkey,
this.maxRetryPending,
MAX_DEPTH,
);
Expand Down
54 changes: 54 additions & 0 deletions tests/test-fail-retry.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import { execSync, spawn } from "child_process";
import Redis from "ioredis";

const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";

async function sleep(time) {
await new Promise((resolve) => setTimeout(resolve, time));
}

test("run crawl", async () => {
let status = 0;
execSync(`docker run -d -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis`);

/*
async function runServer() {
console.log("Waiting to start server");
await sleep(2000);
console.log("Starting server");
//spawn("../../node_modules/.bin/http-server", ["-p", "31501", "--username", "user", "--password", "pass"], {cwd: "./docs/site"});
}
*/
const redis = new Redis("redis://127.0.0.1:36387/0", { lazyConnect: true, retryStrategy: () => null });

await sleep(3000);

let numRetries = 0;

try {
await redis.connect({
maxRetriesPerRequest: 100,
});

//runServer();

while (true) {
const res = await redis.lrange("test:ff", 0, -1);
if (res.length) {
const data = JSON.parse(res);
if (data.retry) {
numRetries = data.retry;
break;
}
}
await sleep(20);
}

} catch (e) {
console.error(e);
} finally {
expect(numRetries).toBe(5);
}
});

0 comments on commit 688cb5e

Please sign in to comment.