Skip to content

Commit

Permalink
Retry Failed Pages + Ignore Hashtags in Redirect Check (#739)
Browse files Browse the repository at this point in the history
- Retry pages that are marked as failed once, at the end of the crawl,
in case it was due to a timeout
- Also, don't treat differences in hashtag between seed page loaded and
actual URL as a redirect (eg. don't add as new seed)
  • Loading branch information
ikreymer authored Jan 16, 2025
1 parent bc4a958 commit 5d9c62e
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 6 deletions.
9 changes: 7 additions & 2 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1938,10 +1938,15 @@ self.__bx_behaviors.selectMainBehavior();
throw new Error("no response for page load, assuming failed");
}

const respUrl = resp.url();
const respUrl = resp.url().split("#")[0];
const isChromeError = page.url().startsWith("chrome-error://");

if (depth === 0 && !isChromeError && respUrl !== url && !downloadResponse) {
if (
depth === 0 &&
!isChromeError &&
respUrl !== url.split("#")[0] &&
!downloadResponse
) {
data.seedId = await this.crawlState.addExtraSeed(
this.seeds,
this.numOriginalSeeds,
Expand Down
62 changes: 58 additions & 4 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,13 @@ declare module "ioredis" {
state: string,
): Result<void, Context>;

requeuefailed(
fkey: string,
qkey: string,
maxRetryPending: number,
maxRegularDepth: number,
): Result<number, Context>;

unlockpending(
pkeyUrl: string,
uid: string,
Expand Down Expand Up @@ -283,6 +290,27 @@ if json then
redis.call('hdel', KEYS[1], ARGV[1]);
end
`,
});

redis.defineCommand("requeuefailed", {
numberOfKeys: 2,
lua: `
local json = redis.call('rpop', KEYS[1]);
if json then
local data = cjson.decode(json);
data['retry'] = (data['retry'] or 0) + 1;
if tonumber(data['retry']) <= tonumber(ARGV[1]) then
json = cjson.encode(data);
local score = (data['depth'] or 0) + ((data['extraHops'] or 0) * ARGV[2]);
redis.call('zadd', KEYS[2], score, json);
return 1;
else
return 2;
end
end
return 0;
`,
});

Expand Down Expand Up @@ -543,18 +571,44 @@ return inx;
}

async nextFromQueue() {
const json = await this._getNext();
let json = await this._getNext();
let retryFailed = false;

if (!json) {
const res = await this.redis.requeuefailed(
this.fkey,
this.qkey,
this.maxRetryPending,
MAX_DEPTH,
);

switch (res) {
case 1:
json = await this._getNext();
retryFailed = true;
break;

case 2:
logger.debug("Did not retry failed, already retried", {}, "state");
return null;
}
}

if (!json) {
return null;
}

let data;

try {
data = JSON.parse(json);
} catch (e) {
logger.error("Invalid queued json", json, "redis");
logger.error("Invalid queued json", json, "state");
return null;
}

if (!data) {
return null;
if (retryFailed) {
logger.debug("Retring failed URL", { url: data.url }, "state");
}

await this.markStarted(data.url);
Expand Down

0 comments on commit 5d9c62e

Please sign in to comment.