Skip to content

Commit

Permalink
various edge-case loading optimizations: (#709)
Browse files Browse the repository at this point in the history
- rework 'should stream' logic:
* ensure 206 responses (or any response) greater than 25M are streamed
* response between 5M and 25M are read into memory if text/css/js as they may be rewritten
* responses <5M are read into memory
* responses with unknown size are streamed if a 2xx, otherwise read into memory, assuming error code responses may lack status codes but otherwise are small
- likely fix for issues in #706
- if too many range requests for same URL are being made, try
skipping/failing right away to reduce load
- assume main browser context is used not just for service workers,
always enable
- check false positive 'net-aborted' error that may actually be ok for
media, as well as documents
- improve logging
- interrupt any pending requests (that may be loading via browser
context) after page timeout, log dropped requests
---------

Co-authored-by: Tessa Walsh <[email protected]>
  • Loading branch information
ikreymer and tw4l authored Oct 31, 2024
1 parent 5c00bca commit e5bab8e
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 80 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^23.5.1",
"puppeteer-core": "^23.6.0",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
Expand Down
6 changes: 4 additions & 2 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ export class Crawler {
finalExit = false;
uploadAndDeleteLocal = false;
done = false;
postCrawling = false;

textInPages = false;

Expand Down Expand Up @@ -1536,12 +1537,13 @@ self.__bx_behaviors.selectMainBehavior();
}

async postCrawl() {
this.postCrawling = true;
logger.info("Crawling done");

if (this.params.combineWARC && !this.params.dryRun) {
await this.combineWARC();
}

logger.info("Crawling done");

if (
(this.params.generateCDX || this.params.generateWACZ) &&
!this.params.dryRun
Expand Down
36 changes: 11 additions & 25 deletions src/util/browser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { Readable } from "node:stream";
import os from "os";
import path from "path";

import { LogContext, logger } from "./logger.js";
import { formatErr, LogContext, logger } from "./logger.js";
import { initStorage } from "./storage.js";

import { DISPLAY, type ServiceWorkerOpt } from "./constants.js";
Expand Down Expand Up @@ -126,7 +126,7 @@ export class Browser {
? undefined
: (target) => this.targetFilter(target),
};
await this._init(launchOpts, ondisconnect, recording);
await this._init(launchOpts, ondisconnect);
}

targetFilter(target: Target) {
Expand Down Expand Up @@ -392,17 +392,14 @@ export class Browser {
launchOpts: PuppeteerLaunchOptions,
// eslint-disable-next-line @typescript-eslint/ban-types
ondisconnect: Function | null = null,
recording: boolean,
) {
this.browser = await puppeteer.launch(launchOpts);

const target = this.browser.target();

this.firstCDP = await target.createCDPSession();

if (recording) {
await this.serviceWorkerFetch();
}
await this.browserContextFetch();

if (ondisconnect) {
this.browser.on("disconnected", (err) => ondisconnect(err));
Expand Down Expand Up @@ -479,35 +476,24 @@ export class Browser {
return { page, cdp };
}

async serviceWorkerFetch() {
async browserContextFetch() {
if (!this.firstCDP) {
return;
}

this.firstCDP.on("Fetch.requestPaused", async (params) => {
const { frameId, requestId, networkId, request } = params;
const { frameId, requestId, request } = params;

const { url } = request;

if (!this.firstCDP) {
throw new Error("CDP missing");
}

if (networkId) {
try {
await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) {
logger.warn(
"continueResponse failed",
{ url: request.url },
"recorder",
);
}
return;
}

let foundRecorder = null;

for (const recorder of this.recorders) {
if (recorder.swUrls.has(request.url)) {
if (recorder.swUrls.has(url)) {
recorder.swFrameIds.add(frameId);
}

Expand All @@ -520,16 +506,16 @@ export class Browser {
if (!foundRecorder) {
logger.warn(
"Skipping URL from unknown frame",
{ url: request.url, frameId },
{ url, frameId },
"recorder",
);

try {
await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) {
logger.warn(
logger.debug(
"continueResponse failed",
{ url: request.url },
{ url, ...formatErr(e), from: "serviceWorker" },
"recorder",
);
}
Expand Down
Loading

0 comments on commit e5bab8e

Please sign in to comment.