Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure partial responses are not written #721

Merged
merged 3 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 29 additions & 19 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -697,19 +697,18 @@ export class Recorder {
requestId,
};

// fetching using response stream, await here and then either call fulFill, or if not started, return false
if (contentLen < 0) {
const fetcher = new ResponseStreamAsyncFetcher(opts);
const res = await fetcher.load();
switch (res) {
case "dupe":
this.removeReqResp(networkId);
return false;

case "fetched":
streamingConsume = true;
break;
}
// fetching using response stream as first attempt,
// await here and then either call fulFill, or if dupe, return false
const fetcher = new ResponseStreamAsyncFetcher(opts);
const res = await fetcher.load();
switch (res) {
case "dupe":
this.removeReqResp(networkId);
return false;

case "fetched":
streamingConsume = true;
break;
}

// if not consumed via takeStream, attempt async loading
Expand Down Expand Up @@ -750,7 +749,12 @@ export class Recorder {

// if in browser context, and not also intercepted in page context
// serialize here, as won't be getting a loadingFinished message for it
if (isBrowserContext && !reqresp.inPageContext && reqresp.payload) {
if (
isBrowserContext &&
!reqresp.inPageContext &&
!reqresp.asyncLoading &&
reqresp.payload
) {
this.removeReqResp(networkId);
await this.serializeToWARC(reqresp);
}
Expand Down Expand Up @@ -788,7 +792,7 @@ export class Recorder {
? "document not loaded in browser, possibly other URLs missing"
: "URL not loaded in browser";

logger.debug(msg, { url, resourceType }, "recorder");
logger.debug(msg, { url, resourceType, e }, "recorder");
}

return true;
Expand All @@ -797,7 +801,11 @@ export class Recorder {
addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number) {
let fetcher: AsyncFetcher;

if (opts.reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
if (
opts.reqresp.method !== "GET" ||
contentLen > MAX_NETWORK_LOAD_SIZE ||
!opts.reqresp.inPageContext
) {
fetcher = new AsyncFetcher(opts);
} else {
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
Expand Down Expand Up @@ -866,7 +874,7 @@ export class Recorder {

async awaitPageResources() {
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
if (reqresp.payload) {
if (reqresp.payload && !reqresp.asyncLoading) {
this.removeReqResp(requestId);
await this.serializeToWARC(reqresp);
// if no url, and not fetch intercept or async loading,
Expand Down Expand Up @@ -1455,8 +1463,10 @@ class AsyncFetcher {
},
"recorder",
);
//await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
//return fetched;
if (status === 206) {
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
return "notfetched";
}
}

const externalBuffer: TempFileBuffer =
Expand Down
4 changes: 2 additions & 2 deletions src/util/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -438,9 +438,9 @@ export async function runWorkers(

await Promise.allSettled(workers.map((worker) => worker.run()));

await crawler.browser.close();

await closeWorkers();

await crawler.browser.close();
}

// ===========================================================================
Expand Down
Loading