Skip to content

Commit

Permalink
Ensure partial responses are not written (#721)
Browse files Browse the repository at this point in the history
various fixes for streaming, especially related to range requests
- follow up to #709
- fix: prefer streaming current response via takeStream, not only when
size is unknown
- don't serialize async responses prematurely
- don't serialize 206 responses if there is size mismatch
  • Loading branch information
ikreymer authored Nov 14, 2024
1 parent f56d650 commit 0b9cd71
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 21 deletions.
48 changes: 29 additions & 19 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -697,19 +697,18 @@ export class Recorder {
requestId,
};

// fetching using response stream, await here and then either call fulFill, or if not started, return false
if (contentLen < 0) {
const fetcher = new ResponseStreamAsyncFetcher(opts);
const res = await fetcher.load();
switch (res) {
case "dupe":
this.removeReqResp(networkId);
return false;

case "fetched":
streamingConsume = true;
break;
}
// fetching using response stream as first attempt,
// await here and then either call fulFill, or if dupe, return false
const fetcher = new ResponseStreamAsyncFetcher(opts);
const res = await fetcher.load();
switch (res) {
case "dupe":
this.removeReqResp(networkId);
return false;

case "fetched":
streamingConsume = true;
break;
}

// if not consumed via takeStream, attempt async loading
Expand Down Expand Up @@ -750,7 +749,12 @@ export class Recorder {

// if in browser context, and not also intercepted in page context
// serialize here, as won't be getting a loadingFinished message for it
if (isBrowserContext && !reqresp.inPageContext && reqresp.payload) {
if (
isBrowserContext &&
!reqresp.inPageContext &&
!reqresp.asyncLoading &&
reqresp.payload
) {
this.removeReqResp(networkId);
await this.serializeToWARC(reqresp);
}
Expand Down Expand Up @@ -788,7 +792,7 @@ export class Recorder {
? "document not loaded in browser, possibly other URLs missing"
: "URL not loaded in browser";

logger.debug(msg, { url, resourceType }, "recorder");
logger.debug(msg, { url, resourceType, e }, "recorder");
}

return true;
Expand All @@ -797,7 +801,11 @@ export class Recorder {
addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number) {
let fetcher: AsyncFetcher;

if (opts.reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
if (
opts.reqresp.method !== "GET" ||
contentLen > MAX_NETWORK_LOAD_SIZE ||
!opts.reqresp.inPageContext
) {
fetcher = new AsyncFetcher(opts);
} else {
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
Expand Down Expand Up @@ -866,7 +874,7 @@ export class Recorder {

async awaitPageResources() {
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
if (reqresp.payload) {
if (reqresp.payload && !reqresp.asyncLoading) {
this.removeReqResp(requestId);
await this.serializeToWARC(reqresp);
// if no url, and not fetch intercept or async loading,
Expand Down Expand Up @@ -1455,8 +1463,10 @@ class AsyncFetcher {
},
"recorder",
);
//await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
//return fetched;
if (status === 206) {
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
return "notfetched";
}
}

const externalBuffer: TempFileBuffer =
Expand Down
4 changes: 2 additions & 2 deletions src/util/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -438,9 +438,9 @@ export async function runWorkers(

await Promise.allSettled(workers.map((worker) => worker.run()));

await crawler.browser.close();

await closeWorkers();

await crawler.browser.close();
}

// ===========================================================================
Expand Down

0 comments on commit 0b9cd71

Please sign in to comment.