From 0b9cd71c5a74efb2ab0a6f41d76a49fdf9985769 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 13 Nov 2024 23:28:37 -0800 Subject: [PATCH] Ensure partial responses are not written (#721) various fixes for streaming, especially related to range requests - follow up to #709 - fix: prefer streaming current response via takeStream, not only when size is unknown - don't serialize async responses prematurely - don't serialize 206 responses if there is size mismatch --- src/util/recorder.ts | 48 ++++++++++++++++++++++++++------------------ src/util/worker.ts | 4 ++-- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index be36cc3b..c237ce34 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -697,19 +697,18 @@ export class Recorder { requestId, }; - // fetching using response stream, await here and then either call fulFill, or if not started, return false - if (contentLen < 0) { - const fetcher = new ResponseStreamAsyncFetcher(opts); - const res = await fetcher.load(); - switch (res) { - case "dupe": - this.removeReqResp(networkId); - return false; - - case "fetched": - streamingConsume = true; - break; - } + // fetching using response stream as first attempt, + // await here and then either call fulFill, or if dupe, return false + const fetcher = new ResponseStreamAsyncFetcher(opts); + const res = await fetcher.load(); + switch (res) { + case "dupe": + this.removeReqResp(networkId); + return false; + + case "fetched": + streamingConsume = true; + break; } // if not consumed via takeStream, attempt async loading @@ -750,7 +749,12 @@ export class Recorder { // if in browser context, and not also intercepted in page context // serialize here, as won't be getting a loadingFinished message for it - if (isBrowserContext && !reqresp.inPageContext && reqresp.payload) { + if ( + isBrowserContext && + !reqresp.inPageContext && + !reqresp.asyncLoading && + reqresp.payload + ) { this.removeReqResp(networkId); await this.serializeToWARC(reqresp); } @@ -788,7 +792,7 @@ export class Recorder { ? "document not loaded in browser, possibly other URLs missing" : "URL not loaded in browser"; - logger.debug(msg, { url, resourceType }, "recorder"); + logger.debug(msg, { url, resourceType, e }, "recorder"); } return true; @@ -797,7 +801,11 @@ export class Recorder { addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number) { let fetcher: AsyncFetcher; - if (opts.reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) { + if ( + opts.reqresp.method !== "GET" || + contentLen > MAX_NETWORK_LOAD_SIZE || + !opts.reqresp.inPageContext + ) { fetcher = new AsyncFetcher(opts); } else { fetcher = new NetworkLoadStreamAsyncFetcher(opts); @@ -866,7 +874,7 @@ export class Recorder { async awaitPageResources() { for (const [requestId, reqresp] of this.pendingRequests.entries()) { - if (reqresp.payload) { + if (reqresp.payload && !reqresp.asyncLoading) { this.removeReqResp(requestId); await this.serializeToWARC(reqresp); // if no url, and not fetch intercept or async loading, @@ -1455,8 +1463,10 @@ class AsyncFetcher { }, "recorder", ); - //await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url); - //return fetched; + if (status === 206) { + await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status); + return "notfetched"; + } } const externalBuffer: TempFileBuffer = diff --git a/src/util/worker.ts b/src/util/worker.ts index abcce726..256d4241 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -438,9 +438,9 @@ export async function runWorkers( await Promise.allSettled(workers.map((worker) => worker.run())); - await crawler.browser.close(); - await closeWorkers(); + + await crawler.browser.close(); } // ===========================================================================