diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 58e05e327..8db3e795c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -60,8 +60,11 @@ jobs: - name: add http-server for tests run: yarn add -D http-server + - name: install py-wacz as root for tests + run: sudo pip install wacz + - name: run all tests as root - run: sudo DOCKER_HOST_NAME=172.17.0.1 yarn test + run: sudo DOCKER_HOST_NAME=172.17.0.1 yarn test -validate - name: run saved state + qa compare test as non-root - with volume owned by current user run: | diff --git a/Dockerfile b/Dockerfile index 41755aed4..2d85936d7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,13 +17,6 @@ EXPOSE 9222 9223 6080 WORKDIR /app -ADD requirements.txt /app/ -RUN python3 -m venv /app/python-venv && \ - /app/python-venv/bin/pip install -U setuptools && \ - /app/python-venv/bin/pip install -r requirements.txt && \ - ln -s /app/python-venv/bin/wacz /usr/bin/wacz && \ - ln -s /app/python-venv/bin/cdxj-indexer /usr/bin/cdxj-indexer - ADD package.json yarn.lock /app/ # to allow forcing rebuilds from this stage diff --git a/package.json b/package.json index 939829110..084420002 100644 --- a/package.json +++ b/package.json @@ -17,9 +17,9 @@ }, "dependencies": { "@novnc/novnc": "^1.4.0", - "@types/sax": "^1.2.7", - "@webrecorder/wabac": "^2.19.7", + "@webrecorder/wabac": "^2.19.8", "browsertrix-behaviors": "^0.6.4", + "client-zip": "^2.4.5", "fetch-socks": "^1.3.0", "get-folder-size": "^4.0.0", "husky": "^8.0.3", @@ -36,7 +36,7 @@ "tsc": "^2.0.4", "undici": "^6.18.2", "uuid": "8.3.2", - "warcio": "^2.2.1", + "warcio": "^2.3.0", "ws": "^7.4.4", "yargs": "^17.7.2" }, @@ -46,6 +46,7 @@ "@types/node": "^20.8.7", "@types/pixelmatch": "^5.2.6", "@types/pngjs": "^6.0.4", + "@types/sax": "^1.2.7", "@types/uuid": "^9.0.6", "@types/ws": "^8.5.8", "@typescript-eslint/eslint-plugin": "^6.10.0", @@ -62,5 +63,8 @@ "jest": { "transform": {}, "testTimeout": 90000 + }, + "resolutions": { + "wrap-ansi": "7.0.0" } } diff --git a/src/crawler.ts b/src/crawler.ts index bca77934b..1033df86d 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -16,6 +16,8 @@ import { parseArgs } from "./util/argParser.js"; import yaml from "js-yaml"; +import { WACZ, WACZInitOpts, mergeCDXJ } from "./util/wacz.js"; + import { HealthChecker } from "./util/healthcheck.js"; import { TextExtractViaSnapshot } from "./util/textextract.js"; import { @@ -62,7 +64,12 @@ import { import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; -import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; +import { + WARCWriter, + createWARCInfo, + setWARCInfo, + streamFinish, +} from "./util/warcwriter.js"; import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js"; import { initProxy } from "./util/proxy.js"; @@ -117,7 +124,7 @@ export class Crawler { pagesFH?: WriteStream | null = null; extraPagesFH?: WriteStream | null = null; - logFH!: WriteStream; + logFH: WriteStream | null = null; crawlId: string; @@ -150,7 +157,8 @@ export class Crawler { archivesDir: string; tempdir: string; - tempCdxDir: string; + warcCdxDir: string; + indexesDir: string; screenshotWriter: WARCWriter | null; textWriter: WARCWriter | null; @@ -288,7 +296,10 @@ export class Crawler { // archives dir this.archivesDir = path.join(this.collDir, "archive"); this.tempdir = path.join(os.tmpdir(), "tmp-dl"); - this.tempCdxDir = path.join(this.collDir, "tmp-cdx"); + + // indexes dirs + this.warcCdxDir = path.join(this.collDir, "warc-cdx"); + this.indexesDir = path.join(this.collDir, "indexes"); this.screenshotWriter = null; this.textWriter = null; @@ -470,7 +481,7 @@ export class Crawler { if (!this.params.dryRun) { await fsp.mkdir(this.archivesDir, { recursive: true }); await fsp.mkdir(this.tempdir, { recursive: true }); - await fsp.mkdir(this.tempCdxDir, { recursive: true }); + await fsp.mkdir(this.warcCdxDir, { recursive: true }); } this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" }); @@ -1478,36 +1489,24 @@ self.__bx_behaviors.selectMainBehavior(); await this.combineWARC(); } - if (this.params.generateCDX && !this.params.dryRun) { - logger.info("Generating CDX"); - await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true }); - await this.crawlState.setStatus("generate-cdx"); + logger.info("Crawling done"); - const warcList = await fsp.readdir(this.archivesDir); - const warcListFull = warcList.map((filename) => - path.join(this.archivesDir, filename), + if ( + (this.params.generateCDX || this.params.generateWACZ) && + !this.params.dryRun + ) { + logger.info("Merging CDX"); + await this.crawlState.setStatus( + this.params.generateWACZ ? "generate-wacz" : "generate-cdx", ); - //const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd})); - const params = [ - "-o", - path.join(this.collDir, "indexes", "index.cdxj"), - ...warcListFull, - ]; - const indexResult = await this.awaitProcess( - child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }), + await mergeCDXJ( + this.warcCdxDir, + this.indexesDir, + this.params.generateWACZ ? null : false, ); - if (indexResult === 0) { - logger.debug("Indexing complete, CDX successfully created"); - } else { - logger.error("Error indexing and generating CDX", { - "status code": indexResult, - }); - } } - logger.info("Crawling done"); - if ( this.params.generateWACZ && !this.params.dryRun && @@ -1543,11 +1542,9 @@ self.__bx_behaviors.selectMainBehavior(); if (!this.logFH) { return; } - try { - await new Promise((resolve) => this.logFH.close(() => resolve())); - } catch (e) { - // ignore - } + const logFH = this.logFH; + this.logFH = null; + await streamFinish(logFH); } async generateWACZ() { @@ -1577,110 +1574,67 @@ self.__bx_behaviors.selectMainBehavior(); logger.fatal("No WARC Files, assuming crawl failed"); } - logger.debug("End of log file, storing logs in WACZ"); + const waczPath = path.join(this.collDir, this.params.collection + ".wacz"); - // Build the argument list to pass to the wacz create command - const waczFilename = this.params.collection.concat(".wacz"); - const waczPath = path.join(this.collDir, waczFilename); + const streaming = !!this.storage; - const createArgs = [ - "create", - "-o", - waczPath, - "--pages", - this.seedPagesFile, - "--extra-pages", - this.otherPagesFile, - "--copy-pages", - "--log-directory", - this.logDir, - ]; + if (!streaming) { + logger.debug("WACZ will be written to disk", { path: waczPath }, "wacz"); + } else { + logger.debug("WACZ will be stream uploaded to remote storage"); + } + + logger.debug("End of log file in WACZ, storing logs to WACZ file"); + + await this.closeLog(); + + const waczOpts: WACZInitOpts = { + input: warcFileList.map((x) => path.join(this.archivesDir, x)), + output: waczPath, + pages: this.pagesDir, + logDirectory: this.logDir, + warcCdxDir: this.warcCdxDir, + indexesDir: this.indexesDir, + softwareString: this.infoString, + }; if (process.env.WACZ_SIGN_URL) { - createArgs.push("--signing-url"); - createArgs.push(process.env.WACZ_SIGN_URL); + waczOpts.signingUrl = process.env.WACZ_SIGN_URL; if (process.env.WACZ_SIGN_TOKEN) { - createArgs.push("--signing-token"); - createArgs.push(process.env.WACZ_SIGN_TOKEN); + waczOpts.signingToken = "bearer " + process.env.WACZ_SIGN_TOKEN; } } if (this.params.title) { - createArgs.push("--title"); - createArgs.push(this.params.title); + waczOpts.title = this.params.title; } if (this.params.description) { - createArgs.push("--desc"); - createArgs.push(this.params.description); - } - - createArgs.push("-f"); - - warcFileList.forEach((val) => - createArgs.push(path.join(this.archivesDir, val)), - ); - - // create WACZ - const waczResult = await this.awaitProcess( - child_process.spawn("wacz", createArgs, { detached: RUN_DETACHED }), - ); - - if (waczResult !== 0) { - logger.error("Error creating WACZ", { "status code": waczResult }); - logger.fatal("Unable to write WACZ successfully"); + waczOpts.description = this.params.description; } - logger.debug(`WACZ successfully generated and saved to: ${waczPath}`); - - // Verify WACZ - /* - const validateArgs = ["validate"]; - validateArgs.push("-f"); - validateArgs.push(waczPath); + try { + const wacz = new WACZ(waczOpts, this.collDir); + if (!streaming) { + await wacz.generateToFile(waczPath); + } - const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs)); + if (this.storage) { + await this.crawlState.setStatus("uploading-wacz"); + const filename = process.env.STORE_FILENAME || "@ts-@id.wacz"; + const targetFilename = interpolateFilename(filename, this.crawlId); - if (waczVerifyResult !== 0) { - console.log("validate", waczVerifyResult); - logger.fatal("Unable to verify WACZ created successfully"); - } -*/ - if (this.storage) { - await this.crawlState.setStatus("uploading-wacz"); - const filename = process.env.STORE_FILENAME || "@ts-@id.wacz"; - const targetFilename = interpolateFilename(filename, this.crawlId); + await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished); + return true; + } - await this.storage.uploadCollWACZ(waczPath, targetFilename, isFinished); - return true; + return false; + } catch (e) { + logger.error("Error creating WACZ", e); + if (!streaming) { + logger.fatal("Unable to write WACZ successfully"); + } } - - return false; - } - - awaitProcess(proc: ChildProcess) { - const stdout: string[] = []; - const stderr: string[] = []; - - proc.stdout!.on("data", (data) => { - stdout.push(data.toString()); - }); - - proc.stderr!.on("data", (data) => { - stderr.push(data.toString()); - }); - - return new Promise((resolve) => { - proc.on("close", (code) => { - if (stdout.length) { - logger.debug(stdout.join("\n")); - } - if (stderr.length && this.params.logging.includes("debug")) { - logger.debug(stderr.join("\n")); - } - resolve(code); - }); - }); } logMemory() { @@ -2604,7 +2558,7 @@ self.__bx_behaviors.selectMainBehavior(); return new WARCWriter({ archivesDir: this.archivesDir, - tempCdxDir: this.tempCdxDir, + warcCdxDir: this.warcCdxDir, filenameTemplate, rolloverSize: this.params.rolloverSize, gzip, diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 2f43a49f5..414d9f7d3 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -201,7 +201,7 @@ class ArgParser { generateWACZ: { alias: ["generatewacz", "generateWacz"], - describe: "If set, generate wacz", + describe: "If set, generate WACZ on disk", type: "boolean", default: false, }, diff --git a/src/util/logger.ts b/src/util/logger.ts index 5ecd8c5c8..d33a6256a 100644 --- a/src/util/logger.ts +++ b/src/util/logger.ts @@ -51,6 +51,7 @@ export const LOG_CONTEXT_TYPES = [ "crawlStatus", "links", "sitemap", + "wacz", "replay", "proxy", ] as const; diff --git a/src/util/storage.ts b/src/util/storage.ts index 796b20a34..73f8c88de 100644 --- a/src/util/storage.ts +++ b/src/util/storage.ts @@ -14,6 +14,8 @@ import { logger } from "./logger.js"; // @ts-expect-error (incorrect types on get-folder-size) import getFolderSize from "get-folder-size"; +import { WACZ } from "./wacz.js"; + const DEFAULT_REGION = "us-east-1"; // =========================================================================== @@ -81,6 +83,32 @@ export class S3StorageSync { this.webhookUrl = webhookUrl; } + async uploadStreamingWACZ(wacz: WACZ, targetFilename: string) { + const fileUploadInfo = { + bucket: this.bucketName, + crawlId: this.crawlId, + prefix: this.objectPrefix, + targetFilename, + }; + logger.info("S3 file upload information", fileUploadInfo, "storage"); + + const waczStream = wacz.generate(); + + await this.client.putObject( + this.bucketName, + this.objectPrefix + targetFilename, + waczStream, + ); + + const hash = wacz.getHash(); + const path = targetFilename; + + const size = wacz.getSize(); + + // for backwards compatibility, keep 'bytes' + return { path, size, hash, bytes: size }; + } + async uploadFile(srcFilename: string, targetFilename: string) { const fileUploadInfo = { bucket: this.bucketName, @@ -114,11 +142,15 @@ export class S3StorageSync { } async uploadCollWACZ( - srcFilename: string, + srcOrWACZ: string | WACZ, targetFilename: string, completed = true, ) { - const resource = await this.uploadFile(srcFilename, targetFilename); + const resource = + typeof srcOrWACZ === "string" + ? await this.uploadFile(srcOrWACZ, targetFilename) + : await this.uploadStreamingWACZ(srcOrWACZ, targetFilename); + logger.info( "WACZ S3 file upload resource", { targetFilename, resource }, @@ -191,7 +223,7 @@ export async function getFileSize(filename: string) { return stats.size; } -export async function getDirSize(dir: string) { +export async function getDirSize(dir: string): Promise { const { size, errors } = await getFolderSize(dir); if (errors && errors.length) { logger.warn("Size check errors", { errors }, "storage"); @@ -234,10 +266,15 @@ export async function checkDiskUtilization( const kbTotal = parseInt(diskUsage["1K-blocks"]); let kbArchiveDirSize = Math.round(archiveDirSize / 1024); - if (params.combineWARC && params.generateWACZ) { - kbArchiveDirSize *= 4; - } else if (params.combineWARC || params.generateWACZ) { - kbArchiveDirSize *= 2; + + // assume if has STORE_ENDPOINT_URL, will be uploading to remote + // and not storing local copy of either WACZ or WARC + if (!process.env.STORE_ENDPOINT_URL) { + if (params.combineWARC && params.generateWACZ) { + kbArchiveDirSize *= 4; + } else if (params.combineWARC || params.generateWACZ) { + kbArchiveDirSize *= 2; + } } const projectedTotal = kbUsed + kbArchiveDirSize; diff --git a/src/util/wacz.ts b/src/util/wacz.ts new file mode 100644 index 000000000..fcf4eabcd --- /dev/null +++ b/src/util/wacz.ts @@ -0,0 +1,429 @@ +import path, { basename } from "node:path"; +import fs from "node:fs"; +import fsp from "node:fs/promises"; +import { Writable, Readable } from "node:stream"; +import { pipeline } from "node:stream/promises"; +import readline from "node:readline"; +import child_process from "node:child_process"; + +import { createHash, Hash } from "node:crypto"; + +import { gzip } from "node:zlib"; + +import { ReadableStream } from "node:stream/web"; + +import { makeZip, InputWithoutMeta } from "client-zip"; +import { logger, formatErr } from "./logger.js"; +import { streamFinish } from "./warcwriter.js"; +import { getDirSize } from "./storage.js"; + +const DATAPACKAGE_JSON = "datapackage.json"; +const DATAPACKAGE_DIGEST_JSON = "datapackage-digest.json"; + +const INDEX_CDXJ = "index.cdxj"; +const INDEX_IDX = "index.idx"; +const INDEX_CDX_GZ = "index.cdx.gz"; + +const LINES_PER_BLOCK = 256; + +const ZIP_CDX_MIN_SIZE = 50_000; + +// ============================================================================ +export type WACZInitOpts = { + input: string[]; + output: string; + pages: string; + warcCdxDir: string; + indexesDir: string; + logDirectory: string; + + softwareString: string; + + signingUrl?: string; + signingToken?: string; + title?: string; + description?: string; +}; + +export type WACZResourceEntry = { + name: string; + path: string; + hash: string; + bytes: number; +}; + +export type WACZDataPackage = { + resources: WACZResourceEntry[]; + created: string; + wacz_version: string; + software: string; + title?: string; + description?: string; +}; + +type WACZDigest = { + path: string; + hash: string; + signedData?: string; +}; + +class CurrZipFileMarker extends Uint8Array { + // empty array to mark start of WACZ file, also track metadata per-file + filename: string; + zipPath: string; + size: number; + hasher: Hash; + + constructor(filename: string, zipPath: string, size: number) { + super(); + this.filename = filename; + this.zipPath = zipPath; + this.size = size; + this.hasher = createHash("sha256"); + } +} + +class EndOfZipFileMarker extends Uint8Array { + // empty array to mark end of WACZ file +} + +// ============================================================================ +export class WACZ { + collDir: string; + + warcs: string[]; + + pagesDir: string; + logsDir: string; + warcCdxDir: string; + indexesDir: string; + + datapackage: WACZDataPackage; + + signingUrl: string | null; + signingToken: string | null; + + private size = 0; + private hash: string = ""; + + constructor(config: WACZInitOpts, collDir: string) { + this.warcs = config.input; + this.pagesDir = config.pages; + this.logsDir = config.logDirectory; + this.warcCdxDir = config.warcCdxDir; + this.collDir = collDir; + this.indexesDir = config.indexesDir; + + this.datapackage = { + resources: [], + // drop microseconds + created: new Date().toISOString().split(".", 1)[0] + "Z", + wacz_version: "1.1.1", + software: config.softwareString, + }; + + if (config.title) { + this.datapackage.title = config.title; + } + if (config.description) { + this.datapackage.description = config.description; + } + + this.signingUrl = config.signingUrl || null; + this.signingToken = config.signingToken || null; + } + + generate(): Readable { + const files = [ + ...this.warcs, + ...addDirFiles(this.indexesDir), + ...addDirFiles(this.pagesDir), + ...addDirFiles(this.logsDir), + ]; + + const zip = makeZip( + this.iterDirForZip(files), + ) as ReadableStream; + + const hasher = createHash("sha256"); + const resources = this.datapackage.resources; + + let size = 0; + + async function* iterWACZ(wacz: WACZ): AsyncIterable { + let currFile: CurrZipFileMarker | null = null; + + for await (const chunk of zip) { + if (chunk instanceof CurrZipFileMarker) { + currFile = chunk; + } else if (chunk instanceof EndOfZipFileMarker) { + if (currFile) { + // Frictionless data validation requires this to be lowercase + const name = basename(currFile.filename).toLowerCase(); + const path = currFile.zipPath; + const bytes = currFile.size; + const hash = "sha256:" + currFile.hasher.digest("hex"); + resources.push({ name, path, bytes, hash }); + logger.debug("Added file to WACZ", { path, bytes, hash }, "wacz"); + } + currFile = null; + } else { + yield chunk; + if (currFile) { + currFile.hasher.update(chunk); + } + hasher.update(chunk); + size += chunk.length; + } + } + + wacz.hash = hasher.digest("hex"); + wacz.size = size; + } + + return Readable.from(iterWACZ(this)); + } + + getHash() { + return this.hash; + } + + getSize() { + return this.size; + } + + async generateToFile(filename: string) { + await pipeline(this.generate(), fs.createWriteStream(filename)); + } + + async *iterDirForZip(files: string[]): AsyncGenerator { + const encoder = new TextEncoder(); + const end = new EndOfZipFileMarker(); + + async function* wrapMarkers( + start: CurrZipFileMarker, + iter: AsyncIterable, + ) { + yield start; + yield* iter; + yield end; + } + + async function* getData(data: Uint8Array) { + yield data; + } + + for (const filename of files) { + const input = fs.createReadStream(filename); + + const stat = await fsp.stat(filename); + const lastModified = stat.mtime; + const size = stat.size; + + const nameStr = filename.slice(this.collDir.length + 1); + const name = encoder.encode(nameStr); + + const currFile = new CurrZipFileMarker(filename, nameStr, size); + + yield { input: wrapMarkers(currFile, input), lastModified, name, size }; + } + + // datapackage.json + + const datapackageData = encoder.encode( + JSON.stringify(this.datapackage, null, 2), + ); + + yield { + input: getData(datapackageData), + lastModified: new Date(), + name: DATAPACKAGE_JSON, + size: datapackageData.length, + }; + + const hash = + "sha256:" + createHash("sha256").update(datapackageData).digest("hex"); + + // datapackage-digest.json + + const digest: WACZDigest = { + path: DATAPACKAGE_JSON, + hash, + }; + + // Get Signature + if (this.signingUrl) { + const body = JSON.stringify({ + hash, + created: this.datapackage.created, + }); + + const headers: Record = { + "Content-Type": "application/json", + }; + + if (this.signingToken) { + headers["Authorization"] = this.signingToken; + } + + try { + const response = await fetch(this.signingUrl, { + method: "POST", + headers, + body, + }); + digest.signedData = await response.json(); + } catch (e) { + logger.warn( + "Failed to sign WACZ, continuing w/o signature", + { ...formatErr(e) }, + "wacz", + ); + } + } + + const digestData = encoder.encode(JSON.stringify(digest, null, 2)); + + yield { + input: getData(digestData), + lastModified: new Date(), + name: DATAPACKAGE_DIGEST_JSON, + size: digestData.length, + }; + } +} + +// Merge CDX +export function addDirFiles(fullDir: string): string[] { + const files = fs.readdirSync(fullDir); + return files.map((name) => path.join(fullDir, name)); +} + +export async function mergeCDXJ( + warcCdxDir: string, + indexesDir: string, + zipped: boolean | null = null, +) { + async function* readLinesFrom(stdout: Readable): AsyncGenerator { + for await (const line of readline.createInterface({ input: stdout })) { + yield line + "\n"; + } + } + + async function* generateCompressed( + reader: AsyncGenerator, + idxFile: Writable, + ) { + let offset = 0; + + const encoder = new TextEncoder(); + + const filename = INDEX_CDX_GZ; + + let cdxLines: string[] = []; + + let key = ""; + let count = 0; + + idxFile.write( + `!meta 0 ${JSON.stringify({ + format: "cdxj-gzip-1.0", + filename: INDEX_CDX_GZ, + })}\n`, + ); + + const finishChunk = async () => { + const compressed = await new Promise((resolve) => { + gzip(encoder.encode(cdxLines.join("")), (_, result) => { + if (result) { + resolve(result); + } + }); + }); + + const length = compressed.length; + const digest = + "sha256:" + createHash("sha256").update(compressed).digest("hex"); + + const idx = + key + " " + JSON.stringify({ offset, length, digest, filename }); + + idxFile.write(idx + "\n"); + + offset += length; + + count = 1; + key = ""; + cdxLines = []; + + return compressed; + }; + + for await (const cdx of reader) { + if (!key) { + key = cdx.split(" {", 1)[0]; + } + + if (++count === LINES_PER_BLOCK) { + yield await finishChunk(); + } + cdxLines.push(cdx); + } + + if (key) { + yield await finishChunk(); + } + } + + await fsp.mkdir(indexesDir, { recursive: true }); + + const removeIndexFile = async (filename: string) => { + try { + await fsp.unlink(path.join(indexesDir, filename)); + } catch (e) { + // ignore + } + }; + + const cdxFiles = addDirFiles(warcCdxDir); + + if (!cdxFiles.length) { + logger.info("No CDXJ files to merge"); + return; + } + + if (zipped === null) { + const tempCdxSize = await getDirSize(warcCdxDir); + + // if CDX size is at least this size, use compressed version + zipped = tempCdxSize >= ZIP_CDX_MIN_SIZE; + } + + const proc = child_process.spawn("sort", cdxFiles, { + env: { LC_ALL: "C" }, + }); + + if (!zipped) { + const output = fs.createWriteStream(path.join(indexesDir, INDEX_CDXJ)); + + await pipeline(Readable.from(readLinesFrom(proc.stdout)), output); + + await removeIndexFile(INDEX_IDX); + await removeIndexFile(INDEX_CDX_GZ); + } else { + const output = fs.createWriteStream(path.join(indexesDir, INDEX_CDX_GZ)); + + const outputIdx = fs.createWriteStream(path.join(indexesDir, INDEX_IDX), { + encoding: "utf-8", + }); + + await pipeline( + Readable.from(generateCompressed(readLinesFrom(proc.stdout), outputIdx)), + output, + ); + + await streamFinish(outputIdx); + + await removeIndexFile(INDEX_CDXJ); + } +} diff --git a/src/util/warcwriter.ts b/src/util/warcwriter.ts index 2a14552f7..2cd62a89b 100644 --- a/src/util/warcwriter.ts +++ b/src/util/warcwriter.ts @@ -24,7 +24,7 @@ export type ResourceRecordData = { // ================================================================= export class WARCWriter implements IndexerOffsetLength { archivesDir: string; - tempCdxDir?: string; + warcCdxDir?: string; filenameTemplate: string; filename?: string; gzip: boolean; @@ -45,23 +45,21 @@ export class WARCWriter implements IndexerOffsetLength { constructor({ archivesDir, - tempCdxDir, + warcCdxDir, filenameTemplate, rolloverSize = DEFAULT_ROLLOVER_SIZE, gzip, logDetails, }: { archivesDir: string; - tempCdxDir?: string; + warcCdxDir?: string; filenameTemplate: string; rolloverSize?: number; gzip: boolean; logDetails: Record; }) { this.archivesDir = archivesDir; - this.tempCdxDir = tempCdxDir; - // for now, disabling CDX - this.tempCdxDir = undefined; + this.warcCdxDir = warcCdxDir; this.logDetails = logDetails; this.gzip = gzip; this.rolloverSize = rolloverSize; @@ -77,7 +75,7 @@ export class WARCWriter implements IndexerOffsetLength { this.offset = 0; this.recordLength = 0; - if (this.tempCdxDir) { + if (this.warcCdxDir) { this.indexer = new CDXIndexer({ format: "cdxj" }); } @@ -112,14 +110,19 @@ export class WARCWriter implements IndexerOffsetLength { flags: "a", }); } - if (!this.cdxFH && this.tempCdxDir) { + if (!this.cdxFH && this.warcCdxDir) { this.cdxFH = fs.createWriteStream( - path.join(this.tempCdxDir, this.filename + ".cdx"), + path.join(this.warcCdxDir, this.filename + ".cdx"), { flags: "a" }, ); } - fh.write(await createWARCInfo(this.filename)); + const buffer = await createWARCInfo(this.filename); + fh.write(buffer); + + // account for size of warcinfo record, (don't index as warcinfo never added to cdx) + this.recordLength = buffer.length; + this.offset += buffer.length; return fh; } diff --git a/tests/basic_crawl.test.js b/tests/basic_crawl.test.js index 89c8bd5d8..6fc00af66 100644 --- a/tests/basic_crawl.test.js +++ b/tests/basic_crawl.test.js @@ -3,17 +3,22 @@ import fs from "fs"; import path from "path"; import md5 from "md5"; +const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0]; +const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args); + test("ensure basic crawl run with docker run passes", async () => { child_process.execSync( 'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix', ); child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz", + "unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz", ); +}); +testIf(doValidate, "validate wacz", () => { child_process.execSync( - "unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz", + "wacz validate --file ./test-crawls/collections/wr-net/wr-net.wacz", ); }); diff --git a/tests/mult_url_crawl_with_favicon.test.js b/tests/mult_url_crawl_with_favicon.test.js index c477623a5..8be9e1bfc 100644 --- a/tests/mult_url_crawl_with_favicon.test.js +++ b/tests/mult_url_crawl_with_favicon.test.js @@ -1,13 +1,18 @@ import child_process from "child_process"; import fs from "fs"; +const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0]; +const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args); + test("ensure multi url crawl run with docker run passes", async () => { child_process.execSync( 'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2', ); +}); +testIf(doValidate, "validate multi url crawl wacz", () => { child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz", + "wacz validate --file ./test-crawls/collections/advanced/advanced.wacz", ); }); diff --git a/tests/pageinfo-records.test.js b/tests/pageinfo-records.test.js index 60da85101..0221e697f 100644 --- a/tests/pageinfo-records.test.js +++ b/tests/pageinfo-records.test.js @@ -127,7 +127,7 @@ function validateResourcesIndex(json) { mime: "image/vnd.microsoft.icon", type: "other", }, - "https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2F&d=webrecorder.net": + "https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2F&d=webrecorder.net&r=null": { status: 202, mime: "text/plain", type: "xhr" }, }); } @@ -172,7 +172,7 @@ function validateResourcesAbout(json) { { status: 200, mime: "font/woff2", type: "font" }, "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": { status: 200, mime: "font/woff2", type: "font" }, - "https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net": + "https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net&r=null": { status: 0, type: "xhr", diff --git a/tests/redis_crawl_state.js b/tests/redis_crawl_state.js deleted file mode 100644 index 2bc7767ab..000000000 --- a/tests/redis_crawl_state.js +++ /dev/null @@ -1,19 +0,0 @@ -import child_process from "child_process"; - -test("ensure crawl run with redis passes", async () => { - const redis = child_process.spawn( - "docker run -d --name test-crawl-redis -p 6379:6379 redis", - ); - - child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2", - ); - - redis.kill("SIGINT"); -}); - -test("check that wacz created is valid", () => { - child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz", - ); -}); diff --git a/tsconfig.json b/tsconfig.json index 20d5422a1..1ca75332e 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -102,7 +102,7 @@ /* Completeness */ // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ - "skipLibCheck": true /* Skip type checking all .d.ts files. */ + "skipLibCheck": false /* Skip type checking all .d.ts files. */ }, "include": ["src/**/*"] diff --git a/yarn.lock b/yarn.lock index 85f7ed8a9..2e276aecb 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1148,6 +1148,11 @@ dependencies: undici-types "~5.25.1" +"@types/pako@^1.0.7": + version "1.0.7" + resolved "https://registry.yarnpkg.com/@types/pako/-/pako-1.0.7.tgz#aa0e4af9855d81153a29ff84cc44cce25298eda9" + integrity sha512-YBtzT2ztNF6R/9+UXj2wTGFnC9NklAnASt3sC0h2m1bbH7G6FyBIkt4AN8ThZpNfxUo1b2iMVO0UawiJymEt8A== + "@types/pixelmatch@^5.2.6": version "5.2.6" resolved "https://registry.yarnpkg.com/@types/pixelmatch/-/pixelmatch-5.2.6.tgz#fba6de304ac958495f27d85989f5c6bb7499a686" @@ -1179,6 +1184,13 @@ resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-2.0.0.tgz#7036640b4e21cc2f259ae826ce843d277dad8cff" integrity sha512-RJJrrySY7A8havqpGObOB4W92QXKJo63/jFLLgpvOtsGUqbQZ9Sbgl35KMm1DjC6j7AvmmU2bIno+3IyEaemaw== +"@types/stream-buffers@^3.0.7": + version "3.0.7" + resolved "https://registry.yarnpkg.com/@types/stream-buffers/-/stream-buffers-3.0.7.tgz#0b719fa1bd2ca2cc0908205a440e5e569e1aa21e" + integrity sha512-azOCy05sXVXrO+qklf0c/B07H/oHaIuDDAiHPVwlk3A9Ek+ksHyTeMajLZl3r76FxpPpxem//4Te61G1iW3Giw== + dependencies: + "@types/node" "*" + "@types/uuid@^9.0.6": version "9.0.6" resolved "https://registry.yarnpkg.com/@types/uuid/-/uuid-9.0.6.tgz#c91ae743d8344a54b2b0c691195f5ff5265f6dfb" @@ -1300,15 +1312,15 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.19.7": - version "2.19.7" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.7.tgz#3afe48f79752bcd189cffd5d5e6a8dbe4f394053" - integrity sha512-X9UFxWCww1KWDnAaEjg7vpg6SznBov5a88FPxbOvo5yCT/UkJcQHaa0qo1L52l46sIAUnSbsYz1ur9yMd6ygVA== +"@webrecorder/wabac@^2.19.8": + version "2.19.8" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.8.tgz#302ade200953a1c76f0b355983ae4081428fb933" + integrity sha512-WjyfsGK8JWKeeDsGrOIT8ZLjMcOOAN93OMnRLO214jSV18SHEOY4JRvXzFOLF+OWYC5kJIMjl05gurTLq18jOA== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" "@peculiar/x509" "^1.9.2" - "@webrecorder/wombat" "^3.7.14" + "@webrecorder/wombat" "^3.8.0" acorn "^8.10.0" auto-js-ipfs "^2.1.1" base64-js "^1.5.1" @@ -1327,14 +1339,14 @@ path-parser "^6.1.0" process "^0.11.10" stream-browserify "^3.0.0" - warcio "^2.2.1" + warcio "^2.3.0" -"@webrecorder/wombat@^3.7.14": - version "3.7.14" - resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.14.tgz#3779e4cadb256755bbbfd2960805965ec4daacd8" - integrity sha512-sDNH+c8WstQrK91y8kIPJh1XAC2WXLU5rC8wztANzK1mVzA7v6XB5gk3Yp7OIAn4bn1XuGRVjubhKhmxVVZ9kg== +"@webrecorder/wombat@^3.8.0": + version "3.8.0" + resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.0.tgz#63ed3df199f11223b23c9ce66202590b8511ae2a" + integrity sha512-MpzNu9+ClCHjOER9XCrsEIsJk15L6qGO+PxeBPiOtaFJmNUiz0auMT5AQwiPqJgKEAniZTlPx1O4kNCVJu9f2Q== dependencies: - warcio "^2.2.0" + warcio "^2.3.0" "@zxing/text-encoding@0.9.0": version "0.9.0" @@ -1380,6 +1392,11 @@ ansi-escapes@^4.2.1: dependencies: type-fest "^0.21.3" +ansi-regex@^4.1.0: + version "4.1.1" + resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-4.1.1.tgz#164daac87ab2d6f6db3a29875e2d1766582dabed" + integrity sha512-ILlv4k/3f6vfQ4OoP2AGvirOktlQ98ZEL1k9FaQjxa3L1abBgbuTDAdPOpvbGncC0BTVQrl+OM8xZGK6tWXt7g== + ansi-regex@^5.0.0: version "5.0.0" resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-5.0.0.tgz#388539f55179bf39339c81af30a654d69f87cb75" @@ -1820,6 +1837,11 @@ cjs-module-lexer@^1.0.0: resolved "https://registry.yarnpkg.com/cjs-module-lexer/-/cjs-module-lexer-1.2.2.tgz#9f84ba3244a512f3a54e5277e8eef4c489864e40" integrity sha512-cOU9usZw8/dXIXKtwa8pM0OTJQuJkxMN6w30csNRUerHfeQ5R6U3kkU/FtJeIf3M202OHfY2U8ccInBG7/xogA== +client-zip@^2.4.5: + version "2.4.5" + resolved "https://registry.yarnpkg.com/client-zip/-/client-zip-2.4.5.tgz#c9b6190abca57b8b4d6dcfd21c3a1f4d4ab3bc68" + integrity sha512-4y4d5ZeTH/szIAMQeC8ju67pxtvj+3u20wMGwOFrZk+pegy3aSEA2JkwgC8XVDTXP/Iqn1gyqNQXmkyBp4KLEQ== + cliui@^8.0.1: version "8.0.1" resolved "https://registry.yarnpkg.com/cliui/-/cliui-8.0.1.tgz#0c04b075db02cbfe60dc8e6cf2f5486b1a3608aa" @@ -4816,16 +4838,16 @@ string-length@^4.0.1: char-regex "^1.0.2" strip-ansi "^6.0.0" -string-width@^4.1.0, string-width@^4.2.0: - version "4.2.2" - resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.2.tgz#dafd4f9559a7585cfba529c6a0a4f73488ebd4c5" - integrity sha512-XBJbT3N4JhVumXE0eoLU9DCjcaF92KLNqTmFCnG1pf8duUxFGwtP6AD6nkjw9a3IdiRtL3E2w3JDiE/xi3vOeA== +string-width@^4.1.0: + version "4.1.0" + resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.1.0.tgz#ba846d1daa97c3c596155308063e075ed1c99aff" + integrity sha512-NrX+1dVVh+6Y9dnQ19pR0pP4FiEIlUvdTGn8pw6CKTNq5sgib2nIhmUNT5TAmhWmvKr3WcxBcP3E8nWezuipuQ== dependencies: emoji-regex "^8.0.0" is-fullwidth-code-point "^3.0.0" - strip-ansi "^6.0.0" + strip-ansi "^5.2.0" -string-width@^4.2.3: +string-width@^4.2.0, string-width@^4.2.3: version "4.2.3" resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010" integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g== @@ -4870,6 +4892,13 @@ string_decoder@^1.1.1: dependencies: safe-buffer "~5.2.0" +strip-ansi@^5.2.0: + version "5.2.0" + resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-5.2.0.tgz#8c9a536feb6afc962bdfa5b104a5091c1ad9c0ae" + integrity sha512-DuRs1gKbBqsMKIZlrffwlug8MHkcnpjs5VPmL1PAh+mA30U0DTotfDZ0d2UUsXpPmPmMMJ6W773MaA3J+lbiWA== + dependencies: + ansi-regex "^4.1.0" + strip-ansi@^6.0.0: version "6.0.0" resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.0.tgz#0b1571dd7669ccd4f3e06e14ef1eed26225ae532" @@ -4878,6 +4907,7 @@ strip-ansi@^6.0.0: ansi-regex "^5.0.0" strip-ansi@^6.0.1: + name strip-ansi-cjs version "6.0.1" resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== @@ -5247,11 +5277,13 @@ walker@^1.0.8: dependencies: makeerror "1.0.12" -warcio@^2.2.0, warcio@^2.2.1: - version "2.2.1" - resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.2.1.tgz#3619728fde716291c9b364744c276362a94bacec" - integrity sha512-KPLoz3aFtdTjexG+QQaubMyuLiNANzvcadGMyNKdpcmhl0k6lBHQQVpxZw3Hx9+4pbyqDXyiF4cr/h2tS8kvcw== +warcio@^2.3.0: + version "2.3.0" + resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.3.0.tgz#a655df9b5986a53e5d05aa68cda51bfefdfa8347" + integrity sha512-PCHcZ/fDE5+QECOFe/n/vzyDmAITJ1mvLx1jVONJ0uaV9OwcTbIWoh7Z0+OQwQdq8Wr1Nnb2hwhtHJ7J+9rHIQ== dependencies: + "@types/pako" "^1.0.7" + "@types/stream-buffers" "^3.0.7" base32-encode "^2.0.0" hash-wasm "^4.9.0" pako "^1.0.11" @@ -5305,7 +5337,7 @@ which@^2.0.1: dependencies: isexe "^2.0.0" -wrap-ansi@^7.0.0: +wrap-ansi@7.0.0, wrap-ansi@^7.0.0: version "7.0.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==