Skip to content

Commit

Permalink
Autoclick Support (#729)
Browse files Browse the repository at this point in the history
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future

Fixes #728, also #216, #665, #31
  • Loading branch information
ikreymer authored Jan 16, 2025
1 parent 8714907 commit b7150f1
Show file tree
Hide file tree
Showing 14 changed files with 259 additions and 108 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG BROWSER_VERSION=1.73.104
ARG BROWSER_VERSION=1.74.48
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION}

FROM ${BROWSER_IMAGE_BASE}
Expand Down Expand Up @@ -39,7 +39,7 @@ ADD config/ /app/

ADD html/ /app/html/

ARG RWP_VERSION=2.2.4
ARG RWP_VERSION=2.2.5
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
Expand Down
1 change: 1 addition & 0 deletions behaviors.js

Large diffs are not rendered by default.

29 changes: 20 additions & 9 deletions docs/docs/user-guide/cli-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,14 @@ Options:
e-page-application crawling or when
different hashtags load dynamic cont
ent
--selectLinks one or more selectors for extracting
--selectLinks, --linkSelector One or more selectors for extracting
links, in the format [css selector]
->[property to use],[css selector]->
@[attribute to use]
[array] [default: ["a[href]->href"]]
--clickSelector Selector for elements to click when
using the autoclick behavior
[string] [default: "a"]
--blockRules Additional rules for blocking certai
n URLs from being loaded, by URL reg
ex and optionally via text match in
Expand All @@ -75,7 +78,8 @@ Options:
[string] [default: "crawl-@ts"]
--headless Run in headless mode, otherwise star
t xvfb [boolean] [default: false]
--driver JS driver for the crawler [string]
--driver Custom driver for the crawler, if an
y [string]
--generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us
rateCdx e with pywb after crawl is done
[boolean] [default: false]
Expand Down Expand Up @@ -142,8 +146,7 @@ Options:
o crawl working directory) [string]
--behaviors Which background behaviors to enable
on each page
[array] [choices: "autoplay", "autofetch", "autoscroll", "siteSpecific"] [defa
ult: ["autoplay","autofetch","autoscroll","siteSpecific"]]
[array] [default: ["autoplay","autofetch","autoscroll","siteSpecific"]]
--behaviorTimeout If >0, timeout (in seconds) for in-p
age behavior will run on each page.
If 0, a behavior can run until finis
Expand All @@ -163,8 +166,10 @@ Options:
hich contains the browser profile di
rectory [string]
--screenshot Screenshot options for crawler, can
include: view, thumbnail, fullPage
[array] [choices: "view", "thumbnail", "fullPage"] [default: []]
include: view, thumbnail, fullPage,
fullPageFinal
[array] [choices: "view", "thumbnail", "fullPage", "fullPageFinal"] [default:
[]]
--screencastPort If set to a non-zero value, starts a
n HTTP server with screencast access
ible on this port
Expand Down Expand Up @@ -251,9 +256,15 @@ Options:
failing due to non-200 responses
[boolean] [default: false]
--customBehaviors Custom behavior files to inject. Val
ues can be URLs, paths to individual
behavior files, or paths to a direc
tory of behavior files
id values: URL to file, path to file
, path to directory of behaviors, UR
L to Git repo of behaviors (prefixed
with git+, optionally specify branc
h and relative path to a directory w
ithin repo as branch and path query
parameters, e.g. --customBehaviors "
git+https://git.example.com/repo.git
?branch=dev&path=some/dir"
[array] [default: []]
--debugAccessRedis if set, runs internal redis without
protected mode to allow external acc
Expand Down
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.4.2",
"version": "1.5.0-beta.2",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand All @@ -18,7 +18,7 @@
"dependencies": {
"@novnc/novnc": "1.4.0",
"@webrecorder/wabac": "^2.20.8",
"browsertrix-behaviors": "^0.6.6",
"browsertrix-behaviors": "^0.7.0",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0",
Expand All @@ -31,7 +31,7 @@
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^23.7.1",
"puppeteer-core": "^24.1.0",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
Expand Down
106 changes: 91 additions & 15 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,7 @@ import { ScreenCaster, WSTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js";
import { initRedis } from "./util/redis.js";
import { logger, formatErr, LogDetails } from "./util/logger.js";
import {
WorkerOpts,
WorkerState,
closeWorkers,
runWorkers,
} from "./util/worker.js";
import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";

Expand Down Expand Up @@ -689,14 +684,9 @@ export class Crawler {
return !!seed.isIncluded(url, depth, extraHops, logDetails);
}

async setupPage({
page,
cdp,
workerid,
callbacks,
recorder,
frameIdToExecId,
}: WorkerOpts) {
async setupPage(opts: WorkerState) {
const { page, cdp, workerid, callbacks, frameIdToExecId, recorder } = opts;

await this.browser.setupPage({ page, cdp });

await this.setupExecContextEvents(cdp, frameIdToExecId);
Expand Down Expand Up @@ -775,6 +765,87 @@ self.__bx_behaviors.selectMainBehavior();

await this.browser.addInitScript(page, initScript);
}

// only add if running with autoclick behavior
if (this.params.behaviors.includes("autoclick")) {
// Ensure off-page navigation is canceled while behavior is running
page.on("dialog", async (dialog) => {
let accepted = true;
if (dialog.type() === "beforeunload") {
if (opts.pageBlockUnload) {
accepted = false;
await dialog.dismiss();
} else {
await dialog.accept();
}
} else {
await dialog.accept();
}
logger.debug("JS Dialog", {
accepted,
blockingUnload: opts.pageBlockUnload,
message: dialog.message(),
type: dialog.type(),
page: page.url(),
workerid,
});
});

// Close any windows opened during navigation from autoclick
await cdp.send("Target.setDiscoverTargets", { discover: true });

cdp.on("Target.targetCreated", async (params) => {
const { targetInfo } = params;
const { type, openerFrameId, targetId } = targetInfo;

try {
if (
type === "page" &&
openerFrameId &&
opts.frameIdToExecId.has(openerFrameId)
) {
await cdp.send("Target.closeTarget", { targetId });
} else {
logger.warn("Extra target not closed", { targetInfo });
}

await cdp.send("Runtime.runIfWaitingForDebugger");
} catch (e) {
// target likely already closed
}
});

void cdp.send("Target.setAutoAttach", {
autoAttach: true,
waitForDebuggerOnStart: true,
flatten: false,
});

if (this.recording) {
await cdp.send("Page.enable");

cdp.on("Page.windowOpen", async (params) => {
const { seedId, depth, extraHops = 0, url } = opts.data;

const logDetails = { page: url, workerid };

await this.queueInScopeUrls(
seedId,
[params.url],
depth,
extraHops,
false,
logDetails,
);
});
}
}

await page.exposeFunction("__bx_addSet", (data: string) =>
this.crawlState.addToUserSet(data),
);

// await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data));
}

async setupExecContextEvents(
Expand Down Expand Up @@ -932,6 +1003,7 @@ self.__bx_behaviors.selectMainBehavior();
}

opts.markPageUsed();
opts.pageBlockUnload = false;

if (auth) {
await page.setExtraHTTPHeaders({ Authorization: auth });
Expand All @@ -955,8 +1027,12 @@ self.__bx_behaviors.selectMainBehavior();
);
data.favicon = await this.getFavicon(page, logDetails);

opts.pageBlockUnload = true;

await this.doPostLoadActions(opts);

opts.pageBlockUnload = false;

await this.awaitPageExtraDelay(opts);
}

Expand Down Expand Up @@ -1111,7 +1187,7 @@ self.__bx_behaviors.selectMainBehavior();
}
}

async teardownPage({ workerid }: WorkerOpts) {
async teardownPage({ workerid }: WorkerState) {
if (this.screencaster) {
await this.screencaster.stopById(workerid);
}
Expand Down
22 changes: 22 additions & 0 deletions src/create-login-profile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
import { getInfoString } from "./util/file_reader.js";
import { DISPLAY } from "./util/constants.js";
import { initProxy } from "./util/proxy.js";
//import { sleep } from "./util/timing.js";

const profileHTML = fs.readFileSync(
new URL("../html/createProfile.html", import.meta.url),
Expand Down Expand Up @@ -437,6 +438,27 @@ class InteractiveBrowser {

// attempt to keep everything to initial tab if headless
if (this.params.headless) {
void cdp.send("Target.setDiscoverTargets", { discover: true });

cdp.on("Target.targetCreated", async (params) => {
const { targetInfo } = params;
const { type, openerFrameId } = targetInfo;

if (type === "page" && openerFrameId) {
await cdp.send("Target.closeTarget", {
targetId: params.targetInfo.targetId,
});
}

await cdp.send("Runtime.runIfWaitingForDebugger");
});

void cdp.send("Target.setAutoAttach", {
autoAttach: true,
waitForDebuggerOnStart: true,
flatten: false,
});

cdp.send("Page.enable").catch((e) => logger.warn("Page.enable error", e));

cdp.on("Page.windowOpen", async (resp) => {
Expand Down
4 changes: 2 additions & 2 deletions src/replaycrawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { Crawler } from "./crawler.js";
import { ReplayServer } from "./util/replayserver.js";
import { sleep } from "./util/timing.js";
import { logger, formatErr } from "./util/logger.js";
import { WorkerOpts, WorkerState } from "./util/worker.js";
import { WorkerState } from "./util/worker.js";
import { PageState } from "./util/state.js";
import { PageInfoRecord, PageInfoValue, Recorder } from "./util/recorder.js";

Expand Down Expand Up @@ -718,7 +718,7 @@ export class ReplayCrawler extends Crawler {
return text;
}

async teardownPage(opts: WorkerOpts) {
async teardownPage(opts: WorkerState) {
const { page } = opts;
await this.processPageInfo(page);
await super.teardownPage(opts);
Expand Down
23 changes: 21 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS,
DEFAULT_SELECTORS,
BEHAVIOR_TYPES,
ExtractSelector,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
Expand Down Expand Up @@ -165,13 +166,21 @@ class ArgParser {
},

selectLinks: {
alias: "linkSelector",
describe:
"One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]",
type: "array",
default: ["a[href]->href"],
coerce,
},

clickSelector: {
describe:
"Selector for elements to click when using the autoclick behavior",
type: "string",
default: "a",
},

blockRules: {
describe:
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
Expand Down Expand Up @@ -351,7 +360,6 @@ class ArgParser {
describe: "Which background behaviors to enable on each page",
type: "array",
default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
choices: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
coerce,
},

Expand Down Expand Up @@ -693,9 +701,20 @@ class ArgParser {
// background behaviors to apply
const behaviorOpts: { [key: string]: string | boolean } = {};
if (argv.behaviors.length > 0) {
argv.behaviors.forEach((x: string) => (behaviorOpts[x] = true));
argv.behaviors.forEach((x: string) => {
if (BEHAVIOR_TYPES.includes(x)) {
behaviorOpts[x] = true;
} else {
logger.warn(
"Unknown behavior specified, ignoring",
{ behavior: x },
"behavior",
);
}
});
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
behaviorOpts.startEarly = true;
behaviorOpts.clickSelector = argv.clickSelector;
argv.behaviorOpts = JSON.stringify(behaviorOpts);
} else {
argv.behaviorOpts = "";
Expand Down
Loading

0 comments on commit b7150f1

Please sign in to comment.