Skip to content

Commit

Permalink
feat(platform): support configurable userAgent header via `SECUTILS…
Browse files Browse the repository at this point in the history
…_WEB_SCRAPER_USER_AGENT` envvar
  • Loading branch information
azasypkin committed Jan 14, 2024
1 parent 47aeda2 commit 030c8d9
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 10 deletions.
12 changes: 10 additions & 2 deletions src/api/web_page/content/get.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import type { Browser } from 'playwright/index.js';

import { registerWebPageContentGetRoutes } from './get.js';
import type { WebPageContext } from './web_page_context.js';
import { configure } from '../../../config.js';
import {
createBrowserContextMock,
createBrowserMock,
Expand Down Expand Up @@ -133,7 +134,10 @@ await test('[/api/web_page/content] can inject content extractor', async (t) =>

const browserMock = createBrowserMock(browserContextMock);
const response = await registerWebPageContentGetRoutes(
createMock({ browser: browserMock as unknown as Browser }),
createMock({
browser: browserMock as unknown as Browser,
config: { ...configure(), userAgent: 'secutils/1.0.0' },
}),
).inject({
method: 'POST',
url: '/api/web_page/content',
Expand Down Expand Up @@ -162,7 +166,11 @@ await test('[/api/web_page/content] can inject content extractor', async (t) =>

assert.strictEqual(browserMock.newContext.mock.callCount(), 1);
assert.deepEqual(browserMock.newContext.mock.calls[0].arguments, [
{ extraHTTPHeaders: { Cookie: 'my-cookie' }, bypassCSP: false },
{
extraHTTPHeaders: { Cookie: 'my-cookie' },
bypassCSP: false,
userAgent: 'secutils/1.0.0',
},
]);
assert.strictEqual(browserContextMock.newPage.mock.callCount(), 1);

Expand Down
12 changes: 9 additions & 3 deletions src/api/web_page/content/get.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import jsBeautify from 'js-beautify';
import type { Browser, JSHandle, Page, Response } from 'playwright';

import type { WebPageContext } from './web_page_context.js';
import type { Config } from '../../../config.js';
import { createObjectHash } from '../../../utilities/index.js';
import type { ApiResult } from '../../api_result.js';
import type { ApiRouteParams } from '../../api_route_params.js';
Expand Down Expand Up @@ -72,7 +73,7 @@ interface OutputBodyType {
content: string;
}

export function registerWebPageContentGetRoutes({ server, cache, acquireBrowser }: ApiRouteParams) {
export function registerWebPageContentGetRoutes({ server, cache, acquireBrowser, config }: ApiRouteParams) {
return server.post<{ Body: InputBodyParamsType }>(
'/api/web_page/content',
{
Expand Down Expand Up @@ -111,7 +112,7 @@ export function registerWebPageContentGetRoutes({ server, cache, acquireBrowser
const browser = await acquireBrowser();
const log = server.log.child({ provider: 'web_page_content_get' });
try {
const result = await getContent(browser, log, request.body);
const result = await getContent(config, browser, log, request.body);
if (result.type === 'client-error') {
log.error(`Cannot retrieve content for page "${request.body.url}" due to client error: ${result.error}`);
await Diagnostics.screenshot(log, browser);
Expand All @@ -135,6 +136,7 @@ export function registerWebPageContentGetRoutes({ server, cache, acquireBrowser
}

async function getContent(
config: Config,
browser: Browser,
log: FastifyBaseLogger,
{
Expand All @@ -147,7 +149,11 @@ async function getContent(
headers,
}: InputBodyParamsType,
): Promise<ApiResult<OutputBodyType>> {
const context = await browser.newContext({ extraHTTPHeaders: headers, bypassCSP: false });
const context = await browser.newContext({
extraHTTPHeaders: headers,
bypassCSP: false,
userAgent: config.userAgent,
});
const page = await context.newPage();

// Disable browser cache.
Expand Down
8 changes: 6 additions & 2 deletions src/api/web_page/resources/list.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import type { Browser } from 'playwright/index.js';

import type { WebPageResourceWithRawData } from './list.js';
import { registerWebPageResourcesListRoutes } from './list.js';
import { configure } from '../../../config.js';
import {
createBrowserContextMock,
createBrowserMock,
Expand Down Expand Up @@ -275,7 +276,10 @@ await test('[/api/web_page/resources] can inject resource filters', async (t) =>

const browserMock = createBrowserMock(browserContextMock);
const response = await registerWebPageResourcesListRoutes(
createMock({ browser: browserMock as unknown as Browser }),
createMock({
browser: browserMock as unknown as Browser,
config: { ...configure(), userAgent: 'secutils/1.0.0' },
}),
).inject({
method: 'POST',
url: '/api/web_page/resources',
Expand Down Expand Up @@ -321,7 +325,7 @@ await test('[/api/web_page/resources] can inject resource filters', async (t) =>

assert.strictEqual(browserMock.newContext.mock.callCount(), 1);
assert.deepEqual(browserMock.newContext.mock.calls[0].arguments, [
{ extraHTTPHeaders: { Cookie: 'my-cookie' }, bypassCSP: false },
{ extraHTTPHeaders: { Cookie: 'my-cookie' }, bypassCSP: false, userAgent: 'secutils/1.0.0' },
]);

// Make sure we didn't wait for a selector since it wasn't specified.
Expand Down
12 changes: 9 additions & 3 deletions src/api/web_page/resources/list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import type { FastifyBaseLogger } from 'fastify';
import type { Browser, JSHandle } from 'playwright';

import type { WebPageResource, WebPageResourceContent, WebPageResourceContentData } from './web_page_resource.js';
import type { Config } from '../../../config.js';
import { createObjectHash } from '../../../utilities/index.js';
import type { ApiResult } from '../../api_result.js';
import type { ApiRouteParams } from '../../api_route_params.js';
Expand Down Expand Up @@ -91,7 +92,7 @@ const RESOURCES_SCHEMA = {
},
};

export function registerWebPageResourcesListRoutes({ server, cache, acquireBrowser }: ApiRouteParams) {
export function registerWebPageResourcesListRoutes({ server, cache, acquireBrowser, config }: ApiRouteParams) {
return server.post<{ Body: InputBodyParamsType }>(
'/api/web_page/resources',
{
Expand Down Expand Up @@ -130,7 +131,7 @@ export function registerWebPageResourcesListRoutes({ server, cache, acquireBrows
const log = server.log.child({ provider: 'web_page_resources_list' });

try {
const result = await getResourcesList(browser, log, request.body);
const result = await getResourcesList(config, browser, log, request.body);
if (result.type === 'client-error') {
log.error(`Cannot retrieve resources for page "${request.body.url}" due to client error: ${result.error}`);
await Diagnostics.screenshot(log, browser);
Expand All @@ -154,11 +155,16 @@ export function registerWebPageResourcesListRoutes({ server, cache, acquireBrows
}

async function getResourcesList(
config: Config,
browser: Browser,
log: FastifyBaseLogger,
{ url, waitSelector, timeout = DEFAULT_TIMEOUT_MS, delay = DEFAULT_DELAY_MS, scripts, headers }: InputBodyParamsType,
): Promise<ApiResult<OutputBodyType>> {
const context = await browser.newContext({ extraHTTPHeaders: headers, bypassCSP: false });
const context = await browser.newContext({
extraHTTPHeaders: headers,
bypassCSP: false,
userAgent: config.userAgent,
});
const page = await context.newPage();

// Disable browser cache.
Expand Down
2 changes: 2 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export interface Config {
port: number;
cacheTTLSec: number;
browserTTLSec: number;
userAgent?: string;
}

export function configure(): Config {
Expand All @@ -17,5 +18,6 @@ export function configure(): Config {
port: +(process.env.SECUTILS_WEB_SCRAPER_PORT ?? 0) || 7272,
cacheTTLSec: +(process.env.SECUTILS_WEB_SCRAPER_CACHE_TTL_SEC ?? 0) || 20 * 60,
browserTTLSec: +(process.env.SECUTILS_WEB_SCRAPER_BROWSER_TTL_SEC ?? 0) || 10 * 60,
userAgent: process.env.SECUTILS_WEB_SCRAPER_USER_AGENT,
};
}
1 change: 1 addition & 0 deletions src/mocks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ export function createBrowserContextMock(
return {
newCDPSession: mock.fn(() => Promise.resolve(cdpSessionMock ?? createCDPSessionMock())),
newPage: mock.fn(() => Promise.resolve(pageMock ?? createPageMock())),
close: mock.fn(),
};
}

Expand Down

0 comments on commit 030c8d9

Please sign in to comment.