Skip to content

Commit

Permalink
feat: impit-based HttpClient implementation (#2787)
Browse files Browse the repository at this point in the history
Adds a new `@crawlee`-scope package, which exports an `HttpClient`
implementation based on
[`impit`](https://github.com/apify/impit).
  • Loading branch information
barjin authored Jan 22, 2025
1 parent 030886d commit 61d7ffa
Show file tree
Hide file tree
Showing 14 changed files with 541 additions and 1 deletion.
32 changes: 32 additions & 0 deletions packages/impit-client/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# @crawlee/impit-client

This package provides a Crawlee-compliant `HttpClient` interface for the [`impit`](https://www.npmjs.com/package/impit) package.

To use the `impit` package directly without Crawlee, check out [`impit`](https://www.npmjs.com/package/impit) on NPM.

## Example usage

Simply pass the `ImpitHttpClient` instance to the `httpClient` option of the crawler constructor:

```typescript
import { CheerioCrawler, Dictionary } from '@crawlee/cheerio';
import { ImpitHttpClient, Browser } from '@crawlee/impit-client';

const crawler = new CheerioCrawler({
httpClient: new ImpitHttpClient({
browser: Browser.Firefox,
http3: true,
ignoreTlsErrors: true,
}),
async requestHandler({ $, request }) {
// Extract the title of the page.
const title = $('title').text();
console.log(`Title of the page ${request.url}: ${title}`);
},
});

crawler.run([
'http://www.example.com/page-1',
'http://www.example.com/page-2',
]);
```
65 changes: 65 additions & 0 deletions packages/impit-client/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"name": "@crawlee/impit-client",
"version": "3.12.1",
"description": "impit-based HTTP client implementation for Crawlee. Impersonates browser requests to avoid bot detection.",
"engines": {
"node": ">=16.0.0"
},
"main": "./dist/index.js",
"module": "./dist/index.mjs",
"types": "./dist/index.d.ts",
"exports": {
".": {
"import": "./dist/index.mjs",
"require": "./dist/index.js",
"types": "./dist/index.d.ts"
},
"./package.json": "./package.json"
},
"keywords": [
"apify",
"headless",
"chrome",
"puppeteer",
"crawler",
"scraper"
],
"author": {
"name": "Apify",
"email": "[email protected]",
"url": "https://apify.com"
},
"contributors": [
"Jan Curn <[email protected]>",
"Marek Trunkat <[email protected]>",
"Ondra Urban <[email protected]>"
],
"license": "Apache-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/apify/crawlee"
},
"bugs": {
"url": "https://github.com/apify/crawlee/issues"
},
"homepage": "https://crawlee.dev",
"scripts": {
"build": "yarn clean && yarn compile && yarn copy",
"clean": "rimraf ./dist",
"compile": "tsc -p tsconfig.build.json && gen-esm-wrapper ./dist/index.js ./dist/index.mjs",
"copy": "tsx ../../scripts/copy.ts"
},
"publishConfig": {
"access": "public"
},
"peerDependencies": {
"@crawlee/core": "^3.12.1"
},
"devDependencies": {
"@crawlee/core": "^3.12.1"
},
"dependencies": {
"impit": "^0.1.3"
},
"packageManager": "[email protected]"
}
188 changes: 188 additions & 0 deletions packages/impit-client/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import { Readable } from 'stream';
import { isTypedArray } from 'util/types';

import type { HttpRequest, HttpResponse, ResponseTypes, StreamingHttpResponse, BaseHttpClient } from '@crawlee/core';
import { type ImpitOptions, type HttpMethod, Impit } from 'impit';

export { Browser } from 'impit';

/**
* A HTTP client implementation based on the `impit library.
*/
export class ImpitHttpClient implements BaseHttpClient {
private impitOptions: ImpitOptions;
private maxRedirects: number;
private followRedirects: boolean;

constructor(options?: Omit<ImpitOptions, 'proxyUrl'> & { maxRedirects?: number }) {
this.impitOptions = options ?? {};

this.maxRedirects = options?.maxRedirects ?? 10;
this.followRedirects = options?.followRedirects ?? true;
}

/**
* Converts the body of a `HttpRequest` to a format that can be passed to `impit`.
*/
private async intoImpitBody<TResponseType extends keyof ResponseTypes>(
body: Exclude<HttpRequest<TResponseType>['body'], undefined>,
): Promise<string | Uint8Array> {
if (typeof body === 'string' || isTypedArray(body)) {
return body;
}

if (body instanceof ReadableStream) {
const reader = body.getReader();
const buffer = new Uint8Array();

while (true) {

Check warning on line 38 in packages/impit-client/src/index.ts

View workflow job for this annotation

GitHub Actions / Lint

Unexpected constant condition
const { done, value } = await reader.read();

if (done) return buffer;

buffer.set(value, buffer.length);
}
}

throw new Error('Unsupported body type.');
}

/**
* Flattens the headers of a `HttpRequest` to a format that can be passed to `impit`.
* @param headers `SimpleHeaders` object
* @returns `Record<string, string>` object
*/
private flattenHeaders<TResponseType extends keyof ResponseTypes>(
headers: Exclude<HttpRequest<TResponseType>['headers'], undefined>,
): Record<string, string> {
const result: Record<string, string> = {};

for (const headerName of Object.keys(headers)) {
const headerValue = headers[headerName];

if (headerValue === undefined) continue;

if (Array.isArray(headerValue)) {
result[headerName] = headerValue[0];
continue;
}

result[headerName] = headerValue;
}

return result;
}

/**
* Common implementation for `sendRequest` and `stream` methods.
* @param request `HttpRequest` object
* @returns `HttpResponse` object
*/
private async performRequest<TResponseType extends keyof ResponseTypes>(
request: HttpRequest<TResponseType>,
redirects?: {
redirectCount?: number;
redirectUrls?: URL[];
},
): Promise<HttpResponse<TResponseType>> {
if ((redirects?.redirectCount ?? 0) > this.maxRedirects) {
throw new Error(`Too many redirects, maximum is ${this.maxRedirects}.`);
}

const url = typeof request.url === 'string' ? request.url : request.url.href;
const headers = request.headers !== undefined ? this.flattenHeaders(request.headers) : undefined;
const body = request.body !== undefined ? await this.intoImpitBody(request.body) : undefined;

const impit = new Impit({
...this.impitOptions,
proxyUrl: request.proxyUrl,
followRedirects: false,
});

const response = await impit.fetch(url, {
method: request.method as HttpMethod,
headers,
body: body as string,
});

if (this.followRedirects && response.status >= 300 && response.status < 400) {
const location = response.headers.location;

if (!location) {
throw new Error('Redirect response missing location header.');
}

return this.performRequest(
{
...request,
url: location,
},
{
redirectCount: (redirects?.redirectCount ?? 0) + 1,
redirectUrls: [...(redirects?.redirectUrls ?? []), new URL(location)],
},
);
}

let responseBody;

switch (request.responseType) {
case 'text':
responseBody = response.text();
break;
case 'json':
responseBody = response.json();
break;
case 'buffer':
responseBody = response.bytes();
break;
default:
throw new Error('Unsupported response type.');
}

return {
headers: response.headers,
statusCode: response.status,
url,
request,
redirectUrls: redirects?.redirectUrls ?? [],
trailers: {},
body: responseBody,
complete: true,
};
}

/**
* @inheritDoc
*/
async sendRequest<TResponseType extends keyof ResponseTypes>(
request: HttpRequest<TResponseType>,
): Promise<HttpResponse<TResponseType>> {
return this.performRequest(request);
}

/**
* @inheritDoc
*/
async stream(request: HttpRequest): Promise<StreamingHttpResponse> {
const response = await this.performRequest(request);

const stream = new Readable();
stream.push(response.body);
stream.push(null);

return {
request,
url: response.url,
ip: response.ipAddress,
statusCode: response.statusCode,
stream,
complete: true,
downloadProgress: { percent: 100, transferred: response.body.length },
uploadProgress: { percent: 100, transferred: 0 },
redirectUrls: response.redirectUrls,
headers: response.headers,
trailers: {},
};
}
}
7 changes: 7 additions & 0 deletions packages/impit-client/tsconfig.build.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"extends": "../../tsconfig.build.json",
"compilerOptions": {
"outDir": "./dist"
},
"include": ["src/**/*"]
}
4 changes: 4 additions & 0 deletions packages/impit-client/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"extends": "../../tsconfig.json",
"include": ["src/**/*"]
}
7 changes: 7 additions & 0 deletions test/e2e/cheerio-impit-ts/actor/.actor/actor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"actorSpecification": 1,
"name": "test-cheerio-impit-ts",
"version": "0.0",
"buildTag": "latest",
"env": null
}
13 changes: 13 additions & 0 deletions test/e2e/cheerio-impit-ts/actor/.eslintrc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"root": true,
"extends": "../../.eslintrc.json",
"parserOptions": {
"project": "./test/e2e/cheerio-impit-ts/actor/tsconfig.json",
"ecmaVersion": 2022
},
"rules": {
"no-empty-function": "off",
"@typescript-eslint/no-explicit-any": "off",
"no-constant-condition": "off"
}
}
11 changes: 11 additions & 0 deletions test/e2e/cheerio-impit-ts/actor/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
.idea
.DS_Store
node_modules
package-lock.json
apify_storage
crawlee_storage
storage
main.d.ts
main.d.ts.map
main.js
main.js.map
28 changes: 28 additions & 0 deletions test/e2e/cheerio-impit-ts/actor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# using multistage build, as we need dev deps to build the TS source code
FROM apify/actor-node:20-beta AS builder

# copy all files, install all dependencies (including dev deps) and build the project
COPY . ./
RUN npm install --include=dev \
&& npm run build

# create final image
FROM apify/actor-node:20-beta
# copy only necessary files
COPY --from=builder /usr/src/app/packages ./packages
COPY --from=builder /usr/src/app/package.json ./
COPY --from=builder /usr/src/app/main.js ./

# install only prod deps
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-audit \
&& npm update --no-audit \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version

# run compiled code
CMD npm run start:prod
Loading

0 comments on commit 61d7ffa

Please sign in to comment.