-
Notifications
You must be signed in to change notification settings - Fork 754
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds a new `@crawlee`-scope package, which exports an `HttpClient` implementation based on [`impit`](https://github.com/apify/impit).
- Loading branch information
Showing
14 changed files
with
541 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# @crawlee/impit-client | ||
|
||
This package provides a Crawlee-compliant `HttpClient` interface for the [`impit`](https://www.npmjs.com/package/impit) package. | ||
|
||
To use the `impit` package directly without Crawlee, check out [`impit`](https://www.npmjs.com/package/impit) on NPM. | ||
|
||
## Example usage | ||
|
||
Simply pass the `ImpitHttpClient` instance to the `httpClient` option of the crawler constructor: | ||
|
||
```typescript | ||
import { CheerioCrawler, Dictionary } from '@crawlee/cheerio'; | ||
import { ImpitHttpClient, Browser } from '@crawlee/impit-client'; | ||
|
||
const crawler = new CheerioCrawler({ | ||
httpClient: new ImpitHttpClient({ | ||
browser: Browser.Firefox, | ||
http3: true, | ||
ignoreTlsErrors: true, | ||
}), | ||
async requestHandler({ $, request }) { | ||
// Extract the title of the page. | ||
const title = $('title').text(); | ||
console.log(`Title of the page ${request.url}: ${title}`); | ||
}, | ||
}); | ||
|
||
crawler.run([ | ||
'http://www.example.com/page-1', | ||
'http://www.example.com/page-2', | ||
]); | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
{ | ||
"name": "@crawlee/impit-client", | ||
"version": "3.12.1", | ||
"description": "impit-based HTTP client implementation for Crawlee. Impersonates browser requests to avoid bot detection.", | ||
"engines": { | ||
"node": ">=16.0.0" | ||
}, | ||
"main": "./dist/index.js", | ||
"module": "./dist/index.mjs", | ||
"types": "./dist/index.d.ts", | ||
"exports": { | ||
".": { | ||
"import": "./dist/index.mjs", | ||
"require": "./dist/index.js", | ||
"types": "./dist/index.d.ts" | ||
}, | ||
"./package.json": "./package.json" | ||
}, | ||
"keywords": [ | ||
"apify", | ||
"headless", | ||
"chrome", | ||
"puppeteer", | ||
"crawler", | ||
"scraper" | ||
], | ||
"author": { | ||
"name": "Apify", | ||
"email": "[email protected]", | ||
"url": "https://apify.com" | ||
}, | ||
"contributors": [ | ||
"Jan Curn <[email protected]>", | ||
"Marek Trunkat <[email protected]>", | ||
"Ondra Urban <[email protected]>" | ||
], | ||
"license": "Apache-2.0", | ||
"repository": { | ||
"type": "git", | ||
"url": "git+https://github.com/apify/crawlee" | ||
}, | ||
"bugs": { | ||
"url": "https://github.com/apify/crawlee/issues" | ||
}, | ||
"homepage": "https://crawlee.dev", | ||
"scripts": { | ||
"build": "yarn clean && yarn compile && yarn copy", | ||
"clean": "rimraf ./dist", | ||
"compile": "tsc -p tsconfig.build.json && gen-esm-wrapper ./dist/index.js ./dist/index.mjs", | ||
"copy": "tsx ../../scripts/copy.ts" | ||
}, | ||
"publishConfig": { | ||
"access": "public" | ||
}, | ||
"peerDependencies": { | ||
"@crawlee/core": "^3.12.1" | ||
}, | ||
"devDependencies": { | ||
"@crawlee/core": "^3.12.1" | ||
}, | ||
"dependencies": { | ||
"impit": "^0.1.3" | ||
}, | ||
"packageManager": "[email protected]" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
import { Readable } from 'stream'; | ||
import { isTypedArray } from 'util/types'; | ||
|
||
import type { HttpRequest, HttpResponse, ResponseTypes, StreamingHttpResponse, BaseHttpClient } from '@crawlee/core'; | ||
import { type ImpitOptions, type HttpMethod, Impit } from 'impit'; | ||
|
||
export { Browser } from 'impit'; | ||
|
||
/** | ||
* A HTTP client implementation based on the `impit library. | ||
*/ | ||
export class ImpitHttpClient implements BaseHttpClient { | ||
private impitOptions: ImpitOptions; | ||
private maxRedirects: number; | ||
private followRedirects: boolean; | ||
|
||
constructor(options?: Omit<ImpitOptions, 'proxyUrl'> & { maxRedirects?: number }) { | ||
this.impitOptions = options ?? {}; | ||
|
||
this.maxRedirects = options?.maxRedirects ?? 10; | ||
this.followRedirects = options?.followRedirects ?? true; | ||
} | ||
|
||
/** | ||
* Converts the body of a `HttpRequest` to a format that can be passed to `impit`. | ||
*/ | ||
private async intoImpitBody<TResponseType extends keyof ResponseTypes>( | ||
body: Exclude<HttpRequest<TResponseType>['body'], undefined>, | ||
): Promise<string | Uint8Array> { | ||
if (typeof body === 'string' || isTypedArray(body)) { | ||
return body; | ||
} | ||
|
||
if (body instanceof ReadableStream) { | ||
const reader = body.getReader(); | ||
const buffer = new Uint8Array(); | ||
|
||
while (true) { | ||
const { done, value } = await reader.read(); | ||
|
||
if (done) return buffer; | ||
|
||
buffer.set(value, buffer.length); | ||
} | ||
} | ||
|
||
throw new Error('Unsupported body type.'); | ||
} | ||
|
||
/** | ||
* Flattens the headers of a `HttpRequest` to a format that can be passed to `impit`. | ||
* @param headers `SimpleHeaders` object | ||
* @returns `Record<string, string>` object | ||
*/ | ||
private flattenHeaders<TResponseType extends keyof ResponseTypes>( | ||
headers: Exclude<HttpRequest<TResponseType>['headers'], undefined>, | ||
): Record<string, string> { | ||
const result: Record<string, string> = {}; | ||
|
||
for (const headerName of Object.keys(headers)) { | ||
const headerValue = headers[headerName]; | ||
|
||
if (headerValue === undefined) continue; | ||
|
||
if (Array.isArray(headerValue)) { | ||
result[headerName] = headerValue[0]; | ||
continue; | ||
} | ||
|
||
result[headerName] = headerValue; | ||
} | ||
|
||
return result; | ||
} | ||
|
||
/** | ||
* Common implementation for `sendRequest` and `stream` methods. | ||
* @param request `HttpRequest` object | ||
* @returns `HttpResponse` object | ||
*/ | ||
private async performRequest<TResponseType extends keyof ResponseTypes>( | ||
request: HttpRequest<TResponseType>, | ||
redirects?: { | ||
redirectCount?: number; | ||
redirectUrls?: URL[]; | ||
}, | ||
): Promise<HttpResponse<TResponseType>> { | ||
if ((redirects?.redirectCount ?? 0) > this.maxRedirects) { | ||
throw new Error(`Too many redirects, maximum is ${this.maxRedirects}.`); | ||
} | ||
|
||
const url = typeof request.url === 'string' ? request.url : request.url.href; | ||
const headers = request.headers !== undefined ? this.flattenHeaders(request.headers) : undefined; | ||
const body = request.body !== undefined ? await this.intoImpitBody(request.body) : undefined; | ||
|
||
const impit = new Impit({ | ||
...this.impitOptions, | ||
proxyUrl: request.proxyUrl, | ||
followRedirects: false, | ||
}); | ||
|
||
const response = await impit.fetch(url, { | ||
method: request.method as HttpMethod, | ||
headers, | ||
body: body as string, | ||
}); | ||
|
||
if (this.followRedirects && response.status >= 300 && response.status < 400) { | ||
const location = response.headers.location; | ||
|
||
if (!location) { | ||
throw new Error('Redirect response missing location header.'); | ||
} | ||
|
||
return this.performRequest( | ||
{ | ||
...request, | ||
url: location, | ||
}, | ||
{ | ||
redirectCount: (redirects?.redirectCount ?? 0) + 1, | ||
redirectUrls: [...(redirects?.redirectUrls ?? []), new URL(location)], | ||
}, | ||
); | ||
} | ||
|
||
let responseBody; | ||
|
||
switch (request.responseType) { | ||
case 'text': | ||
responseBody = response.text(); | ||
break; | ||
case 'json': | ||
responseBody = response.json(); | ||
break; | ||
case 'buffer': | ||
responseBody = response.bytes(); | ||
break; | ||
default: | ||
throw new Error('Unsupported response type.'); | ||
} | ||
|
||
return { | ||
headers: response.headers, | ||
statusCode: response.status, | ||
url, | ||
request, | ||
redirectUrls: redirects?.redirectUrls ?? [], | ||
trailers: {}, | ||
body: responseBody, | ||
complete: true, | ||
}; | ||
} | ||
|
||
/** | ||
* @inheritDoc | ||
*/ | ||
async sendRequest<TResponseType extends keyof ResponseTypes>( | ||
request: HttpRequest<TResponseType>, | ||
): Promise<HttpResponse<TResponseType>> { | ||
return this.performRequest(request); | ||
} | ||
|
||
/** | ||
* @inheritDoc | ||
*/ | ||
async stream(request: HttpRequest): Promise<StreamingHttpResponse> { | ||
const response = await this.performRequest(request); | ||
|
||
const stream = new Readable(); | ||
stream.push(response.body); | ||
stream.push(null); | ||
|
||
return { | ||
request, | ||
url: response.url, | ||
ip: response.ipAddress, | ||
statusCode: response.statusCode, | ||
stream, | ||
complete: true, | ||
downloadProgress: { percent: 100, transferred: response.body.length }, | ||
uploadProgress: { percent: 100, transferred: 0 }, | ||
redirectUrls: response.redirectUrls, | ||
headers: response.headers, | ||
trailers: {}, | ||
}; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"extends": "../../tsconfig.build.json", | ||
"compilerOptions": { | ||
"outDir": "./dist" | ||
}, | ||
"include": ["src/**/*"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"extends": "../../tsconfig.json", | ||
"include": ["src/**/*"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"actorSpecification": 1, | ||
"name": "test-cheerio-impit-ts", | ||
"version": "0.0", | ||
"buildTag": "latest", | ||
"env": null | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"root": true, | ||
"extends": "../../.eslintrc.json", | ||
"parserOptions": { | ||
"project": "./test/e2e/cheerio-impit-ts/actor/tsconfig.json", | ||
"ecmaVersion": 2022 | ||
}, | ||
"rules": { | ||
"no-empty-function": "off", | ||
"@typescript-eslint/no-explicit-any": "off", | ||
"no-constant-condition": "off" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
.idea | ||
.DS_Store | ||
node_modules | ||
package-lock.json | ||
apify_storage | ||
crawlee_storage | ||
storage | ||
main.d.ts | ||
main.d.ts.map | ||
main.js | ||
main.js.map |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# using multistage build, as we need dev deps to build the TS source code | ||
FROM apify/actor-node:20-beta AS builder | ||
|
||
# copy all files, install all dependencies (including dev deps) and build the project | ||
COPY . ./ | ||
RUN npm install --include=dev \ | ||
&& npm run build | ||
|
||
# create final image | ||
FROM apify/actor-node:20-beta | ||
# copy only necessary files | ||
COPY --from=builder /usr/src/app/packages ./packages | ||
COPY --from=builder /usr/src/app/package.json ./ | ||
COPY --from=builder /usr/src/app/main.js ./ | ||
|
||
# install only prod deps | ||
RUN npm --quiet set progress=false \ | ||
&& npm install --only=prod --no-audit \ | ||
&& npm update --no-audit \ | ||
&& echo "Installed NPM packages:" \ | ||
&& (npm list --omit=dev --all || true) \ | ||
&& echo "Node.js version:" \ | ||
&& node --version \ | ||
&& echo "NPM version:" \ | ||
&& npm --version | ||
|
||
# run compiled code | ||
CMD npm run start:prod |
Oops, something went wrong.