From b5fd3a9e3f6b189b86c0fb89a37b66c08ff3fe5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 26 Aug 2024 11:17:22 +0200 Subject: [PATCH] feat: `globs` & `regexps` for `SitemapRequestList` (#2631) --- .../core/src/storages/sitemap_request_list.ts | 101 +++++++++++++++++- test/core/sitemap_request_list.test.ts | 39 +++++++ 2 files changed, 138 insertions(+), 2 deletions(-) diff --git a/packages/core/src/storages/sitemap_request_list.ts b/packages/core/src/storages/sitemap_request_list.ts index c30642e9541f..a02e9e0fd22b 100644 --- a/packages/core/src/storages/sitemap_request_list.ts +++ b/packages/core/src/storages/sitemap_request_list.ts @@ -2,17 +2,58 @@ import { Transform } from 'node:stream'; import defaultLog from '@apify/log'; import { type ParseSitemapOptions, parseSitemap } from '@crawlee/utils'; +import { minimatch } from 'minimatch'; import ow from 'ow'; import { KeyValueStore } from './key_value_store'; import type { IRequestList } from './request_list'; import { purgeDefaultStorages } from './utils'; +import type { GlobInput, RegExpInput, UrlPatternObject } from '../enqueue_links'; +import { constructGlobObjectsFromGlobs, constructRegExpObjectsFromRegExps } from '../enqueue_links'; import { Request } from '../request'; /** @internal */ const STATE_PERSISTENCE_KEY = 'SITEMAP_REQUEST_LIST_STATE'; -export interface SitemapRequestListOptions { +interface UrlConstraints { + /** + * An array of glob pattern strings or plain objects + * containing glob pattern strings matching the URLs to be enqueued. + * + * The plain objects must include at least the `glob` property, which holds the glob pattern string. + * + * The matching is always case-insensitive. + * If you need case-sensitive matching, use `regexps` property directly. + * + * If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the `SitemapRequestList` + * includes all the URLs from the sitemap. + */ + globs?: readonly GlobInput[]; + + /** + * An array of glob pattern strings, regexp patterns or plain objects + * containing patterns matching URLs that will **never** be included. + * + * The plain objects must include either the `glob` property or the `regexp` property. + * + * Glob matching is always case-insensitive. + * If you need case-sensitive matching, provide a regexp. + */ + exclude?: readonly (GlobInput | RegExp)[]; + + /** + * An array of regular expressions or plain objects + * containing regular expressions matching the URLs to be enqueued. + * + * The plain objects must include at least the `regexp` property, which holds the regular expression. + * + * If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the `SitemapRequestList` + * includes all the URLs from the sitemap. + */ + regexps?: readonly RegExpInput[]; +} + +export interface SitemapRequestListOptions extends UrlConstraints { /** * List of sitemap URLs to parse. */ @@ -138,6 +179,9 @@ export class SitemapRequestList implements IRequestList { */ private log = defaultLog.child({ prefix: 'SitemapRequestList' }); + private urlExcludePatternObjects: UrlPatternObject[] = []; + private urlPatternObjects: UrlPatternObject[] = []; + /** @internal */ private constructor(options: SitemapRequestListOptions) { ow( @@ -150,9 +194,34 @@ export class SitemapRequestList implements IRequestList { timeoutMillis: ow.optional.number, maxBufferSize: ow.optional.number, parseSitemapOptions: ow.optional.object, + globs: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('glob'))), + exclude: ow.optional.array.ofType( + ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')), + ), + regexps: ow.optional.array.ofType(ow.any(ow.regExp, ow.object.hasKeys('regexp'))), }), ); + const { globs, exclude, regexps } = options; + + if (exclude?.length) { + for (const excl of exclude) { + if (typeof excl === 'string' || 'glob' in excl) { + this.urlExcludePatternObjects.push(...constructGlobObjectsFromGlobs([excl])); + } else if (excl instanceof RegExp || 'regexp' in excl) { + this.urlExcludePatternObjects.push(...constructRegExpObjectsFromRegExps([excl])); + } + } + } + + if (globs?.length) { + this.urlPatternObjects.push(...constructGlobObjectsFromGlobs(globs)); + } + + if (regexps?.length) { + this.urlPatternObjects.push(...constructRegExpObjectsFromRegExps(regexps)); + } + this.persistStateKey = options.persistStateKey; this.proxyUrl = options.proxyUrl; @@ -165,6 +234,34 @@ export class SitemapRequestList implements IRequestList { this.sitemapParsingProgress.pendingSitemapUrls = new Set(options.sitemapUrls); } + /** + * Returns a function that checks whether the provided pattern matches the closure URL. + * @param url URL to be checked. + * @returns A matcher function that checks whether the pattern matches the closure URL. + */ + private matchesUrl(url: string): (patternObject: UrlPatternObject) => boolean { + return (patternObject) => { + const { regexp, glob } = patternObject; + + const matchesRegex = (regexp && url.match(regexp)) || false; + const matchesGlob = (glob && minimatch(url, glob, { nocase: true })) || false; + + return Boolean(matchesRegex || matchesGlob); + }; + } + + /** + * Checks whether the URL matches the `globs` / `regexps` / `exclude` provided in the `options`. + * @param url URL to be checked. + * @returns `true` if the URL matches the patterns, `false` otherwise. + */ + private isUrlMatchingPatterns(url: string): boolean { + return ( + !this.urlExcludePatternObjects.some(this.matchesUrl(url)) && + (this.urlPatternObjects.length === 0 || this.urlPatternObjects.some(this.matchesUrl(url))) + ); + } + /** * Adds a URL to the queue of parsed URLs. * @@ -172,7 +269,7 @@ export class SitemapRequestList implements IRequestList { */ private async pushNextUrl(url: string | null) { return new Promise((resolve) => { - if (this.closed) { + if (this.closed || (url && !this.isUrlMatchingPatterns(url))) { return resolve(); } diff --git a/test/core/sitemap_request_list.test.ts b/test/core/sitemap_request_list.test.ts index 0a2b915a67ea..ed8b8589fba7 100644 --- a/test/core/sitemap_request_list.test.ts +++ b/test/core/sitemap_request_list.test.ts @@ -278,6 +278,45 @@ describe('SitemapRequestList', () => { expect(list.fetchNextRequest()).resolves.toBe(null); }); + test('globs filtering works', async () => { + const list = await SitemapRequestList.open({ + sitemapUrls: [`${url}/sitemap.xml`], + globs: ['http://not-exists.com/catalog**'], + }); + + for await (const request of list) { + await list.markRequestHandled(request); + } + + expect(list.handledCount()).toBe(4); + }); + + test('regexps filtering works', async () => { + const list = await SitemapRequestList.open({ + sitemapUrls: [`${url}/sitemap.xml`], + regexps: [/desc=vacation_new.+/], + }); + + for await (const request of list) { + await list.markRequestHandled(request); + } + + expect(list.handledCount()).toBe(2); + }); + + test('exclude filtering works', async () => { + const list = await SitemapRequestList.open({ + sitemapUrls: [`${url}/sitemap.xml`], + exclude: [/desc=vacation_new/], + }); + + for await (const request of list) { + await list.markRequestHandled(request); + } + + expect(list.handledCount()).toBe(3); + }); + test('draining the request list between sitemaps', async () => { const list = await SitemapRequestList.open({ sitemapUrls: [`${url}/sitemap-index.xml`] });