diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java b/crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java index 6c58bace9..b85f80d56 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java @@ -96,7 +96,7 @@ public class Crawler { ? extends FetchRequest, ? extends FetchResponse> fetcher; private final Class docContextType; private CrawlerState state; - // TODO remove stopper listener when we are fully using a table? + // TODO remove stopper listener when we are fully using an accessible store? private CrawlerStopper stopper = new FileBasedStopper(); // --- Set in init --- @@ -127,8 +127,8 @@ public class Crawler { doc -> new UpsertRequest( doc.getReference(), doc.getMetadata(), - doc.getInputStream())) // Closed by - // caller + // InputStream closed by caller + doc.getInputStream())) .deleteRequestBuilder( doc -> new DeleteRequest( doc.getReference(), @@ -194,12 +194,11 @@ public void fire(String eventName) { } public void fire(String eventName, Object subject) { - fire( - CrawlerEvent.builder() - .name(eventName) - .source(this) - .subject(subject) - .build()); + fire(CrawlerEvent.builder() + .name(eventName) + .source(this) + .subject(subject) + .build()); } @Override @@ -224,31 +223,26 @@ && getState().isExecutionLocked()) { getState().setStopping(true); LOG.info("Stopping the crawler."); } else { - LOG.info( - "CANNOT STOP: the targetted crawler does not appear " - + "to be running on on this host."); + LOG.info("CANNOT STOP: the targetted crawler does not appear " + + "to be running on on this host."); } } public void exportDataStore(Path exportDir) { - executeCommand( - new CommandExecution(this, "STORE_EXPORT") - .failableCommand( - () -> DataStoreExporter.exportDataStore( - this, - exportDir)) - .lock(true) - .logIntro(true)); + executeCommand(new CommandExecution(this, "STORE_EXPORT") + .failableCommand(() -> DataStoreExporter.exportDataStore( + this, + exportDir)) + .lock(true) + .logIntro(true)); } public void importDataStore(Path file) { - executeCommand( - new CommandExecution(this, "STORE_IMPORT") - .failableCommand( - () -> DataStoreImporter - .importDataStore(this, file)) - .lock(true) - .logIntro(true)); + executeCommand(new CommandExecution(this, "STORE_IMPORT") + .failableCommand( + () -> DataStoreImporter.importDataStore(this, file)) + .lock(true) + .logIntro(true)); } /** @@ -256,14 +250,13 @@ public void importDataStore(Path file) { * the crawler was run for the first time. */ public void clean() { - executeCommand( - new CommandExecution(this, "CLEAN") - .failableCommand(() -> { - getServices().getCommitterService().clean(); - dataStoreEngine.clean(); - FileUtils.deleteDirectory(getWorkDir().toFile()); - }) - .lock(true) - .logIntro(true)); + executeCommand(new CommandExecution(this, "CLEAN") + .failableCommand(() -> { + getServices().getCommitterService().clean(); + dataStoreEngine.clean(); + FileUtils.deleteDirectory(getWorkDir().toFile()); + }) + .lock(true) + .logIntro(true)); } } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerBuilder.java b/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerBuilder.java index 33c2437ca..db8a8ebb3 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerBuilder.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerBuilder.java @@ -29,13 +29,11 @@ import lombok.Setter; import lombok.experimental.Accessors; -//TODO move to its own class? //TODO document the optional ones and their default values @Accessors(fluent = true) @Setter @Getter @NonNull -@SuppressWarnings("javadoc") public class CrawlerBuilder { private CrawlerConfig configuration = new CrawlerConfig(); private DocPipelines docPipelines; @@ -47,8 +45,6 @@ public class CrawlerBuilder { /** * The exact type of {@link CrawlDocContext} if your crawler is subclassing * it. Defaults to {@link CrawlDocContext} class. - * @param docContextType crawl doc brief class - * @return doc brief class */ private Class docContextType = CrawlDocContext.class; @@ -56,9 +52,6 @@ public class CrawlerBuilder { /** * Provides a required fetcher implementation, responsible for obtaining * resources being crawled. - * - * @param fetcherProvider fetcher provider function - * @return a function returning a fetcher to associate with a given crawler. */ private Function> fetcherProvider; diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerCallbacks.java b/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerCallbacks.java index 7d0580dbb..87c788e7f 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerCallbacks.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerCallbacks.java @@ -23,7 +23,6 @@ import lombok.Builder; import lombok.Getter; -@SuppressWarnings("javadoc") @Builder @Getter public class CrawlerCallbacks { @@ -35,9 +34,6 @@ public class CrawlerCallbacks { * This method is different than the {@link #initCrawler()} method, * which is invoked for any type of actions where as this one is only * invoked before an effective request for crawling. - * @param beforeCrawlerExecution bi-consumer accepting a crawler and - * a "resume" indicator. - * @return bi-consumer accepting a crawler and a "resume" indicator */ Consumer beforeCrawlerExecution; @@ -48,17 +44,15 @@ public class CrawlerCallbacks { * Invoked right after {@link CrawlerEvent#CRAWLER_STOP_END} or * {@link CrawlerEvent#CRAWLER_RUN_END} (depending which of the two is * triggered). - * @param afterCrawlerExecution consumer accepting a crawler - * @return consumer accepting a crawler */ Consumer afterCrawlerExecution; - //TODO are those used? Should they be? + //MAYBE: are those used? Should they be? // Add those that are missing to ReferencesProcessor BiConsumer beforeDocumentProcessing; BiConsumer afterDocumentProcessing; - // need those, or we can replace beforeDocumentFinalizing + //MAYBE: need those, or we can replace beforeDocumentFinalizing // (the only one used) with after processing? BiConsumer beforeDocumentFinalizing; BiConsumer afterDocumentFinalizing; diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerConfig.java index 37e3f8739..1df0c7d6a 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerConfig.java @@ -39,6 +39,7 @@ import com.norconex.crawler.core.doc.operations.spoil.SpoiledReferenceStrategizer; import com.norconex.crawler.core.doc.operations.spoil.impl.GenericSpoiledReferenceStrategizer; import com.norconex.crawler.core.doc.pipelines.queue.ReferencesProvider; +import com.norconex.crawler.core.event.listeners.StopCrawlerOnMaxEventListener; import com.norconex.crawler.core.fetch.FetchDirectiveSupport; import com.norconex.crawler.core.fetch.Fetcher; import com.norconex.crawler.core.store.DataStoreEngine; @@ -53,135 +54,14 @@ /** *

* Base Crawler configuration. Crawlers usually read this configuration upon - * starting up. Once execution has started, it should not be changed - * to avoid unexpected behaviors. + * starting up. While not always enforced, once execution has started, it + * should be considered immutable to avoid unexpected behaviors. *

- * *

* Concrete implementations inherit the following XML configuration * options (typically within a <crawler> tag): *

- * - * {@nx.xml #init - * - * (maximum number of threads) - * - * (maximum number of documents to crawl per session, resuming on next - * sessions where it last ended, if crawling was not complete) - * - * (maximum depth the crawler should go) - * (thread inactivity timeout) - * - * (minimum frequency at which progress is logged) - * - * [PROCESS|IGNORE|DELETE] - * - * - * - * (fully qualified class name of a an exception) - * - * - * - * - * - * - * - * - * } - * - * {@nx.xml #start-refs - * - * - * (a reference) - * (local path to a file containing references) - * - * - * } - * - * {@nx.xml #fetchers - * - * - * - * - * } - * - * {@nx.xml #pipeline-queue - * - * - * - * - * } - * - * {@nx.xml #pipeline-import - * - * - * - * - * - * - * - * - * - * } - * - * {@nx.xml #import - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * } - * - * {@nx.xml #directive-meta - * [DISABLED|REQUIRED|OPTIONAL] - * } - * {@nx.xml #directive-doc - * [REQUIRED|DISABLED|OPTIONAL] - * } - * - * {@nx.xml #checksum-meta - * - * } - * - * {@nx.xml #dedup-meta - * [false|true] - * } - * - * {@nx.xml #checksum-doc - * - * } - * - * {@nx.xml #dedup-doc - * [false|true] - * } - * - * {@nx.xml #pipeline-committer - * - * - * - * - * - * } */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) @FieldNameConstants @@ -222,8 +102,6 @@ public enum OrphansStrategy { * crawlers in the same crawl session. On top of avoiding conflicts, * it facilitates integration with different systems and facilitates * tracking. - * @param id unique identifier - * @return unique identifier */ @JsonProperty(required = true) private String id; @@ -239,19 +117,14 @@ public enum OrphansStrategy { /** * The base directory location where files generated during execution * will reside. When null the collector will use - * ./work at runtime. - * @param workDir working directory path - * @return working directory path + * ./work, relative to the execution "current" directory. */ private Path workDir = DEFAULT_WORKDIR; /** * Maximum number of bytes used for memory caching of all reusable streams - * at any given time, for faster processing. Defaults to 1 GB. + * combined, at any given time, for faster processing. Defaults to 1 GB. * File-caching is used when the maximum is reached. - * @param streamCachePoolSize - * maximum number of bytes for all reusable streams combined - * @return maximum number of bytes for all reusable streams combined */ private long maxStreamCachePoolSize = ImporterConfig.DEFAULT_MAX_STREAM_CACHE_POOL_SIZE; @@ -260,50 +133,38 @@ public enum OrphansStrategy { * Maximum number of bytes used for memory caching of a single reusable * stream, for faster processing. Defaults to 100 MB. File-caching is * used when this maximum is reached for a single file, or when the - * pool size has been reached. - * @param streamCacheSize - * maximum number of bytes for a single reusable streams - * @return maximum number of bytes for a single reusable stream + * pool maximum size has been reached. */ private long maxStreamCacheSize = ImporterConfig.DEFAULT_MAX_STREAM_CACHE_SIZE; /** - * The amount of time to defer the collector shutdown when it is + * The amount of time to defer the crawler shutdown when it is * done executing. This is useful for giving external processes * with polling intervals enough time to grab the latest state of * the collector before it shuts down. Default is zero (does not * wait to shutdown after completion). - * @param deferredShutdownDuration duration - * @return duration */ private Duration deferredShutdownDuration = Duration.ZERO; /** - * The crawl data store factory. - * @param dataStoreEngine crawl data store factory. - * @return crawl data store factory. + * The data store engine. */ private DataStoreEngine dataStoreEngine = new MvStoreDataStoreEngine(); /** * Whether the start references should be loaded asynchronously. When * true, the crawler will start processing the start - * references in a separate thread as they are added to the queue - * (as opposed to wait for queue initialization to be complete). + * references in one or more separate threads as they are added to the + * queue (as opposed to wait for queue initialization to be complete). * While this may speed up crawling, it may have an unexpected effect on * accuracy of {@link CrawlDocMetadata#DEPTH}. Use of this option is only * recommended when start references take a significant time to load. - * @param startReferencesAsync true if initialized - * asynchronously - * @return true if initialized asynchronously */ private boolean startReferencesAsync; /** * The maximum number of threads a crawler can use. Default is 2. - * @param numThreads number of threads - * @return number of threads */ @Min(1) private int numThreads = 2; @@ -322,21 +183,24 @@ public enum OrphansStrategy { * its documents actively being processed before stopping. *

*

- * Reaching the maximum value does not terminate the crawl session but - * rather pauses it. On next run, the crawler will resume the same session, + * Reaching the maximum value will stop the crawler but it will not + * otherwise consider the crawler session "complete", but rather + * on "pause". On next run, the crawler will resume the same session, * processing an additional number of documents up to the maximum * specified. * This maximum allows crawling one or more sources * in chunks, processing a maximum number of documents each time. * When the session fully completes, the next run will start a new - * crawl session. To prevent resuming an partial crawl session, - * explicitly clean the crawl session. + * crawl session. To prevent resuming a partial crawl session, + * explicitly clean the crawl store first. + *

+ *

+ * For more control on what events may stop the crawler, consider using + * configuring a {@link StopCrawlerOnMaxEventListener}. *

*

* Default is -1 (unlimited). *

- * @param maxDocuments maximum number of documents that can be processed - * @return maximum number of documents that can be processed */ private int maxDocuments = -1; @@ -345,8 +209,6 @@ public enum OrphansStrategy { * is crawler-specific. Examples: levels of sub-directories, * number of URL clicks to reach a page, etc. Refer to specific crawler * implementation for details. Default is -1 (unlimited). - * @param maxDepth maximum depth or -1 for unlimited depth - * @return maximum depth or -1 for unlimited depth */ private int maxDepth = -1; @@ -358,8 +220,6 @@ public enum OrphansStrategy { * is also considered "inactive". Default is * {@value #DEFAULT_IDLE_PROCESSING_TIMEOUT}. A null * value means no timeouts. - * @param idleTimeout time to wait for a document to be processed - * @return time to wait for a document to be processed */ private Duration idleTimeout; @@ -369,9 +229,6 @@ public enum OrphansStrategy { * Default value is {@value #DEFAULT_MIN_PROGRESS_LOGGING_INTERVAL}. * A null value disables progress logging. Minimum value * is 1 second. - * @param minProgressLoggingInterval time to wait between each logging - * of crawling progress - * @return time to wait between each logging of crawling progress */ private Duration minProgressLoggingInterval; @@ -381,13 +238,13 @@ public enum OrphansStrategy { * current run. In other words, they are leftovers from a previous run * that were not re-encountered in the current. *

- * Unless explicitly stated otherwise by an implementing class, the default - * strategy is to PROCESS orphans. + * Unless explicitly stated otherwise by a crawler implementation, the + * default strategy is to PROCESS orphans. * Setting a null value is the same as setting * IGNORE. *

* Be careful: Setting the orphan strategy to DELETE - * is NOT recommended in most cases. With some collectors, a temporary + * is NOT recommended in most cases. There are times when a temporary * failure such as a network outage or a web page timing out, may cause * some documents not to be crawled. When this happens, unreachable * documents would be considered "orphans" and be deleted while under @@ -395,8 +252,6 @@ public enum OrphansStrategy { * (default), is usually the safest approach to confirm they still * exist before deleting or updating them. *

- * @param orphansStrategy orphans strategy - * @return orphans strategy */ private OrphansStrategy orphansStrategy = OrphansStrategy.PROCESS; @@ -414,15 +269,11 @@ public enum OrphansStrategy { /** * The metadata checksummer. * Metadata checksum generation is disabled when null. - * @param metadataChecksummer metadata checksummer - * @return metadata checksummer or null when disabled */ private MetadataChecksummer metadataChecksummer; /** * The Importer module configuration. - * @param importerConfig Importer module configuration - * @return Importer module configuration */ @JsonProperty("importer") private ImporterConfig importerConfig = new ImporterConfig(); @@ -433,42 +284,35 @@ public enum OrphansStrategy { private final List committers = new ArrayList<>(); /** - * Whether to turn on deduplication based on metadata checksum. + * Whether to turn ON deduplication based on metadata checksum. * To enable, {@link #getMetadataChecksummer()} must not return * null. * Not recommended unless you know for sure your metadata * checksum is acceptably unique. - * @param metadataDeduplicate true to turn on - * metadata-based deduplication - * @return whether to turn on metadata-based deduplication */ private boolean metadataDeduplicate; /** - * Whether to turn on deduplication based on document checksum. + * Whether to turn ON deduplication based on document checksum. * To enable, {@link #getDocumentChecksummer()} must not return * null. * Not recommended unless you know for sure your document * checksum is acceptably unique. - * @param documentDeduplicate true to turn on - * document-based deduplication - * @return whether to turn on document-based deduplication */ private boolean documentDeduplicate; /** - * The document checksummer. - * Document checksum generation is disabled when null. - * @param documentChecksummer document checksummer - * @return document checksummer or null when disabled + * The document checksummer. Document checksum generation is disabled + * when null. */ private DocumentChecksummer documentChecksummer = new Md5DocumentChecksummer(); /** - * The spoiled state strategy resolver. - * @param spoiledReferenceStrategizer spoiled state strategy resolver - * @return spoiled state strategy resolver + * The spoiled state strategy resolver. A spoiled document is one that + * was crawled properly before but on a subsequent crawl, it can no longer + * be crawled for whatever reason (not found, bad status, server error, + * etc.). */ private SpoiledReferenceStrategizer spoiledReferenceStrategizer = new GenericSpoiledReferenceStrategizer(); @@ -483,16 +327,12 @@ public enum OrphansStrategy { /** * One or more fetchers responsible for obtaining documents and their * metadata from a source. - * @param fetchers one or more fetchers - * @return one or more fetchers */ private final List> fetchers = new ArrayList<>(); /** * The maximum number of times a fetcher will re-attempt fetching * a resource in case of failures. Default is zero (won't retry). - * @param fetchersMaxRetries maximum number of retries - * @return maximum number of retries */ private int fetchersMaxRetries; @@ -500,8 +340,6 @@ public enum OrphansStrategy { * How long to wait before a failing fetcher re-attempts fetching * a resource in case of failures (in milliseconds). * Default is zero (no delay). - * @param fetchersRetryDelay retry delay - * @return retry delay */ private Duration fetchersRetryDelay; @@ -518,6 +356,7 @@ public List getStartReferences() { /** * Sets the references to initiate crawling from. * @param startReferences start references + * @return this */ public CrawlerConfig setStartReferences(List startReferences) { CollectionUtil.setAll(this.startReferences, startReferences); @@ -541,6 +380,7 @@ public List getStartReferencesFiles() { * Blank lines and lines starting with # (comment) are ignored. * @param startReferencesFiles file paths of seed files containing * references + * @return this */ public CrawlerConfig setStartReferencesFiles( List startReferencesFiles) { @@ -563,6 +403,7 @@ public List getStartReferencesProviders() { * Use this approach when references need to be provided * dynamically at launch time. * @param startReferencesProviders start references provider + * @return this */ public CrawlerConfig setStartReferencesProviders( List startReferencesProviders) { @@ -598,6 +439,7 @@ public List> getStopOnExceptions() { * should catch them all). * @param stopOnExceptions exceptions that will stop the crawler when * encountered + * @return this */ public CrawlerConfig setStopOnExceptions( List> stopOnExceptions) { @@ -616,6 +458,7 @@ public List getReferenceFilters() { /** * Sets reference filters. * @param referenceFilters the referenceFilters to set + * @return this */ public CrawlerConfig setReferenceFilters( List referenceFilters) { @@ -634,6 +477,7 @@ public List getDocumentFilters() { /** * Sets document filters. * @param documentFilters document filters + * @return this */ public CrawlerConfig setDocumentFilters( List documentFilters) { @@ -652,6 +496,7 @@ public List getMetadataFilters() { /** * Sets metadata filters. * @param metadataFilters metadata filters + * @return this */ public CrawlerConfig setMetadataFilters( List metadataFilters) { @@ -672,6 +517,7 @@ public List getCommitters() { * Sets Committers responsible for persisting information * to a target location/repository. * @param committers list of Committers + * @return this */ public CrawlerConfig setCommitters(List committers) { CollectionUtil.setAll(this.committers, committers); @@ -693,6 +539,7 @@ public List> getEventListeners() { * Those are considered additions to automatically * detected configuration objects implementing {@link EventListener}. * @param eventListeners event listeners. + * @return this */ public CrawlerConfig setEventListeners( List> eventListeners) { @@ -705,6 +552,7 @@ public CrawlerConfig setEventListeners( * Those are considered additions to automatically * detected configuration objects implementing {@link EventListener}. * @param eventListeners event listeners. + * @return this */ public CrawlerConfig addEventListeners( List> eventListeners) { @@ -717,6 +565,7 @@ public CrawlerConfig addEventListeners( * Those are considered additions to automatically * detected configuration objects implementing {@link EventListener}. * @param eventListener event listener. + * @return this */ public CrawlerConfig addEventListener(EventListener eventListener) { eventListeners.add(eventListener); @@ -737,6 +586,7 @@ public boolean removeEventListener(EventListener eventListener) { * Clears all event listeners. The automatically * detected configuration objects implementing {@link EventListener} * are not cleared. + * @return this */ public CrawlerConfig clearEventListeners() { eventListeners.clear(); @@ -754,6 +604,7 @@ public List getPreImportConsumers() { /** * Sets pre-import consumers. * @param preImportConsumers pre-import consumers + * @return this */ public CrawlerConfig setPreImportConsumers( List preImportConsumers) { @@ -773,6 +624,7 @@ public List getPostImportConsumers() { /** * Sets post-import consumers. * @param postImportConsumers post-import consumers + * @return this */ public CrawlerConfig setPostImportConsumers( List postImportConsumers) { @@ -800,6 +652,7 @@ public CrawlerConfig setPostImportConsumers( * be invoked in their defined order, until the first one that accepts and * successfully process a reference (others are not invoked). * @param fetchers one or more fetchers + * @return this */ public CrawlerConfig setFetchers(List> fetchers) { CollectionUtil.setAll(this.fetchers, fetchers); diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractDocumentChecksummer.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractDocumentChecksummer.java index e23b8de57..cc3ade8c9 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractDocumentChecksummer.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractDocumentChecksummer.java @@ -35,16 +35,7 @@ *

* Implementors do not need to store the checksum themselves, this abstract * class does it. - *

- * Implementors should offer this XML configuration usage: *

- * {@nx.xml #usage - * - * } *

* toField is ignored unless the keep * attribute is set to true. diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractMetadataChecksummer.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractMetadataChecksummer.java index f7c038c33..18fba1dd9 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractMetadataChecksummer.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractMetadataChecksummer.java @@ -34,16 +34,7 @@ *

* Implementors do not need to store the checksum themselves, this abstract * class does it. - *

- * Implementors should offer this XML configuration usage: *

- * - * {@nx.xml #usage - * - * keep="[false|true]" - * toField="(optional metadata field to store the checksum)" - * onSet="[append|prepend|replace|optional]" /> - * } *

* toField is ignored unless the keep * attribute is set to true. diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilter.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilter.java index a66bd825e..3c3709e5a 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilter.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilter.java @@ -39,24 +39,6 @@ * Extensions are typically the last characters of a file name, after the * last dot. *

- * - * {@nx.xml.usage - * - * (comma-separated list of extensions) - * - * } - * - * {@nx.xml.example - * - * html,htm,php,asp - * - * } - *

- * The above example will only accept references with the following - * extensions: .html, .htm, .php, and .asp. - *

*/ @EqualsAndHashCode @ToString diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterConfig.java index bce5e5265..f8a175073 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterConfig.java @@ -16,7 +16,6 @@ import java.util.Collections; import java.util.HashSet; -import java.util.List; import java.util.Set; import com.norconex.commons.lang.collection.CollectionUtil; @@ -31,24 +30,6 @@ * Extensions are typically the last characters of a file name, after the * last dot. *

- * - * {@nx.xml.usage - * - * (comma-separated list of extensions) - * - * } - * - * {@nx.xml.example - * - * html,htm,php,asp - * - * } - *

- * The above example will only accept references with the following - * extensions: .html, .htm, .php, and .asp. - *

*/ @Data @Accessors(chain = true) @@ -63,7 +44,7 @@ public Set getExtensions() { } public ExtensionReferenceFilterConfig setExtensions( - List extensions) { + Set extensions) { CollectionUtil.setAll(this.extensions, extensions); return this; } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilter.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilter.java index 91d27503d..e6946ae92 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilter.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilter.java @@ -34,32 +34,7 @@ * Accepts or rejects a reference based on whether one or more * metadata field values are matching. *

- * - * {@nx.xml.usage - * - * - * (Expression matching one or more fields to evaluate.) - * - * - * (Expression matching one or more values from matching fields.) - * - * - * } - * - * {@nx.xml.example - * - * Content-Type - * application/zip - * - * } - *

- * Used in a web context, the above example filters out Zip documents base - * on a "Content-Type" metadata field. - *

- * */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class GenericMetadataFilter implements diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilterConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilterConfig.java index 10d49b2ad..8e1de1ae2 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilterConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilterConfig.java @@ -25,32 +25,7 @@ * Accepts or rejects a reference based on whether one or more * metadata field values are matching. *

- * - * {@nx.xml.usage - * - * - * (Expression matching one or more fields to evaluate.) - * - * - * (Expression matching one or more values from matching fields.) - * - * - * } - * - * {@nx.xml.example - * - * Content-Type - * application/zip - * - * } - *

- * Used in a web context, the above example filters out Zip documents base - * on a "Content-Type" metadata field. - *

- * */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class GenericMetadataFilterConfig { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilter.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilter.java index e1de39af0..b829e39ee 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilter.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilter.java @@ -35,27 +35,8 @@ *

* Filters URL based on a matching expression. *

- * - * {@nx.xml.usage - * - * - * (Expression matching the document reference.) - * - * - * } - * - * {@nx.xml.example - * - * .*/login/.* - * - * } - *

- * The above will reject documents having "/login/" in their reference. - *

* @see Pattern */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class GenericReferenceFilter implements diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilterConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilterConfig.java index 9611a5088..4fe018183 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilterConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilterConfig.java @@ -26,27 +26,8 @@ *

* Filters URL based on a matching expression. *

- * - * {@nx.xml.usage - * - * - * (Expression matching the document reference.) - * - * - * } - * - * {@nx.xml.example - * - * .*/login/.* - * - * } - *

- * The above will reject documents having "/login/" in their reference. - *

* @see Pattern */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class GenericReferenceFilterConfig { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizer.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizer.java index d963bf942..690d82368 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizer.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizer.java @@ -41,28 +41,6 @@ * BAD_STATUSGRACE_ONCE * ERRORGRACE_ONCE * - * - * {@nx.xml.usage - * - * - * (repeat mapping tag as needed) - * - * } - * - * {@nx.xml.example - * - * - * - * - * - * } - *

- * The above example indicates we should ignore (do nothing) errors processing - * documents, and send a deletion request if they are not found or have - * resulted in a bad status. - *

*/ @EqualsAndHashCode @ToString @@ -83,8 +61,8 @@ public SpoiledReferenceStrategy resolveSpoiledReferenceStrategy( strategy = configuration.getFallbackStrategy(); } if (strategy == null) { - strategy = - GenericSpoiledReferenceStrategizerConfig.DEFAULT_FALLBACK_STRATEGY; + strategy = GenericSpoiledReferenceStrategizerConfig + .DEFAULT_FALLBACK_STRATEGY; } return strategy; } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizerConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizerConfig.java index f4da05074..0eea56850 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizerConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizerConfig.java @@ -19,7 +19,6 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.norconex.crawler.core.doc.CrawlDocState; -import com.norconex.crawler.core.doc.operations.spoil.SpoiledReferenceStrategizer; import com.norconex.crawler.core.doc.operations.spoil.SpoiledReferenceStrategy; import lombok.Data; @@ -27,43 +26,7 @@ /** *

- * Generic implementation of {@link SpoiledReferenceStrategizer} that - * offers a simple mapping between the crawl state of references that have - * turned "bad" and the strategy to adopt for each. - * Whenever a crawl state does not have a strategy associated, the fall-back - * strategy is used (default being DELETE). - *

- *

- * The mappings defined by default are as follow: - *

- * - * - * - * - * - * - *
Crawl stateStrategy
NOT_FOUNDDELETE
BAD_STATUSGRACE_ONCE
ERRORGRACE_ONCE
- * - * {@nx.xml.usage - * - * - * (repeat mapping tag as needed) - * - * } - * - * {@nx.xml.example - * - * - * - * - * - * } - *

- * The above example indicates we should ignore (do nothing) errors processing - * documents, and send a deletion request if they are not found or have - * resulted in a bad status. + * Configuration for {@link GenericSpoiledReferenceStrategizer}. *

*/ @Data diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/pipelines/queue/QueuePipeline.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/pipelines/queue/QueuePipeline.java index 8e2fcdb52..34edc4f16 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/pipelines/queue/QueuePipeline.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/pipelines/queue/QueuePipeline.java @@ -67,6 +67,7 @@ public boolean isQueueInitialized() { * Otherwise the method returns when initial queuing has completed. * If no queue initializer was provided, returns right away with * true (initialized). + * @param crawler the crawler * @return queue initialization completion status */ public MutableBoolean initializeQueue(Crawler crawler) { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListener.java b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListener.java index a49eaa42c..6ec5099c5 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListener.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListener.java @@ -46,7 +46,7 @@ * REJECTED_. To avoid performance issues when dealing with * too many deletion requests, it is recommended you can change this behavior * to match exactly the events you are interested in with - * {@link #setEventMatcher(TextMatcher)}. + * {@link DeleteRejectedEventListenerConfig#setEventMatcher(TextMatcher)}. * Keep limiting events to "rejected" ones to avoid unexpected results. *

* @@ -68,30 +68,7 @@ * references. Be aware this can cause issues if you are using rules in your * committer (e.g., to route requests) based on metadata. *

- * - * {@nx.xml.usage - * - * - * (event name-matching expression) - * - * - * } - * - * {@nx.xml.example - * - * REJECTED_NOTFOUND,REJECTED_FILTER - * - * } - *

- * The above example will send deletion requests whenever a reference is not - * found (e.g., a 404 response from a web server) or if it was filtered out - * by the crawler. - *

- * */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString @Slf4j diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListenerConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListenerConfig.java index e6af97659..b79da8b0a 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListenerConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListenerConfig.java @@ -15,71 +15,15 @@ package com.norconex.crawler.core.event.listeners; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.core.event.CrawlerEvent; import lombok.Data; import lombok.experimental.Accessors; /** *

- * Provides the ability to send deletion requests to your configured - * committer(s) whenever a reference is rejected, regardless whether it was - * encountered in a previous crawling session or not. + * Configuration for {@link DeleteRejectedEventListener}. *

- * - *

Supported events

- *

- * By default this listener will send deletion requests for all references - * associated with a {@link CrawlerEvent} name starting with - * REJECTED_. To avoid performance issues when dealing with - * too many deletion requests, it is recommended you can change this behavior - * to match exactly the events you are interested in with - * {@link #setEventMatcher(TextMatcher)}. - * Keep limiting events to "rejected" ones to avoid unexpected results. - *

- * - *

Deletion requests sent once

- *

- * This class tries to handles each reference for "rejected" events only once. - * To do so it will queue all such references and wait until normal - * crawler completion to send them. Waiting for completion also gives this - * class a chance to listen for deletion requests sent to your committer as - * part of the crawler regular execution (typically on subsequent crawls). - * This helps ensure you do not get duplicate deletion requests for the same - * reference. - *

- * - *

Only references

- *

- * Since several rejection events are triggered before document are processed, - * we can't assume there is any metadata attached with rejected - * references. Be aware this can cause issues if you are using rules in your - * committer (e.g., to route requests) based on metadata. - *

- * - * {@nx.xml.usage - * - * - * (event name-matching expression) - * - * - * } - * - * {@nx.xml.example - * - * REJECTED_NOTFOUND,REJECTED_FILTER - * - * } - *

- * The above example will send deletion requests whenever a reference is not - * found (e.g., a 404 response from a web server) or if it was filtered out - * by the crawler. - *

- * */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class DeleteRejectedEventListenerConfig { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListener.java b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListener.java index 7decc2d8b..8f01b34d4 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListener.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListener.java @@ -77,30 +77,7 @@ * have reached the maximum. * * - * - * {@nx.xml.usage - * - * - * (event name-matching expression) - * - * - * } - * - * {@nx.xml.example - * - * DOCUMENT_COMMITTED_UPSERT,DOCUMENT_COMMITTED_DELETE - * - * } - *

- * The above example will stop the crawler when the sum of committed documents - * (upserts + deletions) reaches 100. - *

*/ -@SuppressWarnings("javadoc") @Slf4j @EqualsAndHashCode @ToString diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListenerConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListenerConfig.java index 785838bc4..4d83f4dce 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListenerConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListenerConfig.java @@ -15,80 +15,15 @@ package com.norconex.crawler.core.event.listeners; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.core.CrawlerConfig; import lombok.Data; import lombok.experimental.Accessors; /** *

- * Alternative to {@link CrawlerConfig#setMaxDocuments(int)} for stopping - * the crawler upon reaching specific event counts. The event counts are only - * kept for a crawling session. They are reset to zero upon restarting - * the crawler. - *

- *

- * Not specifying any maximum or events has no effect. - *

- * - *

Difference with "maxDocuments"

- *

- * The "maxDocuments" option deals with "processed" documents. Those are - * documents that were initially queued for crawling and crawling was attempted - * on them, whether that exercise what successful or not. That is, - * "maxDocuments" will not count documents that were sent to your committer - * for additions or deletions, but also documents that were rejected - * by your Importer configuration, produced errors, etc. - * This class gives you more control over what should trigger a crawler to stop. - *

- *

- * Note that for this class to take effect, make sure that "maxDocuments" has - * a high enough number or is set -1 (unlimited). - *

- * - *

Combining events

- *

- * If your event matcher matches more than one event, you can decide what - * should be the expected behavior. Options are: - *

- *
    - *
  • - * any: Stop the crawler when any of the matching event count - * reaches the specified maximum. - *
  • - *
  • - * all: Stop the crawler when all of the matching event counts - * have reached the maximum. - *
  • - *
  • - * sum: Stop the crawler when the sum of all matching event counts - * have reached the maximum. - *
  • - *
- * - * {@nx.xml.usage - * - * - * (event name-matching expression) - * - * - * } - * - * {@nx.xml.example - * - * DOCUMENT_COMMITTED_UPSERT,DOCUMENT_COMMITTED_DELETE - * - * } - *

- * The above example will stop the crawler when the sum of committed documents - * (upserts + deletions) reaches 100. + * Configuration for {@link StopCrawlerOnMaxEventListener}. *

*/ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class StopCrawlerOnMaxEventListenerConfig { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/AbstractFetcher.java b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/AbstractFetcher.java index abaa6121c..81228e306 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/AbstractFetcher.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/AbstractFetcher.java @@ -19,7 +19,6 @@ import com.norconex.commons.lang.config.Configurable; import com.norconex.commons.lang.event.Event; import com.norconex.commons.lang.event.EventListener; -import com.norconex.commons.lang.xml.XmlConfigurable; import com.norconex.crawler.core.Crawler; import com.norconex.crawler.core.doc.operations.filter.FilterGroupResolver; import com.norconex.crawler.core.doc.operations.filter.ReferenceFilter; @@ -43,33 +42,6 @@ * It also offers methods to overwrite in order to react to crawler * startup and shutdown events. *

- *

XML configuration usage:

- * Subclasses inherit this {@link XmlConfigurable} configuration: - * - * {@nx.xml.usage #referenceFilters - * - * - * - * (Restrict usage of this fetcher to matching reference filters. - * Refer to the documentation for the ReferenceFilter implementation - * you are using here for usage details.) - * - * - * } - * - *

Usage example:

- *

- * This XML snippet is an example of filter that restricts the application of - * this Fetcher to references ending with ".pdf". - *

- * - * {@nx.xml.example - * - * - * .*\.pdf$ - * - * - * } * * @param fetcher request type * @param fetcher response type diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/BaseFetcherConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/BaseFetcherConfig.java index d891f747c..ede5609cf 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/BaseFetcherConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/BaseFetcherConfig.java @@ -19,7 +19,6 @@ import java.util.List; import com.norconex.commons.lang.collection.CollectionUtil; -import com.norconex.commons.lang.xml.XmlConfigurable; import com.norconex.crawler.core.doc.operations.filter.ReferenceFilter; import lombok.Data; @@ -36,33 +35,6 @@ * It also offers methods to overwrite in order to react to crawler * startup and shutdown events. *

- *

XML configuration usage:

- * Subclasses inherit this {@link XmlConfigurable} configuration: - * - * {@nx.xml.usage #referenceFilters - * - * - * - * (Restrict usage of this fetcher to matching reference filters. - * Refer to the documentation for the ReferenceFilter implementation - * you are using here for usage details.) - * - * - * } - * - *

Usage example:

- *

- * This XML snippet is an example of filter that restricts the application of - * this Fetcher to references ending with ".pdf". - *

- * - * {@nx.xml.example - * - * - * .*\.pdf$ - * - * - * } */ @Data @Accessors(chain = true) diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/FetchException.java b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/FetchException.java index 12546546c..31c3e6615 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/FetchException.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/FetchException.java @@ -1,4 +1,4 @@ -/* Copyright 2022-2022 Norconex Inc. +/* Copyright 2022-2024 Norconex Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/FetchRequest.java b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/FetchRequest.java index c61773344..c9fbccea7 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/FetchRequest.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/FetchRequest.java @@ -1,4 +1,4 @@ -/* Copyright 2022-2022 Norconex Inc. +/* Copyright 2022-2024 Norconex Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/Fetcher.java b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/Fetcher.java index ff902f897..08ebd019a 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/Fetcher.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/Fetcher.java @@ -1,4 +1,4 @@ -/* Copyright 2022-2022 Norconex Inc. +/* Copyright 2022-2024 Norconex Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,12 +19,6 @@ * @param fetch request type * @param fetch response type */ -//public interface Fetcher { -// -// boolean accept(FetchRequest fetchRequest); -// -// FetchResponse fetch(FetchRequest fetchRequest) throws FetchException; -//} public interface Fetcher { boolean accept(T fetchRequest); diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/MultiFetchResponse.java b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/MultiFetchResponse.java index aaa4b0132..6ea5c9d80 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/MultiFetchResponse.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/MultiFetchResponse.java @@ -58,32 +58,16 @@ public String getReasonPhrase() { FetchResponse::getReasonPhrase).orElse(null); } - // @Override - // public String getUserAgent() { - // return lastResponse().map( - // FetchResponse::getUserAgent).orElse(null); - // } - // @Override @Override public Exception getException() { return getLastFetchResponse().map( FetchResponse::getException).orElse(null); } - // @Override - // public String getRedirectTarget() { - // return lastResponse().map( - // FetchResponse::getRedirectTarget).orElse(null); - // } public List getFetchResponses() { return Collections.unmodifiableList(fetchResponses); } - // @Override - // public void addFetchResponse(T resp, Fetcher fetcher) { - // fetchResponses.add(0, new ImmutablePair<>(resp, fetcher)); - // } - protected Optional getLastFetchResponse() { if (fetchResponses.isEmpty()) { return Optional.empty(); @@ -103,8 +87,6 @@ public String toString() { var r = op.get(); var b = new StringBuilder( r.getStatusCode() + " " + r.getReasonPhrase()); - // lastFetcher().ifPresent(f -> b.append( - // " - " + f.getClass().getSimpleName())); return b.toString(); } } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/MultiFetcher.java b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/MultiFetcher.java index cb6339693..277c5d495 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/MultiFetcher.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/MultiFetcher.java @@ -36,9 +36,6 @@ * @param fetcher request type * @param fetcher response type */ -//TODO make it THE fetcher handling multi fetchers transparently and eliminate -// the concept of many vs single.... there is only 1 many in core to rule them all. - @Slf4j public class MultiFetcher implements Fetcher { @@ -47,26 +44,11 @@ public class MultiFetcher private final ResponseListAdapter responseListAdapter; private final UnsuccessfulResponseFactory unsuccessfulResponseFactory; - // + @Getter private final int maxRetries; @Getter private final Duration retryDelay; - // - // @FunctionalInterface - // public interface MultiResponseFactory - // { - // //TODO Document that responses are ordered from first to last - // MultiFetchResponse adapt(Map> responses); - // } - // - // @FunctionalInterface - // public interface UnsuccessfulResponseFactory { - // R adapt(CrawlDocState crawlState, String message, Exception e); - // } - - //TODO drop above two interfaces in favor of a single one and let - // this class handle collections. @FunctionalInterface public interface UnsuccessfulResponseFactory { @@ -118,7 +100,6 @@ public R fetch(T fetchRequest) { var doc = fetchRequest.getDoc(); - // Map> allResponses = new ListOrderedMap<>(); List allResponses = new ArrayList<>(); var accepted = false; for (Fetcher fetcher : fetchers) { @@ -133,8 +114,6 @@ public R fetch(T fetchRequest) { accepted = true; for (var retryCount = 0; retryCount <= maxRetries; retryCount++) { var fetchResponse = doFetch(fetcher, fetchRequest, retryCount); - - // allResponses.put(fetchResponse, fetcher); allResponses.add(fetchResponse); doc.getMetadata().add( @@ -161,13 +140,12 @@ public R fetch(T fetchRequest) { + "' for fetch request: " + fetchRequest, null)); - LOG.debug( - """ - No fetcher accepted to fetch this\s\ - reference: "{}".\s\ - For generic reference filtering it is highly recommended you\s\ - use a regular reference filtering options, such as reference\s\ - filters.""", + LOG.debug(""" + No fetcher accepted to fetch this\s\ + reference: "{}".\s\ + For generic reference filtering it is highly recommended you\s\ + use a regular reference filtering options, such as reference\s\ + filters.""", doc.getReference()); } return responseListAdapter.adapt(allResponses); diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/services/DocTrackerService.java b/crawler/core/src/main/java/com/norconex/crawler/core/services/DocTrackerService.java index bc76e0efc..4f8dc78ac 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/services/DocTrackerService.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/services/DocTrackerService.java @@ -54,11 +54,8 @@ * */ @Slf4j -//TODO rename DocProcessingTracker or else? public class DocTrackerService implements Closeable { - // private static final String PROP_STAGE = "processingStage"; - // new ones private DataStore queue; private DataStore active; @@ -95,30 +92,11 @@ public void init() { resuming = !isQueueEmpty() || !isActiveEmpty(); crawler.getState().setResuming(resuming); - - // XXXXXXXXXXXXXX - - //TODO do not do resume/non-resume activities when exporting/importing - // do it only on start() - - // Way to do it.. have open just open and maybe return if - // resuming or not but do nothing - // then, add a new init() or prepare() method that will only be called - // by crawler start() - - // or maybe, move below code out of here. - - // XXXXXXXXXXXXXX - } - //MAYBE: Move elsewhere since only used once, when starting crawler? - // prepare for processing start public void prepareForCrawl() { - // var resuming = !isQueueEmpty() || !isActiveEmpty(); - if (resuming) { // Active -> Queued diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreEngine.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreEngine.java index 4bcee491d..c1cffe65c 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreEngine.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreEngine.java @@ -19,27 +19,76 @@ import java.util.Set; import com.fasterxml.jackson.annotation.JsonIgnore; +import com.norconex.committer.core.Committer; import com.norconex.crawler.core.Crawler; +/** + * Database of some kind used by the crawler to store minimum information + * required to operate (e.g., tracking its progress, remember previously + * crawled documents, ...). Not to be confused with {@link Committer}, which + * is used as the target destination for crawled content. + */ public interface DataStoreEngine extends Closeable { + /** + * Initializes the data store. Called once per crawler execution, before + * crawling begins. + * @param crawler the crawler instance + */ void init(Crawler crawler); + /** + * Wipe out all stores in the data store. Same as invoking + * {@link #dropStore(String)} for each of the existing stores. + * @return true if there were stores to clean + */ boolean clean(); + /** + * Closes the data store. Called once after crawling completes. + */ @Override void close(); + /** + * Opens (and creates as needed) a store of the given name to store + * and retrieve objects of the given type. + * @param class type of the object kept in the store + * @param name store name + * @param type a class for type of the object kept in the store + * @return a data store + */ DataStore openStore(String name, Class type); + /** + * Eliminates the store matching the given name. + * @param name the name of the store to delete + * @return true if there was a store to delete + */ boolean dropStore(String name); - // returns true if target was deleted + /** + * Rename a data store. + * @param dataStore the data store to rename + * @param newName the new name + * @return true if a store already exist with the new name and + * had to first be deleted + */ boolean renameStore(DataStore dataStore, String newName); + /** + * Gets the name of all stores in the engine. + * @return a set of stores + */ @JsonIgnore Set getStoreNames(); + /** + * Gets the type of an existing store, or empty if there are no stores + * of that name. + * @param name store name + * @return optional with the store type + */ @JsonIgnore Optional> getStoreType(String name); } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreExporter.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreExporter.java index 3f3e00164..f0cf834b4 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreExporter.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreExporter.java @@ -83,7 +83,7 @@ private static void exportStore( Class type) throws IOException { var writer = SerialUtil.jsonGenerator(out); - //TODO add "nice" option? + //MAYBE add "nice" option? //writer.setIndent(" "); var qty = store.count(); diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreImporter.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreImporter.java index 1984b6b45..3329f396d 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreImporter.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/DataStoreImporter.java @@ -51,9 +51,8 @@ public static void importDataStore(Crawler crawler, Path inFile) var zipEntry = zipIn.getNextEntry(); //NOSONAR while (zipEntry != null) { if (!importStore(crawler, zipIn)) { - LOG.debug( - "Input file \"{}\" not matching crawler " - + "\"{}\". Skipping.", + LOG.debug("Input file \"{}\" not matching crawler " + + "\"{}\". Skipping.", inFile, crawler.getId()); } zipIn.closeEntry(); @@ -119,8 +118,7 @@ private static boolean importStore( private static void logProgress(long cnt, boolean done) { if (LOG.isInfoEnabled() && (cnt % 10000 == 0 ^ done)) { - LOG.info( - "{} imported.", + LOG.info("{} imported.", NumberFormat.getIntegerInstance().format(cnt)); } } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/jdbc/JdbcDataStoreEngine.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/jdbc/JdbcDataStoreEngine.java index 0963d6283..1165920bf 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/jdbc/JdbcDataStoreEngine.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/jdbc/JdbcDataStoreEngine.java @@ -59,54 +59,17 @@ * Refer to * * Hikari's documentation for all configuration options. The Hikari options - * are passed as-is, via datasource properties as shown below. + * are passed as-is, via + * {@link JdbcDataStoreEngineConfig#setProperties(java.util.Properties)}. *

*

Data types

*

* This class only use a few data types to store its data in a generic way. * It will try to detect what data type to use for your database. If you * get errors related to field data types not being supported, you have - * the option to redefined them. - *

- * - * {@nx.xml.usage - * - * - * - * (property value) - * - * - * (Optional prefix used for table creation. Default is the collector - * id plus the crawler id, each followed by an underscore character. - * The value is first modified to convert spaces to underscores, and - * to strip unsupported characters. The supported - * characters are: alphanumeric, period, and underscore. - * ) - * - * - * - * - * - * - * - * - * } - * - * {@nx.xml.example - * - * - * jdbc:mysql://localhost:33060/sample - * dbuser - * dbpwd - * 1000 - * - * - * } - *

- * The above example contains basic settings for creating a MySQL data source. + * the option to provide your own SQL statements for creating tables + * and doing "upsert" requests. Refer to {@link JdbcDialect} source code + * for SQL examples. *

*/ @Slf4j @@ -122,7 +85,7 @@ public class JdbcDataStoreEngine // table id field is store name private JdbcDataStore storeTypes; private JdbcDialect dialect; - private String tableSessionPrefix; + private String resolvedTablePrefix; @Getter private JdbcDataStoreEngineConfig configuration = @@ -130,11 +93,10 @@ public class JdbcDataStoreEngine @Override public void init(Crawler crawler) { - tableSessionPrefix = safeTableName( - isBlank( - configuration.getTablePrefix()) - ? crawler.getId() + "_" - : configuration.getTablePrefix()); + resolvedTablePrefix = safeTableName( + isBlank(configuration.getTablePrefix()) + ? crawler.getId() + "_" + : configuration.getTablePrefix()); // create data source datasource = new HikariDataSource( @@ -231,10 +193,10 @@ public Set getStoreNames() { while (rs.next()) { var tableName = rs.getString(3); if (startsWithIgnoreCase( - tableName, tableSessionPrefix)) { + tableName, resolvedTablePrefix)) { // only add if not the table holding store types var storeName = removeStartIgnoreCase( - tableName, tableSessionPrefix); + tableName, resolvedTablePrefix); if (!STORE_TYPES_NAME.equalsIgnoreCase(storeName)) { names.add(storeName); } @@ -276,7 +238,7 @@ Connection getConnection() { } String toTableName(String storeName) { - return tableSessionPrefix + safeTableName(storeName); + return resolvedTablePrefix + safeTableName(storeName); } boolean tableExist(String tableName) { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/jdbc/JdbcDataStoreEngineConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/jdbc/JdbcDataStoreEngineConfig.java index b87f5b4d5..503c0a1da 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/jdbc/JdbcDataStoreEngineConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/jdbc/JdbcDataStoreEngineConfig.java @@ -21,83 +21,46 @@ /** *

- * Data store engine using a JDBC-compatible database for storing - * crawl data. - *

- *

Database JDBC driver

- *

- * To use this data store engine, you need its JDBC database driver - * on the classpath. - *

- *

Database datasource configuration

- *

- * This JDBC data store engine uses - * Hikari as the JDBC - * datasource implementation, which provides efficient connection-pooling. - * Refer to - * - * Hikari's documentation for all configuration options. The Hikari options - * are passed as-is, via datasource properties as shown below. - *

- *

Data types

- *

- * This class only use a few data types to store its data in a generic way. - * It will try to detect what data type to use for your database. If you - * get errors related to field data types not being supported, you have - * the option to redefined them. - *

- * - * {@nx.xml.usage - * - * - * - * (property value) - * - * - * (Optional prefix used for table creation. Default is the collector - * id plus the crawler id, each followed by an underscore character. - * The value is first modified to convert spaces to underscores, and - * to strip unsupported characters. The supported - * characters are: alphanumeric, period, and underscore. - * ) - * - * - * - * - * - * - * - * - * } - * - * {@nx.xml.example - * - * - * jdbc:mysql://localhost:33060/sample - * dbuser - * dbpwd - * 1000 - * - * - * } - *

- * The above example contains basic settings for creating a MySQL data source. + * Configuration for {@link JdbcDataStoreEngine}. *

*/ @Data @Accessors(chain = true) public class JdbcDataStoreEngineConfig { + /** + * Optional prefix used for table creation. Default is the crawler id, + * followed by an underscore character. + * The value is first modified to convert spaces to underscores, and + * to strip unsupported characters. The supported + * characters are: alphanumeric, period, and underscore. + */ private String tablePrefix; + + /** + * Explicitly set the JDBC dialect to use when generating SQL. + * By default, an attempt is made to detect it automatically. + * Not really useful if you provide your own SQLs for table creation + * and upserts. + */ private JdbcDialect dialect; + + /** + * For advanced use. Optional SQL to create new tables. + * Refer to {@link JdbcDialect} source code for examples. + */ private String createTableSql; + /** + * For advanced use. Optional SQL to create upsert SQLs. + * Refer to {@link JdbcDialect} source code for examples. + */ private String upsertSql; + /** + * Connection properties as per + * + * Hikari's documentation. At a minimum, you need to provide + * either a "dataSourceClassName" or a "jdbcUrl". + */ private Properties properties = new Properties(); - // private String varcharType; - // private String timestampType; - // private String textType; } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngine.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngine.java index ecf8538e4..9d80dac50 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngine.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngine.java @@ -51,9 +51,6 @@ *

* Data store engine using MongoDB for storing crawl data. *

- * {@nx.xml.usage - * (MongoDB connection string.) - * } */ @Slf4j @EqualsAndHashCode diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngineConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngineConfig.java index bb1a707b6..a9609a7d0 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngineConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngineConfig.java @@ -18,12 +18,8 @@ /** *

- * Data store engine using MongoDB for storing crawl data. + * Configuration for {@link MongoDataStoreEngine}. *

- * - * {@nx.xml.usage (MongoDB - * connection string.) } - * */ @Data @Accessors(chain = true) diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mvstore/MvStoreDataStoreEngine.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mvstore/MvStoreDataStoreEngine.java index cd90a0a21..622b24a9b 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mvstore/MvStoreDataStoreEngine.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mvstore/MvStoreDataStoreEngine.java @@ -37,6 +37,13 @@ import lombok.ToString; import lombok.extern.slf4j.Slf4j; +/** + *

+ * Embedded data store relying on MvStore. + *

+ * @since 1.10.0 + * @author Pascal Essiembre + */ @EqualsAndHashCode @ToString @Slf4j @@ -114,13 +121,12 @@ public void init(Crawler crawler) { try { mvstore = builder.open(); } catch (MVStoreException e) { - LOG.warn( - """ - An exception occurred while trying to open the store engine.\s\ - This could happen due to an abnormal shutdown on a previous\s\ - execution of the crawler. An attempt will be made to recover.\s\ - It is advised to back-up the store engine if you want to\s\ - preserve the crawl history.""", + LOG.warn(""" + An exception occurred while trying to open the store engine.\s\ + This could happen due to an abnormal shutdown on a previous\s\ + execution of the crawler. An attempt will be made to recover.\s\ + It is advised to back-up the store engine if you want to\s\ + preserve the crawl history.""", e); builder.recoveryMode(); mvstore = builder.open(); @@ -169,8 +175,8 @@ public synchronized void close() { if (mvstore != null && !mvstore.isClosed()) { LOG.info("Compacting data store..."); mvstore.commit(); - //TODO method dropped from MVStore. Any replacemetn? - //mvstore.compactMoveChunks(); + //NOTE: this method was dropped from MVStore. Any replacement? + // mvstore.compactMoveChunks(); mvstore.close(); } mvstore = null; @@ -224,33 +230,4 @@ public Set getStoreNames() { public Optional> getStoreType(String name) { return Optional.ofNullable(storeTypes.get(name)); } - // - // @Override - // public void loadFromXML(XML xml) { - // cfg.setPageSplitSize( - // xml.getDataSize(Fields.pageSplitSize, cfg.getPageSplitSize())); - // cfg.setCompress(xml.getInteger(Fields.compress, cfg.getCompress())); - // cfg.setCacheConcurrency(xml.getInteger( - // Fields.cacheConcurrency, cfg.getCacheConcurrency())); - // cfg.setCacheSize(xml.getDataSize(Fields.cacheSize, cfg.getCacheSize())); - // cfg.setAutoCompactFillRate(xml.getInteger( - // Fields.autoCompactFillRate, cfg.getAutoCompactFillRate())); - // cfg.setAutoCommitBufferSize(xml.getDataSize( - // Fields.autoCommitBufferSize, cfg.getAutoCommitBufferSize())); - // cfg.setAutoCommitDelay(xml.getDurationMillis( - // Fields.autoCommitDelay, cfg.getAutoCommitDelay())); - // cfg.setEphemeral(xml.getBoolean(Fields.ephemeral, cfg.isEphemeral())); - // } - // - // @Override - // public void saveToXML(XML xml) { - // xml.addElement(Fields.pageSplitSize, cfg.getPageSplitSize()); - // xml.addElement(Fields.compress, cfg.getCompress()); - // xml.addElement(Fields.cacheConcurrency, cfg.getCacheConcurrency()); - // xml.addElement(Fields.cacheSize, cfg.getCacheSize()); - // xml.addElement(Fields.autoCompactFillRate, cfg.getAutoCompactFillRate()); - // xml.addElement(Fields.autoCommitBufferSize, cfg.getAutoCommitBufferSize()); - // xml.addElement(Fields.autoCommitDelay, cfg.getAutoCommitDelay()); - // xml.addElement(Fields.ephemeral, cfg.isEphemeral()); - // } } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mvstore/MvStoreDataStoreEngineConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mvstore/MvStoreDataStoreEngineConfig.java index d1a9c5a3a..18e55cb09 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mvstore/MvStoreDataStoreEngineConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mvstore/MvStoreDataStoreEngineConfig.java @@ -19,24 +19,29 @@ import lombok.experimental.FieldNameConstants; /** -*

-* MVStore configuration parameters. For advanced use only. -* Differences from MVStore defaults: -* All data size values are expected to be set in bytes. -* Light compression is enabled by default (compress = 1) -*

-*

-* For more info: -*

-* -* @since 1.10.0 -* @author Pascal Essiembre -*/ + *

+ * Configuration for {@link MvStoreDataStoreEngine}. + *

+ *

+ * Changing default values is for advanced use only. Differences from MVStore + * defaults: + *

+ *
    + *
  • All data size values are expected to be set in bytes.
  • + *
  • Light compression is enabled by default (compress = 1)
  • + *
+ *

+ * For more info: + *

+ * + * @since 1.10.0 + * @author Pascal Essiembre + */ @Data @Accessors(chain = true) @FieldNameConstants @@ -45,10 +50,7 @@ public class MvStoreDataStoreEngineConfig { /** * The max memory page size in bytes before splitting it. * Defaults to 4KB for memory, and 16KB for disk. - * @param pageSplitSize split size - * @return page size */ - @SuppressWarnings("javadoc") private Long pageSplitSize; /** @@ -58,52 +60,34 @@ public class MvStoreDataStoreEngineConfig { *
  • 1: Low compression (default)
  • *
  • 2: High compression
  • * - * @param compress level of compression - * @return level of compression */ - @SuppressWarnings("javadoc") private Integer compress = 1; /** * The maximum number of concurrent operations when reading from * the store cache. Default is 16. - * @param cacheConcurrency maximum number of concurrent operations - * @return maximum number of concurrent operations */ - @SuppressWarnings("javadoc") private Integer cacheConcurrency = 16; /** * The read cache size in bytes. Default is 16MB. - * @param cacheSize read cache size - * @return read cache size */ - @SuppressWarnings("javadoc") private Long cacheSize; /** * The auto-compact target fill rate, in percentage. Default is 90%. - * @param autoCompactFillRate auto compact fill rate - * @return auto compact fill rate */ - @SuppressWarnings("javadoc") private Integer autoCompactFillRate; /** * The size of the write buffer. Defaults to 1024KB. - * @param autoCommitBufferSize size of the write buffer - * @return size of the write buffer */ - @SuppressWarnings("javadoc") private Long autoCommitBufferSize; /** * The maximum delay in milliseconds to auto-commit changes. Defaults * to 1000ms (1 second). - * @param autoCommitDelay maximum delay to auto-commit changes - * @return maximum delay to auto-commit changes */ - @SuppressWarnings("javadoc") private Long autoCommitDelay; /** @@ -112,9 +96,6 @@ public class MvStoreDataStoreEngineConfig { * normal behavior). * Not recommended for regular use. * Useful for testing and troubleshooting, or if you know what your doing. - * @param ephemeral whether to persist store data or keep it all in memory - * @return true if only using memory (data is not persisted) */ - @SuppressWarnings("javadoc") private boolean ephemeral; } \ No newline at end of file diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/util/About.java b/crawler/core/src/main/java/com/norconex/crawler/core/util/About.java index de4b4979a..b01a4a1e3 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/util/About.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/util/About.java @@ -28,33 +28,31 @@ import com.norconex.committer.core.Committer; import com.norconex.commons.lang.PackageManifest; -import com.norconex.crawler.core.Crawler; import com.norconex.crawler.core.CrawlerConfig; +import lombok.NonNull; + public final class About { /** Simple ASCI art of Norconex. */ - public static final String NORCONEX_ASCII = - """ - _ _ ___ ____ ____ ___ _ _ _______ __ - | \\ | |/ _ \\| _ \\ / ___/ _ \\| \\ | | ____\\ \\/ / - | \\| | | | | |_) | | | | | | \\| | _| \\ /\s - | |\\ | |_| | _ <| |__| |_| | |\\ | |___ / \\\s - |_| \\_|\\___/|_| \\_\\\\____\\___/|_| \\_|_____/_/\\_\\ + public static final String NORCONEX_ASCII = """ + _ _ ___ ____ ____ ___ _ _ _______ __ + | \\ | |/ _ \\| _ \\ / ___/ _ \\| \\ | | ____\\ \\/ / + | \\| | | | | |_) | | | | | | \\| | _| \\ /\s + | |\\ | |_| | _ <| |__| |_| | |\\ | |___ / \\\s + |_| \\_|\\___/|_| \\_\\\\____\\___/|_| \\_|_____/_/\\_\\ - ================ C R A W L E R ================ - """; + ================ C R A W L E R ================ + """; private About() { } - public static String about(CrawlerConfig config) { + public static String about(@NonNull CrawlerConfig config) { try (var sw = new StringWriter(); var w = new PrintWriter(sw, true)) { w.println(NORCONEX_ASCII); - // version - // w.println("Version: " + releaseVersion(Crawler.class)); //TODO pass class from crawler impl so we have the name ? (web vs file) - w.println("Version:\n " + releaseVersion(Crawler.class)); //TODO pass class from crawler impl so we have the name ? (web vs file) + w.println("Version:\n " + releaseVersion(config.getClass())); // committer var committerClasses = configuredCommitters(config); @@ -67,16 +65,6 @@ public static String about(CrawlerConfig config) { w.println(" "); } - // var prefix = "Committers: "; - // if (CollectionUtils.isNotEmpty(committerClasses)) { - // for (Class cls : committerClasses) { - // w.println(prefix + committerName(cls)); - // prefix = " "; - // } - // } else { - // w.println(" "); - // } - // runtime w.println("Runtime:"); w.println(" Name: " + SystemUtils.JAVA_RUNTIME_NAME); diff --git a/crawler/core/src/test/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterTest.java b/crawler/core/src/test/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterTest.java index 6c12591c3..cf36fba8f 100644 --- a/crawler/core/src/test/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterTest.java +++ b/crawler/core/src/test/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterTest.java @@ -17,7 +17,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatNoException; -import java.util.List; +import java.util.Set; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -31,7 +31,7 @@ class ExtensionReferenceFilterTest { @Test void testOnlyDetectExtensionsInLastPathSegment() { - var filter = initFilter(List.of("com", "xml")); + var filter = initFilter(Set.of("com", "xml")); Assertions.assertFalse( filter.acceptReference("http://example.com")); @@ -96,7 +96,7 @@ void testEmpty() { // true (record is accepted) f = new ExtensionReferenceFilter(); f.getConfiguration() - .setExtensions(List.of("blah")) + .setExtensions(Set.of("blah")) .setOnMatch(OnMatch.EXCLUDE); assertThat(f.acceptReference("")).isTrue(); } @@ -105,7 +105,7 @@ void testEmpty() { void testDocumentAndMetadata() { var f = new ExtensionReferenceFilter(); f.getConfiguration() - .setExtensions(List.of("pdf")); + .setExtensions(Set.of("pdf")); assertThat( f.acceptDocument( CrawlDocStubs.crawlDoc( @@ -116,7 +116,7 @@ void testDocumentAndMetadata() { .isTrue(); } - private ExtensionReferenceFilter initFilter(List extensions) { + private ExtensionReferenceFilter initFilter(Set extensions) { var filter = new ExtensionReferenceFilter(); filter.getConfiguration().setExtensions(extensions); return filter; @@ -127,7 +127,7 @@ void testWriteRead() { var f = new ExtensionReferenceFilter(); f.getConfiguration() .setIgnoreCase(true) - .setExtensions(List.of("com", "pdf")) + .setExtensions(Set.of("com", "pdf")) .setOnMatch(OnMatch.EXCLUDE); assertThatNoException().isThrownBy( () -> BeanMapper.DEFAULT.assertWriteRead(f)); diff --git a/crawler/core/src/test/java/com/norconex/crawler/core/mocks/MockNoopDataStore.java b/crawler/core/src/test/java/com/norconex/crawler/core/mocks/MockNoopDataStore.java index 4428e4752..534bc5e28 100644 --- a/crawler/core/src/test/java/com/norconex/crawler/core/mocks/MockNoopDataStore.java +++ b/crawler/core/src/test/java/com/norconex/crawler/core/mocks/MockNoopDataStore.java @@ -24,6 +24,7 @@ /** * Empty store that does nothing. + * @param data store type */ @EqualsAndHashCode @ToString diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java index 0c6774ce2..4b5cfc8e7 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java @@ -66,15 +66,12 @@ public enum DirStructure { protected static final List DEFAULT_TYPES = List.of(Target.DIRECTORY); - // @ToString.Exclude - // @EqualsAndHashCode.Exclude private final List targets = new ArrayList<>(DEFAULT_TYPES); private Path targetDir; private String targetDirField; private DirStructure targetDirStructure = DirStructure.DATETIME; private String targetMetaField; private String imageFormat = DEFAULT_IMAGE_FORMAT; - // private final ImageTransformer imgTransformer = new ImageTransformer(); public List getTargets() { return Collections.unmodifiableList(targets); @@ -84,26 +81,4 @@ public DocImageHandlerConfig setTargets(List targets) { CollectionUtil.setAll(this.targets, targets); return this; } - - // - // @Override - // public void loadFromXML(XML xml) { - // setTargets(xml.getDelimitedEnumList("targets", Target.class, targets)); - // setTargetDir(xml.getPath("targetDir", targetDir)); - // setTargetDirStructure(xml.getEnum("targetDir/@structure", - // DirStructure.class, targetDirStructure)); - // setTargetDirField(xml.getString("targetDir/@field", targetDirField)); - // setTargetMetaField(xml.getString("targetMetaField", targetMetaField)); - // setImageFormat(xml.getString("imageFormat", imageFormat)); - // } - // - // @Override - // public void saveToXML(XML xml) { - // xml.addDelimitedElementList("targets", targets); - // xml.addElement("targetDir", targetDir) - // .setAttribute("structure", targetDirStructure) - // .setAttribute("field", targetDirField); - // xml.addElement("targetMetaField", targetMetaField); - // xml.addElement("imageFormat", imageFormat); - // } } diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/PostImportLinksTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/PostImportLinksTest.java index 411e4ddca..5b1629b23 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/PostImportLinksTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/PostImportLinksTest.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.nio.file.Path; import java.util.List; +import java.util.Set; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -66,12 +67,11 @@ void testPostImportLinksURL(ClientAndServer client) throws IOException { cfg.setPostImportLinksKeep(true); // Keep only the test PDF. cfg.setDocumentFilters( - List.of( - Configurable.configure( - new ExtensionReferenceFilter(), - c -> c - .setExtensions(List.of("pdf")) - .setOnMatch(OnMatch.INCLUDE)))); + List.of(Configurable.configure( + new ExtensionReferenceFilter(), + c -> c + .setExtensions(Set.of("pdf")) + .setOnMatch(OnMatch.INCLUDE)))); // Create a field with post-import PDF URLs. var tagger = new UrlExtractorTransformer(); tagger.getConfiguration().setToField("myPostImportURLs");