diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java b/crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java
index 6c58bace9..b85f80d56 100644
--- a/crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java
+++ b/crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java
@@ -96,7 +96,7 @@ public class Crawler {
? extends FetchRequest, ? extends FetchResponse> fetcher;
private final Class extends CrawlDocContext> docContextType;
private CrawlerState state;
- // TODO remove stopper listener when we are fully using a table?
+ // TODO remove stopper listener when we are fully using an accessible store?
private CrawlerStopper stopper = new FileBasedStopper();
// --- Set in init ---
@@ -127,8 +127,8 @@ public class Crawler {
doc -> new UpsertRequest(
doc.getReference(),
doc.getMetadata(),
- doc.getInputStream())) // Closed by
- // caller
+ // InputStream closed by caller
+ doc.getInputStream()))
.deleteRequestBuilder(
doc -> new DeleteRequest(
doc.getReference(),
@@ -194,12 +194,11 @@ public void fire(String eventName) {
}
public void fire(String eventName, Object subject) {
- fire(
- CrawlerEvent.builder()
- .name(eventName)
- .source(this)
- .subject(subject)
- .build());
+ fire(CrawlerEvent.builder()
+ .name(eventName)
+ .source(this)
+ .subject(subject)
+ .build());
}
@Override
@@ -224,31 +223,26 @@ && getState().isExecutionLocked()) {
getState().setStopping(true);
LOG.info("Stopping the crawler.");
} else {
- LOG.info(
- "CANNOT STOP: the targetted crawler does not appear "
- + "to be running on on this host.");
+ LOG.info("CANNOT STOP: the targetted crawler does not appear "
+ + "to be running on on this host.");
}
}
public void exportDataStore(Path exportDir) {
- executeCommand(
- new CommandExecution(this, "STORE_EXPORT")
- .failableCommand(
- () -> DataStoreExporter.exportDataStore(
- this,
- exportDir))
- .lock(true)
- .logIntro(true));
+ executeCommand(new CommandExecution(this, "STORE_EXPORT")
+ .failableCommand(() -> DataStoreExporter.exportDataStore(
+ this,
+ exportDir))
+ .lock(true)
+ .logIntro(true));
}
public void importDataStore(Path file) {
- executeCommand(
- new CommandExecution(this, "STORE_IMPORT")
- .failableCommand(
- () -> DataStoreImporter
- .importDataStore(this, file))
- .lock(true)
- .logIntro(true));
+ executeCommand(new CommandExecution(this, "STORE_IMPORT")
+ .failableCommand(
+ () -> DataStoreImporter.importDataStore(this, file))
+ .lock(true)
+ .logIntro(true));
}
/**
@@ -256,14 +250,13 @@ public void importDataStore(Path file) {
* the crawler was run for the first time.
*/
public void clean() {
- executeCommand(
- new CommandExecution(this, "CLEAN")
- .failableCommand(() -> {
- getServices().getCommitterService().clean();
- dataStoreEngine.clean();
- FileUtils.deleteDirectory(getWorkDir().toFile());
- })
- .lock(true)
- .logIntro(true));
+ executeCommand(new CommandExecution(this, "CLEAN")
+ .failableCommand(() -> {
+ getServices().getCommitterService().clean();
+ dataStoreEngine.clean();
+ FileUtils.deleteDirectory(getWorkDir().toFile());
+ })
+ .lock(true)
+ .logIntro(true));
}
}
diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerBuilder.java b/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerBuilder.java
index 33c2437ca..db8a8ebb3 100644
--- a/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerBuilder.java
+++ b/crawler/core/src/main/java/com/norconex/crawler/core/CrawlerBuilder.java
@@ -29,13 +29,11 @@
import lombok.Setter;
import lombok.experimental.Accessors;
-//TODO move to its own class?
//TODO document the optional ones and their default values
@Accessors(fluent = true)
@Setter
@Getter
@NonNull
-@SuppressWarnings("javadoc")
public class CrawlerBuilder {
private CrawlerConfig configuration = new CrawlerConfig();
private DocPipelines docPipelines;
@@ -47,8 +45,6 @@ public class CrawlerBuilder {
/**
* The exact type of {@link CrawlDocContext} if your crawler is subclassing
* it. Defaults to {@link CrawlDocContext} class.
- * @param docContextType crawl doc brief class
- * @return doc brief class
*/
private Class extends CrawlDocContext> docContextType =
CrawlDocContext.class;
@@ -56,9 +52,6 @@ public class CrawlerBuilder {
/**
* Provides a required fetcher implementation, responsible for obtaining
* resources being crawled.
- *
- * @param fetcherProvider fetcher provider function
- * @return a function returning a fetcher to associate with a given crawler.
*/
private Function
* Base Crawler configuration. Crawlers usually read this configuration upon
- * starting up. Once execution has started, it should not be changed
- * to avoid unexpected behaviors.
+ * starting up. While not always enforced, once execution has started, it
+ * should be considered immutable to avoid unexpected behaviors.
*
* Concrete implementations inherit the following XML configuration
* options (typically within a <crawler>
tag):
* null
the collector will use
- * ./work
at runtime.
- * @param workDir working directory path
- * @return working directory path
+ * ./work
, relative to the execution "current" directory.
*/
private Path workDir = DEFAULT_WORKDIR;
/**
* Maximum number of bytes used for memory caching of all reusable streams
- * at any given time, for faster processing. Defaults to 1 GB.
+ * combined, at any given time, for faster processing. Defaults to 1 GB.
* File-caching is used when the maximum is reached.
- * @param streamCachePoolSize
- * maximum number of bytes for all reusable streams combined
- * @return maximum number of bytes for all reusable streams combined
*/
private long maxStreamCachePoolSize =
ImporterConfig.DEFAULT_MAX_STREAM_CACHE_POOL_SIZE;
@@ -260,50 +133,38 @@ public enum OrphansStrategy {
* Maximum number of bytes used for memory caching of a single reusable
* stream, for faster processing. Defaults to 100 MB. File-caching is
* used when this maximum is reached for a single file, or when the
- * pool size has been reached.
- * @param streamCacheSize
- * maximum number of bytes for a single reusable streams
- * @return maximum number of bytes for a single reusable stream
+ * pool maximum size has been reached.
*/
private long maxStreamCacheSize =
ImporterConfig.DEFAULT_MAX_STREAM_CACHE_SIZE;
/**
- * The amount of time to defer the collector shutdown when it is
+ * The amount of time to defer the crawler shutdown when it is
* done executing. This is useful for giving external processes
* with polling intervals enough time to grab the latest state of
* the collector before it shuts down. Default is zero (does not
* wait to shutdown after completion).
- * @param deferredShutdownDuration duration
- * @return duration
*/
private Duration deferredShutdownDuration = Duration.ZERO;
/**
- * The crawl data store factory.
- * @param dataStoreEngine crawl data store factory.
- * @return crawl data store factory.
+ * The data store engine.
*/
private DataStoreEngine dataStoreEngine = new MvStoreDataStoreEngine();
/**
* Whether the start references should be loaded asynchronously. When
* true
, the crawler will start processing the start
- * references in a separate thread as they are added to the queue
- * (as opposed to wait for queue initialization to be complete).
+ * references in one or more separate threads as they are added to the
+ * queue (as opposed to wait for queue initialization to be complete).
* While this may speed up crawling, it may have an unexpected effect on
* accuracy of {@link CrawlDocMetadata#DEPTH}. Use of this option is only
* recommended when start references take a significant time to load.
- * @param startReferencesAsync true
if initialized
- * asynchronously
- * @return true
if initialized asynchronously
*/
private boolean startReferencesAsync;
/**
* The maximum number of threads a crawler can use. Default is 2.
- * @param numThreads number of threads
- * @return number of threads
*/
@Min(1)
private int numThreads = 2;
@@ -322,21 +183,24 @@ public enum OrphansStrategy {
* its documents actively being processed before stopping.
*
- * Reaching the maximum value does not terminate the crawl session but - * rather pauses it. On next run, the crawler will resume the same session, + * Reaching the maximum value will stop the crawler but it will not + * otherwise consider the crawler session "complete", but rather + * on "pause". On next run, the crawler will resume the same session, * processing an additional number of documents up to the maximum * specified. * This maximum allows crawling one or more sources * in chunks, processing a maximum number of documents each time. * When the session fully completes, the next run will start a new - * crawl session. To prevent resuming an partial crawl session, - * explicitly clean the crawl session. + * crawl session. To prevent resuming a partial crawl session, + * explicitly clean the crawl store first. + *
+ *+ * For more control on what events may stop the crawler, consider using + * configuring a {@link StopCrawlerOnMaxEventListener}. *
** Default is -1 (unlimited). *
- * @param maxDocuments maximum number of documents that can be processed - * @return maximum number of documents that can be processed */ private int maxDocuments = -1; @@ -345,8 +209,6 @@ public enum OrphansStrategy { * is crawler-specific. Examples: levels of sub-directories, * number of URL clicks to reach a page, etc. Refer to specific crawler * implementation for details. Default is -1 (unlimited). - * @param maxDepth maximum depth or -1 for unlimited depth - * @return maximum depth or -1 for unlimited depth */ private int maxDepth = -1; @@ -358,8 +220,6 @@ public enum OrphansStrategy { * is also considered "inactive". Default is * {@value #DEFAULT_IDLE_PROCESSING_TIMEOUT}. Anull
* value means no timeouts.
- * @param idleTimeout time to wait for a document to be processed
- * @return time to wait for a document to be processed
*/
private Duration idleTimeout;
@@ -369,9 +229,6 @@ public enum OrphansStrategy {
* Default value is {@value #DEFAULT_MIN_PROGRESS_LOGGING_INTERVAL}.
* A null
value disables progress logging. Minimum value
* is 1 second.
- * @param minProgressLoggingInterval time to wait between each logging
- * of crawling progress
- * @return time to wait between each logging of crawling progress
*/
private Duration minProgressLoggingInterval;
@@ -381,13 +238,13 @@ public enum OrphansStrategy {
* current run. In other words, they are leftovers from a previous run
* that were not re-encountered in the current.
*
- * Unless explicitly stated otherwise by an implementing class, the default
- * strategy is to PROCESS
orphans.
+ * Unless explicitly stated otherwise by a crawler implementation, the
+ * default strategy is to PROCESS
orphans.
* Setting a null
value is the same as setting
* IGNORE
.
*
* Be careful: Setting the orphan strategy to DELETE
- * is NOT recommended in most cases. With some collectors, a temporary
+ * is NOT recommended in most cases. There are times when a temporary
* failure such as a network outage or a web page timing out, may cause
* some documents not to be crawled. When this happens, unreachable
* documents would be considered "orphans" and be deleted while under
@@ -395,8 +252,6 @@ public enum OrphansStrategy {
* (default), is usually the safest approach to confirm they still
* exist before deleting or updating them.
*
null
.
- * @param metadataChecksummer metadata checksummer
- * @return metadata checksummer or null
when disabled
*/
private MetadataChecksummer metadataChecksummer;
/**
* The Importer module configuration.
- * @param importerConfig Importer module configuration
- * @return Importer module configuration
*/
@JsonProperty("importer")
private ImporterConfig importerConfig = new ImporterConfig();
@@ -433,42 +284,35 @@ public enum OrphansStrategy {
private final Listnull
.
* Not recommended unless you know for sure your metadata
* checksum is acceptably unique.
- * @param metadataDeduplicate true
to turn on
- * metadata-based deduplication
- * @return whether to turn on metadata-based deduplication
*/
private boolean metadataDeduplicate;
/**
- * Whether to turn on deduplication based on document checksum.
+ * Whether to turn ON deduplication based on document checksum.
* To enable, {@link #getDocumentChecksummer()} must not return
* null
.
* Not recommended unless you know for sure your document
* checksum is acceptably unique.
- * @param documentDeduplicate true
to turn on
- * document-based deduplication
- * @return whether to turn on document-based deduplication
*/
private boolean documentDeduplicate;
/**
- * The document checksummer.
- * Document checksum generation is disabled when null
.
- * @param documentChecksummer document checksummer
- * @return document checksummer or null
when disabled
+ * The document checksummer. Document checksum generation is disabled
+ * when null
.
*/
private DocumentChecksummer documentChecksummer =
new Md5DocumentChecksummer();
/**
- * The spoiled state strategy resolver.
- * @param spoiledReferenceStrategizer spoiled state strategy resolver
- * @return spoiled state strategy resolver
+ * The spoiled state strategy resolver. A spoiled document is one that
+ * was crawled properly before but on a subsequent crawl, it can no longer
+ * be crawled for whatever reason (not found, bad status, server error,
+ * etc.).
*/
private SpoiledReferenceStrategizer spoiledReferenceStrategizer =
new GenericSpoiledReferenceStrategizer();
@@ -483,16 +327,12 @@ public enum OrphansStrategy {
/**
* One or more fetchers responsible for obtaining documents and their
* metadata from a source.
- * @param fetchers one or more fetchers
- * @return one or more fetchers
*/
private final List* Implementors do not need to store the checksum themselves, this abstract * class does it. - *
- * Implementors should offer this XML configuration usage: *
- * {@nx.xml #usage - *
* toField
is ignored unless the keep
* attribute is set to true
.
diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractMetadataChecksummer.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractMetadataChecksummer.java
index f7c038c33..18fba1dd9 100644
--- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractMetadataChecksummer.java
+++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/checksum/AbstractMetadataChecksummer.java
@@ -34,16 +34,7 @@
*
* Implementors do not need to store the checksum themselves, this abstract * class does it. - *
- * Implementors should offer this XML configuration usage: *
- * - * {@nx.xml #usage - *
* toField
is ignored unless the keep
* attribute is set to true
.
diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilter.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilter.java
index a66bd825e..3c3709e5a 100644
--- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilter.java
+++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilter.java
@@ -39,24 +39,6 @@
* Extensions are typically the last characters of a file name, after the
* last dot.
*
- * The above example will only accept references with the following - * extensions: .html, .htm, .php, and .asp. - *
*/ @EqualsAndHashCode @ToString diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterConfig.java index bce5e5265..f8a175073 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/ExtensionReferenceFilterConfig.java @@ -16,7 +16,6 @@ import java.util.Collections; import java.util.HashSet; -import java.util.List; import java.util.Set; import com.norconex.commons.lang.collection.CollectionUtil; @@ -31,24 +30,6 @@ * Extensions are typically the last characters of a file name, after the * last dot. * - * - * {@nx.xml.usage - *- * The above example will only accept references with the following - * extensions: .html, .htm, .php, and .asp. - *
*/ @Data @Accessors(chain = true) @@ -63,7 +44,7 @@ public Set- * Used in a web context, the above example filters out Zip documents base - * on a "Content-Type" metadata field. - *
- * */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class GenericMetadataFilter implements diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilterConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilterConfig.java index 10d49b2ad..8e1de1ae2 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilterConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericMetadataFilterConfig.java @@ -25,32 +25,7 @@ * Accepts or rejects a reference based on whether one or more * metadata field values are matching. * - * - * {@nx.xml.usage - *- * Used in a web context, the above example filters out Zip documents base - * on a "Content-Type" metadata field. - *
- * */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class GenericMetadataFilterConfig { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilter.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilter.java index e1de39af0..b829e39ee 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilter.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilter.java @@ -35,27 +35,8 @@ ** Filters URL based on a matching expression. *
- * - * {@nx.xml.usage - *- * The above will reject documents having "/login/" in their reference. - *
* @see Pattern */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class GenericReferenceFilter implements diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilterConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilterConfig.java index 9611a5088..4fe018183 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilterConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/filter/impl/GenericReferenceFilterConfig.java @@ -26,27 +26,8 @@ ** Filters URL based on a matching expression. *
- * - * {@nx.xml.usage - *- * The above will reject documents having "/login/" in their reference. - *
* @see Pattern */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class GenericReferenceFilterConfig { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizer.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizer.java index d963bf942..690d82368 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizer.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizer.java @@ -41,28 +41,6 @@ *- * The above example indicates we should ignore (do nothing) errors processing - * documents, and send a deletion request if they are not found or have - * resulted in a bad status. - *
*/ @EqualsAndHashCode @ToString @@ -83,8 +61,8 @@ public SpoiledReferenceStrategy resolveSpoiledReferenceStrategy( strategy = configuration.getFallbackStrategy(); } if (strategy == null) { - strategy = - GenericSpoiledReferenceStrategizerConfig.DEFAULT_FALLBACK_STRATEGY; + strategy = GenericSpoiledReferenceStrategizerConfig + .DEFAULT_FALLBACK_STRATEGY; } return strategy; } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizerConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizerConfig.java index f4da05074..0eea56850 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizerConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/operations/spoil/impl/GenericSpoiledReferenceStrategizerConfig.java @@ -19,7 +19,6 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.norconex.crawler.core.doc.CrawlDocState; -import com.norconex.crawler.core.doc.operations.spoil.SpoiledReferenceStrategizer; import com.norconex.crawler.core.doc.operations.spoil.SpoiledReferenceStrategy; import lombok.Data; @@ -27,43 +26,7 @@ /** *
- * Generic implementation of {@link SpoiledReferenceStrategizer} that
- * offers a simple mapping between the crawl state of references that have
- * turned "bad" and the strategy to adopt for each.
- * Whenever a crawl state does not have a strategy associated, the fall-back
- * strategy is used (default being DELETE
).
- *
- * The mappings defined by default are as follow: - *
- * - *Crawl state | Strategy |
NOT_FOUND | DELETE |
BAD_STATUS | GRACE_ONCE |
ERROR | GRACE_ONCE |
- * The above example indicates we should ignore (do nothing) errors processing - * documents, and send a deletion request if they are not found or have - * resulted in a bad status. + * Configuration for {@link GenericSpoiledReferenceStrategizer}. *
*/ @Data diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/pipelines/queue/QueuePipeline.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/pipelines/queue/QueuePipeline.java index 8e2fcdb52..34edc4f16 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/pipelines/queue/QueuePipeline.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/pipelines/queue/QueuePipeline.java @@ -67,6 +67,7 @@ public boolean isQueueInitialized() { * Otherwise the method returns when initial queuing has completed. * If no queue initializer was provided, returns right away with *true
(initialized).
+ * @param crawler the crawler
* @return queue initialization completion status
*/
public MutableBoolean initializeQueue(Crawler crawler) {
diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListener.java b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListener.java
index a49eaa42c..6ec5099c5 100644
--- a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListener.java
+++ b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListener.java
@@ -46,7 +46,7 @@
* REJECTED_
. To avoid performance issues when dealing with
* too many deletion requests, it is recommended you can change this behavior
* to match exactly the events you are interested in with
- * {@link #setEventMatcher(TextMatcher)}.
+ * {@link DeleteRejectedEventListenerConfig#setEventMatcher(TextMatcher)}.
* Keep limiting events to "rejected" ones to avoid unexpected results.
*
*
@@ -68,30 +68,7 @@
* references. Be aware this can cause issues if you are using rules in your
* committer (e.g., to route requests) based on metadata.
*
- *
- * {@nx.xml.usage
- *
- * The above example will send deletion requests whenever a reference is not - * found (e.g., a 404 response from a web server) or if it was filtered out - * by the crawler. - *
- * */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString @Slf4j diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListenerConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListenerConfig.java index e6af97659..b79da8b0a 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListenerConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/DeleteRejectedEventListenerConfig.java @@ -15,71 +15,15 @@ package com.norconex.crawler.core.event.listeners; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.core.event.CrawlerEvent; import lombok.Data; import lombok.experimental.Accessors; /** *- * Provides the ability to send deletion requests to your configured - * committer(s) whenever a reference is rejected, regardless whether it was - * encountered in a previous crawling session or not. + * Configuration for {@link DeleteRejectedEventListener}. *
- * - *
- * By default this listener will send deletion requests for all references
- * associated with a {@link CrawlerEvent} name starting with
- * REJECTED_
. To avoid performance issues when dealing with
- * too many deletion requests, it is recommended you can change this behavior
- * to match exactly the events you are interested in with
- * {@link #setEventMatcher(TextMatcher)}.
- * Keep limiting events to "rejected" ones to avoid unexpected results.
- *
- * This class tries to handles each reference for "rejected" events only once. - * To do so it will queue all such references and wait until normal - * crawler completion to send them. Waiting for completion also gives this - * class a chance to listen for deletion requests sent to your committer as - * part of the crawler regular execution (typically on subsequent crawls). - * This helps ensure you do not get duplicate deletion requests for the same - * reference. - *
- * - *- * Since several rejection events are triggered before document are processed, - * we can't assume there is any metadata attached with rejected - * references. Be aware this can cause issues if you are using rules in your - * committer (e.g., to route requests) based on metadata. - *
- *
- * {@nx.xml.usage
- *
- * The above example will send deletion requests whenever a reference is not - * found (e.g., a 404 response from a web server) or if it was filtered out - * by the crawler. - *
- * */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class DeleteRejectedEventListenerConfig { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListener.java b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListener.java index 7decc2d8b..8f01b34d4 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListener.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListener.java @@ -77,30 +77,7 @@ * have reached the maximum. * * - * - * {@nx.xml.usage - *- * The above example will stop the crawler when the sum of committed documents - * (upserts + deletions) reaches 100. - *
*/ -@SuppressWarnings("javadoc") @Slf4j @EqualsAndHashCode @ToString diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListenerConfig.java b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListenerConfig.java index 785838bc4..4d83f4dce 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListenerConfig.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/event/listeners/StopCrawlerOnMaxEventListenerConfig.java @@ -15,80 +15,15 @@ package com.norconex.crawler.core.event.listeners; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.core.CrawlerConfig; import lombok.Data; import lombok.experimental.Accessors; /** *- * Alternative to {@link CrawlerConfig#setMaxDocuments(int)} for stopping - * the crawler upon reaching specific event counts. The event counts are only - * kept for a crawling session. They are reset to zero upon restarting - * the crawler. - *
- *- * Not specifying any maximum or events has no effect. - *
- * - *- * The "maxDocuments" option deals with "processed" documents. Those are - * documents that were initially queued for crawling and crawling was attempted - * on them, whether that exercise what successful or not. That is, - * "maxDocuments" will not count documents that were sent to your committer - * for additions or deletions, but also documents that were rejected - * by your Importer configuration, produced errors, etc. - * This class gives you more control over what should trigger a crawler to stop. - *
- *
- * Note that for this class to take effect, make sure that "maxDocuments" has
- * a high enough number or is set -1
(unlimited).
- *
- * If your event matcher matches more than one event, you can decide what - * should be the expected behavior. Options are: - *
- *- * The above example will stop the crawler when the sum of committed documents - * (upserts + deletions) reaches 100. + * Configuration for {@link StopCrawlerOnMaxEventListener}. *
*/ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class StopCrawlerOnMaxEventListenerConfig { diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/AbstractFetcher.java b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/AbstractFetcher.java index abaa6121c..81228e306 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/fetch/AbstractFetcher.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/fetch/AbstractFetcher.java @@ -19,7 +19,6 @@ import com.norconex.commons.lang.config.Configurable; import com.norconex.commons.lang.event.Event; import com.norconex.commons.lang.event.EventListener; -import com.norconex.commons.lang.xml.XmlConfigurable; import com.norconex.crawler.core.Crawler; import com.norconex.crawler.core.doc.operations.filter.FilterGroupResolver; import com.norconex.crawler.core.doc.operations.filter.ReferenceFilter; @@ -43,33 +42,6 @@ * It also offers methods to overwrite in order to react to crawler * startup and shutdown events. * - *- * This XML snippet is an example of filter that restricts the application of - * this Fetcher to references ending with ".pdf". - *
- * - * {@nx.xml.example - *- * This XML snippet is an example of filter that restricts the application of - * this Fetcher to references ending with ".pdf". - *
- * - * {@nx.xml.example - *true
if there were stores to clean
+ */
boolean clean();
+ /**
+ * Closes the data store. Called once after crawling completes.
+ */
@Override
void close();
+ /**
+ * Opens (and creates as needed) a store of the given name to store
+ * and retrieve objects of the given type.
+ * @param true
if there was a store to delete
+ */
boolean dropStore(String name);
- // returns true if target was deleted
+ /**
+ * Rename a data store.
+ * @param dataStore the data store to rename
+ * @param newName the new name
+ * @return true
if a store already exist with the new name and
+ * had to first be deleted
+ */
boolean renameStore(DataStore> dataStore, String newName);
+ /**
+ * Gets the name of all stores in the engine.
+ * @return a set of stores
+ */
@JsonIgnore
Setdatasource
properties as shown below.
+ * are passed as-is, via
+ * {@link JdbcDataStoreEngineConfig#setProperties(java.util.Properties)}.
*
* * This class only use a few data types to store its data in a generic way. * It will try to detect what data type to use for your database. If you * get errors related to field data types not being supported, you have - * the option to redefined them. - *
- * - * {@nx.xml.usage - *- * The above example contains basic settings for creating a MySQL data source. + * the option to provide your own SQL statements for creating tables + * and doing "upsert" requests. Refer to {@link JdbcDialect} source code + * for SQL examples. *
*/ @Slf4j @@ -122,7 +85,7 @@ public class JdbcDataStoreEngine // table id field is store name private JdbcDataStore- * Data store engine using a JDBC-compatible database for storing - * crawl data. - *
- *- * To use this data store engine, you need its JDBC database driver - * on the classpath. - *
- *
- * This JDBC data store engine uses
- * Hikari as the JDBC
- * datasource implementation, which provides efficient connection-pooling.
- * Refer to
- *
- * Hikari's documentation for all configuration options. The Hikari options
- * are passed as-is, via datasource
properties as shown below.
- *
- * This class only use a few data types to store its data in a generic way. - * It will try to detect what data type to use for your database. If you - * get errors related to field data types not being supported, you have - * the option to redefined them. - *
- * - * {@nx.xml.usage - *- * The above example contains basic settings for creating a MySQL data source. + * Configuration for {@link JdbcDataStoreEngine}. *
*/ @Data @Accessors(chain = true) public class JdbcDataStoreEngineConfig { + /** + * Optional prefix used for table creation. Default is the crawler id, + * followed by an underscore character. + * The value is first modified to convert spaces to underscores, and + * to strip unsupported characters. The supported + * characters are: alphanumeric, period, and underscore. + */ private String tablePrefix; + + /** + * Explicitly set the JDBC dialect to use when generating SQL. + * By default, an attempt is made to detect it automatically. + * Not really useful if you provide your own SQLs for table creation + * and upserts. + */ private JdbcDialect dialect; + + /** + * For advanced use. Optional SQL to create new tables. + * Refer to {@link JdbcDialect} source code for examples. + */ private String createTableSql; + /** + * For advanced use. Optional SQL to create upsert SQLs. + * Refer to {@link JdbcDialect} source code for examples. + */ private String upsertSql; + /** + * Connection properties as per + * + * Hikari's documentation. At a minimum, you need to provide + * either a "dataSourceClassName" or a "jdbcUrl". + */ private Properties properties = new Properties(); - // private String varcharType; - // private String timestampType; - // private String textType; } diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngine.java b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngine.java index ecf8538e4..9d80dac50 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngine.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/store/impl/mongodb/MongoDataStoreEngine.java @@ -51,9 +51,6 @@ ** Data store engine using MongoDB for storing crawl data. *
- * {@nx.xml.usage- * Data store engine using MongoDB for storing crawl data. + * Configuration for {@link MongoDataStoreEngine}. *
- * - * {@nx.xml.usage+ * Embedded data store relying on MvStore. + *
+ * @since 1.10.0 + * @author Pascal Essiembre + */ @EqualsAndHashCode @ToString @Slf4j @@ -114,13 +121,12 @@ public void init(Crawler crawler) { try { mvstore = builder.open(); } catch (MVStoreException e) { - LOG.warn( - """ - An exception occurred while trying to open the store engine.\s\ - This could happen due to an abnormal shutdown on a previous\s\ - execution of the crawler. An attempt will be made to recover.\s\ - It is advised to back-up the store engine if you want to\s\ - preserve the crawl history.""", + LOG.warn(""" + An exception occurred while trying to open the store engine.\s\ + This could happen due to an abnormal shutdown on a previous\s\ + execution of the crawler. An attempt will be made to recover.\s\ + It is advised to back-up the store engine if you want to\s\ + preserve the crawl history.""", e); builder.recoveryMode(); mvstore = builder.open(); @@ -169,8 +175,8 @@ public synchronized void close() { if (mvstore != null && !mvstore.isClosed()) { LOG.info("Compacting data store..."); mvstore.commit(); - //TODO method dropped from MVStore. Any replacemetn? - //mvstore.compactMoveChunks(); + //NOTE: this method was dropped from MVStore. Any replacement? + // mvstore.compactMoveChunks(); mvstore.close(); } mvstore = null; @@ -224,33 +230,4 @@ public Set-* MVStore configuration parameters. For advanced use only. -* Differences from MVStore defaults: -* All data size values are expected to be set in bytes. -* Light compression is enabled by default (compress = 1) -*
-*-* For more info: -*
-*+ * Configuration for {@link MvStoreDataStoreEngine}. + *
+ *+ * Changing default values is for advanced use only. Differences from MVStore + * defaults: + *
+ *+ * For more info: + *
+ *true
if only using memory (data is not persisted)
*/
- @SuppressWarnings("javadoc")
private boolean ephemeral;
}
\ No newline at end of file
diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/util/About.java b/crawler/core/src/main/java/com/norconex/crawler/core/util/About.java
index de4b4979a..b01a4a1e3 100644
--- a/crawler/core/src/main/java/com/norconex/crawler/core/util/About.java
+++ b/crawler/core/src/main/java/com/norconex/crawler/core/util/About.java
@@ -28,33 +28,31 @@
import com.norconex.committer.core.Committer;
import com.norconex.commons.lang.PackageManifest;
-import com.norconex.crawler.core.Crawler;
import com.norconex.crawler.core.CrawlerConfig;
+import lombok.NonNull;
+
public final class About {
/** Simple ASCI art of Norconex. */
- public static final String NORCONEX_ASCII =
- """
- _ _ ___ ____ ____ ___ _ _ _______ __
- | \\ | |/ _ \\| _ \\ / ___/ _ \\| \\ | | ____\\ \\/ /
- | \\| | | | | |_) | | | | | | \\| | _| \\ /\s
- | |\\ | |_| | _ <| |__| |_| | |\\ | |___ / \\\s
- |_| \\_|\\___/|_| \\_\\\\____\\___/|_| \\_|_____/_/\\_\\
+ public static final String NORCONEX_ASCII = """
+ _ _ ___ ____ ____ ___ _ _ _______ __
+ | \\ | |/ _ \\| _ \\ / ___/ _ \\| \\ | | ____\\ \\/ /
+ | \\| | | | | |_) | | | | | | \\| | _| \\ /\s
+ | |\\ | |_| | _ <| |__| |_| | |\\ | |___ / \\\s
+ |_| \\_|\\___/|_| \\_\\\\____\\___/|_| \\_|_____/_/\\_\\
- ================ C R A W L E R ================
- """;
+ ================ C R A W L E R ================
+ """;
private About() {
}
- public static String about(CrawlerConfig config) {
+ public static String about(@NonNull CrawlerConfig config) {
try (var sw = new StringWriter(); var w = new PrintWriter(sw, true)) {
w.println(NORCONEX_ASCII);
- // version
- // w.println("Version: " + releaseVersion(Crawler.class)); //TODO pass class from crawler impl so we have the name ? (web vs file)
- w.println("Version:\n " + releaseVersion(Crawler.class)); //TODO pass class from crawler impl so we have the name ? (web vs file)
+ w.println("Version:\n " + releaseVersion(config.getClass()));
// committer
var committerClasses = configuredCommitters(config);
@@ -67,16 +65,6 @@ public static String about(CrawlerConfig config) {
w.println("