Skip to content

Commit

Permalink
Merge pull request #1057 from Norconex/feature/CU-86888fmx3/crawler-s…
Browse files Browse the repository at this point in the history
…tability

Feature/cu 86888fmx3/crawler stability
  • Loading branch information
essiembre authored Sep 6, 2024
2 parents 1769e86 + c6e9fb8 commit 31ba4b2
Show file tree
Hide file tree
Showing 41 changed files with 255 additions and 1,027 deletions.
65 changes: 29 additions & 36 deletions crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public class Crawler {
? extends FetchRequest, ? extends FetchResponse> fetcher;
private final Class<? extends CrawlDocContext> docContextType;
private CrawlerState state;
// TODO remove stopper listener when we are fully using a table?
// TODO remove stopper listener when we are fully using an accessible store?
private CrawlerStopper stopper = new FileBasedStopper();

// --- Set in init ---
Expand Down Expand Up @@ -127,8 +127,8 @@ public class Crawler {
doc -> new UpsertRequest(
doc.getReference(),
doc.getMetadata(),
doc.getInputStream())) // Closed by
// caller
// InputStream closed by caller
doc.getInputStream()))
.deleteRequestBuilder(
doc -> new DeleteRequest(
doc.getReference(),
Expand Down Expand Up @@ -194,12 +194,11 @@ public void fire(String eventName) {
}

public void fire(String eventName, Object subject) {
fire(
CrawlerEvent.builder()
.name(eventName)
.source(this)
.subject(subject)
.build());
fire(CrawlerEvent.builder()
.name(eventName)
.source(this)
.subject(subject)
.build());
}

@Override
Expand All @@ -224,46 +223,40 @@ && getState().isExecutionLocked()) {
getState().setStopping(true);
LOG.info("Stopping the crawler.");
} else {
LOG.info(
"CANNOT STOP: the targetted crawler does not appear "
+ "to be running on on this host.");
LOG.info("CANNOT STOP: the targetted crawler does not appear "
+ "to be running on on this host.");
}
}

public void exportDataStore(Path exportDir) {
executeCommand(
new CommandExecution(this, "STORE_EXPORT")
.failableCommand(
() -> DataStoreExporter.exportDataStore(
this,
exportDir))
.lock(true)
.logIntro(true));
executeCommand(new CommandExecution(this, "STORE_EXPORT")
.failableCommand(() -> DataStoreExporter.exportDataStore(
this,
exportDir))
.lock(true)
.logIntro(true));
}

public void importDataStore(Path file) {
executeCommand(
new CommandExecution(this, "STORE_IMPORT")
.failableCommand(
() -> DataStoreImporter
.importDataStore(this, file))
.lock(true)
.logIntro(true));
executeCommand(new CommandExecution(this, "STORE_IMPORT")
.failableCommand(
() -> DataStoreImporter.importDataStore(this, file))
.lock(true)
.logIntro(true));
}

/**
* Cleans the crawler cache information, leading to the next run being as if
* the crawler was run for the first time.
*/
public void clean() {
executeCommand(
new CommandExecution(this, "CLEAN")
.failableCommand(() -> {
getServices().getCommitterService().clean();
dataStoreEngine.clean();
FileUtils.deleteDirectory(getWorkDir().toFile());
})
.lock(true)
.logIntro(true));
executeCommand(new CommandExecution(this, "CLEAN")
.failableCommand(() -> {
getServices().getCommitterService().clean();
dataStoreEngine.clean();
FileUtils.deleteDirectory(getWorkDir().toFile());
})
.lock(true)
.logIntro(true));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,11 @@
import lombok.Setter;
import lombok.experimental.Accessors;

//TODO move to its own class?
//TODO document the optional ones and their default values
@Accessors(fluent = true)
@Setter
@Getter
@NonNull
@SuppressWarnings("javadoc")
public class CrawlerBuilder {
private CrawlerConfig configuration = new CrawlerConfig();
private DocPipelines docPipelines;
Expand All @@ -47,18 +45,13 @@ public class CrawlerBuilder {
/**
* The exact type of {@link CrawlDocContext} if your crawler is subclassing
* it. Defaults to {@link CrawlDocContext} class.
* @param docContextType crawl doc brief class
* @return doc brief class
*/
private Class<? extends CrawlDocContext> docContextType =
CrawlDocContext.class;

/**
* Provides a required fetcher implementation, responsible for obtaining
* resources being crawled.
*
* @param fetcherProvider fetcher provider function
* @return a function returning a fetcher to associate with a given crawler.
*/
private Function<Crawler, ? extends Fetcher<? extends FetchRequest,
? extends FetchResponse>> fetcherProvider;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import lombok.Builder;
import lombok.Getter;

@SuppressWarnings("javadoc")
@Builder
@Getter
public class CrawlerCallbacks {
Expand All @@ -35,9 +34,6 @@ public class CrawlerCallbacks {
* This method is different than the {@link #initCrawler()} method,
* which is invoked for any type of actions where as this one is only
* invoked before an effective request for crawling.
* @param beforeCrawlerExecution bi-consumer accepting a crawler and
* a "resume" indicator.
* @return bi-consumer accepting a crawler and a "resume" indicator
*/
Consumer<Crawler> beforeCrawlerExecution;

Expand All @@ -48,17 +44,15 @@ public class CrawlerCallbacks {
* Invoked right after {@link CrawlerEvent#CRAWLER_STOP_END} or
* {@link CrawlerEvent#CRAWLER_RUN_END} (depending which of the two is
* triggered).
* @param afterCrawlerExecution consumer accepting a crawler
* @return consumer accepting a crawler
*/
Consumer<Crawler> afterCrawlerExecution;

//TODO are those used? Should they be?
//MAYBE: are those used? Should they be?
// Add those that are missing to ReferencesProcessor
BiConsumer<Crawler, CrawlDoc> beforeDocumentProcessing;
BiConsumer<Crawler, CrawlDoc> afterDocumentProcessing;

// need those, or we can replace beforeDocumentFinalizing
//MAYBE: need those, or we can replace beforeDocumentFinalizing
// (the only one used) with after processing?
BiConsumer<Crawler, CrawlDoc> beforeDocumentFinalizing;
BiConsumer<Crawler, CrawlDoc> afterDocumentFinalizing;
Expand Down
Loading

0 comments on commit 31ba4b2

Please sign in to comment.