Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/cu 86888fmx3/crawler stability #1057

Merged
merged 9 commits into from
Sep 6, 2024
65 changes: 29 additions & 36 deletions crawler/core/src/main/java/com/norconex/crawler/core/Crawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public class Crawler {
? extends FetchRequest, ? extends FetchResponse> fetcher;
private final Class<? extends CrawlDocContext> docContextType;
private CrawlerState state;
// TODO remove stopper listener when we are fully using a table?
// TODO remove stopper listener when we are fully using an accessible store?
private CrawlerStopper stopper = new FileBasedStopper();

// --- Set in init ---
Expand Down Expand Up @@ -127,8 +127,8 @@ public class Crawler {
doc -> new UpsertRequest(
doc.getReference(),
doc.getMetadata(),
doc.getInputStream())) // Closed by
// caller
// InputStream closed by caller
doc.getInputStream()))
.deleteRequestBuilder(
doc -> new DeleteRequest(
doc.getReference(),
Expand Down Expand Up @@ -194,12 +194,11 @@ public void fire(String eventName) {
}

public void fire(String eventName, Object subject) {
fire(
CrawlerEvent.builder()
.name(eventName)
.source(this)
.subject(subject)
.build());
fire(CrawlerEvent.builder()
.name(eventName)
.source(this)
.subject(subject)
.build());
}

@Override
Expand All @@ -224,46 +223,40 @@ && getState().isExecutionLocked()) {
getState().setStopping(true);
LOG.info("Stopping the crawler.");
} else {
LOG.info(
"CANNOT STOP: the targetted crawler does not appear "
+ "to be running on on this host.");
LOG.info("CANNOT STOP: the targetted crawler does not appear "
+ "to be running on on this host.");
}
}

public void exportDataStore(Path exportDir) {
executeCommand(
new CommandExecution(this, "STORE_EXPORT")
.failableCommand(
() -> DataStoreExporter.exportDataStore(
this,
exportDir))
.lock(true)
.logIntro(true));
executeCommand(new CommandExecution(this, "STORE_EXPORT")
.failableCommand(() -> DataStoreExporter.exportDataStore(
this,
exportDir))
.lock(true)
.logIntro(true));
}

public void importDataStore(Path file) {
executeCommand(
new CommandExecution(this, "STORE_IMPORT")
.failableCommand(
() -> DataStoreImporter
.importDataStore(this, file))
.lock(true)
.logIntro(true));
executeCommand(new CommandExecution(this, "STORE_IMPORT")
.failableCommand(
() -> DataStoreImporter.importDataStore(this, file))
.lock(true)
.logIntro(true));
}

/**
* Cleans the crawler cache information, leading to the next run being as if
* the crawler was run for the first time.
*/
public void clean() {
executeCommand(
new CommandExecution(this, "CLEAN")
.failableCommand(() -> {
getServices().getCommitterService().clean();
dataStoreEngine.clean();
FileUtils.deleteDirectory(getWorkDir().toFile());
})
.lock(true)
.logIntro(true));
executeCommand(new CommandExecution(this, "CLEAN")
.failableCommand(() -> {
getServices().getCommitterService().clean();
dataStoreEngine.clean();
FileUtils.deleteDirectory(getWorkDir().toFile());
})
.lock(true)
.logIntro(true));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,11 @@
import lombok.Setter;
import lombok.experimental.Accessors;

//TODO move to its own class?
//TODO document the optional ones and their default values
@Accessors(fluent = true)
@Setter
@Getter
@NonNull
@SuppressWarnings("javadoc")
public class CrawlerBuilder {
private CrawlerConfig configuration = new CrawlerConfig();
private DocPipelines docPipelines;
Expand All @@ -47,18 +45,13 @@ public class CrawlerBuilder {
/**
* The exact type of {@link CrawlDocContext} if your crawler is subclassing
* it. Defaults to {@link CrawlDocContext} class.
* @param docContextType crawl doc brief class
* @return doc brief class
*/
private Class<? extends CrawlDocContext> docContextType =
CrawlDocContext.class;

/**
* Provides a required fetcher implementation, responsible for obtaining
* resources being crawled.
*
* @param fetcherProvider fetcher provider function
* @return a function returning a fetcher to associate with a given crawler.
*/
private Function<Crawler, ? extends Fetcher<? extends FetchRequest,
? extends FetchResponse>> fetcherProvider;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import lombok.Builder;
import lombok.Getter;

@SuppressWarnings("javadoc")
@Builder
@Getter
public class CrawlerCallbacks {
Expand All @@ -35,9 +34,6 @@ public class CrawlerCallbacks {
* This method is different than the {@link #initCrawler()} method,
* which is invoked for any type of actions where as this one is only
* invoked before an effective request for crawling.
* @param beforeCrawlerExecution bi-consumer accepting a crawler and
* a "resume" indicator.
* @return bi-consumer accepting a crawler and a "resume" indicator
*/
Consumer<Crawler> beforeCrawlerExecution;

Expand All @@ -48,17 +44,15 @@ public class CrawlerCallbacks {
* Invoked right after {@link CrawlerEvent#CRAWLER_STOP_END} or
* {@link CrawlerEvent#CRAWLER_RUN_END} (depending which of the two is
* triggered).
* @param afterCrawlerExecution consumer accepting a crawler
* @return consumer accepting a crawler
*/
Consumer<Crawler> afterCrawlerExecution;

//TODO are those used? Should they be?
//MAYBE: are those used? Should they be?
// Add those that are missing to ReferencesProcessor
BiConsumer<Crawler, CrawlDoc> beforeDocumentProcessing;
BiConsumer<Crawler, CrawlDoc> afterDocumentProcessing;

// need those, or we can replace beforeDocumentFinalizing
//MAYBE: need those, or we can replace beforeDocumentFinalizing
// (the only one used) with after processing?
BiConsumer<Crawler, CrawlDoc> beforeDocumentFinalizing;
BiConsumer<Crawler, CrawlDoc> afterDocumentFinalizing;
Expand Down
Loading