Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable database names #426

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,28 @@ public class CrawlController {

public CrawlController(CrawlConfig config, PageFetcher pageFetcher,
RobotstxtServer robotstxtServer) throws Exception {
this(config, pageFetcher, null, robotstxtServer, null);
this(config, pageFetcher, null, robotstxtServer, null, null, null, null);
}

public CrawlController(CrawlConfig config, PageFetcher pageFetcher,
RobotstxtServer robotstxtServer, TLDList tldList) throws Exception {
this(config, pageFetcher, null, robotstxtServer, tldList);
this(config, pageFetcher, null, robotstxtServer, tldList, null, null, null);
}

public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parser,
RobotstxtServer robotstxtServer, TLDList tldList) throws Exception {
RobotstxtServer robotstxtServer, TLDList tldList) throws Exception {
this(config, pageFetcher, parser, robotstxtServer, tldList, null, null, null);
}

public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parser,
RobotstxtServer robotstxtServer, TLDList tldList,
String docIdDbName, String pendingDbName) throws Exception {
this(config, pageFetcher, parser, robotstxtServer, tldList, null, null, null);
}

public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parser,
RobotstxtServer robotstxtServer, TLDList tldList,
String docIdDbName, String pendingDbName, String inProcessDbName) throws Exception {
config.validate();
this.config = config;

Expand Down Expand Up @@ -140,8 +152,8 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parse
}

env = new Environment(envHome, envConfig);
docIdServer = new DocIDServer(env, config);
frontier = new Frontier(env, config);
docIdServer = new DocIDServer(env, config, docIdDbName);
frontier = new Frontier(env, config, pendingDbName, inProcessDbName);

this.pageFetcher = pageFetcher;
this.parser = parser == null ? new Parser(config, tldList) : parser;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,18 @@ public class DocIDServer {
private CrawlConfig config;
private int lastDocID;

public DocIDServer(Environment env, CrawlConfig config) {
public DocIDServer(Environment env, CrawlConfig config, String dbName) {
this.config = config;
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(config.isResumableCrawling());
dbConfig.setDeferredWrite(!config.isResumableCrawling());
lastDocID = 0;
docIDsDB = env.openDatabase(null, DATABASE_NAME, dbConfig);
if (dbName == null) {
docIDsDB = env.openDatabase(null, DATABASE_NAME, dbConfig);
} else {
docIDsDB = env.openDatabase(null, dbName, dbConfig);
}
if (config.isResumableCrawling()) {
int docCount = getDocCount();
if (docCount > 0) {
Expand All @@ -62,6 +66,10 @@ public DocIDServer(Environment env, CrawlConfig config) {
}
}

public DocIDServer(Environment env, CrawlConfig config) {
this(env, config, null);
}

/**
* Returns the docid of an already seen url.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,29 @@ public class Frontier {
protected Counters counters;

public Frontier(Environment env, CrawlConfig config) {
this(env, config, null);
}

public Frontier(Environment env, CrawlConfig config, String dbName) {
this(env, config, dbName, null);
}

public Frontier(Environment env, CrawlConfig config, String dbName, String inProcessDbName) {
this.config = config;
this.counters = new Counters(env, config);
try {
workQueues = new WorkQueues(env, DATABASE_NAME, config.isResumableCrawling());
if (dbName == null) {
workQueues = new WorkQueues(env, DATABASE_NAME, config.isResumableCrawling());
} else {
workQueues = new WorkQueues(env, dbName, config.isResumableCrawling());
}
if (config.isResumableCrawling()) {
scheduledPages = counters.getValue(Counters.ReservedCounterNames.SCHEDULED_PAGES);
inProcessPages = new InProcessPagesDB(env);
if (inProcessDbName == null) {
inProcessPages = new InProcessPagesDB(env);
} else {
inProcessPages = new InProcessPagesDB(env, inProcessDbName);
}
long numPreviouslyInProcessPages = inProcessPages.getLength();
if (numPreviouslyInProcessPages > 0) {
logger.info("Rescheduling {} URLs from previous crawl.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ public class InProcessPagesDB extends WorkQueues {
private static final String DATABASE_NAME = "InProcessPagesDB";

public InProcessPagesDB(Environment env) {
super(env, DATABASE_NAME, true);
this(env, DATABASE_NAME);
}

public InProcessPagesDB(Environment env, String dbName) {
super(env, dbName, true);
long docCount = getLength();
if (docCount > 0) {
logger.info("Loaded {} URLs that have been in process in the previous crawl.",
Expand Down