Skip to content

Commit

Permalink
Allow to run solr-indexing without an existing collection #289
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Oct 20, 2023
1 parent 9863645 commit 5f7554b
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 29 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# QA Catalogue Changelog

## v0.8.0

- [\#326](https://github.com/pkiraly/metadata-qa-marc/issues/326) Use GETOPT for the command line scripts

## v0.5.0

### New features
Expand Down
36 changes: 23 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,11 @@ total
1192536
```

* `issue-by-category.csv`: the counts of issues by categories
* `issue-by-category.csv`: the counts of issues by categories. Columns:
* `id` the identifier of error category
* `category` the name of the category
* `instances` the number of instances of errors within the category (one record might have multiple instances of the same error)
* `records` the number of records having at least one of the errors within the category

```csv
id,category,instances,records
Expand Down Expand Up @@ -821,22 +825,28 @@ If the data is _not_ grouped by libraries (no `--groupBy <path>` parameter), it
structure and import some of the CSV files into it:

`issue_summary` table for the `issue-summary.csv`:

It represents a particular type of error
```
id INTEGER,
MarcPath TEXT,
categoryId INTEGER,
typeId INTEGER,
type TEXT,
message TEXT,
url TEXT,
instances INTEGER,
records INTEGER
id INTEGER, -- identifier of the error
MarcPath TEXT, -- the location of the error in the bibliographic record
categoryId INTEGER, -- the identifier of the category of the error
typeId INTEGER, -- the identifier of the type of the error
type TEXT, -- the description of the type
message TEXT, -- extra contextual information
url TEXT, -- the url of the definition of the data element
instances INTEGER, -- the number of instances this error occured
records INTEGER -- the number of records this error occured in
```

`issue_details` table for the `issue-details.csv`:

Each row represents how many instances of an error occur in a particular bibliographic record

```
id TEXT,
errorId INTEGER,
instances INTEGER
id TEXT, -- the record identifier
errorId INTEGER, -- the error identifier (-> issue_summary.id)
instances INTEGER -- the number of instances of an error in the record
```

##### Union catalogue for multiple libraries
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ private void initialize() {
: new MarcSolrClient(parameters.getSolrUrl());
client.setTrimId(parameters.getTrimId());
client.indexWithTokenizedField(parameters.indexWithTokenizedField());
if (parameters.getValidationUrl() != null) {
if (parameters.getSolrForScoresUrl() != null) {
validationClient = parameters.useEmbedded()
? new MarcSolrClient(parameters.getValidationClient())
: new MarcSolrClient(parameters.getValidationUrl());
: new MarcSolrClient(parameters.getSolrForScoresUrl());
validationClient.setTrimId(parameters.getTrimId());
}
readyToProcess = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public class CommonParameters implements Serializable {
private SchemaType schemaType = SchemaType.MARC21;
private String groupBy;
private String groupListFile;
private String solrForScoresUrl;

protected void setOptions() {
if (!isOptionSet) {
Expand Down Expand Up @@ -98,6 +99,7 @@ protected void setOptions() {
options.addOption("c", "allowableRecords", true, "allow records for the analysis");
options.addOption("e", "groupBy", true, "group the results by the value of this data element (e.g. the ILN of library)");
options.addOption("3", "groupListFile", true, "the file which contains a list of ILN codes");
options.addOption("4", "solrForScoresUrl", true, "the URL of the Solr server used to store scores");

isOptionSet = true;
}
Expand Down Expand Up @@ -140,6 +142,7 @@ public CommonParameters(String[] arguments) throws ParseException {
readPicaRecordType();
readGroupBy();
readGroupListFile();
readSolrForScoresUrl();

args = cmd.getArgs();
}
Expand All @@ -154,7 +157,6 @@ private void readPicaRecordType() {
picaRecordTypeField = cmd.getOptionValue("picaRecordType");
}


private void readGroupBy() {
if (cmd.hasOption("groupBy"))
groupBy = cmd.getOptionValue("groupBy");
Expand All @@ -165,6 +167,11 @@ private void readGroupListFile() {
groupListFile = cmd.getOptionValue("groupListFile");
}

private void readSolrForScoresUrl() {
if (cmd.hasOption("solrForScoresUrl"))
solrForScoresUrl = cmd.getOptionValue("solrForScoresUrl");
}

private void readPicaSubfieldSeparator() {
if (cmd.hasOption("picaSubfieldSeparator"))
picaSubfieldSeparator = cmd.getOptionValue("picaSubfieldSeparator");
Expand Down Expand Up @@ -544,6 +551,10 @@ public String getGroupListFile() {
return groupListFile;
}

public String getSolrForScoresUrl() {
return solrForScoresUrl;
}

public String formatParameters() {
String text = "";
text += String.format("schemaType: %s%n", schemaType);
Expand Down Expand Up @@ -574,6 +585,7 @@ public String formatParameters() {
}
text += String.format("groupBy: %s%n", groupBy);
text += String.format("groupListFile: %s%n", groupListFile);
text += String.format("solrForScoresUrl: %s%n", solrForScoresUrl);

return text;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ public class MarcToSolrParameters extends CommonParameters {
private String solrUrl = null;
private boolean doCommit = false;
private SolrFieldType solrFieldType = SolrFieldType.MARC;
private String validationUrl = null;
private SolrClient mainClient = null;
private SolrClient validationClient = null;
private boolean indexWithTokenizedField = false;
Expand All @@ -24,7 +23,6 @@ protected void setOptions() {
options.addOption("c", "doCommit", false, "send commits to Solr regularly");
options.addOption("t", "solrFieldType", true,
"type of Solr fields, could be one of 'marc-tags', 'human-readable', or 'mixed'");
options.addOption("A", "validationUrl", true, "the URL of the Solr server used in validation");
options.addOption("B", "useEmbedded", false, "use embedded Solr server (used in tests only)");
options.addOption("C", "indexWithTokenizedField", false, "index data elements as tokenized field as well");
isOptionSet = true;
Expand All @@ -43,9 +41,6 @@ public MarcToSolrParameters(String[] arguments) throws ParseException {
if (cmd.hasOption("solrFieldType"))
solrFieldType = SolrFieldType.byCode(cmd.getOptionValue("solrFieldType"));

if (cmd.hasOption("validationUrl"))
validationUrl = cmd.getOptionValue("validationUrl");

if (cmd.hasOption("useEmbedded"))
useEmbedded = true;

Expand All @@ -65,10 +60,6 @@ public SolrFieldType getSolrFieldType() {
return solrFieldType;
}

public String getValidationUrl() {
return validationUrl;
}

public SolrClient getMainClient() {
return mainClient;
}
Expand Down Expand Up @@ -99,7 +90,6 @@ public String formatParameters() {
text += String.format("solrUrl: %s%n", solrUrl);
text += String.format("doCommit: %s%n", doCommit);
text += String.format("solrFieldType: %s%n", solrFieldType);
text += String.format("validationUrl: %s%n", validationUrl);
text += String.format("indexWithTokenizedField: %s%n", indexWithTokenizedField);
return text;
}
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/de/gwdg/metadataqa/marc/cli/MarcToSolrTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,11 @@ public void run0() {
"--solrFieldType", "MIXED",
"--useEmbedded",
"--solrUrl", "http://localhost:8983/solr/k10plus_pica_grouped_dev",
"--validationUrl", "http://localhost:8983/solr/k10plus_pica_grouped_validation",
"--solrForScoresUrl", "http://localhost:8983/solr/k10plus_pica_grouped_scores",
getPath("src/test/resources/pica/pica-with-holdings-info.dat")
});
EmbeddedSolrServer mainClient = EmbeddedSolrClientFactory.getClient(coreFromUrl(params.getSolrUrl()));
EmbeddedSolrServer validationClient = EmbeddedSolrClientFactory.getClient(coreFromUrl(params.getValidationUrl()));
EmbeddedSolrServer validationClient = EmbeddedSolrClientFactory.getClient(coreFromUrl(params.getSolrForScoresUrl()));
params.setMainClient(mainClient);
params.setValidationClient(validationClient);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,8 @@ public void formatParameters() {
"defaultEncoding: null\n" +
"alephseqLineType: null\n" +
"groupBy: null\n" +
"groupListFile: null\n";
"groupListFile: null\n" +
"solrForScoresUrl: null\n";
assertEquals(expected, parameters.formatParameters());
} catch (ParseException e) {
logger.log(Level.WARNING, "error in formatParameters()", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ public void formatParameters() {
"alephseqLineType: null\n" +
"groupBy: null\n" +
"groupListFile: null\n" +
"solrForScoresUrl: null\n" +
"shaclConfigurationFile: shacl.cnf\n" +
"shaclOutputFile: shacl.csv\n" +
"shaclOutputType: STATUS\n",
Expand Down

0 comments on commit 5f7554b

Please sign in to comment.