Skip to content

Commit

Permalink
Add solr field with number of times a field is used in a record #342:…
Browse files Browse the repository at this point in the history
… add subfield count
  • Loading branch information
pkiraly committed Dec 18, 2024
1 parent d7044a2 commit 9e228f0
Show file tree
Hide file tree
Showing 16 changed files with 137 additions and 38 deletions.
3 changes: 2 additions & 1 deletion catalogues/k10plus_pica_grouped.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ TYPE_PARAMS="$TYPE_PARAMS --allowableRecords base64:"$(echo '[email protected] !~ "^L" && 0
# TYPE_PARAMS="$TYPE_PARAMS --solrUrl http://localhost:8983/solr/k10plus_pica_grouped"
TYPE_PARAMS="$TYPE_PARAMS --solrForScoresUrl http://localhost:8983/solr/k10plus_pica_grouped_validation"
TYPE_PARAMS="$TYPE_PARAMS --indexWithTokenizedField"
TYPE_PARAMS="$TYPE_PARAMS --indexFieldCounts"
TYPE_PARAMS="$TYPE_PARAMS --indexFieldCounts --indexSubfieldCounts"
TYPE_PARAMS="$TYPE_PARAMS --fieldPrefix bib"
# MASK=sample.pica
# =kxp-title_2022-09-30-groupped.dat.gz
MASK=${MASK:=pica-with-holdings-info-1K.dat} # if not set in setdir.sh
Expand Down
5 changes: 4 additions & 1 deletion catalogues/nls.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
. ./setdir.sh

NAME=nls
TYPE_PARAMS="--marcxml --emptyLargeCollectors --indexWithTokenizedField --indexFieldCounts --solrForScoresUrl http://localhost:8983/solr/nls_validation"
TYPE_PARAMS="--marcxml --emptyLargeCollectors --indexWithTokenizedField --doCommit"
# TYPE_PARAMS="${TYPE_PARAMS} --offset 180000"
TYPE_PARAMS="${TYPE_PARAMS} --indexFieldCounts --indexSubfieldCounts"
TYPE_PARAMS="${TYPE_PARAMS} --solrForScoresUrl http://localhost:8983/solr/nls_validation"
MASK=NBS_v2_validated_marcxml.xml.gz

. ./common-script
13 changes: 13 additions & 0 deletions cli-parameter-definitions.json
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,12 @@
"hasArg": false,
"description": "index the count of field instances"
},
{
"short": "G",
"long": "indexSubfieldCounts",
"hasArg": false,
"description": "index the count of subfield instances"
},
{
"short": "F",
"long": "fieldPrefix",
Expand Down Expand Up @@ -426,6 +432,12 @@
"long": "fileName",
"hasArg": true,
"description": "output file (default: extracted.csv)"
},
{
"short": "A",
"long": "ids",
"hasArg": true,
"description": "list of identifiers separated by comma"
}
],
"functional-analysis": [
Expand Down Expand Up @@ -462,6 +474,7 @@
"description": "action: 'primary' (default), 'pairing'"
}
],
"marc-history": [],
"record-patterns": [
{
"short": "R",
Expand Down
6 changes: 4 additions & 2 deletions index
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ options:
-C, --indexWithTokenizedField index data elements as tokenized field as well
-D, --commitAt <arg> commit index after this number of records
-E, --indexFieldCounts index the count of field instances
-G, --indexSubfieldCounts index the count of subfield instances
-F, --fieldPrefix <arg> field prefix
-Z, --core <arg> The index name (core)
-Y, --file-path <arg> File path
Expand All @@ -74,8 +75,8 @@ if [ $# -eq 0 ]; then
show_usage
fi

SHORT_OPTIONS="m:hnl:o:i:d:qabpxyt:rz:v:f:s:g:1:2:u:j:w:k:c:e:3:4:S:AT:BCD:EF:Z:Y:X:WVU"
LONG_OPTIONS="marcVersion:,help,nolog,limit:,offset:,id:,defaultRecordType:,fixAlephseq,fixAlma,fixKbr,alephseq,marcxml,lineSeparated,outputDir:,trimId,ignorableFields:,ignorableRecords:,marcFormat:,dataSource:,defaultEncoding:,alephseqLineType:,picaIdField:,picaSubfieldSeparator:,picaSchemaFile:,schemaType:,picaRecordType:,allowableRecords:,groupBy:,groupListFile:,solrForScoresUrl:,solrUrl:,doCommit,solrFieldType:,useEmbedded,indexWithTokenizedField,commitAt:,indexFieldCounts,fieldPrefix:,core:,file-path:,file-mask:,purge,status,no-delete"
SHORT_OPTIONS="m:hnl:o:i:d:qabpxyt:rz:v:f:s:g:1:2:u:j:w:k:c:e:3:4:S:AT:BCD:EF:GZ:Y:X:WVU"
LONG_OPTIONS="marcVersion:,help,nolog,limit:,offset:,id:,defaultRecordType:,fixAlephseq,fixAlma,fixKbr,alephseq,marcxml,lineSeparated,outputDir:,trimId,ignorableFields:,ignorableRecords:,marcFormat:,dataSource:,defaultEncoding:,alephseqLineType:,picaIdField:,picaSubfieldSeparator:,picaSchemaFile:,schemaType:,picaRecordType:,allowableRecords:,groupBy:,groupListFile:,solrForScoresUrl:,solrUrl:,doCommit,solrFieldType:,useEmbedded,indexWithTokenizedField,commitAt:,indexFieldCounts,fieldPrefix:,indexSubfieldCounts,core:,file-path:,file-mask:,purge,status,no-delete"

GETOPT=$(getopt \
-o ${SHORT_OPTIONS} \
Expand Down Expand Up @@ -132,6 +133,7 @@ while true ; do
-C|--indexWithTokenizedField) PARAMS="$PARAMS --indexWithTokenizedField" ; shift ;;
-D|--commitAt) PARAMS="$PARAMS --commitAt $2" ; shift 2 ;;
-E|--indexFieldCounts) PARAMS="$PARAMS --indexFieldCounts" ; shift ;;
-G|--indexSubfieldCounts) PARAMS="$PARAMS --indexSubfieldCounts" ; shift ;;
-F|--fieldPrefix) PARAMS="$PARAMS --fieldPrefix $2" ; shift 2 ;;
-Z|--core) CORE="$2" ; shift 2 ;;
-Y|--file-path) FILE_PATH="$2" ; shift 2 ;;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,13 @@
package de.gwdg.metadataqa.marc.analysis.contextual.classification;

import de.gwdg.metadataqa.marc.MarcSubfield;
import de.gwdg.metadataqa.marc.Utils;
import de.gwdg.metadataqa.marc.analysis.contextual.ContextualAnalyzer;
import de.gwdg.metadataqa.marc.cli.parameters.ClassificationParameters;
import de.gwdg.metadataqa.marc.cli.utils.Schema;
import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.definition.general.indexer.subject.ClassificationSchemes;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static de.gwdg.metadataqa.marc.Utils.count;
Expand Down Expand Up @@ -50,7 +40,7 @@ protected void increaseCounters(int total) {
// Add this record as an example for the obtained number of classifications
statistics.getFrequencyExamples().computeIfAbsent(total, s -> bibliographicRecord.getId(true));

if (parameters == null || !parameters.doCollectCollocations()) {
if (parameters == null || !parameters.isDoCollectCollocations()) {
return;
}
logger.info("Collecting collocations");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ private void validateSubfields(List<MarcSubfield> subfields) {
SubfieldDefinition subfieldDefinition = entry.getKey();
long count = entry.getValue();
if (count > 1 && subfieldDefinition.getCardinality().equals(Cardinality.Nonrepeatable)) {
addError(subfieldDefinition, SUBFIELD_NONREPEATABLE, String.format("there are %d instances", count));
// addError(subfieldDefinition, SUBFIELD_NONREPEATABLE, String.format("there are %d instances", count));
addError(subfieldDefinition, SUBFIELD_NONREPEATABLE, String.format("there are multiple instances", count));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,8 @@ private ValidationError createNonRepeatableFieldError(RepetitionDao dao, Integer
DataFieldDefinition fieldDefinition = dao.getFieldDefinition();
return new ValidationError(bibliographicRecord.getId(), fieldDefinition.getExtendedTag(),
ValidationErrorType.FIELD_NONREPEATABLE,
String.format("there are %d instances", count),
// String.format("there are %d instances", count),
String.format("there are multiple instances", count),
fieldDefinition.getDescriptionUrl()
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ public void afterIteration(int numberOfprocessedRecords, long duration) {
printClassificationsHistogram();
printFrequencyExamples();
printSchemaSubfieldsStatistics();
if (parameters.doCollectCollocations())
if (parameters.isDoCollectCollocations())
printClassificationsCollocation();
copySchemaFileToOutputDir();
saveParameters("classifications.params.json", parameters, Map.of("numberOfprocessedRecords", numberOfprocessedRecords, "duration", duration));
Expand Down
63 changes: 51 additions & 12 deletions src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package de.gwdg.metadataqa.marc.cli;

import de.gwdg.metadataqa.marc.MarcSubfield;
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
import de.gwdg.metadataqa.marc.cli.parameters.MarcToSolrParameters;
import de.gwdg.metadataqa.marc.cli.processor.BibliographicInputProcessor;
Expand Down Expand Up @@ -66,18 +67,18 @@ public MarcToSolr(MarcToSolrParameters parameters) {
private void initialize() {
options = parameters.getOptions();

client = parameters.useEmbedded()
client = parameters.isUseEmbedded()
? new MarcSolrClient(parameters.getMainClient())
: new MarcSolrClient(parameters.getSolrUrl());
client.setTrimId(parameters.getTrimId());
client.indexWithTokenizedField(parameters.indexWithTokenizedField());
client.indexWithTokenizedField(parameters.isIndexWithTokenizedField());

if (parameters.getFieldPrefix() != null) {
client.setFieldPrefix(parameters.getFieldPrefix());
}

if (parameters.getSolrForScoresUrl() != null) {
validationClient = parameters.useEmbedded()
validationClient = parameters.isUseEmbedded()
? new MarcSolrClient(parameters.getValidationClient())
: new MarcSolrClient(parameters.getSolrForScoresUrl());
validationClient.setTrimId(parameters.getTrimId());
Expand Down Expand Up @@ -144,23 +145,34 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
// Add the record itself as a field to the index
keyValuePairs.put("record_sni", Collections.singletonList(bibliographicRecord.asJson()));

// logger.info(bibliographicRecord.getId());
SolrInputDocument solrDocument = client.createSolrDoc(bibliographicRecord.getId(), keyValuePairs);
if (validationClient != null) {
indexValidationResults(bibliographicRecord, solrDocument);
}

if (parameters.indexFieldCounts()) {
if (parameters.isIndexFieldCounts() || parameters.isIndexSubfieldCounts()) {
indexFieldCounts(bibliographicRecord, solrDocument);
}

client.index(solrDocument);
try {
client.index(solrDocument);
} catch (Exception e) {
logger.severe(() -> "ERROR while index." + e.getLocalizedMessage());
}

if (recordNumber % parameters.getCommitAt() != 0) {
return;
}

if (parameters.doCommit()) {
if (parameters.isDoCommit()) {
logger.info("do commit @" + recordNumber);
client.commit();
long indexedRecordCount = client.getCount();
if (recordNumber != indexedRecordCount) {
logger.severe(String.format("recordNumber: %d != indexedRecordCount: %d", recordNumber, indexedRecordCount));
}
logger.info("/do commit @" + recordNumber);
}

String logMessage = String.format(
Expand All @@ -183,8 +195,17 @@ private void indexValidationResults(BibliographicRecord bibliographicRecord, Sol
}
}

private void indexFieldCounts(BibliographicRecord bibliographicRecord, SolrInputDocument document) {
Counter<String> fieldCounter = new Counter<>();
/**
* Index field and subfield counts. The solr field will look like <tag>_count_i and
* <tag><subfield code>_count_i, the value will be the number of times this element is
* available in the record.
*
* @param bibliographicRecord The bibliographic record
* @param document The Solr document
*/
private void indexFieldCounts(BibliographicRecord bibliographicRecord,
SolrInputDocument document) {
Counter<String> counter = new Counter<>();
boolean isPica = bibliographicRecord.getSchemaType().equals(SchemaType.PICA);
for (DataField field : bibliographicRecord.getDatafields()) {
String tag;
Expand All @@ -196,10 +217,20 @@ private void indexFieldCounts(BibliographicRecord bibliographicRecord, SolrInput
tag = field.getTag();
}
String safeTag = escape(tag);
fieldCounter.count(safeTag);
if (parameters.isIndexFieldCounts())
counter.count(safeTag);

if (parameters.isIndexSubfieldCounts()) {
for (MarcSubfield subfield : field.getSubfields()) {
String safeSubfieldCode = DataFieldKeyGenerator.escape(subfield.getCode());
counter.count(safeTag + safeSubfieldCode);
}
}
}
for (Map.Entry<String, Integer> entry : fieldCounter.entrySet()) {
document.addField(String.format("%s_count_i", entry.getKey()), entry.getValue());
for (Map.Entry<String, Integer> entry : counter.entrySet()) {
document.addField(String.format(
"%s%s_count_i",
parameters.getFieldPrefix(), entry.getKey()), entry.getValue());
}
}

Expand Down Expand Up @@ -228,7 +259,15 @@ public void fileProcessed() {
@Override
public void afterIteration(int numberOfprocessedRecords, long duration) {
client.commit();
saveParameters("marctosolr.params.json", parameters, Map.of("numberOfprocessedRecords", numberOfprocessedRecords, "duration", duration));
logger.info(parameters.toString());
saveParameters(
"marctosolr.params.json",
parameters,
Map.of(
"numberOfprocessedRecords", numberOfprocessedRecords,
"duration", duration
)
);
}

@Override
Expand Down
1 change: 1 addition & 0 deletions src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ protected void saveParameters(String fileName, T parameters, Map<String, Object>
ObjectMapper mapper = new ObjectMapper();
try {
String json = mapper.writeValueAsString(parameters);
logger.info("json: " + json);
Map<String, Object> configuration = mapper.readValue(json, new TypeReference<>(){});
configuration.put("mqaf.version", de.gwdg.metadataqa.api.cli.Version.getVersion());
configuration.put("qa-catalogue.version", de.gwdg.metadataqa.marc.cli.Version.getVersion());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public ClassificationParameters(String[] arguments) throws ParseException {
}


public boolean doCollectCollocations() {
public boolean isDoCollectCollocations() {
return collectCollocations;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public class MarcToSolrParameters extends CommonParameters {
private boolean isOptionSet = false;
private int commitAt = DEFAULT_COMMIT_AT;
private boolean indexFieldCounts = false;
private boolean indexSubfieldCounts = false;
private String fieldPrefix = null;

@Override
Expand All @@ -32,6 +33,7 @@ protected void setOptions() {
options.addOption("C", "indexWithTokenizedField", false, "index data elements as tokenized field as well");
options.addOption("D", "commitAt", true, "commit index after this number of records");
options.addOption("E", "indexFieldCounts", false, "index the count of field instances");
options.addOption("G", "indexSubfieldCounts", false, "index the count of subfield instances");
options.addOption("F", "fieldPrefix", true, "field prefix");
isOptionSet = true;
}
Expand Down Expand Up @@ -65,6 +67,9 @@ public MarcToSolrParameters(String[] arguments) throws ParseException {
if (cmd.hasOption("indexFieldCounts"))
indexFieldCounts = true;

if (cmd.hasOption("indexSubfieldCounts"))
indexSubfieldCounts = true;

if (cmd.hasOption("fieldPrefix"))
fieldPrefix = cmd.getOptionValue("fieldPrefix");
}
Expand All @@ -73,7 +78,7 @@ public String getSolrUrl() {
return solrUrl;
}

public boolean doCommit() {
public boolean isDoCommit() {
return doCommit;
}

Expand All @@ -97,22 +102,26 @@ public void setValidationClient(SolrClient validationClient) {
this.validationClient = validationClient;
}

public boolean useEmbedded() {
public boolean isUseEmbedded() {
return useEmbedded;
}

public boolean indexWithTokenizedField() {
public boolean isIndexWithTokenizedField() {
return indexWithTokenizedField;
}

public int getCommitAt() {
return commitAt;
}

public boolean indexFieldCounts() {
public boolean isIndexFieldCounts() {
return indexFieldCounts;
}

public boolean isIndexSubfieldCounts() {
return indexSubfieldCounts;
}

public String getFieldPrefix() {
return fieldPrefix;
}
Expand All @@ -130,6 +139,7 @@ public String formatParameters() {
text += String.format("indexWithTokenizedField: %s%n", indexWithTokenizedField);
text += String.format("commitAt: %s%n", commitAt);
text += String.format("indexFieldCounts: %s%n", indexFieldCounts);
text += String.format("indexSubfieldCounts: %s%n", indexSubfieldCounts);
text += String.format("fieldPrefix: %s%n", fieldPrefix);
return text;
}
Expand Down
Loading

0 comments on commit 9e228f0

Please sign in to comment.