Skip to content

Commit

Permalink
Add solr field with number of times a field is used in a record #342
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Nov 2, 2023
1 parent c594b3e commit 3add92c
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 22 deletions.
1 change: 1 addition & 0 deletions catalogues/k10plus_pica_grouped.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ TYPE_PARAMS="$TYPE_PARAMS --ignorableFields 001@,001E,001L,001U,001U,001X,001X,0
TYPE_PARAMS="$TYPE_PARAMS --allowableRecords base64:"$(echo '[email protected] !~ "^L" && [email protected] !~ "^..[iktN]" && ([email protected] !~ "^.v" || 021A.a?)' | base64 -w 0)
TYPE_PARAMS="$TYPE_PARAMS --solrForScoresUrl http://localhost:8983/solr/k10plus_pica_grouped_validation"
TYPE_PARAMS="$TYPE_PARAMS --indexWithTokenizedField"
TYPE_PARAMS="$TYPE_PARAMS --countFields"
# MASK=sample.pica
# =kxp-title_2022-09-30-groupped.dat.gz
MASK=${MASK:=pica-with-holdings-info-1K.dat} # if not set in setdir.sh
Expand Down
26 changes: 13 additions & 13 deletions common-script
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ run() {
do_validate() {
GENERAL_PARAMS="--details --trimId --summary --format csv --defaultRecordType BOOKS"
OUTPUT_PARAMS="--outputDir ${OUTPUT_DIR} --detailsFileName issue-details.csv --summaryFileName issue-summary.csv"
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--(indexWithTokenizedField|countFields)//g')
run validate
./validate ${GENERAL_PARAMS} ${OUTPUT_PARAMS} ${PARAMS} ${MARC_DIR}/$MASK 2> ${PREFIX}/validate.log
}
Expand Down Expand Up @@ -72,7 +72,7 @@ do_postprocess_solr() {
}

do_completeness() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|countFields)//g')
run completeness
./completeness --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${PREFIX}/completeness.log
}
Expand All @@ -86,20 +86,20 @@ do_completeness_sqlite() {
}

do_classifications() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|countFields)//g')
run classifications
./classifications --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${PREFIX}/classifications.log
Rscript scripts/classifications/classifications-type.R ${OUTPUT_DIR}
}

do_authorities() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')
run authorities
./authorities --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${PREFIX}/authorities.log
}

do_tt_completeness() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')
run tt-completeness
./tt-completeness --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ --trimId ${MARC_DIR}/${MASK} 2> ${PREFIX}/tt-completeness.log
Rscript scripts/tt-histogram/tt-histogram.R ${OUTPUT_DIR} &>> ${PREFIX}/tt-completeness.log
Expand All @@ -109,7 +109,7 @@ do_tt_completeness() {
}

do_shelf_ready_completeness() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')
run shelf-ready-completeness
./shelf-ready-completeness \
--defaultRecordType BOOKS \
Expand All @@ -124,7 +124,7 @@ do_shelf_ready_completeness() {
}

do_bl_classification() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')
run bk-classification
./bl-classification \
--defaultRecordType BOOKS \
Expand All @@ -134,7 +134,7 @@ do_bl_classification() {
}

do_serial_score() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')
run serial-score
./serial-score --defaultRecordType BOOKS \
${PARAMS} \
Expand All @@ -150,15 +150,15 @@ do_format() {
}

do_functional_analysis() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')
run functional-analysis
./functional-analysis --defaultRecordType BOOKS \
${PARAMS} \
--outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${PREFIX}/functional-analysis.log
}

do_network_analysis() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')
run network-analysis
./network-analysis --defaultRecordType BOOKS \
${PARAMS} \
Expand Down Expand Up @@ -214,7 +214,7 @@ do_marc_history() {
else
SELECTOR="008~7-10;008~0-5"
fi
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')

run marc-history
./formatter --selector "$SELECTOR" --defaultRecordType BOOKS ${PARAMS} --separator "," \
Expand All @@ -233,7 +233,7 @@ do_marc_history() {
}

do_record_patterns() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')

run record-patterns
Rscript scripts/record-patterns/top-fields.R ${OUTPUT_DIR} &>> ${PREFIX}/top-fields.log
Expand Down Expand Up @@ -427,7 +427,7 @@ do_export_schema_files() {

do_shacl4bib() {
# note: SHACL specific parameters are missing here --shaclConfigurationFile, --shaclOutputType, --shaclOutputFile
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--indexWithTokenizedField//g')
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts)//g')
run shacl4bib
echo " ./shacl4bib --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${PREFIX}/shacl4bib.log"
./shacl4bib --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${PREFIX}/shacl4bib.log
Expand Down
12 changes: 9 additions & 3 deletions index
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ usage:
[-d|--defaultRecordType] [-v|--marcVersion]
[-A|--alephseqLineType] [-B|--validationCore] [-C|--outputDir]
[-C|--indexWithTokenizedField]
[-D|--commitAt]
[-E|--indexFieldCounts]
[-p|--purge] [-s|--status] [-h|--help]
-b, --db <name> name of the database
Expand Down Expand Up @@ -46,7 +48,9 @@ usage:
-B, --validationCore the Solr collection used in the validation task
-t, --outputDir the directory to write the file listing the parameters
-3, --groupListFile the file which contains a list of ILN codes
-C, --indexWithTokenizedField
-C, --indexWithTokenizedField index data elements as tokenized field as well
-D, --commitAt commit index after this number of records
-E, --indexFieldCounts index the count of field instances
-s, --status status information
-p, --purge delete all records from a core
-h, --help this help
Expand Down Expand Up @@ -78,8 +82,8 @@ validationCore=""
outputDir=""
PARAMS=""

GETOPT=$(getopt -o b:p:m:ws::xard:hSpv:l:i:g:A:F:f:z:J:B:t:C:3:c:4: \
--long db:,file-path:,file-mask:,no-delete,solrFieldType:,marcxml,alephseq,trimId,defaultRecordType,help,status,purge,marcVersion:,limit:,ignorableRecords:,defaultEncoding:,alephseqLineType:,schemaType:,marcFormat:,ignorableFields:,groupBy:,validationCore:,outputDir:,outputDir,indexWithTokenizedField,groupListFile:,allowableRecords:,solrForScoresUrl: \
GETOPT=$(getopt -o b:p:m:ws::xard:hSpv:l:i:g:A:F:f:z:J:B:t:C:3:c:4:D:E \
--long db:,file-path:,file-mask:,no-delete,solrFieldType:,marcxml,alephseq,trimId,defaultRecordType,help,status,purge,marcVersion:,limit:,ignorableRecords:,defaultEncoding:,alephseqLineType:,schemaType:,marcFormat:,ignorableFields:,groupBy:,validationCore:,outputDir:,outputDir,indexWithTokenizedField,groupListFile:,allowableRecords:,solrForScoresUrl:,commitAt:,indexFieldCounts \
-n ${ME} -- "$@")
eval set -- "$GETOPT"

Expand Down Expand Up @@ -109,6 +113,8 @@ while true ; do
-c|--allowableRecords) PARAMS="$PARAMS --allowableRecords $2" ; shift 2 ;;
-C|--indexWithTokenizedField) PARAMS="$PARAMS --indexWithTokenizedField" ; shift ;;
-4|--solrForScoresUrl) PARAMS="$PARAMS --solrForScoresUrl $2" ; shift 2 ;;
-D|--commitAt) PARAMS="$PARAMS --commitAt $2" ; shift 2 ;;
-E|--indexFieldCounts) PARAMS="$PARAMS --indexFieldCounts" ; shift ;;
-S|--status) status ; shift ;;
-p|--purge) purge_and_exit $DB ; shift ;;
-h|--help) show_usage ; shift ;;
Expand Down
39 changes: 33 additions & 6 deletions src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.definition.general.indexer.FieldIndexer;
import de.gwdg.metadataqa.marc.model.validation.ValidationError;
import de.gwdg.metadataqa.marc.utils.Counter;
import de.gwdg.metadataqa.marc.utils.pica.PicaFieldDefinition;
import de.gwdg.metadataqa.marc.utils.pica.PicaGroupIndexer;
import de.gwdg.metadataqa.marc.utils.pica.path.PicaPath;
import org.apache.commons.cli.HelpFormatter;
Expand Down Expand Up @@ -126,14 +128,14 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
parameters.getSolrFieldType(), true, parameters.getMarcVersion()
);
map.put("record_sni", Arrays.asList(bibliographicRecord.asJson()));
SolrInputDocument document = client.createSolrDoc(bibliographicRecord.getId(), map);
SolrInputDocument solrDocument = client.createSolrDoc(bibliographicRecord.getId(), map);
if (validationClient != null) {
SolrDocument validationValues = validationClient.get(bibliographicRecord.getId());
if (validationValues != null && !validationValues.isEmpty())
for (String field : validationValues.getFieldNames())
document.addField(field, validationValues.getFieldValues(field));
indexValidationResults(bibliographicRecord, solrDocument);
}
client.index(document);
if (parameters.indexFieldCounts()) {
indexFieldCounts(bibliographicRecord, solrDocument);
}
client.index(solrDocument);

if (recordNumber % parameters.getCommitAt() == 0) {
if (parameters.doCommit())
Expand All @@ -149,6 +151,31 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
}
}

private void indexValidationResults(BibliographicRecord bibliographicRecord, SolrInputDocument document) {
SolrDocument validationValues = validationClient.get(bibliographicRecord.getId());
if (validationValues != null && !validationValues.isEmpty())
for (String field : validationValues.getFieldNames())
document.addField(field, validationValues.getFieldValues(field));
}

private static void indexFieldCounts(BibliographicRecord bibliographicRecord, SolrInputDocument document) {
Counter<String> fieldCOunter = new Counter<>();
for (DataField field : bibliographicRecord.getDatafields()) {
String tag = null;
if (field.getDefinition() != null) {
tag = bibliographicRecord.getSchemaType().equals(SchemaType.PICA)
? ((PicaFieldDefinition)field.getDefinition()).getId()
: field.getDefinition().getTag();
} else {
tag = field.getTag();
}
fieldCOunter.count(tag);
}
for (Map.Entry<String, Integer> entry : fieldCOunter.entrySet()) {
document.addField(String.format("%s_count_i", entry.getKey()), entry.getValue());
}
}

@Override
public void beforeIteration() {
logger.info(parameters.formatParameters());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ public class MarcToSolrParameters extends CommonParameters {

private boolean isOptionSet = false;
private int commitAt = DEFAULT_COMMIT_AT;
private boolean indexFieldCounts = false;

protected void setOptions() {
if (!isOptionSet) {
Expand All @@ -28,6 +29,7 @@ protected void setOptions() {
options.addOption("B", "useEmbedded", false, "use embedded Solr server (used in tests only)");
options.addOption("C", "indexWithTokenizedField", false, "index data elements as tokenized field as well");
options.addOption("D", "commitAt", true, "commit index after this number of records");
options.addOption("E", "indexFieldCounts", false, "index the count of field instances");
isOptionSet = true;
}
}
Expand All @@ -52,6 +54,9 @@ public MarcToSolrParameters(String[] arguments) throws ParseException {

if (cmd.hasOption("commitAt"))
commitAt = Integer.valueOf(cmd.getOptionValue("commitAt"));

if (cmd.hasOption("indexFieldCounts"))
indexFieldCounts = true;
}

public String getSolrUrl() {
Expand Down Expand Up @@ -94,13 +99,19 @@ public int getCommitAt() {
return commitAt;
}

public boolean indexFieldCounts() {
return indexFieldCounts;
}

@Override
public String formatParameters() {
String text = super.formatParameters();
text += String.format("solrUrl: %s%n", solrUrl);
text += String.format("doCommit: %s%n", doCommit);
text += String.format("solrFieldType: %s%n", solrFieldType);
text += String.format("indexWithTokenizedField: %s%n", indexWithTokenizedField);
text += String.format("indexFieldCounts: %s%n", indexFieldCounts);
return text;
}

}

0 comments on commit 3add92c

Please sign in to comment.