Add solr field with number of times a field is used in a record #342:…

… add subfield count
pkiraly · Dec 18, 2024 · 9e228f0 · 9e228f0
1 parent d7044a2
commit 9e228f0
Show file tree

Hide file tree

Showing 16 changed files with 137 additions and 38 deletions.
diff --git a/catalogues/k10plus_pica_grouped.sh b/catalogues/k10plus_pica_grouped.sh
@@ -16,7 +16,8 @@ TYPE_PARAMS="$TYPE_PARAMS --allowableRecords base64:"$(echo '[email protected] !~ "^L" && 0
 # TYPE_PARAMS="$TYPE_PARAMS --solrUrl http://localhost:8983/solr/k10plus_pica_grouped"
 TYPE_PARAMS="$TYPE_PARAMS --solrForScoresUrl http://localhost:8983/solr/k10plus_pica_grouped_validation"
 TYPE_PARAMS="$TYPE_PARAMS --indexWithTokenizedField"
-TYPE_PARAMS="$TYPE_PARAMS --indexFieldCounts"
+TYPE_PARAMS="$TYPE_PARAMS --indexFieldCounts --indexSubfieldCounts"
+TYPE_PARAMS="$TYPE_PARAMS --fieldPrefix bib"
 # MASK=sample.pica
 # =kxp-title_2022-09-30-groupped.dat.gz
 MASK=${MASK:=pica-with-holdings-info-1K.dat} # if not set in setdir.sh

diff --git a/catalogues/nls.sh b/catalogues/nls.sh
@@ -5,7 +5,10 @@
 . ./setdir.sh
 
 NAME=nls
-TYPE_PARAMS="--marcxml --emptyLargeCollectors --indexWithTokenizedField --indexFieldCounts --solrForScoresUrl http://localhost:8983/solr/nls_validation"
+TYPE_PARAMS="--marcxml --emptyLargeCollectors --indexWithTokenizedField --doCommit"
+# TYPE_PARAMS="${TYPE_PARAMS} --offset 180000"
+TYPE_PARAMS="${TYPE_PARAMS} --indexFieldCounts --indexSubfieldCounts"
+TYPE_PARAMS="${TYPE_PARAMS} --solrForScoresUrl http://localhost:8983/solr/nls_validation"
 MASK=NBS_v2_validated_marcxml.xml.gz
 
 . ./common-script
diff --git a/cli-parameter-definitions.json b/cli-parameter-definitions.json
@@ -294,6 +294,12 @@
       "hasArg": false,
       "description": "index the count of field instances"
     },
+    {
+      "short": "G",
+      "long": "indexSubfieldCounts",
+      "hasArg": false,
+      "description": "index the count of subfield instances"
+    },
     {
       "short": "F",
       "long": "fieldPrefix",
@@ -426,6 +432,12 @@
       "long": "fileName",
       "hasArg": true,
       "description": "output file (default: extracted.csv)"
+    },
+    {
+      "short": "A",
+      "long": "ids",
+      "hasArg": true,
+      "description": "list of identifiers separated by comma"
     }
   ],
   "functional-analysis": [
@@ -462,6 +474,7 @@
       "description": "action: 'primary' (default), 'pairing'"
     }
   ],
+  "marc-history": [],
   "record-patterns": [
     {
       "short": "R",

diff --git a/index b/index
@@ -56,6 +56,7 @@ options:
  -C, --indexWithTokenizedField        index data elements as tokenized field as well
  -D, --commitAt <arg>                 commit index after this number of records
  -E, --indexFieldCounts               index the count of field instances
+ -G, --indexSubfieldCounts            index the count of subfield instances
  -F, --fieldPrefix <arg>              field prefix
  -Z, --core <arg>                     The index name (core)
  -Y, --file-path <arg>                File path
@@ -74,8 +75,8 @@ if [ $# -eq 0 ]; then
   show_usage
 fi
 
-SHORT_OPTIONS="m:hnl:o:i:d:qabpxyt:rz:v:f:s:g:1:2:u:j:w:k:c:e:3:4:S:AT:BCD:EF:Z:Y:X:WVU"
-LONG_OPTIONS="marcVersion:,help,nolog,limit:,offset:,id:,defaultRecordType:,fixAlephseq,fixAlma,fixKbr,alephseq,marcxml,lineSeparated,outputDir:,trimId,ignorableFields:,ignorableRecords:,marcFormat:,dataSource:,defaultEncoding:,alephseqLineType:,picaIdField:,picaSubfieldSeparator:,picaSchemaFile:,schemaType:,picaRecordType:,allowableRecords:,groupBy:,groupListFile:,solrForScoresUrl:,solrUrl:,doCommit,solrFieldType:,useEmbedded,indexWithTokenizedField,commitAt:,indexFieldCounts,fieldPrefix:,core:,file-path:,file-mask:,purge,status,no-delete"
+SHORT_OPTIONS="m:hnl:o:i:d:qabpxyt:rz:v:f:s:g:1:2:u:j:w:k:c:e:3:4:S:AT:BCD:EF:GZ:Y:X:WVU"
+LONG_OPTIONS="marcVersion:,help,nolog,limit:,offset:,id:,defaultRecordType:,fixAlephseq,fixAlma,fixKbr,alephseq,marcxml,lineSeparated,outputDir:,trimId,ignorableFields:,ignorableRecords:,marcFormat:,dataSource:,defaultEncoding:,alephseqLineType:,picaIdField:,picaSubfieldSeparator:,picaSchemaFile:,schemaType:,picaRecordType:,allowableRecords:,groupBy:,groupListFile:,solrForScoresUrl:,solrUrl:,doCommit,solrFieldType:,useEmbedded,indexWithTokenizedField,commitAt:,indexFieldCounts,fieldPrefix:,indexSubfieldCounts,core:,file-path:,file-mask:,purge,status,no-delete"
 
 GETOPT=$(getopt \
   -o ${SHORT_OPTIONS} \
@@ -132,6 +133,7 @@ while true ; do
     -C|--indexWithTokenizedField)  PARAMS="$PARAMS --indexWithTokenizedField" ;    shift   ;;
     -D|--commitAt)                 PARAMS="$PARAMS --commitAt $2" ;                shift 2 ;;
     -E|--indexFieldCounts)         PARAMS="$PARAMS --indexFieldCounts" ;           shift   ;;
+    -G|--indexSubfieldCounts)      PARAMS="$PARAMS --indexSubfieldCounts" ;        shift   ;;
     -F|--fieldPrefix)              PARAMS="$PARAMS --fieldPrefix $2" ;             shift 2 ;;
     -Z|--core)                     CORE="$2" ;                                     shift 2 ;;
     -Y|--file-path)                FILE_PATH="$2" ;                                shift 2 ;;

diff --git a/...va/de/gwdg/metadataqa/marc/analysis/contextual/classification/ClassificationAnalyzer.java b/...va/de/gwdg/metadataqa/marc/analysis/contextual/classification/ClassificationAnalyzer.java
@@ -1,23 +1,13 @@
 package de.gwdg.metadataqa.marc.analysis.contextual.classification;
 
-import de.gwdg.metadataqa.marc.MarcSubfield;
-import de.gwdg.metadataqa.marc.Utils;
 import de.gwdg.metadataqa.marc.analysis.contextual.ContextualAnalyzer;
 import de.gwdg.metadataqa.marc.cli.parameters.ClassificationParameters;
 import de.gwdg.metadataqa.marc.cli.utils.Schema;
-import de.gwdg.metadataqa.marc.dao.DataField;
 import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
 import de.gwdg.metadataqa.marc.definition.general.indexer.subject.ClassificationSchemes;
 
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
-import java.util.Set;
 import java.util.logging.Logger;
-import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 import static de.gwdg.metadataqa.marc.Utils.count;
@@ -50,7 +40,7 @@ protected void increaseCounters(int total) {
     // Add this record as an example for the obtained number of classifications
     statistics.getFrequencyExamples().computeIfAbsent(total, s -> bibliographicRecord.getId(true));
 
-    if (parameters == null || !parameters.doCollectCollocations()) {
+    if (parameters == null || !parameters.isDoCollectCollocations()) {
       return;
     }
     logger.info("Collecting collocations");

diff --git a/src/main/java/de/gwdg/metadataqa/marc/analysis/validator/DataFieldValidator.java b/src/main/java/de/gwdg/metadataqa/marc/analysis/validator/DataFieldValidator.java
@@ -165,7 +165,8 @@ private void validateSubfields(List<MarcSubfield> subfields) {
       SubfieldDefinition subfieldDefinition = entry.getKey();
       long count = entry.getValue();
       if (count > 1 && subfieldDefinition.getCardinality().equals(Cardinality.Nonrepeatable)) {
-        addError(subfieldDefinition, SUBFIELD_NONREPEATABLE, String.format("there are %d instances", count));
+        // addError(subfieldDefinition, SUBFIELD_NONREPEATABLE, String.format("there are %d instances", count));
+        addError(subfieldDefinition, SUBFIELD_NONREPEATABLE, String.format("there are multiple instances", count));
       }
     }
 

diff --git a/src/main/java/de/gwdg/metadataqa/marc/analysis/validator/Validator.java b/src/main/java/de/gwdg/metadataqa/marc/analysis/validator/Validator.java
@@ -270,7 +270,8 @@ private ValidationError createNonRepeatableFieldError(RepetitionDao dao, Integer
     DataFieldDefinition fieldDefinition = dao.getFieldDefinition();
     return new ValidationError(bibliographicRecord.getId(), fieldDefinition.getExtendedTag(),
         ValidationErrorType.FIELD_NONREPEATABLE,
-        String.format("there are %d instances", count),
+        // String.format("there are %d instances", count),
+        String.format("there are multiple instances", count),
         fieldDefinition.getDescriptionUrl()
     );
   }

diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java b/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java
@@ -143,7 +143,7 @@ public void afterIteration(int numberOfprocessedRecords, long duration) {
     printClassificationsHistogram();
     printFrequencyExamples();
     printSchemaSubfieldsStatistics();
-    if (parameters.doCollectCollocations())
+    if (parameters.isDoCollectCollocations())
       printClassificationsCollocation();
     copySchemaFileToOutputDir();
     saveParameters("classifications.params.json", parameters, Map.of("numberOfprocessedRecords", numberOfprocessedRecords, "duration", duration));

diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java b/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
@@ -1,5 +1,6 @@
 package de.gwdg.metadataqa.marc.cli;
 
+import de.gwdg.metadataqa.marc.MarcSubfield;
 import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
 import de.gwdg.metadataqa.marc.cli.parameters.MarcToSolrParameters;
 import de.gwdg.metadataqa.marc.cli.processor.BibliographicInputProcessor;
@@ -66,18 +67,18 @@ public MarcToSolr(MarcToSolrParameters parameters) {
   private void initialize() {
     options = parameters.getOptions();
 
-    client = parameters.useEmbedded()
+    client = parameters.isUseEmbedded()
       ? new MarcSolrClient(parameters.getMainClient())
       : new MarcSolrClient(parameters.getSolrUrl());
     client.setTrimId(parameters.getTrimId());
-    client.indexWithTokenizedField(parameters.indexWithTokenizedField());
+    client.indexWithTokenizedField(parameters.isIndexWithTokenizedField());
 
     if (parameters.getFieldPrefix() != null) {
       client.setFieldPrefix(parameters.getFieldPrefix());
     }
 
     if (parameters.getSolrForScoresUrl() != null) {
-      validationClient = parameters.useEmbedded()
+      validationClient = parameters.isUseEmbedded()
         ? new MarcSolrClient(parameters.getValidationClient())
         : new MarcSolrClient(parameters.getSolrForScoresUrl());
       validationClient.setTrimId(parameters.getTrimId());
@@ -144,23 +145,34 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
     // Add the record itself as a field to the index
     keyValuePairs.put("record_sni", Collections.singletonList(bibliographicRecord.asJson()));
 
+    // logger.info(bibliographicRecord.getId());
     SolrInputDocument solrDocument = client.createSolrDoc(bibliographicRecord.getId(), keyValuePairs);
     if (validationClient != null) {
       indexValidationResults(bibliographicRecord, solrDocument);
     }
 
-    if (parameters.indexFieldCounts()) {
+    if (parameters.isIndexFieldCounts() || parameters.isIndexSubfieldCounts()) {
       indexFieldCounts(bibliographicRecord, solrDocument);
     }
 
-    client.index(solrDocument);
+    try {
+      client.index(solrDocument);
+    } catch (Exception e) {
+      logger.severe(() -> "ERROR while index." + e.getLocalizedMessage());
+    }
 
     if (recordNumber % parameters.getCommitAt() != 0) {
       return;
     }
 
-    if (parameters.doCommit()) {
+    if (parameters.isDoCommit()) {
+      logger.info("do commit @" + recordNumber);
       client.commit();
+      long indexedRecordCount = client.getCount();
+      if (recordNumber != indexedRecordCount) {
+        logger.severe(String.format("recordNumber: %d != indexedRecordCount: %d", recordNumber, indexedRecordCount));
+      }
+      logger.info("/do commit @" + recordNumber);
     }
 
     String logMessage = String.format(
@@ -183,8 +195,17 @@ private void indexValidationResults(BibliographicRecord bibliographicRecord, Sol
     }
   }
 
-  private void indexFieldCounts(BibliographicRecord bibliographicRecord, SolrInputDocument document) {
-    Counter<String> fieldCounter = new Counter<>();
+  /**
+   * Index field and subfield counts. The solr field will look like <tag>_count_i and
+   * <tag><subfield code>_count_i, the value will be the number of times this element is
+   * available in the record.
+   *
+   * @param bibliographicRecord The bibliographic record
+   * @param document The Solr document
+   */
+  private void indexFieldCounts(BibliographicRecord bibliographicRecord,
+                                SolrInputDocument document) {
+    Counter<String> counter = new Counter<>();
     boolean isPica = bibliographicRecord.getSchemaType().equals(SchemaType.PICA);
     for (DataField field : bibliographicRecord.getDatafields()) {
       String tag;
@@ -196,10 +217,20 @@ private void indexFieldCounts(BibliographicRecord bibliographicRecord, SolrInput
         tag = field.getTag();
       }
       String safeTag = escape(tag);
-      fieldCounter.count(safeTag);
+      if (parameters.isIndexFieldCounts())
+        counter.count(safeTag);
+
+      if (parameters.isIndexSubfieldCounts()) {
+        for (MarcSubfield subfield : field.getSubfields()) {
+          String safeSubfieldCode = DataFieldKeyGenerator.escape(subfield.getCode());
+          counter.count(safeTag + safeSubfieldCode);
+        }
+      }
     }
-    for (Map.Entry<String, Integer> entry : fieldCounter.entrySet()) {
-      document.addField(String.format("%s_count_i", entry.getKey()), entry.getValue());
+    for (Map.Entry<String, Integer> entry : counter.entrySet()) {
+      document.addField(String.format(
+        "%s%s_count_i",
+        parameters.getFieldPrefix(), entry.getKey()), entry.getValue());
     }
   }
 
@@ -228,7 +259,15 @@ public void fileProcessed() {
   @Override
   public void afterIteration(int numberOfprocessedRecords, long duration) {
     client.commit();
-    saveParameters("marctosolr.params.json", parameters, Map.of("numberOfprocessedRecords", numberOfprocessedRecords, "duration", duration));
+    logger.info(parameters.toString());
+    saveParameters(
+      "marctosolr.params.json",
+      parameters,
+      Map.of(
+        "numberOfprocessedRecords", numberOfprocessedRecords,
+        "duration", duration
+      )
+    );
   }
 
   @Override

diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java b/src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java
@@ -58,6 +58,7 @@ protected void saveParameters(String fileName, T parameters, Map<String, Object>
     ObjectMapper mapper = new ObjectMapper();
     try {
       String json = mapper.writeValueAsString(parameters);
+      logger.info("json: " + json);
       Map<String, Object> configuration = mapper.readValue(json, new TypeReference<>(){});
       configuration.put("mqaf.version", de.gwdg.metadataqa.api.cli.Version.getVersion());
       configuration.put("qa-catalogue.version", de.gwdg.metadataqa.marc.cli.Version.getVersion());

diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/parameters/ClassificationParameters.java b/src/main/java/de/gwdg/metadataqa/marc/cli/parameters/ClassificationParameters.java
@@ -29,7 +29,7 @@ public ClassificationParameters(String[] arguments) throws ParseException {
   }
 
 
-  public boolean doCollectCollocations() {
+  public boolean isDoCollectCollocations() {
     return collectCollocations;
   }
 

diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/parameters/MarcToSolrParameters.java b/src/main/java/de/gwdg/metadataqa/marc/cli/parameters/MarcToSolrParameters.java
@@ -18,6 +18,7 @@ public class MarcToSolrParameters extends CommonParameters {
   private boolean isOptionSet = false;
   private int commitAt = DEFAULT_COMMIT_AT;
   private boolean indexFieldCounts = false;
+  private boolean indexSubfieldCounts = false;
   private String fieldPrefix = null;
 
   @Override
@@ -32,6 +33,7 @@ protected void setOptions() {
       options.addOption("C", "indexWithTokenizedField", false, "index data elements as tokenized field as well");
       options.addOption("D", "commitAt", true, "commit index after this number of records");
       options.addOption("E", "indexFieldCounts", false, "index the count of field instances");
+      options.addOption("G", "indexSubfieldCounts", false, "index the count of subfield instances");
       options.addOption("F", "fieldPrefix", true, "field prefix");
       isOptionSet = true;
     }
@@ -65,6 +67,9 @@ public MarcToSolrParameters(String[] arguments) throws ParseException {
     if (cmd.hasOption("indexFieldCounts"))
       indexFieldCounts = true;
 
+    if (cmd.hasOption("indexSubfieldCounts"))
+      indexSubfieldCounts = true;
+
     if (cmd.hasOption("fieldPrefix"))
       fieldPrefix = cmd.getOptionValue("fieldPrefix");
   }
@@ -73,7 +78,7 @@ public String getSolrUrl() {
     return solrUrl;
   }
 
-  public boolean doCommit() {
+  public boolean isDoCommit() {
     return doCommit;
   }
 
@@ -97,22 +102,26 @@ public void setValidationClient(SolrClient validationClient) {
     this.validationClient = validationClient;
   }
 
-  public boolean useEmbedded() {
+  public boolean isUseEmbedded() {
     return useEmbedded;
   }
 
-  public boolean indexWithTokenizedField() {
+  public boolean isIndexWithTokenizedField() {
     return indexWithTokenizedField;
   }
 
   public int getCommitAt() {
     return commitAt;
   }
 
-  public boolean indexFieldCounts() {
+  public boolean isIndexFieldCounts() {
     return indexFieldCounts;
   }
 
+  public boolean isIndexSubfieldCounts() {
+    return indexSubfieldCounts;
+  }
+
   public String getFieldPrefix() {
     return fieldPrefix;
   }
@@ -130,6 +139,7 @@ public String formatParameters() {
     text += String.format("indexWithTokenizedField: %s%n", indexWithTokenizedField);
     text += String.format("commitAt: %s%n", commitAt);
     text += String.format("indexFieldCounts: %s%n", indexFieldCounts);
+    text += String.format("indexSubfieldCounts: %s%n", indexSubfieldCounts);
     text += String.format("fieldPrefix: %s%n", fieldPrefix);
     return text;
   }
-Original file line number
+Diff line change
@@ Expand Up @@
       }
-      public boolean doCollectCollocations() {
+      public boolean isDoCollectCollocations() {
         return collectCollocations;
       }
@@ Expand Down @@