From 0dc4c927e38c6778247f45c08c6eec6049fb8f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Kir=C3=A1ly?= Date: Thu, 2 Nov 2023 18:46:40 +0100 Subject: [PATCH] Add solr field with number of times a field is used in a record #342 --- catalogues/k10plus_pica_grouped.sh | 2 +- .../gwdg/metadataqa/marc/cli/MarcToSolr.java | 22 ++++++++++++++----- .../keygenerator/DataFieldKeyGenerator.java | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/catalogues/k10plus_pica_grouped.sh b/catalogues/k10plus_pica_grouped.sh index 07f567d72..b32c375f5 100755 --- a/catalogues/k10plus_pica_grouped.sh +++ b/catalogues/k10plus_pica_grouped.sh @@ -11,7 +11,7 @@ TYPE_PARAMS="$TYPE_PARAMS --ignorableFields 001@,001E,001L,001U,001U,001X,001X,0 TYPE_PARAMS="$TYPE_PARAMS --allowableRecords base64:"$(echo '002@.0 !~ "^L" && 002@.0 !~ "^..[iktN]" && (002@.0 !~ "^.v" || 021A.a?)' | base64 -w 0) TYPE_PARAMS="$TYPE_PARAMS --solrForScoresUrl http://localhost:8983/solr/k10plus_pica_grouped_validation" TYPE_PARAMS="$TYPE_PARAMS --indexWithTokenizedField" -TYPE_PARAMS="$TYPE_PARAMS --countFields" +TYPE_PARAMS="$TYPE_PARAMS --indexFieldCounts" # MASK=sample.pica # =kxp-title_2022-09-30-groupped.dat.gz MASK=${MASK:=pica-with-holdings-info-1K.dat} # if not set in setdir.sh diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java b/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java index dc710c161..37a1d381c 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java +++ b/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java @@ -12,6 +12,7 @@ import de.gwdg.metadataqa.marc.definition.general.indexer.FieldIndexer; import de.gwdg.metadataqa.marc.model.validation.ValidationError; import de.gwdg.metadataqa.marc.utils.Counter; +import de.gwdg.metadataqa.marc.utils.keygenerator.DataFieldKeyGenerator; import de.gwdg.metadataqa.marc.utils.pica.PicaFieldDefinition; import de.gwdg.metadataqa.marc.utils.pica.PicaGroupIndexer; import de.gwdg.metadataqa.marc.utils.pica.path.PicaPath; @@ -28,6 +29,7 @@ import java.nio.file.Path; import java.text.DecimalFormat; import java.util.Arrays; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Logger; @@ -51,6 +53,7 @@ public class MarcToSolr extends QACli implements Bibliogra private boolean readyToProcess; private DecimalFormat decimalFormat = new DecimalFormat(); private FieldIndexer groupIndexer; + private Map escapedTagCache = new HashMap<>(); public MarcToSolr(String[] args) throws ParseException { parameters = new MarcToSolrParameters(args); @@ -158,22 +161,29 @@ private void indexValidationResults(BibliographicRecord bibliographicRecord, Sol document.addField(field, validationValues.getFieldValues(field)); } - private static void indexFieldCounts(BibliographicRecord bibliographicRecord, SolrInputDocument document) { - Counter fieldCOunter = new Counter<>(); + private void indexFieldCounts(BibliographicRecord bibliographicRecord, SolrInputDocument document) { + Counter fieldCounter = new Counter<>(); + boolean isPica = bibliographicRecord.getSchemaType().equals(SchemaType.PICA); for (DataField field : bibliographicRecord.getDatafields()) { String tag = null; if (field.getDefinition() != null) { - tag = bibliographicRecord.getSchemaType().equals(SchemaType.PICA) + tag = isPica ? ((PicaFieldDefinition)field.getDefinition()).getId() : field.getDefinition().getTag(); } else { tag = field.getTag(); } - fieldCOunter.count(tag); + fieldCounter.count(escape(tag)); } - for (Map.Entry entry : fieldCOunter.entrySet()) { + for (Map.Entry entry : fieldCounter.entrySet()) document.addField(String.format("%s_count_i", entry.getKey()), entry.getValue()); - } + } + + private String escape(String tag) { + if (!escapedTagCache.containsKey(tag)) + escapedTagCache.put(tag, DataFieldKeyGenerator.escape(tag)); + + return escapedTagCache.get(tag); } @Override diff --git a/src/main/java/de/gwdg/metadataqa/marc/utils/keygenerator/DataFieldKeyGenerator.java b/src/main/java/de/gwdg/metadataqa/marc/utils/keygenerator/DataFieldKeyGenerator.java index b40fadae9..5f5554769 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/utils/keygenerator/DataFieldKeyGenerator.java +++ b/src/main/java/de/gwdg/metadataqa/marc/utils/keygenerator/DataFieldKeyGenerator.java @@ -152,7 +152,7 @@ else if (!tag.equals(indexTag) && codeForIndex.equals("_" + code)) return key; } - private String escape(String tag) { + public static String escape(String tag) { List safe = new ArrayList<>(); for (int i = 0; i < tag.length(); i++) { String code = tag.substring(i, i+1);