PICA: Extend classification/subject headings schemes from config file #…

…190
pkiraly · Nov 6, 2023 · fb36749 · fb36749
1 parent 0b6e8d9
commit fb36749
Show file tree

Hide file tree

Showing 11 changed files with 211 additions and 12 deletions.
diff --git a/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java b/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java
@@ -8,6 +8,7 @@
 import de.gwdg.metadataqa.marc.cli.utils.Schema;
 import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
 import de.gwdg.metadataqa.marc.definition.general.indexer.subject.ClassificationSchemes;
+import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager;
 import de.gwdg.metadataqa.marc.utils.pica.PicaVocabularyManager;
 import de.gwdg.metadataqa.marc.utils.pica.VocabularyEntry;
 
@@ -41,6 +42,7 @@ public class ClassificationAnalyzer {
   private ClassificationParameters parameters = null;
   private BibliographicRecord marcRecord;
   private List<Schema> schemasInRecord;
+  private static List<FieldWithScheme> picaFieldsWithScheme = PicaSubjectManager.readFieldsWithScheme();
 
   private static final List<String> fieldsWithIndicator1AndSubfield2 = Arrays.asList(
     "052", // Geographic Classification
@@ -149,7 +151,7 @@ public int process() {
       total = processFieldsWithoutSource(total);
       total = processFieldsWithScheme(total, MARC21_FIELD_WITH_SCHEMES);
     } else if (marcRecord.getSchemaType().equals(SchemaType.PICA)) {
-      total = processFieldsWithSchemePica(total, PICA_FIELDS_WITH_SCHEME);
+      total = processFieldsWithSchemePica(total, picaFieldsWithScheme);
     }
 
     increaseCounters(total);
@@ -180,12 +182,25 @@ private int processFieldsWithScheme(int total, List<FieldWithScheme> fieldsWithS
 
   private int processFieldsWithSchemePica(int total, List<FieldWithScheme> fieldsWithScheme) {
     int count = total;
-    for (VocabularyEntry entry : manager.getAll()) {
-      if (!marcRecord.hasDatafield(entry.getPica()))
+    // for (VocabularyEntry entry : manager.getAll()) {
+    for (FieldWithScheme entry : fieldsWithScheme) {
+      /*
+      String tag = entry.getPica();
+      String schema = entry.getLabel();
+      String voc = entry.getVoc();
+       */
+      String tag = entry.getTag();
+      String schema = entry.getSchemaName();
+      String voc = tag;
+      try {
+        voc = classificationSchemes.resolve(schema);
+      } catch (IllegalArgumentException e) {
+
+      }
+      if (!marcRecord.hasDatafield(tag))
         continue;
 
-      String schema = entry.getLabel();
-      List<DataField> fields = marcRecord.getDatafield(entry.getPica());
+      List<DataField> fields = marcRecord.getDatafield(tag);
       List<Schema> schemas = new ArrayList<>();
       for (DataField field : fields) {
         String firstSubfield = null;
@@ -201,7 +216,7 @@ private int processFieldsWithSchemePica(int total, List<FieldWithScheme> fieldsW
           }
         }
         if (firstSubfield != null) {
-          var currentSchema = new Schema(field.getTagWithOccurrence(), firstSubfield, entry.getVoc(), schema);
+          var currentSchema = new Schema(field.getTagWithOccurrence(), firstSubfield, voc, schema);
           schemas.add(currentSchema);
           updateSchemaSubfieldStatistics(field, currentSchema);
           count++;

diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java b/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java
@@ -11,6 +11,7 @@
 import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
 import de.gwdg.metadataqa.marc.cli.utils.Schema;
 import de.gwdg.metadataqa.marc.model.validation.ValidationError;
+import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager;
 import org.apache.commons.cli.Options;
 import org.apache.commons.cli.ParseException;
 import org.apache.commons.io.FileUtils;
@@ -21,7 +22,6 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.Serializable;
-import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@@ -134,9 +134,21 @@ public void afterIteration(int numberOfprocessedRecords, long duration) {
     printSchemaSubfieldsStatistics();
     if (parameters.doCollectCollocations())
       printClassificationsCollocation();
+    copySchemaFileToOutputDir();
     saveParameters("classifications.params.json", parameters, Map.of("numberOfprocessedRecords", numberOfprocessedRecords, "duration", duration));
   }
 
+  private void copySchemaFileToOutputDir() {
+    if (parameters.isPica()) {
+      File source = new File(PicaSubjectManager.getSchemaFile());
+      try {
+        FileUtils.copyFileToDirectory(source, new File(parameters.getOutputDir()));
+      } catch (IOException e) {
+        logger.warning(e.getLocalizedMessage());
+      }
+    }
+  }
+
   private void printClassificationsCollocation() {
     Path path;
     path = Paths.get(parameters.getOutputDir(), "classifications-collocations.csv");

diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java b/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
@@ -132,12 +132,12 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
     );
     map.put("record_sni", Arrays.asList(bibliographicRecord.asJson()));
     SolrInputDocument solrDocument = client.createSolrDoc(bibliographicRecord.getId(), map);
-    if (validationClient != null) {
+    if (validationClient != null)
       indexValidationResults(bibliographicRecord, solrDocument);
-    }
-    if (parameters.indexFieldCounts()) {
+
+    if (parameters.indexFieldCounts())
       indexFieldCounts(bibliographicRecord, solrDocument);
-    }
+
     client.index(solrDocument);
 
     if (recordNumber % parameters.getCommitAt() == 0) {

diff --git a/src/main/java/de/gwdg/metadataqa/marc/dao/record/PicaRecord.java b/src/main/java/de/gwdg/metadataqa/marc/dao/record/PicaRecord.java
@@ -6,6 +6,7 @@
 import de.gwdg.metadataqa.marc.analysis.ThompsonTraillFields;
 import de.gwdg.metadataqa.marc.dao.DataField;
 import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
+import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager;
 import de.gwdg.metadataqa.marc.utils.pica.crosswalk.Crosswalk;
 import de.gwdg.metadataqa.marc.utils.pica.crosswalk.PicaMarcCrosswalkReader;
 
@@ -139,9 +140,12 @@ private static void initializeAuthorityTags() {
     skippableAuthoritySubfields.put("033H", Utils.listToMap(Arrays.asList("9", "V", "7", "3", "w")));
     skippableAuthoritySubfields.put("033J", Utils.listToMap(Arrays.asList("9", "V", "7", "3", "w")));
 
+    /*
     List<String> subjectTags = Arrays.asList(
       "045A", "045B", "045F", "045R", "045C", "045E", "045G"
     );
+     */
+    List<String> subjectTags = PicaSubjectManager.getTags();
     subjectTagIndex = Utils.listToMap(subjectTags);
     skippableSubjectSubfields = new HashMap<>();
     skippableSubjectSubfields.put("022A", Utils.listToMap(Arrays.asList("9", "V", "7", "3", "w")));

diff --git a/...ava/de/gwdg/metadataqa/marc/definition/general/indexer/subject/ClassificationSchemes.java b/...ava/de/gwdg/metadataqa/marc/definition/general/indexer/subject/ClassificationSchemes.java
@@ -64,6 +64,10 @@ private void initialize() {
     schemes.put("DDC-Notation", "ddc");
     schemes.put("Notation – Beziehung", "ddc");
     schemes.put("This mixes multiple systems used in DNB before 2004", "dnbsgr");
+
+    schemes.put("LoC Subject Headings", "lcsh0");
+    schemes.put("Regensburger Verbundklassifikation (RVK)", "rvk");
+    schemes.put("Medical Subject Headings (MeSH)", "mesh");
   }
 
   public String resolve(String key) {

diff --git a/src/main/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManager.java b/src/main/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManager.java
@@ -0,0 +1,51 @@
+package de.gwdg.metadataqa.marc.utils.pica;
+
+import de.gwdg.metadataqa.marc.analysis.FieldWithScheme;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+public class PicaSubjectManager {
+  private static List<FieldWithScheme> fields;
+  private static List<String> tags;
+  private static final String schemaFile = Paths.get("src/main/resources/pica/k10plus-subjects.tsv").toAbsolutePath().toString();
+
+  public static List<FieldWithScheme> readFieldsWithScheme() {
+    if (fields == null)
+      read();
+    return fields;
+  }
+
+  public static List<String> getTags() {
+    if (tags == null)
+      read();
+    return tags;
+  }
+
+  public static String getSchemaFile() {
+    return schemaFile;
+  }
+
+  private static void read() {
+    fields = new ArrayList<>();
+    tags = new ArrayList<>();
+    try (BufferedReader br = new BufferedReader(new FileReader(schemaFile))) {
+      String line;
+      while ((line = br.readLine()) != null) {
+        String[] parts = line.split("\\t");
+        String tag = parts[0];
+        if (!parts[1].equals(""))
+          tag += "/" + parts[1];
+        tags.add(tag);
+        fields.add(new FieldWithScheme(tag, parts[2]));
+      }
+    } catch (IOException e) {
+      e.getLocalizedMessage();
+    }
+  }
+
+}
diff --git a/src/main/resources/pica/k10plus-subjects.tsv b/src/main/resources/pica/k10plus-subjects.tsv
@@ -0,0 +1,40 @@
+041A	00-99	Schlagwortfolgen (DNB und Verbünde)
+044A		LoC Subject Headings
+044C		Medical Subject Headings (MeSH)
+044H		Erschließung von Musikalien nach Besetzung und Form/Gattung
+044K	00-09	Schlagwortfolgen (GBV, SWB, K10plus)
+044L	00-09	Einzelschlagwörter (Projekte)
+044N		Schlagwörter aus einem Thesaurus und freie Schlagwörter
+044S		Gattungsbegriffe bei Alten Drucken
+044Z	00-99	Lokale Schlagwörter auf bibliografischer Ebene
+045A		LCC-Notation
+045B	00	Allgemeine Systematik für Bibliotheken (ASB)
+045B	01	Systematik der Stadtbibliothek Duisburg (SSD)
+045B	02	Systematik für Bibliotheken (SfB)
+045B	03	Klassifikation für Allgemeinbibliotheken (KAB)
+045B	04	Systematiken der ekz
+045B	05	Gattungsbegriffe (DNB)
+045C		Klassifikation der National Library of Medicine (NLM)
+045D	00-29	STW-Schlagwörter
+045D	30-39	STW-Schlagwörter - automatisierte verbale Sacherschließung
+045D	40-48	STW-Schlagwörter - Platzhalter
+045D	49	ZBW-Schlagwörter - Veröffentlichungsart
+045D	50	Vorläufige Schlagwörter (STW)
+045D	60	FIV-Schlagwörter (Themen)
+045D	70	FIV-Schlagwörter (Aspekte)
+045E		Sachgruppen der Deutschen Nationalbibliografie bis 2003
+045F		DDC-Notation
+045G		Sachgruppen der Deutschen Nationalbibliografie ab 2004
+045H	00-99	DDC-Notation: Vollständige Notation
+045M	00-99	Lokale Notationen auf bibliografischer Ebene
+045N		FIV-Regionalklassifikation
+045N	01	FIV-Sachklassifikation
+045N	02	Sonstige Notation des FIV
+045Q	01	Basisklassifikation
+045R		Regensburger Verbundklassifikation (RVK)
+045S		Deutsche Bibliotheksstatistik (DBS)
+045T		Nicht mehr gültige Notationen der Regensburger Verbundklassifikation (RVK)
+045V		SSG-Nummer/FID-Kennzeichen
+045W		SSG-Angabe für thematische OLC-Ausschnitte
+045X		Notation eines Klassifikationssystems
+045Y		SSG-Angabe für Fachkataloge
diff --git a/src/main/resources/pica/update-avram-k10plus-subjects.sh b/src/main/resources/pica/update-avram-k10plus-subjects.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+#
+#--------------------------------
+# retrieve subjects from K10plus
+#--------------------------------
+
+curl -s https://format.k10plus.de/avram.pl?profile=k10plus-title \
+  | jq -r '.fields[] | select(.tag | match("04[45]|041A")) | [.tag, .occurrence, .label] | @tsv' -r \
+  > k10plus-subjects.tsv
+
+
diff --git a/src/main/resources/pica/vocabularies.json b/src/main/resources/pica/vocabularies.json
@@ -11,7 +11,7 @@
   },
   {
     "ID": "^a(.+)",
-    "PICA": "045B",
+    "PICA": "045B/00",
     "SRC": "^A(.+)",
     "VOC": "asb",
     "namespace": "http://uri.gbv.de/terminology/asb/",

diff --git a/src/test/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysisTest.java b/src/test/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysisTest.java
@@ -228,48 +228,90 @@ public void pica() throws IOException {
     output = new File(outputDir, "classifications-by-schema.csv");
     assertTrue(output.exists());
     actual = Files.readString(output.toPath());
+    /*
     assertEquals(
       "id,field,location,scheme,abbreviation,abbreviation4solr,recordcount,instancecount,type\n" +
         "4,045A,$a,\"Library of Congress Classification\",\"lcc\",lcc,1,1,CLASSIFICATION_SCHEME\n" +
         "1,045E,$a,\"This mixes multiple systems used in DNB before 2004\",\"dnbsgr\",dnbsgr,2,2,UNKNOWN\n" +
         "3,045F,$a,\"Dewey-Dezimalklassifikation\",\"ddc\",ddc,1,1,CLASSIFICATION_SCHEME\n" +
         "2,045R,$a,\"Regensburger Verbundklassifikation\",\"rvk\",rvk,1,1,CLASSIFICATION_SCHEME\n",
       actual);
+     */
+    assertEquals(
+      "id,field,location,scheme,abbreviation,abbreviation4solr,recordcount,instancecount,type\n" +
+      "4,044A,$a,\"LoC Subject Headings\",\"lcsh0\",lcsh0,1,2,SUBJECT_HEADING\n" +
+      "5,045A,$a,\"LCC-Notation\",\"lcc\",lcc,1,1,CLASSIFICATION_SCHEME\n" +
+      "1,045E,$a,\"Sachgruppen der Deutschen Nationalbibliografie bis 2003\",\"sdnb\",sdnb,2,2,UNKNOWN\n" +
+      "6,045F,$a,\"DDC-Notation\",\"ddc\",ddc,1,1,CLASSIFICATION_SCHEME\n" +
+      "2,045R,$a,\"Regensburger Verbundklassifikation (RVK)\",\"rvk\",rvk,1,1,CLASSIFICATION_SCHEME\n" +
+      "3,045V,$a,\"SSG-Nummer/FID-Kennzeichen\",\"045V\",045v,1,1,UNKNOWN\n",
+      actual);
 
     output = new File(outputDir, "classifications-by-schema-subfields.csv");
     assertTrue(output.exists());
     actual = Files.readString(output.toPath());
+    /*
     assertEquals(
       "id,subfields,count\n" +
         "4,a,1\n" +
         "1,a+,2\n" +
         "3,a+,1\n" +
         "2,V;a;j;k+;3;7;9,1\n", actual);
+     */
+
+    assertEquals(
+      "id,subfields,count\n" +
+    "4,a,1\n" +
+      "4,a+,1\n" +
+      "5,a,1\n" +
+      "1,a+,2\n" +
+      "6,a+,1\n" +
+      "2,V;a;j;k+;3;7;9,1\n" +
+      "3,a,1\n", actual);
 
     output = new File(outputDir, "classifications-collocations.csv");
     assertTrue(output.exists());
     actual = Files.readString(output.toPath());
+    /*
     assertEquals("abbreviations,recordcount,percent\n" +
       "dnbsgr;rvk,1,50.00%\n" +
       "ddc;dnbsgr;lcc,1,50.00%\n", actual);
+     */
+    assertEquals("abbreviations,recordcount,percent\n" +
+      "ddc;lcc;lcsh0;sdnb,1,50.00%\n" +
+      "045V;rvk;sdnb,1,50.00%\n", actual);
 
     output = new File(outputDir, "classifications-histogram.csv");
     assertTrue(output.exists());
     actual = Files.readString(output.toPath());
+    /*
     assertEquals(
       "count,frequency\n" +
         "0,4\n" +
         "2,1\n" +
         "3,1\n", actual);
+     */
+    assertEquals(
+     "count,frequency\n" +
+      "0,4\n" +
+      "3,1\n" +
+      "5,1\n", actual);
 
     output = new File(outputDir, "classifications-frequency-examples.csv");
     assertTrue(output.exists());
     actual = Files.readString(output.toPath());
+    /*
     assertEquals(
       "count,id\n" +
         "0,010000011\n" +
         "2,010000054\n" +
         "3,010000070\n", actual);
+    */
+    assertEquals(
+      "count,id\n" +
+      "0,010000011\n" +
+      "3,010000054\n" +
+      "5,010000070\n", actual);
 
     clearOutput(outputDir, outputFiles);
   }

diff --git a/src/test/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManagerTest.java b/src/test/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManagerTest.java
@@ -0,0 +1,20 @@
+package de.gwdg.metadataqa.marc.utils.pica;
+
+import de.gwdg.metadataqa.marc.analysis.FieldWithScheme;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.junit.Assert.*;
+
+public class PicaSubjectManagerTest {
+
+  @Test
+  public void readFieldsWithScheme() {
+    List<FieldWithScheme> fields = PicaSubjectManager.readFieldsWithScheme();
+    assertEquals(40, fields.size());
+    assertEquals(FieldWithScheme.class, fields.get(0).getClass());
+    assertEquals("041A/00-99", fields.get(0).getTag());
+    assertEquals("Schlagwortfolgen (DNB und Verbünde)", fields.get(0).getSchemaName());
+  }
+}