From fb36749b25c981ae043cc37f808bbfe8d0e961fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Kir=C3=A1ly?= Date: Mon, 6 Nov 2023 23:58:49 +0100 Subject: [PATCH] PICA: Extend classification/subject headings schemes from config file #190 --- .../marc/analysis/ClassificationAnalyzer.java | 27 +++++++--- .../marc/cli/ClassificationAnalysis.java | 14 ++++- .../gwdg/metadataqa/marc/cli/MarcToSolr.java | 8 +-- .../marc/dao/record/PicaRecord.java | 4 ++ .../subject/ClassificationSchemes.java | 4 ++ .../marc/utils/pica/PicaSubjectManager.java | 51 +++++++++++++++++++ src/main/resources/pica/k10plus-subjects.tsv | 40 +++++++++++++++ .../pica/update-avram-k10plus-subjects.sh | 11 ++++ src/main/resources/pica/vocabularies.json | 2 +- .../marc/cli/ClassificationAnalysisTest.java | 42 +++++++++++++++ .../utils/pica/PicaSubjectManagerTest.java | 20 ++++++++ 11 files changed, 211 insertions(+), 12 deletions(-) create mode 100644 src/main/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManager.java create mode 100644 src/main/resources/pica/k10plus-subjects.tsv create mode 100755 src/main/resources/pica/update-avram-k10plus-subjects.sh create mode 100644 src/test/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManagerTest.java diff --git a/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java b/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java index c4e47829f..a9cc2c749 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java +++ b/src/main/java/de/gwdg/metadataqa/marc/analysis/ClassificationAnalyzer.java @@ -8,6 +8,7 @@ import de.gwdg.metadataqa.marc.cli.utils.Schema; import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType; import de.gwdg.metadataqa.marc.definition.general.indexer.subject.ClassificationSchemes; +import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager; import de.gwdg.metadataqa.marc.utils.pica.PicaVocabularyManager; import de.gwdg.metadataqa.marc.utils.pica.VocabularyEntry; @@ -41,6 +42,7 @@ public class ClassificationAnalyzer { private ClassificationParameters parameters = null; private BibliographicRecord marcRecord; private List schemasInRecord; + private static List picaFieldsWithScheme = PicaSubjectManager.readFieldsWithScheme(); private static final List fieldsWithIndicator1AndSubfield2 = Arrays.asList( "052", // Geographic Classification @@ -149,7 +151,7 @@ public int process() { total = processFieldsWithoutSource(total); total = processFieldsWithScheme(total, MARC21_FIELD_WITH_SCHEMES); } else if (marcRecord.getSchemaType().equals(SchemaType.PICA)) { - total = processFieldsWithSchemePica(total, PICA_FIELDS_WITH_SCHEME); + total = processFieldsWithSchemePica(total, picaFieldsWithScheme); } increaseCounters(total); @@ -180,12 +182,25 @@ private int processFieldsWithScheme(int total, List fieldsWithS private int processFieldsWithSchemePica(int total, List fieldsWithScheme) { int count = total; - for (VocabularyEntry entry : manager.getAll()) { - if (!marcRecord.hasDatafield(entry.getPica())) + // for (VocabularyEntry entry : manager.getAll()) { + for (FieldWithScheme entry : fieldsWithScheme) { + /* + String tag = entry.getPica(); + String schema = entry.getLabel(); + String voc = entry.getVoc(); + */ + String tag = entry.getTag(); + String schema = entry.getSchemaName(); + String voc = tag; + try { + voc = classificationSchemes.resolve(schema); + } catch (IllegalArgumentException e) { + + } + if (!marcRecord.hasDatafield(tag)) continue; - String schema = entry.getLabel(); - List fields = marcRecord.getDatafield(entry.getPica()); + List fields = marcRecord.getDatafield(tag); List schemas = new ArrayList<>(); for (DataField field : fields) { String firstSubfield = null; @@ -201,7 +216,7 @@ private int processFieldsWithSchemePica(int total, List fieldsW } } if (firstSubfield != null) { - var currentSchema = new Schema(field.getTagWithOccurrence(), firstSubfield, entry.getVoc(), schema); + var currentSchema = new Schema(field.getTagWithOccurrence(), firstSubfield, voc, schema); schemas.add(currentSchema); updateSchemaSubfieldStatistics(field, currentSchema); count++; diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java b/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java index 7e0dd9de0..1ddf94570 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java +++ b/src/main/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysis.java @@ -11,6 +11,7 @@ import de.gwdg.metadataqa.marc.cli.utils.RecordIterator; import de.gwdg.metadataqa.marc.cli.utils.Schema; import de.gwdg.metadataqa.marc.model.validation.ValidationError; +import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.io.FileUtils; @@ -21,7 +22,6 @@ import java.io.File; import java.io.IOException; import java.io.Serializable; -import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -134,9 +134,21 @@ public void afterIteration(int numberOfprocessedRecords, long duration) { printSchemaSubfieldsStatistics(); if (parameters.doCollectCollocations()) printClassificationsCollocation(); + copySchemaFileToOutputDir(); saveParameters("classifications.params.json", parameters, Map.of("numberOfprocessedRecords", numberOfprocessedRecords, "duration", duration)); } + private void copySchemaFileToOutputDir() { + if (parameters.isPica()) { + File source = new File(PicaSubjectManager.getSchemaFile()); + try { + FileUtils.copyFileToDirectory(source, new File(parameters.getOutputDir())); + } catch (IOException e) { + logger.warning(e.getLocalizedMessage()); + } + } + } + private void printClassificationsCollocation() { Path path; path = Paths.get(parameters.getOutputDir(), "classifications-collocations.csv"); diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java b/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java index 37a1d381c..9c8facbe3 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java +++ b/src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java @@ -132,12 +132,12 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum ); map.put("record_sni", Arrays.asList(bibliographicRecord.asJson())); SolrInputDocument solrDocument = client.createSolrDoc(bibliographicRecord.getId(), map); - if (validationClient != null) { + if (validationClient != null) indexValidationResults(bibliographicRecord, solrDocument); - } - if (parameters.indexFieldCounts()) { + + if (parameters.indexFieldCounts()) indexFieldCounts(bibliographicRecord, solrDocument); - } + client.index(solrDocument); if (recordNumber % parameters.getCommitAt() == 0) { diff --git a/src/main/java/de/gwdg/metadataqa/marc/dao/record/PicaRecord.java b/src/main/java/de/gwdg/metadataqa/marc/dao/record/PicaRecord.java index 2d3fd4ddb..ca88ae830 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/dao/record/PicaRecord.java +++ b/src/main/java/de/gwdg/metadataqa/marc/dao/record/PicaRecord.java @@ -6,6 +6,7 @@ import de.gwdg.metadataqa.marc.analysis.ThompsonTraillFields; import de.gwdg.metadataqa.marc.dao.DataField; import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType; +import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager; import de.gwdg.metadataqa.marc.utils.pica.crosswalk.Crosswalk; import de.gwdg.metadataqa.marc.utils.pica.crosswalk.PicaMarcCrosswalkReader; @@ -139,9 +140,12 @@ private static void initializeAuthorityTags() { skippableAuthoritySubfields.put("033H", Utils.listToMap(Arrays.asList("9", "V", "7", "3", "w"))); skippableAuthoritySubfields.put("033J", Utils.listToMap(Arrays.asList("9", "V", "7", "3", "w"))); + /* List subjectTags = Arrays.asList( "045A", "045B", "045F", "045R", "045C", "045E", "045G" ); + */ + List subjectTags = PicaSubjectManager.getTags(); subjectTagIndex = Utils.listToMap(subjectTags); skippableSubjectSubfields = new HashMap<>(); skippableSubjectSubfields.put("022A", Utils.listToMap(Arrays.asList("9", "V", "7", "3", "w"))); diff --git a/src/main/java/de/gwdg/metadataqa/marc/definition/general/indexer/subject/ClassificationSchemes.java b/src/main/java/de/gwdg/metadataqa/marc/definition/general/indexer/subject/ClassificationSchemes.java index 184493abc..8538ca260 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/definition/general/indexer/subject/ClassificationSchemes.java +++ b/src/main/java/de/gwdg/metadataqa/marc/definition/general/indexer/subject/ClassificationSchemes.java @@ -64,6 +64,10 @@ private void initialize() { schemes.put("DDC-Notation", "ddc"); schemes.put("Notation – Beziehung", "ddc"); schemes.put("This mixes multiple systems used in DNB before 2004", "dnbsgr"); + + schemes.put("LoC Subject Headings", "lcsh0"); + schemes.put("Regensburger Verbundklassifikation (RVK)", "rvk"); + schemes.put("Medical Subject Headings (MeSH)", "mesh"); } public String resolve(String key) { diff --git a/src/main/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManager.java b/src/main/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManager.java new file mode 100644 index 000000000..568207af2 --- /dev/null +++ b/src/main/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManager.java @@ -0,0 +1,51 @@ +package de.gwdg.metadataqa.marc.utils.pica; + +import de.gwdg.metadataqa.marc.analysis.FieldWithScheme; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; + +public class PicaSubjectManager { + private static List fields; + private static List tags; + private static final String schemaFile = Paths.get("src/main/resources/pica/k10plus-subjects.tsv").toAbsolutePath().toString(); + + public static List readFieldsWithScheme() { + if (fields == null) + read(); + return fields; + } + + public static List getTags() { + if (tags == null) + read(); + return tags; + } + + public static String getSchemaFile() { + return schemaFile; + } + + private static void read() { + fields = new ArrayList<>(); + tags = new ArrayList<>(); + try (BufferedReader br = new BufferedReader(new FileReader(schemaFile))) { + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split("\\t"); + String tag = parts[0]; + if (!parts[1].equals("")) + tag += "/" + parts[1]; + tags.add(tag); + fields.add(new FieldWithScheme(tag, parts[2])); + } + } catch (IOException e) { + e.getLocalizedMessage(); + } + } + +} diff --git a/src/main/resources/pica/k10plus-subjects.tsv b/src/main/resources/pica/k10plus-subjects.tsv new file mode 100644 index 000000000..2ee2bb7ff --- /dev/null +++ b/src/main/resources/pica/k10plus-subjects.tsv @@ -0,0 +1,40 @@ +041A 00-99 Schlagwortfolgen (DNB und Verbünde) +044A LoC Subject Headings +044C Medical Subject Headings (MeSH) +044H Erschließung von Musikalien nach Besetzung und Form/Gattung +044K 00-09 Schlagwortfolgen (GBV, SWB, K10plus) +044L 00-09 Einzelschlagwörter (Projekte) +044N Schlagwörter aus einem Thesaurus und freie Schlagwörter +044S Gattungsbegriffe bei Alten Drucken +044Z 00-99 Lokale Schlagwörter auf bibliografischer Ebene +045A LCC-Notation +045B 00 Allgemeine Systematik für Bibliotheken (ASB) +045B 01 Systematik der Stadtbibliothek Duisburg (SSD) +045B 02 Systematik für Bibliotheken (SfB) +045B 03 Klassifikation für Allgemeinbibliotheken (KAB) +045B 04 Systematiken der ekz +045B 05 Gattungsbegriffe (DNB) +045C Klassifikation der National Library of Medicine (NLM) +045D 00-29 STW-Schlagwörter +045D 30-39 STW-Schlagwörter - automatisierte verbale Sacherschließung +045D 40-48 STW-Schlagwörter - Platzhalter +045D 49 ZBW-Schlagwörter - Veröffentlichungsart +045D 50 Vorläufige Schlagwörter (STW) +045D 60 FIV-Schlagwörter (Themen) +045D 70 FIV-Schlagwörter (Aspekte) +045E Sachgruppen der Deutschen Nationalbibliografie bis 2003 +045F DDC-Notation +045G Sachgruppen der Deutschen Nationalbibliografie ab 2004 +045H 00-99 DDC-Notation: Vollständige Notation +045M 00-99 Lokale Notationen auf bibliografischer Ebene +045N FIV-Regionalklassifikation +045N 01 FIV-Sachklassifikation +045N 02 Sonstige Notation des FIV +045Q 01 Basisklassifikation +045R Regensburger Verbundklassifikation (RVK) +045S Deutsche Bibliotheksstatistik (DBS) +045T Nicht mehr gültige Notationen der Regensburger Verbundklassifikation (RVK) +045V SSG-Nummer/FID-Kennzeichen +045W SSG-Angabe für thematische OLC-Ausschnitte +045X Notation eines Klassifikationssystems +045Y SSG-Angabe für Fachkataloge diff --git a/src/main/resources/pica/update-avram-k10plus-subjects.sh b/src/main/resources/pica/update-avram-k10plus-subjects.sh new file mode 100755 index 000000000..1c6e904a8 --- /dev/null +++ b/src/main/resources/pica/update-avram-k10plus-subjects.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# +#-------------------------------- +# retrieve subjects from K10plus +#-------------------------------- + +curl -s https://format.k10plus.de/avram.pl?profile=k10plus-title \ + | jq -r '.fields[] | select(.tag | match("04[45]|041A")) | [.tag, .occurrence, .label] | @tsv' -r \ + > k10plus-subjects.tsv + + diff --git a/src/main/resources/pica/vocabularies.json b/src/main/resources/pica/vocabularies.json index aeb101a7b..83a30a0dc 100644 --- a/src/main/resources/pica/vocabularies.json +++ b/src/main/resources/pica/vocabularies.json @@ -11,7 +11,7 @@ }, { "ID": "^a(.+)", - "PICA": "045B", + "PICA": "045B/00", "SRC": "^A(.+)", "VOC": "asb", "namespace": "http://uri.gbv.de/terminology/asb/", diff --git a/src/test/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysisTest.java b/src/test/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysisTest.java index 3bfd67f38..cb0fa28e4 100644 --- a/src/test/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysisTest.java +++ b/src/test/java/de/gwdg/metadataqa/marc/cli/ClassificationAnalysisTest.java @@ -228,6 +228,7 @@ public void pica() throws IOException { output = new File(outputDir, "classifications-by-schema.csv"); assertTrue(output.exists()); actual = Files.readString(output.toPath()); + /* assertEquals( "id,field,location,scheme,abbreviation,abbreviation4solr,recordcount,instancecount,type\n" + "4,045A,$a,\"Library of Congress Classification\",\"lcc\",lcc,1,1,CLASSIFICATION_SCHEME\n" + @@ -235,41 +236,82 @@ public void pica() throws IOException { "3,045F,$a,\"Dewey-Dezimalklassifikation\",\"ddc\",ddc,1,1,CLASSIFICATION_SCHEME\n" + "2,045R,$a,\"Regensburger Verbundklassifikation\",\"rvk\",rvk,1,1,CLASSIFICATION_SCHEME\n", actual); + */ + assertEquals( + "id,field,location,scheme,abbreviation,abbreviation4solr,recordcount,instancecount,type\n" + + "4,044A,$a,\"LoC Subject Headings\",\"lcsh0\",lcsh0,1,2,SUBJECT_HEADING\n" + + "5,045A,$a,\"LCC-Notation\",\"lcc\",lcc,1,1,CLASSIFICATION_SCHEME\n" + + "1,045E,$a,\"Sachgruppen der Deutschen Nationalbibliografie bis 2003\",\"sdnb\",sdnb,2,2,UNKNOWN\n" + + "6,045F,$a,\"DDC-Notation\",\"ddc\",ddc,1,1,CLASSIFICATION_SCHEME\n" + + "2,045R,$a,\"Regensburger Verbundklassifikation (RVK)\",\"rvk\",rvk,1,1,CLASSIFICATION_SCHEME\n" + + "3,045V,$a,\"SSG-Nummer/FID-Kennzeichen\",\"045V\",045v,1,1,UNKNOWN\n", + actual); output = new File(outputDir, "classifications-by-schema-subfields.csv"); assertTrue(output.exists()); actual = Files.readString(output.toPath()); + /* assertEquals( "id,subfields,count\n" + "4,a,1\n" + "1,a+,2\n" + "3,a+,1\n" + "2,V;a;j;k+;3;7;9,1\n", actual); + */ + + assertEquals( + "id,subfields,count\n" + + "4,a,1\n" + + "4,a+,1\n" + + "5,a,1\n" + + "1,a+,2\n" + + "6,a+,1\n" + + "2,V;a;j;k+;3;7;9,1\n" + + "3,a,1\n", actual); output = new File(outputDir, "classifications-collocations.csv"); assertTrue(output.exists()); actual = Files.readString(output.toPath()); + /* assertEquals("abbreviations,recordcount,percent\n" + "dnbsgr;rvk,1,50.00%\n" + "ddc;dnbsgr;lcc,1,50.00%\n", actual); + */ + assertEquals("abbreviations,recordcount,percent\n" + + "ddc;lcc;lcsh0;sdnb,1,50.00%\n" + + "045V;rvk;sdnb,1,50.00%\n", actual); output = new File(outputDir, "classifications-histogram.csv"); assertTrue(output.exists()); actual = Files.readString(output.toPath()); + /* assertEquals( "count,frequency\n" + "0,4\n" + "2,1\n" + "3,1\n", actual); + */ + assertEquals( + "count,frequency\n" + + "0,4\n" + + "3,1\n" + + "5,1\n", actual); output = new File(outputDir, "classifications-frequency-examples.csv"); assertTrue(output.exists()); actual = Files.readString(output.toPath()); + /* assertEquals( "count,id\n" + "0,010000011\n" + "2,010000054\n" + "3,010000070\n", actual); + */ + assertEquals( + "count,id\n" + + "0,010000011\n" + + "3,010000054\n" + + "5,010000070\n", actual); clearOutput(outputDir, outputFiles); } diff --git a/src/test/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManagerTest.java b/src/test/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManagerTest.java new file mode 100644 index 000000000..1352b5a8e --- /dev/null +++ b/src/test/java/de/gwdg/metadataqa/marc/utils/pica/PicaSubjectManagerTest.java @@ -0,0 +1,20 @@ +package de.gwdg.metadataqa.marc.utils.pica; + +import de.gwdg.metadataqa.marc.analysis.FieldWithScheme; +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.*; + +public class PicaSubjectManagerTest { + + @Test + public void readFieldsWithScheme() { + List fields = PicaSubjectManager.readFieldsWithScheme(); + assertEquals(40, fields.size()); + assertEquals(FieldWithScheme.class, fields.get(0).getClass()); + assertEquals("041A/00-99", fields.get(0).getTag()); + assertEquals("Schlagwortfolgen (DNB und Verbünde)", fields.get(0).getSchemaName()); + } +} \ No newline at end of file