Skip to content

Commit

Permalink
PICA: Extend classification/subject headings schemes from config file #…
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Nov 6, 2023
1 parent 0b6e8d9 commit fb36749
Show file tree
Hide file tree
Showing 11 changed files with 211 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import de.gwdg.metadataqa.marc.cli.utils.Schema;
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.definition.general.indexer.subject.ClassificationSchemes;
import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager;
import de.gwdg.metadataqa.marc.utils.pica.PicaVocabularyManager;
import de.gwdg.metadataqa.marc.utils.pica.VocabularyEntry;

Expand Down Expand Up @@ -41,6 +42,7 @@ public class ClassificationAnalyzer {
private ClassificationParameters parameters = null;
private BibliographicRecord marcRecord;
private List<Schema> schemasInRecord;
private static List<FieldWithScheme> picaFieldsWithScheme = PicaSubjectManager.readFieldsWithScheme();

private static final List<String> fieldsWithIndicator1AndSubfield2 = Arrays.asList(
"052", // Geographic Classification
Expand Down Expand Up @@ -149,7 +151,7 @@ public int process() {
total = processFieldsWithoutSource(total);
total = processFieldsWithScheme(total, MARC21_FIELD_WITH_SCHEMES);
} else if (marcRecord.getSchemaType().equals(SchemaType.PICA)) {
total = processFieldsWithSchemePica(total, PICA_FIELDS_WITH_SCHEME);
total = processFieldsWithSchemePica(total, picaFieldsWithScheme);
}

increaseCounters(total);
Expand Down Expand Up @@ -180,12 +182,25 @@ private int processFieldsWithScheme(int total, List<FieldWithScheme> fieldsWithS

private int processFieldsWithSchemePica(int total, List<FieldWithScheme> fieldsWithScheme) {
int count = total;
for (VocabularyEntry entry : manager.getAll()) {
if (!marcRecord.hasDatafield(entry.getPica()))
// for (VocabularyEntry entry : manager.getAll()) {
for (FieldWithScheme entry : fieldsWithScheme) {
/*
String tag = entry.getPica();
String schema = entry.getLabel();
String voc = entry.getVoc();
*/
String tag = entry.getTag();
String schema = entry.getSchemaName();
String voc = tag;
try {
voc = classificationSchemes.resolve(schema);
} catch (IllegalArgumentException e) {

}
if (!marcRecord.hasDatafield(tag))
continue;

String schema = entry.getLabel();
List<DataField> fields = marcRecord.getDatafield(entry.getPica());
List<DataField> fields = marcRecord.getDatafield(tag);
List<Schema> schemas = new ArrayList<>();
for (DataField field : fields) {
String firstSubfield = null;
Expand All @@ -201,7 +216,7 @@ private int processFieldsWithSchemePica(int total, List<FieldWithScheme> fieldsW
}
}
if (firstSubfield != null) {
var currentSchema = new Schema(field.getTagWithOccurrence(), firstSubfield, entry.getVoc(), schema);
var currentSchema = new Schema(field.getTagWithOccurrence(), firstSubfield, voc, schema);
schemas.add(currentSchema);
updateSchemaSubfieldStatistics(field, currentSchema);
count++;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.cli.utils.Schema;
import de.gwdg.metadataqa.marc.model.validation.ValidationError;
import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
Expand All @@ -21,7 +22,6 @@
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand Down Expand Up @@ -134,9 +134,21 @@ public void afterIteration(int numberOfprocessedRecords, long duration) {
printSchemaSubfieldsStatistics();
if (parameters.doCollectCollocations())
printClassificationsCollocation();
copySchemaFileToOutputDir();
saveParameters("classifications.params.json", parameters, Map.of("numberOfprocessedRecords", numberOfprocessedRecords, "duration", duration));
}

private void copySchemaFileToOutputDir() {
if (parameters.isPica()) {
File source = new File(PicaSubjectManager.getSchemaFile());
try {
FileUtils.copyFileToDirectory(source, new File(parameters.getOutputDir()));
} catch (IOException e) {
logger.warning(e.getLocalizedMessage());
}
}
}

private void printClassificationsCollocation() {
Path path;
path = Paths.get(parameters.getOutputDir(), "classifications-collocations.csv");
Expand Down
8 changes: 4 additions & 4 deletions src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,12 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
);
map.put("record_sni", Arrays.asList(bibliographicRecord.asJson()));
SolrInputDocument solrDocument = client.createSolrDoc(bibliographicRecord.getId(), map);
if (validationClient != null) {
if (validationClient != null)
indexValidationResults(bibliographicRecord, solrDocument);
}
if (parameters.indexFieldCounts()) {

if (parameters.indexFieldCounts())
indexFieldCounts(bibliographicRecord, solrDocument);
}

client.index(solrDocument);

if (recordNumber % parameters.getCommitAt() == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import de.gwdg.metadataqa.marc.analysis.ThompsonTraillFields;
import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.utils.pica.PicaSubjectManager;
import de.gwdg.metadataqa.marc.utils.pica.crosswalk.Crosswalk;
import de.gwdg.metadataqa.marc.utils.pica.crosswalk.PicaMarcCrosswalkReader;

Expand Down Expand Up @@ -139,9 +140,12 @@ private static void initializeAuthorityTags() {
skippableAuthoritySubfields.put("033H", Utils.listToMap(Arrays.asList("9", "V", "7", "3", "w")));
skippableAuthoritySubfields.put("033J", Utils.listToMap(Arrays.asList("9", "V", "7", "3", "w")));

/*
List<String> subjectTags = Arrays.asList(
"045A", "045B", "045F", "045R", "045C", "045E", "045G"
);
*/
List<String> subjectTags = PicaSubjectManager.getTags();
subjectTagIndex = Utils.listToMap(subjectTags);
skippableSubjectSubfields = new HashMap<>();
skippableSubjectSubfields.put("022A", Utils.listToMap(Arrays.asList("9", "V", "7", "3", "w")));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ private void initialize() {
schemes.put("DDC-Notation", "ddc");
schemes.put("Notation – Beziehung", "ddc");
schemes.put("This mixes multiple systems used in DNB before 2004", "dnbsgr");

schemes.put("LoC Subject Headings", "lcsh0");
schemes.put("Regensburger Verbundklassifikation (RVK)", "rvk");
schemes.put("Medical Subject Headings (MeSH)", "mesh");
}

public String resolve(String key) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package de.gwdg.metadataqa.marc.utils.pica;

import de.gwdg.metadataqa.marc.analysis.FieldWithScheme;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

public class PicaSubjectManager {
private static List<FieldWithScheme> fields;
private static List<String> tags;
private static final String schemaFile = Paths.get("src/main/resources/pica/k10plus-subjects.tsv").toAbsolutePath().toString();

public static List<FieldWithScheme> readFieldsWithScheme() {
if (fields == null)
read();
return fields;
}

public static List<String> getTags() {
if (tags == null)
read();
return tags;
}

public static String getSchemaFile() {
return schemaFile;
}

private static void read() {
fields = new ArrayList<>();
tags = new ArrayList<>();
try (BufferedReader br = new BufferedReader(new FileReader(schemaFile))) {
String line;
while ((line = br.readLine()) != null) {
String[] parts = line.split("\\t");
String tag = parts[0];
if (!parts[1].equals(""))
tag += "/" + parts[1];
tags.add(tag);
fields.add(new FieldWithScheme(tag, parts[2]));
}
} catch (IOException e) {
e.getLocalizedMessage();
}
}

}
40 changes: 40 additions & 0 deletions src/main/resources/pica/k10plus-subjects.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
041A 00-99 Schlagwortfolgen (DNB und Verbünde)
044A LoC Subject Headings
044C Medical Subject Headings (MeSH)
044H Erschließung von Musikalien nach Besetzung und Form/Gattung
044K 00-09 Schlagwortfolgen (GBV, SWB, K10plus)
044L 00-09 Einzelschlagwörter (Projekte)
044N Schlagwörter aus einem Thesaurus und freie Schlagwörter
044S Gattungsbegriffe bei Alten Drucken
044Z 00-99 Lokale Schlagwörter auf bibliografischer Ebene
045A LCC-Notation
045B 00 Allgemeine Systematik für Bibliotheken (ASB)
045B 01 Systematik der Stadtbibliothek Duisburg (SSD)
045B 02 Systematik für Bibliotheken (SfB)
045B 03 Klassifikation für Allgemeinbibliotheken (KAB)
045B 04 Systematiken der ekz
045B 05 Gattungsbegriffe (DNB)
045C Klassifikation der National Library of Medicine (NLM)
045D 00-29 STW-Schlagwörter
045D 30-39 STW-Schlagwörter - automatisierte verbale Sacherschließung
045D 40-48 STW-Schlagwörter - Platzhalter
045D 49 ZBW-Schlagwörter - Veröffentlichungsart
045D 50 Vorläufige Schlagwörter (STW)
045D 60 FIV-Schlagwörter (Themen)
045D 70 FIV-Schlagwörter (Aspekte)
045E Sachgruppen der Deutschen Nationalbibliografie bis 2003
045F DDC-Notation
045G Sachgruppen der Deutschen Nationalbibliografie ab 2004
045H 00-99 DDC-Notation: Vollständige Notation
045M 00-99 Lokale Notationen auf bibliografischer Ebene
045N FIV-Regionalklassifikation
045N 01 FIV-Sachklassifikation
045N 02 Sonstige Notation des FIV
045Q 01 Basisklassifikation
045R Regensburger Verbundklassifikation (RVK)
045S Deutsche Bibliotheksstatistik (DBS)
045T Nicht mehr gültige Notationen der Regensburger Verbundklassifikation (RVK)
045V SSG-Nummer/FID-Kennzeichen
045W SSG-Angabe für thematische OLC-Ausschnitte
045X Notation eines Klassifikationssystems
045Y SSG-Angabe für Fachkataloge
11 changes: 11 additions & 0 deletions src/main/resources/pica/update-avram-k10plus-subjects.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash
#
#--------------------------------
# retrieve subjects from K10plus
#--------------------------------

curl -s https://format.k10plus.de/avram.pl?profile=k10plus-title \
| jq -r '.fields[] | select(.tag | match("04[45]|041A")) | [.tag, .occurrence, .label] | @tsv' -r \
> k10plus-subjects.tsv


2 changes: 1 addition & 1 deletion src/main/resources/pica/vocabularies.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
},
{
"ID": "^a(.+)",
"PICA": "045B",
"PICA": "045B/00",
"SRC": "^A(.+)",
"VOC": "asb",
"namespace": "http://uri.gbv.de/terminology/asb/",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,48 +228,90 @@ public void pica() throws IOException {
output = new File(outputDir, "classifications-by-schema.csv");
assertTrue(output.exists());
actual = Files.readString(output.toPath());
/*
assertEquals(
"id,field,location,scheme,abbreviation,abbreviation4solr,recordcount,instancecount,type\n" +
"4,045A,$a,\"Library of Congress Classification\",\"lcc\",lcc,1,1,CLASSIFICATION_SCHEME\n" +
"1,045E,$a,\"This mixes multiple systems used in DNB before 2004\",\"dnbsgr\",dnbsgr,2,2,UNKNOWN\n" +
"3,045F,$a,\"Dewey-Dezimalklassifikation\",\"ddc\",ddc,1,1,CLASSIFICATION_SCHEME\n" +
"2,045R,$a,\"Regensburger Verbundklassifikation\",\"rvk\",rvk,1,1,CLASSIFICATION_SCHEME\n",
actual);
*/
assertEquals(
"id,field,location,scheme,abbreviation,abbreviation4solr,recordcount,instancecount,type\n" +
"4,044A,$a,\"LoC Subject Headings\",\"lcsh0\",lcsh0,1,2,SUBJECT_HEADING\n" +
"5,045A,$a,\"LCC-Notation\",\"lcc\",lcc,1,1,CLASSIFICATION_SCHEME\n" +
"1,045E,$a,\"Sachgruppen der Deutschen Nationalbibliografie bis 2003\",\"sdnb\",sdnb,2,2,UNKNOWN\n" +
"6,045F,$a,\"DDC-Notation\",\"ddc\",ddc,1,1,CLASSIFICATION_SCHEME\n" +
"2,045R,$a,\"Regensburger Verbundklassifikation (RVK)\",\"rvk\",rvk,1,1,CLASSIFICATION_SCHEME\n" +
"3,045V,$a,\"SSG-Nummer/FID-Kennzeichen\",\"045V\",045v,1,1,UNKNOWN\n",
actual);

output = new File(outputDir, "classifications-by-schema-subfields.csv");
assertTrue(output.exists());
actual = Files.readString(output.toPath());
/*
assertEquals(
"id,subfields,count\n" +
"4,a,1\n" +
"1,a+,2\n" +
"3,a+,1\n" +
"2,V;a;j;k+;3;7;9,1\n", actual);
*/

assertEquals(
"id,subfields,count\n" +
"4,a,1\n" +
"4,a+,1\n" +
"5,a,1\n" +
"1,a+,2\n" +
"6,a+,1\n" +
"2,V;a;j;k+;3;7;9,1\n" +
"3,a,1\n", actual);

output = new File(outputDir, "classifications-collocations.csv");
assertTrue(output.exists());
actual = Files.readString(output.toPath());
/*
assertEquals("abbreviations,recordcount,percent\n" +
"dnbsgr;rvk,1,50.00%\n" +
"ddc;dnbsgr;lcc,1,50.00%\n", actual);
*/
assertEquals("abbreviations,recordcount,percent\n" +
"ddc;lcc;lcsh0;sdnb,1,50.00%\n" +
"045V;rvk;sdnb,1,50.00%\n", actual);

output = new File(outputDir, "classifications-histogram.csv");
assertTrue(output.exists());
actual = Files.readString(output.toPath());
/*
assertEquals(
"count,frequency\n" +
"0,4\n" +
"2,1\n" +
"3,1\n", actual);
*/
assertEquals(
"count,frequency\n" +
"0,4\n" +
"3,1\n" +
"5,1\n", actual);

output = new File(outputDir, "classifications-frequency-examples.csv");
assertTrue(output.exists());
actual = Files.readString(output.toPath());
/*
assertEquals(
"count,id\n" +
"0,010000011\n" +
"2,010000054\n" +
"3,010000070\n", actual);
*/
assertEquals(
"count,id\n" +
"0,010000011\n" +
"3,010000054\n" +
"5,010000070\n", actual);

clearOutput(outputDir, outputFiles);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package de.gwdg.metadataqa.marc.utils.pica;

import de.gwdg.metadataqa.marc.analysis.FieldWithScheme;
import org.junit.Test;

import java.util.List;

import static org.junit.Assert.*;

public class PicaSubjectManagerTest {

@Test
public void readFieldsWithScheme() {
List<FieldWithScheme> fields = PicaSubjectManager.readFieldsWithScheme();
assertEquals(40, fields.size());
assertEquals(FieldWithScheme.class, fields.get(0).getClass());
assertEquals("041A/00-99", fields.get(0).getTag());
assertEquals("Schlagwortfolgen (DNB und Verbünde)", fields.get(0).getSchemaName());
}
}

0 comments on commit fb36749

Please sign in to comment.