Skip to content

Commit

Permalink
PICA Fields with occurrences are not properly indexed #294
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Oct 24, 2023
1 parent 178fd6a commit 07580b7
Show file tree
Hide file tree
Showing 11 changed files with 62 additions and 36 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
- [\#332](https://github.com/pkiraly/qa-catalogue/issues/332) Implementing MARC Update No. 35 (December 2022)
- [\#333](https://github.com/pkiraly/qa-catalogue/issues/333) Implementing MARC Update No. 36 (June 2023)

### Improvements

## v0.5.0

Expand Down
10 changes: 5 additions & 5 deletions src/main/java/de/gwdg/metadataqa/marc/MarcSubfield.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ public void setMarcRecord(BibliographicRecord marcRecord) {
public String getCodeForIndex() {
if (codeForIndex == null) {
codeForIndex = "_" + code;
if (definition != null && definition.getCodeForIndex() != null) {
codeForIndex = definition.getCodeForIndex();
if (definition != null && definition.getCodeForIndex(marcRecord.getSchemaType()) != null) {
codeForIndex = definition.getCodeForIndex(marcRecord.getSchemaType());
}
}
return codeForIndex;
Expand All @@ -128,10 +128,10 @@ public Map<String, List<String>> getKeyValuePairs(DataFieldKeyGenerator keyGener
prefixCache = new HashMap<>();
}

String tag = this.getField().getTag();
String tagForCache = this.getField().getTag();
if (this.getField().getOccurrence() != null)
tag += "/" + this.getField().getOccurrence();
String cacheKey = String.format("%s$%s-%s-%s", tag, code, keyGenerator.getType().getType(), keyGenerator.getMarcVersion());
tagForCache += "/" + this.getField().getOccurrence();
String cacheKey = String.format("%s$%s-%s-%s", tagForCache, code, keyGenerator.getType().getType(), keyGenerator.getMarcVersion());
if (!prefixCache.containsKey(cacheKey))
prefixCache.put(cacheKey, keyGenerator.forSubfield(this));
String prefix = prefixCache.get(cacheKey);
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
return;

if (bibliographicRecord.getSchemaType().equals(SchemaType.PICA) && doGroups())
for (DataField field : bibliographicRecord.getDatafield(((PicaPath) groupBy).getTag()))
field.addFieldIndexer(groupIndexer);
for (DataField groupField : bibliographicRecord.getDatafield(((PicaPath) groupBy).getTag()))
groupField.addFieldIndexer(groupIndexer);

Map<String, List<String>> map = bibliographicRecord.getKeyValuePairs(
parameters.getSolrFieldType(), true, parameters.getMarcVersion()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package de.gwdg.metadataqa.marc.cli.utils;

import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.definition.controlpositions.Control006Positions;
import de.gwdg.metadataqa.marc.definition.controlpositions.Control007Positions;
import de.gwdg.metadataqa.marc.definition.controlpositions.Control008Positions;
Expand Down Expand Up @@ -135,7 +136,7 @@ private static void tagToHtml(DataFieldDefinition tag) {
for (SubfieldDefinition subfield : tag.getSubfields()) {
text.append(row(
String.format("%s$%s", tag.getTag(), subfield.getCode()),
String.format("%s%s", tag.getIndexTag(), subfield.getCodeForIndex()),
String.format("%s%s", tag.getIndexTag(), subfield.getCodeForIndex(SchemaType.MARC21)),
subfield.getLabel()
));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package de.gwdg.metadataqa.marc.cli.utils;

import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;
import de.gwdg.metadataqa.marc.definition.structure.SubfieldDefinition;
import de.gwdg.metadataqa.marc.utils.MarcTagLister;
Expand Down Expand Up @@ -56,7 +57,7 @@ private static void tagToMarkDown(DataFieldDefinition tag) {
for (SubfieldDefinition subfield : tag.getSubfields()) {
System.out.printf("| `%s$%s` | `%s%s` | %s |%n",
tag.getTag(), subfield.getCode(),
tag.getIndexTag(), subfield.getCodeForIndex(),
tag.getIndexTag(), subfield.getCodeForIndex(SchemaType.MARC21),
subfield.getLabel());
}
}
Expand Down
16 changes: 13 additions & 3 deletions src/main/java/de/gwdg/metadataqa/marc/dao/DataField.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import de.gwdg.metadataqa.marc.model.SolrFieldType;
import de.gwdg.metadataqa.marc.model.validation.ErrorsCollector;
import de.gwdg.metadataqa.marc.utils.keygenerator.DataFieldKeyGenerator;
import de.gwdg.metadataqa.marc.utils.pica.PicaFieldDefinition;
import org.apache.commons.lang3.StringUtils;

import java.io.Serializable;
Expand Down Expand Up @@ -386,9 +387,18 @@ public Map<String, List<String>> getKeyValuePairs(SolrFieldType type,
MarcVersion marcVersion) {
Map<String, List<String>> pairs = new HashMap<>();

String tag = getTag();
if (getOccurrence() != null)
tag += "_" + getOccurrence();
if (marcRecord != null && marcRecord.getSchemaType().equals(SchemaType.PICA) && definition != null) {
PicaFieldDefinition picaDefinition = (PicaFieldDefinition) definition;
tag = picaDefinition.getTag();
if (picaDefinition.getCounter() != null)
tag += "_" + picaDefinition.getCounter();
else if (picaDefinition.getOccurrence() != null)
tag += "_" + picaDefinition.getOccurrence();
} else {
tag = getTag();
if (getOccurrence() != null)
tag += "/" + getOccurrence();
}

SchemaType schemaType = marcRecord != null ? marcRecord.getSchemaType() : SchemaType.MARC21;
DataFieldKeyGenerator keyGenerator = new DataFieldKeyGenerator(definition, type, tag, schemaType);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import de.gwdg.metadataqa.marc.definition.FRBRFunction;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.definition.bibliographic.BibliographicFieldDefinition;
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.definition.general.codelist.CodeList;
import de.gwdg.metadataqa.marc.definition.general.parser.SubfieldContentParser;
import de.gwdg.metadataqa.marc.definition.general.validator.SubfieldValidator;
Expand Down Expand Up @@ -49,7 +50,7 @@ public class SubfieldDefinition implements Serializable {
private List<MarcVersion> disallowedIn;
private MarcVersion marcVersion = null;

public String getCodeForIndex() {
public String getCodeForIndex(SchemaType schemaType) {
if (codeForIndex == null) {
if (mqTag != null) {
if (mqTag.equals("rdf:value"))
Expand All @@ -70,7 +71,7 @@ else if (code.equals("*"))
else if (code.equals("@"))
codeForIndex = "_at";
else
codeForIndex = "_" + code;
codeForIndex = schemaType.equals(SchemaType.PICA) ? code : "_" + code;
}
}
return codeForIndex;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ public class DataFieldKeyGenerator {
private String indexTag;
public static final Pattern nonValidSubfieldCode = Pattern.compile("[^0-9a-zA-Z]");
private MarcVersion marcVersion;
private SchemaType schemaType;

public DataFieldKeyGenerator(DataFieldDefinition definition, SolrFieldType type) {
this.definition = definition;
Expand All @@ -33,9 +34,10 @@ public DataFieldKeyGenerator(DataFieldDefinition definition,
SchemaType schemaType) {
this.definition = definition;
this.type = type;
this.schemaType = schemaType;
if (schemaType.equals(SchemaType.PICA)) {
this.tag = tag;
this.indexTag = tag;
this.indexTag = escape(tag);
} else {
if (definition != null) {
this.tag = definition.getTag();
Expand Down Expand Up @@ -101,14 +103,14 @@ public String forSubfield(MarcSubfield subfield) {
SubfieldDefinition subfieldDefinition = subfield.getDefinition();
if (subfieldDefinition == null && definition != null)
subfieldDefinition = definition.getVersionSpecificSubfield(marcVersion, code);
String codeForIndex = (subfieldDefinition != null) ? subfieldDefinition.getCodeForIndex() : code;
String codeForIndex = (subfieldDefinition != null) ? subfieldDefinition.getCodeForIndex(schemaType) : code;
String key = forSubfield(code, codeForIndex);

return addVersion(subfieldDefinition, key);
}

public String forSubfield(SubfieldDefinition subfield) {
String key = forSubfield(subfield.getCode(), subfield.getCodeForIndex());
String key = forSubfield(subfield.getCode(), subfield.getCodeForIndex(schemaType));
return addVersion(subfield, key);
}

Expand All @@ -120,18 +122,19 @@ private String addVersion(SubfieldDefinition subfieldDefinition, String key) {

private String forSubfield(String code, String codeForIndex) {
String safeTag = nonValidSubfieldCode.matcher(tag).find() ? escape(tag) : tag;
if (nonValidSubfieldCode.matcher(code).matches())
if (nonValidSubfieldCode.matcher(code).matches()) {
code = String.format("x%x", (int) code.charAt(0));

}

String key = "";
switch (type) {
case HUMAN:
key = String.format("%s%s", indexTag, codeForIndex); break;
key = String.format("%s%s", indexTag, codeForIndex);
break;
case MIXED:
if (!tag.equals(indexTag) && !codeForIndex.equals("_" + code))
if ((schemaType == null || !schemaType.equals(SchemaType.PICA)) && !tag.equals(indexTag) && !codeForIndex.equals("_" + code))
key = String.format("%s%s_%s%s", safeTag, code, indexTag, codeForIndex);
else if (!tag.equals(indexTag) && codeForIndex.equals("_" + code))
else if ((schemaType == null || !schemaType.equals(SchemaType.PICA)) && !tag.equals(indexTag) && codeForIndex.equals("_" + code))
key = String.format("%s%s_%s", safeTag, code, indexTag);
else
key = String.format("%s%s", safeTag, code);
Expand All @@ -149,8 +152,9 @@ private String escape(String tag) {
List<String> safe = new ArrayList<>();
for (int i = 0; i < tag.length(); i++) {
String code = tag.substring(i, i+1);
if (nonValidSubfieldCode.matcher(code).matches())
code = String.format("x%x", (int) code.charAt(0));
if (nonValidSubfieldCode.matcher(code).matches()) {
code = "_"; // code = String.format("x%x", (int) code.charAt(0));
}
safe.add(code);
}
return StringUtils.join(safe, "");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ public class PicaFieldDefinition extends DataFieldDefinition {
private String occurrence;
private PicaRange range;
private String id;
private String counter;

private PicaFieldDefinition(){};

Expand All @@ -24,6 +25,7 @@ public PicaFieldDefinition(PicaTagDefinition picaTagDefinition) {
occurrence = picaTagDefinition.getOccurrence();
id = picaTagDefinition.getId();
range = picaTagDefinition.getRange();
counter = picaTagDefinition.getCounter();
indexSubfields();
}

Expand All @@ -39,6 +41,10 @@ public String getOccurrence() {
return occurrence;
}

public String getCounter() {
return counter;
}

public PicaRange getRange() {
return range;
}
Expand Down Expand Up @@ -81,6 +87,7 @@ public PicaFieldDefinition copyWithChangesId() {
other.modified = getModified();
other.pica3 = getPica3();
other.occurrence = getOccurrence();
other.counter = getCounter();
other.range = getRange();
other.indexSubfields();

Expand Down
6 changes: 3 additions & 3 deletions src/test/java/de/gwdg/metadataqa/marc/MarcFactoryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -460,9 +460,9 @@ public void marc2Test() throws IOException, URISyntaxException {
}
}

assertEquals(Arrays.asList("English"), marcRecord
.getKeyValuePairs(SolrFieldType.HUMAN)
.get("AdminMetadata_languageOfCataloging"));
Map<String, List<String>> map = marcRecord.getKeyValuePairs(SolrFieldType.HUMAN);
System.err.println(map);
assertEquals(Arrays.asList("English"), map.get("AdminMetadata_languageOfCataloging"));
}

@Test
Expand Down
19 changes: 10 additions & 9 deletions src/test/java/de/gwdg/metadataqa/marc/cli/MarcToSolrTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ public void pica() throws Exception {
Record record = reader.next();
BibliographicRecord marcRecord = MarcFactory.createPicaFromMarc4j(record, schema);
Map<String, List<String>> map = marcRecord.getKeyValuePairs(SolrFieldType.HUMAN, true, MarcVersion.MARC21);
// System.err.println(map.keySet());
System.err.println(map.keySet());
assertTrue(marcRecord.asJson().contains("036E/01"));
assertTrue(map.containsKey("036E_01_a"));
assertTrue(map.containsKey("036E_00_09a"));
}

@Test
Expand All @@ -82,13 +82,14 @@ public void pica_extra() throws Exception {
field.addFieldIndexer(groupIndexer);

Map<String, List<String>> map = bibliographicRecord.getKeyValuePairs(SolrFieldType.MIXED, true, MarcVersion.MARC21);
assertTrue(map.containsKey("001x400"));
assertEquals(5, map.get("001x400").size());
assertEquals("20,70,77,2035", map.get("001x400").get(0));
assertEquals("20", map.get("001x400").get(1));
assertEquals("70", map.get("001x400").get(2));
assertEquals("77", map.get("001x400").get(3));
assertEquals("2035", map.get("001x400").get(4));
System.err.println(map.keySet());
assertTrue(map.containsKey("001_0"));
assertEquals(5, map.get("001_0").size());
assertEquals("20,70,77,2035", map.get("001_0").get(0));
assertEquals("20", map.get("001_0").get(1));
assertEquals("70", map.get("001_0").get(2));
assertEquals("77", map.get("001_0").get(3));
assertEquals("2035", map.get("001_0").get(4));
}

@Test
Expand Down

0 comments on commit 07580b7

Please sign in to comment.