-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
730 additions
and
133 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Calling MARC record-patterns | ||
. ./common-variables | ||
|
||
java -cp $JAR de.gwdg.metadataqa.marc.cli.DataElements $@ |
93 changes: 93 additions & 0 deletions
93
src/main/java/de/gwdg/metadataqa/marc/analysis/DataElementCounter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
package de.gwdg.metadataqa.marc.analysis; | ||
|
||
import de.gwdg.metadataqa.marc.MarcSubfield; | ||
import de.gwdg.metadataqa.marc.dao.DataField; | ||
import de.gwdg.metadataqa.marc.dao.MarcRecord; | ||
import org.apache.commons.io.FileUtils; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.LinkedHashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.logging.Logger; | ||
|
||
public class DataElementCounter { | ||
|
||
public enum Basis{EXISTENCE, OCCURENCE} | ||
|
||
private static final Logger logger = Logger.getLogger(DataElementCounter.class.getCanonicalName()); | ||
|
||
List<DataElement> elements = new ArrayList<>(); | ||
Map<String, List<DataElement>> tags = new LinkedHashMap<>(); | ||
private final String header; | ||
private final Basis basis; | ||
|
||
public DataElementCounter(String dir, String fileName, Basis basis) { | ||
this.basis = basis; | ||
File file = new File(dir, fileName); | ||
String _header = ""; | ||
try { | ||
List<String> lines = FileUtils.readLines(file, "utf-8"); | ||
_header = lines.get(0); | ||
String[] topFields = _header.split(","); | ||
for (String field : topFields) { | ||
String[] parts = field.split("\\$"); | ||
DataElement element = new DataElement(parts[0], parts[1]); | ||
elements.add(element); | ||
tags.computeIfAbsent(element.field, s -> new ArrayList<>()); | ||
tags.get(element.field).add(element); | ||
} | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
this.header = _header; | ||
} | ||
|
||
public List<Integer> count(MarcRecord marcRecord) { | ||
List<Integer> counts = new ArrayList<>(); | ||
for (Map.Entry<String, List<DataElement>> entry : tags.entrySet()) { | ||
List<DataField> instances = marcRecord.getDatafield(entry.getKey()); | ||
if (instances == null || instances.isEmpty()) { | ||
for (DataElement element : entry.getValue()) { | ||
counts.add(0); | ||
} | ||
} else { | ||
Map<String, Integer> result = new LinkedHashMap<>(); | ||
for (DataField instance : instances) { | ||
for (DataElement element : entry.getValue()) { | ||
result.computeIfAbsent(element.subfield, s -> 0); | ||
List<MarcSubfield> subfields = instance.getSubfield(element.subfield); | ||
if (subfields != null && !subfields.isEmpty()) { | ||
result.put(element.subfield, result.get(element.subfield) + subfields.size()); | ||
} | ||
} | ||
} | ||
for (DataElement element : entry.getValue()) { | ||
int score = result.get(element.subfield); | ||
if (basis.equals(Basis.EXISTENCE) && score >= 1) | ||
score = 1; | ||
counts.add(score); | ||
} | ||
} | ||
} | ||
return counts; | ||
} | ||
|
||
public String getHeader() { | ||
return header; | ||
} | ||
|
||
class DataElement { | ||
String field; | ||
String subfield; | ||
String key; | ||
|
||
public DataElement(String field, String subfield) { | ||
this.field = field; | ||
this.subfield = subfield; | ||
this.key = field + "$" + subfield; | ||
} | ||
} | ||
} |
164 changes: 164 additions & 0 deletions
164
src/main/java/de/gwdg/metadataqa/marc/cli/DataElements.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
package de.gwdg.metadataqa.marc.cli; | ||
|
||
import de.gwdg.metadataqa.marc.MarcSubfield; | ||
import de.gwdg.metadataqa.marc.Utils; | ||
import de.gwdg.metadataqa.marc.analysis.DataElementCounter; | ||
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters; | ||
import de.gwdg.metadataqa.marc.cli.parameters.CompletenessParameters; | ||
import de.gwdg.metadataqa.marc.cli.processor.MarcFileProcessor; | ||
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator; | ||
import de.gwdg.metadataqa.marc.dao.DataField; | ||
import de.gwdg.metadataqa.marc.dao.MarcControlField; | ||
import de.gwdg.metadataqa.marc.dao.MarcPositionalControlField; | ||
import de.gwdg.metadataqa.marc.dao.MarcRecord; | ||
import de.gwdg.metadataqa.marc.definition.ControlValue; | ||
import de.gwdg.metadataqa.marc.definition.tags.TagCategory; | ||
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormat; | ||
import de.gwdg.metadataqa.marc.utils.BasicStatistics; | ||
import de.gwdg.metadataqa.marc.utils.TagHierarchy; | ||
import org.apache.commons.cli.HelpFormatter; | ||
import org.apache.commons.cli.Options; | ||
import org.apache.commons.cli.ParseException; | ||
import org.apache.commons.io.FileUtils; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.marc4j.marc.Record; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.Serializable; | ||
import java.nio.charset.Charset; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.TreeMap; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
import java.util.regex.Pattern; | ||
|
||
import static de.gwdg.metadataqa.marc.Utils.createRow; | ||
import static de.gwdg.metadataqa.marc.Utils.quote; | ||
|
||
public class DataElements implements MarcFileProcessor, Serializable { | ||
|
||
private static final Logger logger = Logger.getLogger(DataElements.class.getCanonicalName()); | ||
private static final Pattern dataFieldPattern = Pattern.compile("^(\\d\\d\\d)\\$(.*)$"); | ||
|
||
private final Options options; | ||
private CompletenessParameters parameters; | ||
private Map<String, Integer> library003Counter = new TreeMap<>(); | ||
private Map<String, Integer> libraryCounter = new TreeMap<>(); | ||
private Map<String, Map<String, Integer>> packageCounter = new TreeMap<>(); | ||
private Map<String, Map<String, Integer>> elementCardinality = new TreeMap<>(); | ||
private Map<String, Map<String, Integer>> elementFrequency = new TreeMap<>(); | ||
private Map<String, Map<Integer, Integer>> fieldHistogram = new HashMap<>(); | ||
private boolean readyToProcess; | ||
private DataElementCounter dataElementCounter; | ||
private File outputFile; | ||
|
||
public DataElements(String[] args) throws ParseException { | ||
parameters = new CompletenessParameters(args); | ||
options = parameters.getOptions(); | ||
readyToProcess = true; | ||
} | ||
|
||
public static void main(String[] args) { | ||
MarcFileProcessor processor = null; | ||
try { | ||
processor = new DataElements(args); | ||
} catch (ParseException e) { | ||
System.err.println("ERROR. " + e.getLocalizedMessage()); | ||
System.exit(0); | ||
} | ||
if (processor.getParameters().getArgs().length < 1) { | ||
System.err.println("Please provide a MARC file name!"); | ||
processor.printHelp(processor.getParameters().getOptions()); | ||
System.exit(0); | ||
} | ||
if (processor.getParameters().doHelp()) { | ||
processor.printHelp(processor.getParameters().getOptions()); | ||
System.exit(0); | ||
} | ||
RecordIterator iterator = new RecordIterator(processor); | ||
iterator.start(); | ||
} | ||
|
||
@Override | ||
public CommonParameters getParameters() { | ||
return parameters; | ||
} | ||
|
||
@Override | ||
public void processRecord(Record marc4jRecord, int recordNumber) throws IOException { | ||
// do nothing | ||
} | ||
|
||
@Override | ||
public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOException { | ||
if (parameters.getIgnorableRecords().isIgnorable(marcRecord)) | ||
return; | ||
|
||
printToFile(outputFile, StringUtils.join(dataElementCounter.count(marcRecord), ",") + "\n"); | ||
} | ||
|
||
@Override | ||
public void beforeIteration() { | ||
logger.info(parameters.formatParameters()); | ||
elementCardinality.put("all", new TreeMap<>()); | ||
elementFrequency.put("all", new TreeMap<>()); | ||
packageCounter.put("all", new TreeMap<>()); | ||
dataElementCounter = new DataElementCounter(parameters.getOutputDir(), "top-fields.txt", DataElementCounter.Basis.EXISTENCE); | ||
outputFile = new File(parameters.getOutputDir(), "record-patterns.csv"); | ||
if (outputFile.exists()) | ||
outputFile.delete(); | ||
printToFile(outputFile, dataElementCounter.getHeader() + "\n"); | ||
} | ||
|
||
@Override | ||
public void fileOpened(Path file) { | ||
// do nothing | ||
} | ||
|
||
@Override | ||
public void fileProcessed() { | ||
// do nothing | ||
} | ||
|
||
@Override | ||
public void afterIteration(int numberOfprocessedRecords) { | ||
// do nothing | ||
} | ||
|
||
private void printToFile(File file, String message) { | ||
try { | ||
FileUtils.writeStringToFile(file, message, Charset.defaultCharset(), true); | ||
} catch (IOException e) { | ||
if (parameters.doLog()) | ||
logger.log(Level.SEVERE, "printToFile", e); | ||
} | ||
} | ||
|
||
private char getSeparator(ValidationErrorFormat format) { | ||
if (format.equals(ValidationErrorFormat.TAB_SEPARATED)) { | ||
return '\t'; | ||
} else { | ||
return ','; | ||
} | ||
} | ||
|
||
@Override | ||
public void printHelp(Options options) { | ||
HelpFormatter formatter = new HelpFormatter(); | ||
String message = String.format("java -cp metadata-qa-marc.jar %s [options] [file]", this.getClass().getCanonicalName()); | ||
formatter.printHelp(message, options); | ||
} | ||
|
||
@Override | ||
public boolean readyToProcess() { | ||
return readyToProcess; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.