Skip to content

Commit

Permalink
Implementing record patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Jun 3, 2021
1 parent 3053f64 commit ed15bf0
Show file tree
Hide file tree
Showing 18 changed files with 730 additions and 133 deletions.
21 changes: 21 additions & 0 deletions common-script
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,24 @@ do_marc_history() {
Rscript scripts/marc-history.R ${OUTPUT_DIR} &>> ${PREFIX}/marc-history.log
}

do_record_patterns() {
printf "%s %s> [record-patterns]\n" $(date +"%F %T")
printf "%s %s> Rscript scripts/top-fields.R ${OUTPUT_DIR} &>> ${PREFIX}/top-fields.log\n" $(date +"%F %T")
Rscript scripts/top-fields.R ${OUTPUT_DIR} &>> ${PREFIX}/top-fields.log

PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/--emptyLargeCollectors|--with-delete//')
printf "%s %s> ./record-patterns --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} &> ${PREFIX}/record-patterns.log\n" $(date +"%F %T")
./record-patterns --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} &> ${PREFIX}/record-patterns.log

head -1 ${OUTPUT_DIR}/record-patterns.csv | sed -e 's/^/count,/' > ${OUTPUT_DIR}/record-patterns-groupped.csv
cat ${OUTPUT_DIR}/record-patterns.csv \
| grep -v "\\$" \
| sort \
| uniq -c \
| sort -n -r \
| sed -r 's/^ *([0-9]+) /\1,/' >> ${OUTPUT_DIR}/record-patterns-groupped.csv
}

do_version_link() {
printf "%s %s> [version-link]\n" $(date +"%F %T")
if [[ "$VERSION" != "" ]]; then
Expand Down Expand Up @@ -255,6 +273,9 @@ case "$1" in
marc-history)
do_marc_history
;;
record-patterns)
do_record_patterns
;;
all-analyses)
do_all_analyses
;;
Expand Down
4 changes: 4 additions & 0 deletions record-patterns
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Calling MARC record-patterns
. ./common-variables

java -cp $JAR de.gwdg.metadataqa.marc.cli.DataElements $@
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package de.gwdg.metadataqa.marc.analysis;

import de.gwdg.metadataqa.marc.MarcSubfield;
import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.dao.MarcRecord;
import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;

public class DataElementCounter {

public enum Basis{EXISTENCE, OCCURENCE}

private static final Logger logger = Logger.getLogger(DataElementCounter.class.getCanonicalName());

List<DataElement> elements = new ArrayList<>();
Map<String, List<DataElement>> tags = new LinkedHashMap<>();
private final String header;
private final Basis basis;

public DataElementCounter(String dir, String fileName, Basis basis) {
this.basis = basis;
File file = new File(dir, fileName);
String _header = "";
try {
List<String> lines = FileUtils.readLines(file, "utf-8");
_header = lines.get(0);
String[] topFields = _header.split(",");
for (String field : topFields) {
String[] parts = field.split("\\$");
DataElement element = new DataElement(parts[0], parts[1]);
elements.add(element);
tags.computeIfAbsent(element.field, s -> new ArrayList<>());
tags.get(element.field).add(element);
}
} catch (IOException e) {
e.printStackTrace();
}
this.header = _header;
}

public List<Integer> count(MarcRecord marcRecord) {
List<Integer> counts = new ArrayList<>();
for (Map.Entry<String, List<DataElement>> entry : tags.entrySet()) {
List<DataField> instances = marcRecord.getDatafield(entry.getKey());
if (instances == null || instances.isEmpty()) {
for (DataElement element : entry.getValue()) {
counts.add(0);
}
} else {
Map<String, Integer> result = new LinkedHashMap<>();
for (DataField instance : instances) {
for (DataElement element : entry.getValue()) {
result.computeIfAbsent(element.subfield, s -> 0);
List<MarcSubfield> subfields = instance.getSubfield(element.subfield);
if (subfields != null && !subfields.isEmpty()) {
result.put(element.subfield, result.get(element.subfield) + subfields.size());
}
}
}
for (DataElement element : entry.getValue()) {
int score = result.get(element.subfield);
if (basis.equals(Basis.EXISTENCE) && score >= 1)
score = 1;
counts.add(score);
}
}
}
return counts;
}

public String getHeader() {
return header;
}

class DataElement {
String field;
String subfield;
String key;

public DataElement(String field, String subfield) {
this.field = field;
this.subfield = subfield;
this.key = field + "$" + subfield;
}
}
}
164 changes: 164 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/cli/DataElements.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
package de.gwdg.metadataqa.marc.cli;

import de.gwdg.metadataqa.marc.MarcSubfield;
import de.gwdg.metadataqa.marc.Utils;
import de.gwdg.metadataqa.marc.analysis.DataElementCounter;
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
import de.gwdg.metadataqa.marc.cli.parameters.CompletenessParameters;
import de.gwdg.metadataqa.marc.cli.processor.MarcFileProcessor;
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.dao.MarcControlField;
import de.gwdg.metadataqa.marc.dao.MarcPositionalControlField;
import de.gwdg.metadataqa.marc.dao.MarcRecord;
import de.gwdg.metadataqa.marc.definition.ControlValue;
import de.gwdg.metadataqa.marc.definition.tags.TagCategory;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormat;
import de.gwdg.metadataqa.marc.utils.BasicStatistics;
import de.gwdg.metadataqa.marc.utils.TagHierarchy;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.marc4j.marc.Record;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import static de.gwdg.metadataqa.marc.Utils.createRow;
import static de.gwdg.metadataqa.marc.Utils.quote;

public class DataElements implements MarcFileProcessor, Serializable {

private static final Logger logger = Logger.getLogger(DataElements.class.getCanonicalName());
private static final Pattern dataFieldPattern = Pattern.compile("^(\\d\\d\\d)\\$(.*)$");

private final Options options;
private CompletenessParameters parameters;
private Map<String, Integer> library003Counter = new TreeMap<>();
private Map<String, Integer> libraryCounter = new TreeMap<>();
private Map<String, Map<String, Integer>> packageCounter = new TreeMap<>();
private Map<String, Map<String, Integer>> elementCardinality = new TreeMap<>();
private Map<String, Map<String, Integer>> elementFrequency = new TreeMap<>();
private Map<String, Map<Integer, Integer>> fieldHistogram = new HashMap<>();
private boolean readyToProcess;
private DataElementCounter dataElementCounter;
private File outputFile;

public DataElements(String[] args) throws ParseException {
parameters = new CompletenessParameters(args);
options = parameters.getOptions();
readyToProcess = true;
}

public static void main(String[] args) {
MarcFileProcessor processor = null;
try {
processor = new DataElements(args);
} catch (ParseException e) {
System.err.println("ERROR. " + e.getLocalizedMessage());
System.exit(0);
}
if (processor.getParameters().getArgs().length < 1) {
System.err.println("Please provide a MARC file name!");
processor.printHelp(processor.getParameters().getOptions());
System.exit(0);
}
if (processor.getParameters().doHelp()) {
processor.printHelp(processor.getParameters().getOptions());
System.exit(0);
}
RecordIterator iterator = new RecordIterator(processor);
iterator.start();
}

@Override
public CommonParameters getParameters() {
return parameters;
}

@Override
public void processRecord(Record marc4jRecord, int recordNumber) throws IOException {
// do nothing
}

@Override
public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOException {
if (parameters.getIgnorableRecords().isIgnorable(marcRecord))
return;

printToFile(outputFile, StringUtils.join(dataElementCounter.count(marcRecord), ",") + "\n");
}

@Override
public void beforeIteration() {
logger.info(parameters.formatParameters());
elementCardinality.put("all", new TreeMap<>());
elementFrequency.put("all", new TreeMap<>());
packageCounter.put("all", new TreeMap<>());
dataElementCounter = new DataElementCounter(parameters.getOutputDir(), "top-fields.txt", DataElementCounter.Basis.EXISTENCE);
outputFile = new File(parameters.getOutputDir(), "record-patterns.csv");
if (outputFile.exists())
outputFile.delete();
printToFile(outputFile, dataElementCounter.getHeader() + "\n");
}

@Override
public void fileOpened(Path file) {
// do nothing
}

@Override
public void fileProcessed() {
// do nothing
}

@Override
public void afterIteration(int numberOfprocessedRecords) {
// do nothing
}

private void printToFile(File file, String message) {
try {
FileUtils.writeStringToFile(file, message, Charset.defaultCharset(), true);
} catch (IOException e) {
if (parameters.doLog())
logger.log(Level.SEVERE, "printToFile", e);
}
}

private char getSeparator(ValidationErrorFormat format) {
if (format.equals(ValidationErrorFormat.TAB_SEPARATED)) {
return '\t';
} else {
return ',';
}
}

@Override
public void printHelp(Options options) {
HelpFormatter formatter = new HelpFormatter();
String message = String.format("java -cp metadata-qa-marc.jar %s [options] [file]", this.getClass().getCanonicalName());
formatter.printHelp(message, options);
}

@Override
public boolean readyToProcess() {
return readyToProcess;
}
}
8 changes: 5 additions & 3 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Validator.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@ public class Validator implements MarcFileProcessor, Serializable {
private int vErrorId = 1;

public Validator(String[] args) throws ParseException {
parameters = new ValidatorParameters(args);
this(new ValidatorParameters(args));
}

public Validator(ValidatorParameters parameters) throws ParseException {
this.parameters = parameters;
options = parameters.getOptions();
// errorCounter = new TreeMap<>();
readyToProcess = true;
counter = 0;

}

public static void main(String[] args) {
Expand Down
Loading

0 comments on commit ed15bf0

Please sign in to comment.