Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial conference paper bibliography consistency check #10778

Merged
merged 4 commits into from
Jan 27, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package org.jabref.logic.quality.consistency;

import java.nio.file.Path;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SequencedCollection;
import java.util.Set;

import org.jabref.model.database.BibDatabaseMode;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BibEntryTypesManager;
import org.jabref.model.entry.field.Field;
import org.jabref.model.entry.types.EntryType;

public class PaperConsistencyCheck {

public record Result(Map<EntryType, EntryTypeResult> entryTypeToResultMap) {
}

public record EntryTypeResult(Collection<Field> fields, SequencedCollection<BibEntry> sortedEntries) {
}

/**
* Checks the consistency of the given entries by looking at the present and absent fields.
* <p>
* Computation takes place grouped by each entryType.
* Computes the fields set in all entries. In case entries of the same type has more fields defined, it is output.
* <p>
* This class <em>does not</em> check whether all required fields are present or if the fields are valid for the entry type.
* That result can a) be retrieved by using the JabRef UI and b) by checking the CSV output of {@link PaperConsistencyCheckResultCsvWriter#writeFindingsAsCsv(Result, Path, BibEntryTypesManager, BibDatabaseMode)}
*
* @implNote This class does not implement {@link org.jabref.logic.integrity.DatabaseChecker}, because it returns a list of {@link org.jabref.logic.integrity.IntegrityMessage}, which are too fine grained.
*/
public Result check(List<BibEntry> entries) {
Map<EntryType, Set<Field>> entryTypeToFieldsInAnyEntryMap = new HashMap<>();
Map<EntryType, Set<Field>> entryTypeToFieldsInAllEntriesMap = new HashMap<>();

Map<EntryType, Set<BibEntry>> entryTypeToEntriesMap = new HashMap<>();

entries.forEach(entry -> {
EntryType entryType = entry.getType();

Set<Field> fieldsInAnyEntry = entryTypeToFieldsInAnyEntryMap.computeIfAbsent(entryType, k -> new HashSet<>());
fieldsInAnyEntry.addAll(entry.getFields());

Set<Field> fieldsInAllEntries = entryTypeToFieldsInAllEntriesMap.computeIfAbsent(entryType, k -> new HashSet<>(entry.getFields()));
fieldsInAllEntries.retainAll(entry.getFields());

Set<BibEntry> entriesOfType = entryTypeToEntriesMap.computeIfAbsent(entryType, k -> new HashSet<>());
entriesOfType.add(entry);
});

Map<EntryType, EntryTypeResult> resultMap = new HashMap<>();

entryTypeToFieldsInAnyEntryMap.forEach((entryType, fields) -> {
Set<Field> commonFields = entryTypeToFieldsInAllEntriesMap.get(entryType);
assert commonFields != null;
Set<Field> uniqueFields = new HashSet<>(fields);
uniqueFields.removeAll(commonFields);

if (uniqueFields.isEmpty()) {
return;
}

List<BibEntry> sortedEntries = entryTypeToEntriesMap
.get(entryType).stream()
.filter(entry -> !entry.getFields().equals(commonFields))
.sorted((e1, e2) -> {
int first = e1.getFields().size() - e2.getFields().size();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extract this to a comparator method stuff

if (first != 0) {
return first;
}
Iterator<String> it1 = e1.getFields().stream().map(Field::getName).sorted().iterator();
Iterator<String> it2 = e2.getFields().stream().map(Field::getName).sorted().iterator();
while (it1.hasNext() && it2.hasNext()) {
int fieldComparison = it1.next().compareTo(it2.next());
if (fieldComparison != 0) {
return fieldComparison;
}
}
assert !it1.hasNext() && !it2.hasNext();
return it1.hasNext() ? 1 : it2.hasNext() ? -1 : 0;
}).toList();
resultMap.put(entryType, new EntryTypeResult(uniqueFields, sortedEntries));
});

return new Result(resultMap);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package org.jabref.logic.quality.consistency;

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.SequencedCollection;
import java.util.Set;
import java.util.stream.Collectors;

import org.jabref.model.database.BibDatabaseMode;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BibEntryType;
import org.jabref.model.entry.BibEntryTypesManager;
import org.jabref.model.entry.field.BibField;
import org.jabref.model.entry.field.Field;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.jooq.lambda.Unchecked;

public class PaperConsistencyCheckResultCsvWriter {
public static void writeFindingsAsCsv(PaperConsistencyCheck.Result result, Path path) throws IOException {
writeFindingsAsCsv(result, path, new BibEntryTypesManager(), BibDatabaseMode.BIBTEX);
}

/**
* Outputs the findings as CSV.
* <p>
* Following symbols are used:
*
* <ul>
* <li><code>x</code> - required field is present</li>
* <li><code>o</code> - optional field is present</li>
* <li><code>?</code> - unknown field is present</li>
* </ul>
* <p>
* Note that this classification is based on JabRef's definition and might not match the publisher's definition.
*
* @implNote We could have implemented a <code>PaperConsistencyCheckResultFormatter</code>, but that would have been too much effort.
*/
public static void writeFindingsAsCsv(PaperConsistencyCheck.Result result, Path path, BibEntryTypesManager entryTypesManager, BibDatabaseMode bibDatabaseMode) throws IOException {
try (
OutputStreamWriter writer = new OutputStreamWriter(Files.newOutputStream(path), StandardCharsets.UTF_8);
CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT)
) {
List<Field> allFields = result.entryTypeToResultMap().values().stream()
.flatMap(entryTypeResult -> entryTypeResult.fields().stream())
.sorted(Comparator.comparing(Field::getName))
.distinct()
.toList();
int columnCount = allFields.size() + 2;

// heading
List<String> theHeading = new ArrayList(columnCount);
theHeading.add("entry type");
theHeading.add("citation key");
allFields.forEach(field -> {
theHeading.add(field.getDisplayName());
});
csvPrinter.printRecord(theHeading);

// content
result.entryTypeToResultMap().entrySet().stream()
.sorted(Comparator.comparing(entry -> entry.getKey().getName()))
.forEach(Unchecked.consumer(mapEntry -> {
String entryType = mapEntry.getKey().getDisplayName();

Optional<BibEntryType> bibEntryType = entryTypesManager.enrich(mapEntry.getKey(), bibDatabaseMode);
Set<Field> requiredFields = bibEntryType
.map(BibEntryType::getRequiredFields)
.stream()
.flatMap(orFieldsCollection -> orFieldsCollection.stream())
.flatMap(orFields -> orFields.getFields().stream())
.collect(Collectors.toSet());
Set<Field> optionalFields = bibEntryType
.map(BibEntryType::getOptionalFields)
.stream()
.flatMap(bibFieldSet -> bibFieldSet.stream())
.map(BibField::field)
.collect(Collectors.toSet());

PaperConsistencyCheck.EntryTypeResult entries = mapEntry.getValue();
SequencedCollection<BibEntry> bibEntries = entries.sortedEntries();

bibEntries.forEach(Unchecked.consumer(bibEntry -> {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactor to own method

List<String> theRecord = new ArrayList(columnCount);
theRecord.add(entryType);
theRecord.add(bibEntry.getCitationKey().orElse(""));
allFields.forEach(field -> {
theRecord.add(bibEntry.getField(field).map(value -> {
if (requiredFields.contains(field)) {
return "x";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make this constants

} else if (optionalFields.contains(field)) {
return "o";
} else {
return "?";
}
}).orElse("-"));
});
csvPrinter.printRecord(theRecord);
}));
}));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package org.jabref.logic.quality.consistency;

import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.fileformat.BibtexImporter;
import org.jabref.model.database.BibDatabaseContext;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.field.UnknownField;
import org.jabref.model.entry.types.StandardEntryType;
import org.jabref.model.util.DummyFileUpdateMonitor;

import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Answers;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.mock;

class PaperConsistencyCheckTest {

private BibtexImporter importer = new BibtexImporter(mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS), new DummyFileUpdateMonitor());

@Test
void checkSimpleLibrary(@TempDir Path tempDir) throws Exception {
BibEntry first = new BibEntry(StandardEntryType.Article, "first")
.withField(StandardField.AUTHOR, "Author One")
.withField(StandardField.PAGES, "some pages");
BibEntry second = new BibEntry(StandardEntryType.Article, "second")
.withField(StandardField.AUTHOR, "Author One")
.withField(StandardField.PUBLISHER, "publisher");
PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(List.of(first, second));

PaperConsistencyCheck.EntryTypeResult entryTypeResult = new PaperConsistencyCheck.EntryTypeResult(Set.of(StandardField.PAGES, StandardField.PUBLISHER), List.of(first, second));
PaperConsistencyCheck.Result expected = new PaperConsistencyCheck.Result(Map.of(StandardEntryType.Article, entryTypeResult));
assertEquals(expected, result);

Path csvFile = tempDir.resolve("checkSimpleLibrary-result.csv");
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile);
assertEquals("""
entry type,citation key,Pages,Publisher
Article,first,o,-
Article,second,-,?
""", Files.readString(csvFile).replace("\r\n", "\n"));
}

@Test
void checkDifferentOutputSymbols(@TempDir Path tempDir) throws Exception {
UnknownField customField = new UnknownField("custom");
BibEntry first = new BibEntry(StandardEntryType.Article, "first")
.withField(StandardField.AUTHOR, "Author One") // required
.withField(StandardField.TITLE, "Title") // required
.withField(StandardField.PAGES, "some pages") // optional
.withField(customField, "custom"); // unknown
BibEntry second = new BibEntry(StandardEntryType.Article, "second")
.withField(StandardField.AUTHOR, "Author One");
PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(List.of(first, second));

PaperConsistencyCheck.EntryTypeResult entryTypeResult = new PaperConsistencyCheck.EntryTypeResult(Set.of(StandardField.PAGES, StandardField.TITLE, customField), List.of(first));
PaperConsistencyCheck.Result expected = new PaperConsistencyCheck.Result(Map.of(StandardEntryType.Article, entryTypeResult));
assertEquals(expected, result);

Path csvFile = tempDir.resolve("checkDifferentOutputSymbols-result.csv");
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile);
assertEquals("""
entry type,citation key,Custom,Pages,Title
Article,first,?,o,x
""", Files.readString(csvFile).replace("\r\n", "\n"));
}

@Test
void checkComplexLibrary(@TempDir Path tempDir) throws Exception {
BibEntry first = new BibEntry(StandardEntryType.Article, "first")
.withField(StandardField.AUTHOR, "Author One")
.withField(StandardField.PAGES, "some pages");
BibEntry second = new BibEntry(StandardEntryType.Article, "second")
.withField(StandardField.AUTHOR, "Author One")
.withField(StandardField.PUBLISHER, "publisher");

BibEntry third = new BibEntry(StandardEntryType.InProceedings, "third")
.withField(StandardField.AUTHOR, "Author One")
.withField(StandardField.LOCATION, "location")
.withField(StandardField.YEAR, "2024")
.withField(StandardField.PAGES, "some pages");
BibEntry fourth = new BibEntry(StandardEntryType.InProceedings, "fourth")
.withField(StandardField.AUTHOR, "Author One")
.withField(StandardField.YEAR, "2024")
.withField(StandardField.PUBLISHER, "publisher");
BibEntry fifth = new BibEntry(StandardEntryType.InProceedings, "fifth")
.withField(StandardField.AUTHOR, "Author One")
.withField(StandardField.YEAR, "2024");

PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(List.of(first, second, third, fourth, fifth));

PaperConsistencyCheck.EntryTypeResult articleResult = new PaperConsistencyCheck.EntryTypeResult(Set.of(StandardField.PAGES, StandardField.PUBLISHER), List.of(first, second));
PaperConsistencyCheck.EntryTypeResult inProceedingsResult = new PaperConsistencyCheck.EntryTypeResult(Set.of(StandardField.PAGES, StandardField.PUBLISHER, StandardField.LOCATION), List.of(fourth, third));
PaperConsistencyCheck.Result expected = new PaperConsistencyCheck.Result(Map.of(
StandardEntryType.Article, articleResult,
StandardEntryType.InProceedings, inProceedingsResult
));
assertEquals(expected, result);

Path csvFile = tempDir.resolve("checkSimpleLibrary-result.csv");
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile);
assertEquals("""
entry type,citation key,Location,Pages,Publisher
Article,first,-,o,-
Article,second,-,-,?
InProceedings,fourth,-,-,o
InProceedings,third,?,o,-
""", Files.readString(csvFile).replace("\r\n", "\n"));
}

@Test
void checkLibraryWithoutIssues(@TempDir Path tempDir) throws Exception {
BibEntry first = new BibEntry(StandardEntryType.Article, "first")
.withField(StandardField.AUTHOR, "Author One")
.withField(StandardField.PAGES, "some pages");
BibEntry second = new BibEntry(StandardEntryType.Article, "second")
.withField(StandardField.AUTHOR, "Author One")
.withField(StandardField.PAGES, "some pages");
PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(List.of(first, second));

PaperConsistencyCheck.Result expected = new PaperConsistencyCheck.Result(Map.of());
assertEquals(expected, result);

Path csvFile = tempDir.resolve("checkLibraryWithoutIssues-result.csv");
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile);
assertEquals("""
entry type,citation key
""", Files.readString(csvFile).replace("\r\n", "\n"));
}

@Test
@Disabled("This test is only for manual generation of a report")
void checkManualInput() throws Exception {
Path file = Path.of("C:\\TEMP\\JabRef\\biblio-anon.bib");
Path csvFile = file.resolveSibling("biblio-cited.csv");
BibDatabaseContext databaseContext = importer.importDatabase(file).getDatabaseContext();
PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(databaseContext.getEntries());
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile);
}
}