-
-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Initial conference paper bibliography consistency check #10778
Merged
Merged
Changes from 2 commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
93 changes: 93 additions & 0 deletions
93
src/main/java/org/jabref/logic/quality/consistency/PaperConsistencyCheck.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
package org.jabref.logic.quality.consistency; | ||
|
||
import java.nio.file.Path; | ||
import java.util.Collection; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.SequencedCollection; | ||
import java.util.Set; | ||
|
||
import org.jabref.model.database.BibDatabaseMode; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.BibEntryTypesManager; | ||
import org.jabref.model.entry.field.Field; | ||
import org.jabref.model.entry.types.EntryType; | ||
|
||
public class PaperConsistencyCheck { | ||
|
||
public record Result(Map<EntryType, EntryTypeResult> entryTypeToResultMap) { | ||
} | ||
|
||
public record EntryTypeResult(Collection<Field> fields, SequencedCollection<BibEntry> sortedEntries) { | ||
} | ||
|
||
/** | ||
* Checks the consistency of the given entries by looking at the present and absent fields. | ||
* <p> | ||
* Computation takes place grouped by each entryType. | ||
* Computes the fields set in all entries. In case entries of the same type has more fields defined, it is output. | ||
* <p> | ||
* This class <em>does not</em> check whether all required fields are present or if the fields are valid for the entry type. | ||
* That result can a) be retrieved by using the JabRef UI and b) by checking the CSV output of {@link PaperConsistencyCheckResultCsvWriter#writeFindingsAsCsv(Result, Path, BibEntryTypesManager, BibDatabaseMode)} | ||
* | ||
* @implNote This class does not implement {@link org.jabref.logic.integrity.DatabaseChecker}, because it returns a list of {@link org.jabref.logic.integrity.IntegrityMessage}, which are too fine grained. | ||
*/ | ||
public Result check(List<BibEntry> entries) { | ||
Map<EntryType, Set<Field>> entryTypeToFieldsInAnyEntryMap = new HashMap<>(); | ||
Map<EntryType, Set<Field>> entryTypeToFieldsInAllEntriesMap = new HashMap<>(); | ||
|
||
Map<EntryType, Set<BibEntry>> entryTypeToEntriesMap = new HashMap<>(); | ||
|
||
entries.forEach(entry -> { | ||
EntryType entryType = entry.getType(); | ||
|
||
Set<Field> fieldsInAnyEntry = entryTypeToFieldsInAnyEntryMap.computeIfAbsent(entryType, k -> new HashSet<>()); | ||
fieldsInAnyEntry.addAll(entry.getFields()); | ||
|
||
Set<Field> fieldsInAllEntries = entryTypeToFieldsInAllEntriesMap.computeIfAbsent(entryType, k -> new HashSet<>(entry.getFields())); | ||
fieldsInAllEntries.retainAll(entry.getFields()); | ||
|
||
Set<BibEntry> entriesOfType = entryTypeToEntriesMap.computeIfAbsent(entryType, k -> new HashSet<>()); | ||
entriesOfType.add(entry); | ||
}); | ||
|
||
Map<EntryType, EntryTypeResult> resultMap = new HashMap<>(); | ||
|
||
entryTypeToFieldsInAnyEntryMap.forEach((entryType, fields) -> { | ||
Set<Field> commonFields = entryTypeToFieldsInAllEntriesMap.get(entryType); | ||
assert commonFields != null; | ||
Set<Field> uniqueFields = new HashSet<>(fields); | ||
uniqueFields.removeAll(commonFields); | ||
|
||
if (uniqueFields.isEmpty()) { | ||
return; | ||
} | ||
|
||
List<BibEntry> sortedEntries = entryTypeToEntriesMap | ||
.get(entryType).stream() | ||
.filter(entry -> !entry.getFields().equals(commonFields)) | ||
.sorted((e1, e2) -> { | ||
int first = e1.getFields().size() - e2.getFields().size(); | ||
if (first != 0) { | ||
return first; | ||
} | ||
Iterator<String> it1 = e1.getFields().stream().map(Field::getName).sorted().iterator(); | ||
Iterator<String> it2 = e2.getFields().stream().map(Field::getName).sorted().iterator(); | ||
while (it1.hasNext() && it2.hasNext()) { | ||
int fieldComparison = it1.next().compareTo(it2.next()); | ||
if (fieldComparison != 0) { | ||
return fieldComparison; | ||
} | ||
} | ||
assert !it1.hasNext() && !it2.hasNext(); | ||
return it1.hasNext() ? 1 : it2.hasNext() ? -1 : 0; | ||
}).toList(); | ||
resultMap.put(entryType, new EntryTypeResult(uniqueFields, sortedEntries)); | ||
}); | ||
|
||
return new Result(resultMap); | ||
} | ||
} |
111 changes: 111 additions & 0 deletions
111
src/main/java/org/jabref/logic/quality/consistency/PaperConsistencyCheckResultCsvWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
package org.jabref.logic.quality.consistency; | ||
|
||
import java.io.IOException; | ||
import java.io.OutputStreamWriter; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.ArrayList; | ||
import java.util.Comparator; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.SequencedCollection; | ||
import java.util.Set; | ||
import java.util.stream.Collectors; | ||
|
||
import org.jabref.model.database.BibDatabaseMode; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.BibEntryType; | ||
import org.jabref.model.entry.BibEntryTypesManager; | ||
import org.jabref.model.entry.field.BibField; | ||
import org.jabref.model.entry.field.Field; | ||
|
||
import org.apache.commons.csv.CSVFormat; | ||
import org.apache.commons.csv.CSVPrinter; | ||
import org.jooq.lambda.Unchecked; | ||
|
||
public class PaperConsistencyCheckResultCsvWriter { | ||
public static void writeFindingsAsCsv(PaperConsistencyCheck.Result result, Path path) throws IOException { | ||
writeFindingsAsCsv(result, path, new BibEntryTypesManager(), BibDatabaseMode.BIBTEX); | ||
} | ||
|
||
/** | ||
* Outputs the findings as CSV. | ||
* <p> | ||
* Following symbols are used: | ||
* | ||
* <ul> | ||
* <li><code>x</code> - required field is present</li> | ||
* <li><code>o</code> - optional field is present</li> | ||
* <li><code>?</code> - unknown field is present</li> | ||
* </ul> | ||
* <p> | ||
* Note that this classification is based on JabRef's definition and might not match the publisher's definition. | ||
* | ||
* @implNote We could have implemented a <code>PaperConsistencyCheckResultFormatter</code>, but that would have been too much effort. | ||
*/ | ||
public static void writeFindingsAsCsv(PaperConsistencyCheck.Result result, Path path, BibEntryTypesManager entryTypesManager, BibDatabaseMode bibDatabaseMode) throws IOException { | ||
try ( | ||
OutputStreamWriter writer = new OutputStreamWriter(Files.newOutputStream(path), StandardCharsets.UTF_8); | ||
CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT) | ||
) { | ||
List<Field> allFields = result.entryTypeToResultMap().values().stream() | ||
.flatMap(entryTypeResult -> entryTypeResult.fields().stream()) | ||
.sorted(Comparator.comparing(Field::getName)) | ||
.distinct() | ||
.toList(); | ||
int columnCount = allFields.size() + 2; | ||
|
||
// heading | ||
List<String> theHeading = new ArrayList(columnCount); | ||
theHeading.add("entry type"); | ||
theHeading.add("citation key"); | ||
allFields.forEach(field -> { | ||
theHeading.add(field.getDisplayName()); | ||
}); | ||
csvPrinter.printRecord(theHeading); | ||
|
||
// content | ||
result.entryTypeToResultMap().entrySet().stream() | ||
.sorted(Comparator.comparing(entry -> entry.getKey().getName())) | ||
.forEach(Unchecked.consumer(mapEntry -> { | ||
String entryType = mapEntry.getKey().getDisplayName(); | ||
|
||
Optional<BibEntryType> bibEntryType = entryTypesManager.enrich(mapEntry.getKey(), bibDatabaseMode); | ||
Set<Field> requiredFields = bibEntryType | ||
.map(BibEntryType::getRequiredFields) | ||
.stream() | ||
.flatMap(orFieldsCollection -> orFieldsCollection.stream()) | ||
.flatMap(orFields -> orFields.getFields().stream()) | ||
.collect(Collectors.toSet()); | ||
Set<Field> optionalFields = bibEntryType | ||
.map(BibEntryType::getOptionalFields) | ||
.stream() | ||
.flatMap(bibFieldSet -> bibFieldSet.stream()) | ||
.map(BibField::field) | ||
.collect(Collectors.toSet()); | ||
|
||
PaperConsistencyCheck.EntryTypeResult entries = mapEntry.getValue(); | ||
SequencedCollection<BibEntry> bibEntries = entries.sortedEntries(); | ||
|
||
bibEntries.forEach(Unchecked.consumer(bibEntry -> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Refactor to own method |
||
List<String> theRecord = new ArrayList(columnCount); | ||
theRecord.add(entryType); | ||
theRecord.add(bibEntry.getCitationKey().orElse("")); | ||
allFields.forEach(field -> { | ||
theRecord.add(bibEntry.getField(field).map(value -> { | ||
if (requiredFields.contains(field)) { | ||
return "x"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make this constants |
||
} else if (optionalFields.contains(field)) { | ||
return "o"; | ||
} else { | ||
return "?"; | ||
} | ||
}).orElse("-")); | ||
}); | ||
csvPrinter.printRecord(theRecord); | ||
})); | ||
})); | ||
} | ||
} | ||
} |
149 changes: 149 additions & 0 deletions
149
src/test/java/org/jabref/logic/quality/consistency/PaperConsistencyCheckTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
package org.jabref.logic.quality.consistency; | ||
|
||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
|
||
import org.jabref.logic.importer.ImportFormatPreferences; | ||
import org.jabref.logic.importer.fileformat.BibtexImporter; | ||
import org.jabref.model.database.BibDatabaseContext; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
import org.jabref.model.entry.field.UnknownField; | ||
import org.jabref.model.entry.types.StandardEntryType; | ||
import org.jabref.model.util.DummyFileUpdateMonitor; | ||
|
||
import org.junit.jupiter.api.Disabled; | ||
import org.junit.jupiter.api.Test; | ||
import org.junit.jupiter.api.io.TempDir; | ||
import org.mockito.Answers; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
import static org.mockito.Mockito.mock; | ||
|
||
class PaperConsistencyCheckTest { | ||
|
||
private BibtexImporter importer = new BibtexImporter(mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS), new DummyFileUpdateMonitor()); | ||
|
||
@Test | ||
void checkSimpleLibrary(@TempDir Path tempDir) throws Exception { | ||
BibEntry first = new BibEntry(StandardEntryType.Article, "first") | ||
.withField(StandardField.AUTHOR, "Author One") | ||
.withField(StandardField.PAGES, "some pages"); | ||
BibEntry second = new BibEntry(StandardEntryType.Article, "second") | ||
.withField(StandardField.AUTHOR, "Author One") | ||
.withField(StandardField.PUBLISHER, "publisher"); | ||
PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(List.of(first, second)); | ||
|
||
PaperConsistencyCheck.EntryTypeResult entryTypeResult = new PaperConsistencyCheck.EntryTypeResult(Set.of(StandardField.PAGES, StandardField.PUBLISHER), List.of(first, second)); | ||
PaperConsistencyCheck.Result expected = new PaperConsistencyCheck.Result(Map.of(StandardEntryType.Article, entryTypeResult)); | ||
assertEquals(expected, result); | ||
|
||
Path csvFile = tempDir.resolve("checkSimpleLibrary-result.csv"); | ||
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile); | ||
assertEquals(""" | ||
entry type,citation key,Pages,Publisher | ||
Article,first,o,- | ||
Article,second,-,? | ||
""", Files.readString(csvFile).replace("\r\n", "\n")); | ||
} | ||
|
||
@Test | ||
void checkDifferentOutputSymbols(@TempDir Path tempDir) throws Exception { | ||
UnknownField customField = new UnknownField("custom"); | ||
BibEntry first = new BibEntry(StandardEntryType.Article, "first") | ||
.withField(StandardField.AUTHOR, "Author One") // required | ||
.withField(StandardField.TITLE, "Title") // required | ||
.withField(StandardField.PAGES, "some pages") // optional | ||
.withField(customField, "custom"); // unknown | ||
BibEntry second = new BibEntry(StandardEntryType.Article, "second") | ||
.withField(StandardField.AUTHOR, "Author One"); | ||
PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(List.of(first, second)); | ||
|
||
PaperConsistencyCheck.EntryTypeResult entryTypeResult = new PaperConsistencyCheck.EntryTypeResult(Set.of(StandardField.PAGES, StandardField.TITLE, customField), List.of(first)); | ||
PaperConsistencyCheck.Result expected = new PaperConsistencyCheck.Result(Map.of(StandardEntryType.Article, entryTypeResult)); | ||
assertEquals(expected, result); | ||
|
||
Path csvFile = tempDir.resolve("checkDifferentOutputSymbols-result.csv"); | ||
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile); | ||
assertEquals(""" | ||
entry type,citation key,Custom,Pages,Title | ||
Article,first,?,o,x | ||
""", Files.readString(csvFile).replace("\r\n", "\n")); | ||
} | ||
|
||
@Test | ||
void checkComplexLibrary(@TempDir Path tempDir) throws Exception { | ||
BibEntry first = new BibEntry(StandardEntryType.Article, "first") | ||
.withField(StandardField.AUTHOR, "Author One") | ||
.withField(StandardField.PAGES, "some pages"); | ||
BibEntry second = new BibEntry(StandardEntryType.Article, "second") | ||
.withField(StandardField.AUTHOR, "Author One") | ||
.withField(StandardField.PUBLISHER, "publisher"); | ||
|
||
BibEntry third = new BibEntry(StandardEntryType.InProceedings, "third") | ||
.withField(StandardField.AUTHOR, "Author One") | ||
.withField(StandardField.LOCATION, "location") | ||
.withField(StandardField.YEAR, "2024") | ||
.withField(StandardField.PAGES, "some pages"); | ||
BibEntry fourth = new BibEntry(StandardEntryType.InProceedings, "fourth") | ||
.withField(StandardField.AUTHOR, "Author One") | ||
.withField(StandardField.YEAR, "2024") | ||
.withField(StandardField.PUBLISHER, "publisher"); | ||
BibEntry fifth = new BibEntry(StandardEntryType.InProceedings, "fifth") | ||
.withField(StandardField.AUTHOR, "Author One") | ||
.withField(StandardField.YEAR, "2024"); | ||
|
||
PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(List.of(first, second, third, fourth, fifth)); | ||
|
||
PaperConsistencyCheck.EntryTypeResult articleResult = new PaperConsistencyCheck.EntryTypeResult(Set.of(StandardField.PAGES, StandardField.PUBLISHER), List.of(first, second)); | ||
PaperConsistencyCheck.EntryTypeResult inProceedingsResult = new PaperConsistencyCheck.EntryTypeResult(Set.of(StandardField.PAGES, StandardField.PUBLISHER, StandardField.LOCATION), List.of(fourth, third)); | ||
PaperConsistencyCheck.Result expected = new PaperConsistencyCheck.Result(Map.of( | ||
StandardEntryType.Article, articleResult, | ||
StandardEntryType.InProceedings, inProceedingsResult | ||
)); | ||
assertEquals(expected, result); | ||
|
||
Path csvFile = tempDir.resolve("checkSimpleLibrary-result.csv"); | ||
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile); | ||
assertEquals(""" | ||
entry type,citation key,Location,Pages,Publisher | ||
Article,first,-,o,- | ||
Article,second,-,-,? | ||
InProceedings,fourth,-,-,o | ||
InProceedings,third,?,o,- | ||
""", Files.readString(csvFile).replace("\r\n", "\n")); | ||
} | ||
|
||
@Test | ||
void checkLibraryWithoutIssues(@TempDir Path tempDir) throws Exception { | ||
BibEntry first = new BibEntry(StandardEntryType.Article, "first") | ||
.withField(StandardField.AUTHOR, "Author One") | ||
.withField(StandardField.PAGES, "some pages"); | ||
BibEntry second = new BibEntry(StandardEntryType.Article, "second") | ||
.withField(StandardField.AUTHOR, "Author One") | ||
.withField(StandardField.PAGES, "some pages"); | ||
PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(List.of(first, second)); | ||
|
||
PaperConsistencyCheck.Result expected = new PaperConsistencyCheck.Result(Map.of()); | ||
assertEquals(expected, result); | ||
|
||
Path csvFile = tempDir.resolve("checkLibraryWithoutIssues-result.csv"); | ||
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile); | ||
assertEquals(""" | ||
entry type,citation key | ||
""", Files.readString(csvFile).replace("\r\n", "\n")); | ||
} | ||
|
||
@Test | ||
@Disabled("This test is only for manual generation of a report") | ||
void checkManualInput() throws Exception { | ||
Path file = Path.of("C:\\TEMP\\JabRef\\biblio-anon.bib"); | ||
Path csvFile = file.resolveSibling("biblio-cited.csv"); | ||
BibDatabaseContext databaseContext = importer.importDatabase(file).getDatabaseContext(); | ||
PaperConsistencyCheck.Result result = new PaperConsistencyCheck().check(databaseContext.getEntries()); | ||
PaperConsistencyCheckResultCsvWriter.writeFindingsAsCsv(result, csvFile); | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extract this to a comparator method stuff