Skip to content

Commit

Permalink
Updates to institution citation keys (#7210)
Browse files Browse the repository at this point in the history
* Add test cases

* Fix overly broad regexp

The regexp is still broad, but unless there are further complaints
 perhaps it is enough.

* Fix case-sensitivity in test case

* Fix inline abbreviation for institutes

* Drop test for short author

* Add test case

* Add test case

* Fix test case for single word author

* Fix un-escaped backslash in test case

* Fix unbalanced brackets in text cases

* Fix institute author abbreviations

* Readability modifications

* Add log output for generating university key

When generating a key from a university name it should contain at least
two parts, "university" and the university's name. If it does not it is
likely that the name contained latex that could not be resolved
correctly.

* Fix JavaDoc

* Update CHANGELOG.md

* Add log message on miss-parsed LaTeX

* Change fields to final

* Fix institute abbreviation with special characters

Some characters will be converted into a more BibTeX friendly during
citation key generation. Øresund Science Region should be abbreviated to
 OSR but instead becomes OeSR.

* Drop out-of-scope test case

* Codestyle change

* Update src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java

Co-authored-by: Christoph <[email protected]>

* Update src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java

Co-authored-by: Christoph <[email protected]>

* Update src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java

Co-authored-by: Christoph <[email protected]>

* Update src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java

Co-authored-by: Christoph <[email protected]>

* Removes unnecessary checked exception

* Fix missed NFC normalization

* Add test case

Co-authored-by: Christoph <[email protected]>
  • Loading branch information
k3KAW8Pnf7mkmdSMPHz27 and Siedlerchr authored Dec 28, 2020
1 parent a6749ed commit 78b08b5
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 125 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We fixed an issue where the password for a shared SQL database was not remembered [#6869](https://github.com/JabRef/jabref/issues/6869)
- We fixed an issue where newly added entires were not synced to a shared SQL database [#7176](https://github.com/JabRef/jabref/issues/7176)
- We fixed an issue where the PDF-Content importer threw an exception when no DOI number is present at the first page of the PDF document [#7203](https://github.com/JabRef/jabref/issues/7203)
- We fixed an issue where authors that only have last names were incorrectly identified as institutes when generating citation keys [#7199](https://github.com/JabRef/jabref/issues/7199)
- We fixed an issue where institutes were incorrectly identified as universities when generating citation keys [#6942](https://github.com/JabRef/jabref/issues/6942)

### Removed

Expand Down
1 change: 1 addition & 0 deletions src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
requires reactfx;
requires commons.cli;
requires com.github.tomtung.latex2unicode;
requires fastparse;
requires jbibtex;
requires citeproc.java;
requires antlr.runtime;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.jabref.logic.citationkeypattern;

import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -31,6 +32,7 @@
import org.jabref.model.entry.field.InternalField;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.strings.LatexToUnicodeAdapter;
import org.jabref.model.strings.StringUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -59,13 +61,14 @@ public class BracketedPattern {
*/
private static final Pattern NOT_CAPITAL_CHARACTER = Pattern.compile("[^A-Z]");
/**
* Matches with "({[A-Z]}+)", which should be used to abbreviate the name of an institution
* Matches uppercase english letters between "({" and "})", which should be used to abbreviate the name of an institution
*/
private static final Pattern ABBREVIATIONS = Pattern.compile(".*\\(\\{[A-Z]+}\\).*");
private static final Pattern INLINE_ABBREVIATION = Pattern.compile("(?<=\\(\\{)[A-Z]+(?=}\\))");
/**
* Matches with "dep"/"dip", case insensitive
*/
private static final Pattern DEPARTMENTS = Pattern.compile("^d[ei]p.*", Pattern.CASE_INSENSITIVE);
private static final Pattern WHITESPACE = Pattern.compile("\\p{javaWhitespace}");

private enum Institution {
SCHOOL,
Expand All @@ -74,9 +77,9 @@ private enum Institution {
TECHNOLOGY;

/**
* Matches "uni" at the start of a string or after a space, case insensitive
* Matches "uni" followed by "v" or "b", at the start of a string or after a space, case insensitive
*/
private static final Pattern UNIVERSITIES = Pattern.compile("^uni.*", Pattern.CASE_INSENSITIVE);
private static final Pattern UNIVERSITIES = Pattern.compile("^uni(v|b|$).*", Pattern.CASE_INSENSITIVE);
/**
* Matches with "tech", case insensitive
*/
Expand Down Expand Up @@ -492,9 +495,9 @@ private static AuthorList createAuthorList(String unparsedAuthors) {
for (Author author : AuthorList.parse(unparsedAuthors).getAuthors()) {
// If the author is an institution, use an institution key instead of the full name
String lastName = author.getLast()
.map(LatexToUnicodeAdapter::format)
.map(isInstitution(author) ?
BracketedPattern::generateInstitutionKey : Function.identity())
.map(lastPart -> isInstitution(author) ?
generateInstitutionKey(lastPart) :
LatexToUnicodeAdapter.format(lastPart))
.orElse(null);
authorList.addAuthor(
author.getFirst().map(LatexToUnicodeAdapter::format).orElse(null),
Expand All @@ -508,14 +511,15 @@ private static AuthorList createAuthorList(String unparsedAuthors) {
}

/**
* Checks if an author is an institution by verifying that only the last name is present.
* Checks if an author is an institution which can get a citation key from {@link #generateInstitutionKey(String)}.
*
* @param author the checked author
* @return true if only the last name is present
* @return true if only the last name is present and it contains at least one whitespace character.
*/
private static boolean isInstitution(Author author) {
return author.getFirst().isEmpty() && author.getFirstAbbr().isEmpty() && author.getJr().isEmpty()
&& author.getVon().isEmpty() && author.getLast().isPresent();
&& author.getVon().isEmpty() && author.getLast().isPresent()
&& WHITESPACE.matcher(author.getLast().get()).find();
}

/**
Expand Down Expand Up @@ -658,52 +662,31 @@ public static String camelizeSignificantWordsInTitle(String title) {
}

public static String removeSmallWords(String title) {
StringJoiner stringJoiner = new StringJoiner(" ");
String formattedTitle = formatTitle(title);

try (Scanner titleScanner = new Scanner(formattedTitle)) {
mainl:
while (titleScanner.hasNext()) {
String word = titleScanner.next();

for (String smallWord : Word.SMALLER_WORDS) {
if (word.equalsIgnoreCase(smallWord)) {
continue mainl;
}
}

stringJoiner.add(word);
}
return titleScanner.tokens()
.filter(Predicate.not(
Word::isSmallerWord))
.collect(Collectors.joining(" "));
}

return stringJoiner.toString();
}

private static String getTitleWordsWithSpaces(int number, String title) {
StringJoiner stringJoiner = new StringJoiner(" ");
String formattedTitle = formatTitle(title);
int words = 0;

try (Scanner titleScanner = new Scanner(formattedTitle)) {
while (titleScanner.hasNext() && (words < number)) {
String word = titleScanner.next();

stringJoiner.add(word);
words++;
}
return titleScanner.tokens()
.limit(number)
.collect(Collectors.joining(" "));
}

return stringJoiner.toString();
}

private static String keepLettersAndDigitsOnly(String in) {
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < in.length(); i++) {
if (Character.isLetterOrDigit(in.charAt(i))) {
stringBuilder.append(in.charAt(i));
}
}
return stringBuilder.toString();
return in.codePoints()
.filter(Character::isLetterOrDigit)
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
.toString();
}

/**
Expand Down Expand Up @@ -1131,51 +1114,6 @@ protected static List<String> parseFieldAndModifiers(String arg) {
return parts;
}

/**
* Will remove diacritics from the content.
* <ul>
* <li>Replaces umlaut: \"x with xe, e.g. \"o -> oe, \"u -> ue, etc.</li>
* <li>Removes all other diacritics: \?x -> x, e.g. \'a -> a, etc.</li>
* </ul>
*
* @param content The content.
* @return The content without diacritics.
*/
private static String removeDiacritics(String content) {
if (content.isEmpty()) {
return content;
}

String result = content;
// Replace umlaut with '?e'
result = result.replaceAll("\\{\\\\\"([a-zA-Z])\\}", "$1e");
result = result.replaceAll("\\\\\"\\{([a-zA-Z])\\}", "$1e");
result = result.replaceAll("\\\\\"([a-zA-Z])", "$1e");
// Remove diacritics
result = result.replaceAll("\\{\\\\.([a-zA-Z])\\}", "$1");
result = result.replaceAll("\\\\.\\{([a-zA-Z])\\}", "$1");
result = result.replaceAll("\\\\.([a-zA-Z])", "$1");
return result;
}

/**
* Unifies umlauts.
* <ul>
* <li>Replaces: $\ddot{\mathrm{X}}$ (an alternative umlaut) with: {\"X}</li>
* <li>Replaces: \?{X} and \?X with {\?X}, where ? is a diacritic symbol</li>
* </ul>
*
* @param content The content.
* @return The content with unified diacritics.
*/
private static String unifyDiacritics(String content) {
return content.replaceAll(
"\\$\\\\ddot\\{\\\\mathrm\\{([^\\}])\\}\\}\\$",
"{\\\"$1}").replaceAll(
"(\\\\[^\\-a-zA-Z])\\{?([a-zA-Z])\\}?",
"{$1$2}");
}

/**
* <p>
* An author or editor may be and institution not a person. In that case the key generator builds very long keys,
Expand Down Expand Up @@ -1248,15 +1186,20 @@ private static String generateInstitutionKey(String content) {
return "";
}

String result = content;
result = unifyDiacritics(result);
result = result.replaceAll("^\\{", "").replaceAll("}$", "");
Matcher matcher = ABBREVIATIONS.matcher(result);
if (matcher.matches()) {
return matcher.group(1);
Matcher matcher = INLINE_ABBREVIATION.matcher(content);
if (matcher.find()) {
return LatexToUnicodeAdapter.format(matcher.group());
}

result = removeDiacritics(result);
Optional<String> unicodeFormattedName = LatexToUnicodeAdapter.parse(content);
if (unicodeFormattedName.isEmpty()) {
LOGGER.warn("{} could not be converted to unicode. This can result in an incorrect or missing institute citation key", content);
}
String result = unicodeFormattedName.orElse(Normalizer.normalize(content, Normalizer.Form.NFC));

// Special characters can't be allowed past this point because the citation key generator might replace them with multiple mixed-case characters
result = StringUtil.replaceSpecialCharacters(result);

String[] institutionNameTokens = result.split(",");

// Key parts
Expand Down Expand Up @@ -1335,7 +1278,6 @@ private static String generateInstitutionKey(String content) {
* institution keyword and has an uppercase first letter, except univ/tech key word.
*
* @param word to check
* @return
*/
private static boolean noOtherInstitutionKeyWord(String word) {
return !DEPARTMENTS.matcher(word).matches()
Expand Down
45 changes: 24 additions & 21 deletions src/main/java/org/jabref/logic/formatter/casechanger/Word.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
package org.jabref.logic.formatter.casechanger;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;

/**
* Represents a word in a title of a bibtex entry.
* <p>
* A word can have protected chars (enclosed in '{' '}') and may be a small (a, an, the, ...) word.
*/
public final class Word {
/**
* Set containing common lowercase function words
*/
public static final Set<String> SMALLER_WORDS;
private final char[] chars;
private final boolean[] protectedChars;

static {
Set<String> smallerWords = new HashSet<>();
Expand All @@ -26,12 +31,11 @@ public final class Word {
smallerWords.addAll(Arrays.asList("and", "but", "for", "nor", "or", "so", "yet"));

// unmodifiable for thread safety
SMALLER_WORDS = Collections.unmodifiableSet(smallerWords);
SMALLER_WORDS = smallerWords.stream()
.map(word -> word.toLowerCase(Locale.ROOT))
.collect(Collectors.toUnmodifiableSet());
}

private final char[] chars;
private final boolean[] protectedChars;

public Word(char[] chars, boolean[] protectedChars) {
this.chars = Objects.requireNonNull(chars);
this.protectedChars = Objects.requireNonNull(protectedChars);
Expand All @@ -41,16 +45,21 @@ public Word(char[] chars, boolean[] protectedChars) {
}
}

/**
* Case-insensitive check against {@link Word#SMALLER_WORDS}. Checks for common function words.
*/
public static boolean isSmallerWord(String word) {
return SMALLER_WORDS.contains(word.toLowerCase(Locale.ROOT));
}

/**
* Only change letters of the word that are unprotected to upper case.
*/
public void toUpperCase() {
for (int i = 0; i < chars.length; i++) {
if (protectedChars[i]) {
continue;
if (!protectedChars[i]) {
chars[i] = Character.toUpperCase(chars[i]);
}

chars[i] = Character.toUpperCase(chars[i]);
}
}

Expand All @@ -59,24 +68,18 @@ public void toUpperCase() {
*/
public void toLowerCase() {
for (int i = 0; i < chars.length; i++) {
if (protectedChars[i]) {
continue;
if (!protectedChars[i]) {
chars[i] = Character.toLowerCase(chars[i]);
}

chars[i] = Character.toLowerCase(chars[i]);
}
}

public void toUpperFirst() {
for (int i = 0; i < chars.length; i++) {
if (protectedChars[i]) {
continue;
}

if (i == 0) {
chars[i] = Character.toUpperCase(chars[i]);
} else {
chars[i] = Character.toLowerCase(chars[i]);
if (!protectedChars[i]) {
chars[i] = (i == 0) ?
Character.toUpperCase(chars[i]) :
Character.toLowerCase(chars[i]);
}
}
}
Expand Down
35 changes: 29 additions & 6 deletions src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,49 @@

import java.text.Normalizer;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Pattern;

import com.github.tomtung.latex2unicode.LaTeX2Unicode;
import fastparse.core.Parsed;

/**
* Adapter class for the latex2unicode lib. This is an alternative to our LatexToUnicode class
*/
public class LatexToUnicodeAdapter {

private static Pattern underscoreMatcher = Pattern.compile("_(?!\\{)");
private static final Pattern UNDERSCORE_MATCHER = Pattern.compile("_(?!\\{)");

private static String replacementChar = "\uFFFD";
private static final String REPLACEMENT_CHAR = "\uFFFD";

private static Pattern underscorePlaceholderMatcher = Pattern.compile(replacementChar);
private static final Pattern UNDERSCORE_PLACEHOLDER_MATCHER = Pattern.compile(REPLACEMENT_CHAR);

/**
* Attempts to resolve all LaTeX in the String.
*
* @param inField a String containing LaTeX
* @return a String with LaTeX resolved into Unicode, or the original String if the LaTeX could not be parsed
*/
public static String format(String inField) {
Objects.requireNonNull(inField);
return parse(inField).orElse(Normalizer.normalize(inField, Normalizer.Form.NFC));
}

String toFormat = underscoreMatcher.matcher(inField).replaceAll(replacementChar);
toFormat = Normalizer.normalize(LaTeX2Unicode.convert(toFormat), Normalizer.Form.NFC);
return underscorePlaceholderMatcher.matcher(toFormat).replaceAll("_");
/**
* Attempts to resolve all LaTeX in the String.
*
* @param inField a String containing LaTeX
* @return an {@code Optional<String>} with LaTeX resolved into Unicode or {@code empty} on failure.
*/
public static Optional<String> parse(String inField) {
Objects.requireNonNull(inField);
String toFormat = UNDERSCORE_MATCHER.matcher(inField).replaceAll(REPLACEMENT_CHAR);
var parsingResult = LaTeX2Unicode.parse(toFormat);
if (parsingResult instanceof Parsed.Success) {
String text = parsingResult.get().value();
toFormat = Normalizer.normalize(text, Normalizer.Form.NFC);
return Optional.of(UNDERSCORE_PLACEHOLDER_MATCHER.matcher(toFormat).replaceAll("_"));
}
return Optional.empty();
}
}
Loading

0 comments on commit 78b08b5

Please sign in to comment.