diff --git a/CHANGELOG.md b/CHANGELOG.md index 81f5987028b..79024670c0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,6 +90,8 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve - We fixed an issue where the password for a shared SQL database was not remembered [#6869](https://github.com/JabRef/jabref/issues/6869) - We fixed an issue where newly added entires were not synced to a shared SQL database [#7176](https://github.com/JabRef/jabref/issues/7176) - We fixed an issue where the PDF-Content importer threw an exception when no DOI number is present at the first page of the PDF document [#7203](https://github.com/JabRef/jabref/issues/7203) +- We fixed an issue where authors that only have last names were incorrectly identified as institutes when generating citation keys [#7199](https://github.com/JabRef/jabref/issues/7199) +- We fixed an issue where institutes were incorrectly identified as universities when generating citation keys [#6942](https://github.com/JabRef/jabref/issues/6942) ### Removed diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java index 2394dbfc2cf..cf4223b686d 100644 --- a/src/main/java/module-info.java +++ b/src/main/java/module-info.java @@ -57,6 +57,7 @@ requires reactfx; requires commons.cli; requires com.github.tomtung.latex2unicode; + requires fastparse; requires jbibtex; requires citeproc.java; requires antlr.runtime; diff --git a/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java b/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java index a3c93dfd898..0e5a7d4a518 100644 --- a/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java +++ b/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java @@ -1,5 +1,6 @@ package org.jabref.logic.citationkeypattern; +import java.text.Normalizer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -31,6 +32,7 @@ import org.jabref.model.entry.field.InternalField; import org.jabref.model.entry.field.StandardField; import org.jabref.model.strings.LatexToUnicodeAdapter; +import org.jabref.model.strings.StringUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -59,13 +61,14 @@ public class BracketedPattern { */ private static final Pattern NOT_CAPITAL_CHARACTER = Pattern.compile("[^A-Z]"); /** - * Matches with "({[A-Z]}+)", which should be used to abbreviate the name of an institution + * Matches uppercase english letters between "({" and "})", which should be used to abbreviate the name of an institution */ - private static final Pattern ABBREVIATIONS = Pattern.compile(".*\\(\\{[A-Z]+}\\).*"); + private static final Pattern INLINE_ABBREVIATION = Pattern.compile("(?<=\\(\\{)[A-Z]+(?=}\\))"); /** * Matches with "dep"/"dip", case insensitive */ private static final Pattern DEPARTMENTS = Pattern.compile("^d[ei]p.*", Pattern.CASE_INSENSITIVE); + private static final Pattern WHITESPACE = Pattern.compile("\\p{javaWhitespace}"); private enum Institution { SCHOOL, @@ -74,9 +77,9 @@ private enum Institution { TECHNOLOGY; /** - * Matches "uni" at the start of a string or after a space, case insensitive + * Matches "uni" followed by "v" or "b", at the start of a string or after a space, case insensitive */ - private static final Pattern UNIVERSITIES = Pattern.compile("^uni.*", Pattern.CASE_INSENSITIVE); + private static final Pattern UNIVERSITIES = Pattern.compile("^uni(v|b|$).*", Pattern.CASE_INSENSITIVE); /** * Matches with "tech", case insensitive */ @@ -492,9 +495,9 @@ private static AuthorList createAuthorList(String unparsedAuthors) { for (Author author : AuthorList.parse(unparsedAuthors).getAuthors()) { // If the author is an institution, use an institution key instead of the full name String lastName = author.getLast() - .map(LatexToUnicodeAdapter::format) - .map(isInstitution(author) ? - BracketedPattern::generateInstitutionKey : Function.identity()) + .map(lastPart -> isInstitution(author) ? + generateInstitutionKey(lastPart) : + LatexToUnicodeAdapter.format(lastPart)) .orElse(null); authorList.addAuthor( author.getFirst().map(LatexToUnicodeAdapter::format).orElse(null), @@ -508,14 +511,15 @@ private static AuthorList createAuthorList(String unparsedAuthors) { } /** - * Checks if an author is an institution by verifying that only the last name is present. + * Checks if an author is an institution which can get a citation key from {@link #generateInstitutionKey(String)}. * * @param author the checked author - * @return true if only the last name is present + * @return true if only the last name is present and it contains at least one whitespace character. */ private static boolean isInstitution(Author author) { return author.getFirst().isEmpty() && author.getFirstAbbr().isEmpty() && author.getJr().isEmpty() - && author.getVon().isEmpty() && author.getLast().isPresent(); + && author.getVon().isEmpty() && author.getLast().isPresent() + && WHITESPACE.matcher(author.getLast().get()).find(); } /** @@ -658,52 +662,31 @@ public static String camelizeSignificantWordsInTitle(String title) { } public static String removeSmallWords(String title) { - StringJoiner stringJoiner = new StringJoiner(" "); String formattedTitle = formatTitle(title); try (Scanner titleScanner = new Scanner(formattedTitle)) { - mainl: - while (titleScanner.hasNext()) { - String word = titleScanner.next(); - - for (String smallWord : Word.SMALLER_WORDS) { - if (word.equalsIgnoreCase(smallWord)) { - continue mainl; - } - } - - stringJoiner.add(word); - } + return titleScanner.tokens() + .filter(Predicate.not( + Word::isSmallerWord)) + .collect(Collectors.joining(" ")); } - - return stringJoiner.toString(); } private static String getTitleWordsWithSpaces(int number, String title) { - StringJoiner stringJoiner = new StringJoiner(" "); String formattedTitle = formatTitle(title); - int words = 0; try (Scanner titleScanner = new Scanner(formattedTitle)) { - while (titleScanner.hasNext() && (words < number)) { - String word = titleScanner.next(); - - stringJoiner.add(word); - words++; - } + return titleScanner.tokens() + .limit(number) + .collect(Collectors.joining(" ")); } - - return stringJoiner.toString(); } private static String keepLettersAndDigitsOnly(String in) { - StringBuilder stringBuilder = new StringBuilder(); - for (int i = 0; i < in.length(); i++) { - if (Character.isLetterOrDigit(in.charAt(i))) { - stringBuilder.append(in.charAt(i)); - } - } - return stringBuilder.toString(); + return in.codePoints() + .filter(Character::isLetterOrDigit) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString(); } /** @@ -1131,51 +1114,6 @@ protected static List parseFieldAndModifiers(String arg) { return parts; } - /** - * Will remove diacritics from the content. - * - * - * @param content The content. - * @return The content without diacritics. - */ - private static String removeDiacritics(String content) { - if (content.isEmpty()) { - return content; - } - - String result = content; - // Replace umlaut with '?e' - result = result.replaceAll("\\{\\\\\"([a-zA-Z])\\}", "$1e"); - result = result.replaceAll("\\\\\"\\{([a-zA-Z])\\}", "$1e"); - result = result.replaceAll("\\\\\"([a-zA-Z])", "$1e"); - // Remove diacritics - result = result.replaceAll("\\{\\\\.([a-zA-Z])\\}", "$1"); - result = result.replaceAll("\\\\.\\{([a-zA-Z])\\}", "$1"); - result = result.replaceAll("\\\\.([a-zA-Z])", "$1"); - return result; - } - - /** - * Unifies umlauts. - * - * - * @param content The content. - * @return The content with unified diacritics. - */ - private static String unifyDiacritics(String content) { - return content.replaceAll( - "\\$\\\\ddot\\{\\\\mathrm\\{([^\\}])\\}\\}\\$", - "{\\\"$1}").replaceAll( - "(\\\\[^\\-a-zA-Z])\\{?([a-zA-Z])\\}?", - "{$1$2}"); - } - /** *

* An author or editor may be and institution not a person. In that case the key generator builds very long keys, @@ -1248,15 +1186,20 @@ private static String generateInstitutionKey(String content) { return ""; } - String result = content; - result = unifyDiacritics(result); - result = result.replaceAll("^\\{", "").replaceAll("}$", ""); - Matcher matcher = ABBREVIATIONS.matcher(result); - if (matcher.matches()) { - return matcher.group(1); + Matcher matcher = INLINE_ABBREVIATION.matcher(content); + if (matcher.find()) { + return LatexToUnicodeAdapter.format(matcher.group()); } - result = removeDiacritics(result); + Optional unicodeFormattedName = LatexToUnicodeAdapter.parse(content); + if (unicodeFormattedName.isEmpty()) { + LOGGER.warn("{} could not be converted to unicode. This can result in an incorrect or missing institute citation key", content); + } + String result = unicodeFormattedName.orElse(Normalizer.normalize(content, Normalizer.Form.NFC)); + + // Special characters can't be allowed past this point because the citation key generator might replace them with multiple mixed-case characters + result = StringUtil.replaceSpecialCharacters(result); + String[] institutionNameTokens = result.split(","); // Key parts @@ -1335,7 +1278,6 @@ private static String generateInstitutionKey(String content) { * institution keyword and has an uppercase first letter, except univ/tech key word. * * @param word to check - * @return */ private static boolean noOtherInstitutionKeyWord(String word) { return !DEPARTMENTS.matcher(word).matches() diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/Word.java b/src/main/java/org/jabref/logic/formatter/casechanger/Word.java index 65474d3d2fd..bf5eee51bb6 100644 --- a/src/main/java/org/jabref/logic/formatter/casechanger/Word.java +++ b/src/main/java/org/jabref/logic/formatter/casechanger/Word.java @@ -1,11 +1,11 @@ package org.jabref.logic.formatter.casechanger; import java.util.Arrays; -import java.util.Collections; import java.util.HashSet; import java.util.Locale; import java.util.Objects; import java.util.Set; +import java.util.stream.Collectors; /** * Represents a word in a title of a bibtex entry. @@ -13,7 +13,12 @@ * A word can have protected chars (enclosed in '{' '}') and may be a small (a, an, the, ...) word. */ public final class Word { + /** + * Set containing common lowercase function words + */ public static final Set SMALLER_WORDS; + private final char[] chars; + private final boolean[] protectedChars; static { Set smallerWords = new HashSet<>(); @@ -26,12 +31,11 @@ public final class Word { smallerWords.addAll(Arrays.asList("and", "but", "for", "nor", "or", "so", "yet")); // unmodifiable for thread safety - SMALLER_WORDS = Collections.unmodifiableSet(smallerWords); + SMALLER_WORDS = smallerWords.stream() + .map(word -> word.toLowerCase(Locale.ROOT)) + .collect(Collectors.toUnmodifiableSet()); } - private final char[] chars; - private final boolean[] protectedChars; - public Word(char[] chars, boolean[] protectedChars) { this.chars = Objects.requireNonNull(chars); this.protectedChars = Objects.requireNonNull(protectedChars); @@ -41,16 +45,21 @@ public Word(char[] chars, boolean[] protectedChars) { } } + /** + * Case-insensitive check against {@link Word#SMALLER_WORDS}. Checks for common function words. + */ + public static boolean isSmallerWord(String word) { + return SMALLER_WORDS.contains(word.toLowerCase(Locale.ROOT)); + } + /** * Only change letters of the word that are unprotected to upper case. */ public void toUpperCase() { for (int i = 0; i < chars.length; i++) { - if (protectedChars[i]) { - continue; + if (!protectedChars[i]) { + chars[i] = Character.toUpperCase(chars[i]); } - - chars[i] = Character.toUpperCase(chars[i]); } } @@ -59,24 +68,18 @@ public void toUpperCase() { */ public void toLowerCase() { for (int i = 0; i < chars.length; i++) { - if (protectedChars[i]) { - continue; + if (!protectedChars[i]) { + chars[i] = Character.toLowerCase(chars[i]); } - - chars[i] = Character.toLowerCase(chars[i]); } } public void toUpperFirst() { for (int i = 0; i < chars.length; i++) { - if (protectedChars[i]) { - continue; - } - - if (i == 0) { - chars[i] = Character.toUpperCase(chars[i]); - } else { - chars[i] = Character.toLowerCase(chars[i]); + if (!protectedChars[i]) { + chars[i] = (i == 0) ? + Character.toUpperCase(chars[i]) : + Character.toLowerCase(chars[i]); } } } diff --git a/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java b/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java index c1eb8dc4248..c8bfb0a3a6b 100644 --- a/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java +++ b/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java @@ -2,26 +2,49 @@ import java.text.Normalizer; import java.util.Objects; +import java.util.Optional; import java.util.regex.Pattern; import com.github.tomtung.latex2unicode.LaTeX2Unicode; +import fastparse.core.Parsed; /** * Adapter class for the latex2unicode lib. This is an alternative to our LatexToUnicode class */ public class LatexToUnicodeAdapter { - private static Pattern underscoreMatcher = Pattern.compile("_(?!\\{)"); + private static final Pattern UNDERSCORE_MATCHER = Pattern.compile("_(?!\\{)"); - private static String replacementChar = "\uFFFD"; + private static final String REPLACEMENT_CHAR = "\uFFFD"; - private static Pattern underscorePlaceholderMatcher = Pattern.compile(replacementChar); + private static final Pattern UNDERSCORE_PLACEHOLDER_MATCHER = Pattern.compile(REPLACEMENT_CHAR); + /** + * Attempts to resolve all LaTeX in the String. + * + * @param inField a String containing LaTeX + * @return a String with LaTeX resolved into Unicode, or the original String if the LaTeX could not be parsed + */ public static String format(String inField) { Objects.requireNonNull(inField); + return parse(inField).orElse(Normalizer.normalize(inField, Normalizer.Form.NFC)); + } - String toFormat = underscoreMatcher.matcher(inField).replaceAll(replacementChar); - toFormat = Normalizer.normalize(LaTeX2Unicode.convert(toFormat), Normalizer.Form.NFC); - return underscorePlaceholderMatcher.matcher(toFormat).replaceAll("_"); + /** + * Attempts to resolve all LaTeX in the String. + * + * @param inField a String containing LaTeX + * @return an {@code Optional} with LaTeX resolved into Unicode or {@code empty} on failure. + */ + public static Optional parse(String inField) { + Objects.requireNonNull(inField); + String toFormat = UNDERSCORE_MATCHER.matcher(inField).replaceAll(REPLACEMENT_CHAR); + var parsingResult = LaTeX2Unicode.parse(toFormat); + if (parsingResult instanceof Parsed.Success) { + String text = parsingResult.get().value(); + toFormat = Normalizer.normalize(text, Normalizer.Form.NFC); + return Optional.of(UNDERSCORE_PLACEHOLDER_MATCHER.matcher(toFormat).replaceAll("_")); + } + return Optional.empty(); } } diff --git a/src/test/java/org/jabref/logic/citationkeypattern/BracketedPatternTest.java b/src/test/java/org/jabref/logic/citationkeypattern/BracketedPatternTest.java index c7899d32c0b..d42754fef16 100644 --- a/src/test/java/org/jabref/logic/citationkeypattern/BracketedPatternTest.java +++ b/src/test/java/org/jabref/logic/citationkeypattern/BracketedPatternTest.java @@ -292,4 +292,36 @@ void expandBracketsEmptyStringFromEmptyBrackets() { assertEquals("", BracketedPattern.expandBrackets("[]", null, bibEntry, null)); } + + @Test + void expandBracketsInstitutionAbbreviationFromProvidedAbbreviation() { + BibEntry bibEntry = new BibEntry() + .withField(StandardField.AUTHOR, "{European Union Aviation Safety Agency ({EUASABRACKET})}"); + + assertEquals("EUASABRACKET", BracketedPattern.expandBrackets("[auth]", null, bibEntry, null)); + } + + @Test + void expandBracketsInstitutionAbbreviationForAuthorContainingUnion() { + BibEntry bibEntry = new BibEntry() + .withField(StandardField.AUTHOR, "{European Union Aviation Safety Agency}"); + + assertEquals("EUASA", BracketedPattern.expandBrackets("[auth]", null, bibEntry, null)); + } + + @Test + void expandBracketsLastNameForAuthorStartingWithOnlyLastNameStartingWithLowerCase() { + BibEntry bibEntry = new BibEntry() + .withField(StandardField.AUTHOR, "{eBay}"); + + assertEquals("eBay", BracketedPattern.expandBrackets("[auth]", null, bibEntry, null)); + } + + @Test + void expandBracketsLastNameWithChineseCharacters() { + BibEntry bibEntry = new BibEntry() + .withField(StandardField.AUTHOR, "杨秀群"); + + assertEquals("杨秀群", BracketedPattern.expandBrackets("[auth]", null, bibEntry, null)); + } } diff --git a/src/test/java/org/jabref/logic/citationkeypattern/CitationKeyGeneratorTest.java b/src/test/java/org/jabref/logic/citationkeypattern/CitationKeyGeneratorTest.java index 23eb958cb2c..82498577af2 100644 --- a/src/test/java/org/jabref/logic/citationkeypattern/CitationKeyGeneratorTest.java +++ b/src/test/java/org/jabref/logic/citationkeypattern/CitationKeyGeneratorTest.java @@ -377,7 +377,7 @@ void testcrossrefUniversity() { .withField(StandardField.CROSSREF, "entry2"); BibEntry entry2 = new BibEntry() .withCitationKey("entry2") - .withField(StandardField.AUTHOR, "{Link{\\\"{o}}ping University}}"); + .withField(StandardField.AUTHOR, "{Link{\\\"{o}}ping University}"); database.insertEntry(entry1); database.insertEntry(entry2); @@ -403,7 +403,7 @@ void testcrossrefDepartment() { .withField(StandardField.CROSSREF, "entry2"); BibEntry entry2 = new BibEntry() .withCitationKey("entry2") - .withField(StandardField.AUTHOR, "{Link{\\\"{o}}ping University, Department of Electrical Engineering}}"); + .withField(StandardField.AUTHOR, "{Link{\\\"{o}}ping University, Department of Electrical Engineering}"); database.insertEntry(entry1); database.insertEntry(entry2); @@ -449,7 +449,7 @@ void testcrossrefSchool() { .withField(StandardField.CROSSREF, "entry2"); BibEntry entry2 = new BibEntry() .withCitationKey("entry2") - .withField(StandardField.AUTHOR, "{Link{\\\"{o}}ping University, School of Computer Engineering}}"); + .withField(StandardField.AUTHOR, "{Link{\\\"{o}}ping University, School of Computer Engineering}"); database.insertEntry(entry1); database.insertEntry(entry2);