Skip to content

Commit

Permalink
fix: [#528] Handle capital greek letters in PubMed authors to pass au…
Browse files Browse the repository at this point in the history
…thor field validation (#529)
  • Loading branch information
ursjoss authored Jun 25, 2023
1 parent e4ea8d1 commit ead1c69
Show file tree
Hide file tree
Showing 11 changed files with 125 additions and 14 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ NOTE: References to user stories are in the form Iteration/Story-Number.

.Added

////

.Changed

Expand All @@ -81,10 +80,11 @@ NOTE: References to user stories are in the form Iteration/Story-Number.
.Removed

.Fixed
- {url-issues}528[#528] Handle authors from PubMed with greek capital letters that otherwise fail author string validation

.Security

////

[[v1.8.9]]
== {url-tree}1.8.9[1.8.9] -- 2023-04-29

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public class Paper extends IdScipamatoEntity<Long> implements CodeBoxAware, News
private static final String RE_S_WW = "\\s" + RE_WW;

/**
* Regex verifying the correctness of an Author string. Comprises of:
* Regex verifying the correctness of an Author string. Consists of:
* <ol>
* <li>a single author, made up of one or more "name words", each made up of
* <ul>
Expand Down Expand Up @@ -87,7 +87,7 @@ public class Paper extends IdScipamatoEntity<Long> implements CodeBoxAware, News

/**
* Regex to validate DOIs. Does not capture the full range of possible DOIs, but
* nearly all of the likely ones.
* nearly all the likely ones.
*
* <ol>
* <li>starting with {@literal 10} followed by a period</li>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,22 @@ internal class PaperTest : Jsr303ValidatedEntityTest<Paper>(Paper::class.java) {
verifyFailedAuthorValidation(p, invalidValue)
}

@Test
fun validatingPaper_withGreekCapitalAlphaInAuthor_insteadOfRegularA_fails() {
val regularA = "A"
regularA.first().code shouldBeEqualTo 65

// as found in Author string in PM ID 35469927
val invalidValue = "Sindosi OΑ."
val greekCapitalAlpha = invalidValue.split(" ")[1].drop(1)
greekCapitalAlpha.first().code shouldBeEqualTo 913

val p = newValidEntity()
p.authors = invalidValue

verifyFailedAuthorValidation(p, invalidValue)
}

@Test
fun validatingPaper_withSingleAuthorWithoutFirstname_withoutPeriod_fails() {
val invalidValue = "Turner"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import org.amshove.kluent.shouldBeEqualTo
import org.amshove.kluent.shouldBeInstanceOf
import org.amshove.kluent.shouldBeNull
import org.amshove.kluent.shouldContain
import org.amshove.kluent.shouldContainAll
import org.amshove.kluent.shouldContainSame
import org.junit.jupiter.api.Test

internal class PubmedAuthorParserTest {
Expand Down Expand Up @@ -65,7 +65,7 @@ internal class PubmedAuthorParserTest {
p = PubmedAuthorParser("Ln FN 1st, Ln FN 2nd, Ln FN 3rd, Ln FN 4th, Ln FN 5th, Ln FN 100th, Ln FN.")
p.firstAuthor shouldBeEqualTo "Ln"
p.authors.map { it.lastName } shouldContain "Ln"
p.authors.map { it.firstName } shouldContainAll listOf(
p.authors.map { it.firstName } shouldContainSame listOf(
"FN 1st", "FN 2nd", "FN 3rd", "FN 4th", "FN 5th", "FN 100th", "FN"
)
}
Expand Down Expand Up @@ -96,17 +96,17 @@ internal class PubmedAuthorParserTest {
p = PubmedAuthorParser(
"Turner MC, Cohen A, Jerret M, Gapstur SM, Driver WR, Krewsky D, Beckermann BS, Samet JM."
)
p.authors.map { it.lastName } shouldContainAll listOf(
p.authors.map { it.lastName } shouldContainSame listOf(
"Turner", "Cohen", "Jerret", "Gapstur", "Driver", "Krewsky", "Beckermann", "Samet"
)
p.authors.map { it.firstName } shouldContainAll listOf("MC", "A", "M", "SM", "WR", "D", "BS", "JM")
p.authors.map { it.firstName } shouldContainSame listOf("MC", "A", "M", "SM", "WR", "D", "BS", "JM")
}

@Test
fun canDoUmlaute() {
p = PubmedAuthorParser("Flückiger P, Bäni HU.")
p.authors.map { it.lastName } shouldContainAll listOf("Flückiger", "Bäni")
p.authors.map { it.firstName } shouldContainAll listOf("P", "HU")
p.authors.map { it.lastName } shouldContainSame listOf("Flückiger", "Bäni")
p.authors.map { it.firstName } shouldContainSame listOf("P", "HU")
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package ch.difty.scipamato.core.pubmed

/**
* Takes care of removing capital greek letters that visually look similar or even identical
* to regular latin capital characters and replacing them with the regular ones.
*
* Needed for processing Author strings from Pubmed that would otherwise not pass the
* validation constraint implemented in the Paper class.
*
* Identified in PM ID 35469927 where one of the authors contains a greek capital alpha.
*/
object GreekLetterTranslator {

@Suppress("MagicNumber")
private val greekLetterReplacement = mapOf(
913 to 65,
914 to 66,
917 to 69,
918 to 90,
919 to 72,
921 to 73,
922 to 75,
924 to 77,
925 to 78,
927 to 79,
929 to 80,
932 to 84,
933 to 89,
935 to 88,
).map { (key, value) -> Char(key) to Char(value) }

fun replaceGreekLetters(original: String): String {
var tmp = original
greekLetterReplacement.forEach { (greek, replacement) ->
tmp = tmp.replace(greek, replacement)
}
return tmp
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ private val reasonRegex =
""".+Reason: <strong>([^<]+)</strong>+.+""".toRegex(RegexOption.DOT_MATCHES_ALL)

/**
* Data Class providing the [PubmedArticleFacade] or an error specific message
* Providing the [PubmedArticleFacade] or an error specific message
* providing information about the problem that prevented the retrieval of the article.
*/
class PubmedArticleResult(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public PubmedArticleSet unmarshal(@NotNull final String xmlString) throws IOExce
* <ul>
* <li>via API, e.g.
* {@code https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=25395026&retmode=xml}</li>
* <li>through the Web UI (e.g. https://www.ncbi.nlm.nih.gov/pubmed/25395026)
* <li>through the Web UI (e.g. <a href="https://www.ncbi.nlm.nih.gov/pubmed/25395026">...</a>)
* when sending to {@code file} in format {@code XML}</li>
* </ul>
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ class ScipamatoPubmedArticle extends AbstractPubmedArticleFacade {
setPmId(pmid.getvalue());
final AuthorList authorList = article.getAuthorList();
if (authorList != null) {
setAuthors(getAuthorsFrom(authorList));
setFirstAuthor(getFirstAuthorFrom(authorList));
setAuthors(GreekLetterTranslator.INSTANCE.replaceGreekLetters(getAuthorsFrom(authorList)));
setFirstAuthor(GreekLetterTranslator.INSTANCE.replaceGreekLetters(getFirstAuthorFrom(authorList)));
}
final boolean isAheadOfPrint = "aheadofprint".equals(pubmedArticle.getPubmedData() != null ?
pubmedArticle
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package ch.difty.scipamato.core.pubmed

import org.amshove.kluent.shouldBeEqualTo
import org.amshove.kluent.shouldBeFalse
import org.amshove.kluent.shouldContain
import org.amshove.kluent.shouldNotBeEqualTo
import org.amshove.kluent.shouldNotContain
import org.amshove.kluent.shouldStartWith
import org.junit.jupiter.api.Test

class GreekLetterTranslatorTest {

private val translator = GreekLetterTranslator

@Test
fun canParsePmId35469927_resulting_in_validA() {
val authorsStringPmId35469927 = "Markozannes G, Pantavou K, Rizos EC, Sindosi OΑ, Tagkas C, " +
"Seyfried M, Saldanha IJ, Hatzianastassiou N, Nikolopoulos GK, Ntzani E."

// Ensure the fourth author has a capital greek letter alpha in it (visually nearly identical to "A")
val fourthAuthor = authorsStringPmId35469927.split(", ")[3]
fourthAuthor shouldStartWith "Sindosi"
val capitalGreekAlpha = Char(913) // Capital Greek Letter Alpha
fourthAuthor shouldContain capitalGreekAlpha

val result = translator.replaceGreekLetters(authorsStringPmId35469927)

result shouldNotContain capitalGreekAlpha
result shouldContain Char(65) // Regular Capital A
}

@Test
fun replacesSeveralGreekLetters_with_visuallySimilarLetters() {
val wGreeks = "ΑΒΕΖΗΙΚΜΝΟΡΤΥΧ Α."
val woGreek = "ABEZHIKMNOPTYX A."

assertLettersAreNotSame(wGreeks, woGreek)
@Suppress("KotlinConstantConditions")
(wGreeks == woGreek).shouldBeFalse()

translator.replaceGreekLetters(wGreeks) shouldBeEqualTo woGreek
}

@Suppress("SameParameterValue")
private fun assertLettersAreNotSame(one: String, two: String) {
val toIgnore = setOf(' ', '.')
val first = one.filterNot { it in toIgnore }
val second = two.filterNot { it in toIgnore }
first.length shouldBeEqualTo second.length
first.forEachIndexed { index, char -> char shouldNotBeEqualTo second[index] }
}
}
3 changes: 3 additions & 0 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
run app='core':
./gradlew :{{app}}-web:bootRun


debug app='core':
./gradlew -Dorg.gradle.jvmargs="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" :{{app}}-web:bootRun
1 change: 1 addition & 0 deletions public/public-persistence-jooq/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ val props = file("src/integration-test/resources/application.properties").asProp

testing {
suites {
@Suppress("UNUSED_VARIABLE")
val integrationTest by existing {
dependencies {
implementation(libs.bundles.dbTest)
Expand Down

0 comments on commit ead1c69

Please sign in to comment.