-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: [#528] Handle capital greek letters in PubMed authors to pass au…
…thor field validation (#529)
- Loading branch information
Showing
11 changed files
with
125 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
39 changes: 39 additions & 0 deletions
39
core/core-pubmed-api/src/main/java/ch/difty/scipamato/core/pubmed/GreekLetterTranslator.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
package ch.difty.scipamato.core.pubmed | ||
|
||
/** | ||
* Takes care of removing capital greek letters that visually look similar or even identical | ||
* to regular latin capital characters and replacing them with the regular ones. | ||
* | ||
* Needed for processing Author strings from Pubmed that would otherwise not pass the | ||
* validation constraint implemented in the Paper class. | ||
* | ||
* Identified in PM ID 35469927 where one of the authors contains a greek capital alpha. | ||
*/ | ||
object GreekLetterTranslator { | ||
|
||
@Suppress("MagicNumber") | ||
private val greekLetterReplacement = mapOf( | ||
913 to 65, | ||
914 to 66, | ||
917 to 69, | ||
918 to 90, | ||
919 to 72, | ||
921 to 73, | ||
922 to 75, | ||
924 to 77, | ||
925 to 78, | ||
927 to 79, | ||
929 to 80, | ||
932 to 84, | ||
933 to 89, | ||
935 to 88, | ||
).map { (key, value) -> Char(key) to Char(value) } | ||
|
||
fun replaceGreekLetters(original: String): String { | ||
var tmp = original | ||
greekLetterReplacement.forEach { (greek, replacement) -> | ||
tmp = tmp.replace(greek, replacement) | ||
} | ||
return tmp | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
52 changes: 52 additions & 0 deletions
52
...re-pubmed-api/src/test/kotlin/ch/difty/scipamato/core/pubmed/GreekLetterTranslatorTest.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package ch.difty.scipamato.core.pubmed | ||
|
||
import org.amshove.kluent.shouldBeEqualTo | ||
import org.amshove.kluent.shouldBeFalse | ||
import org.amshove.kluent.shouldContain | ||
import org.amshove.kluent.shouldNotBeEqualTo | ||
import org.amshove.kluent.shouldNotContain | ||
import org.amshove.kluent.shouldStartWith | ||
import org.junit.jupiter.api.Test | ||
|
||
class GreekLetterTranslatorTest { | ||
|
||
private val translator = GreekLetterTranslator | ||
|
||
@Test | ||
fun canParsePmId35469927_resulting_in_validA() { | ||
val authorsStringPmId35469927 = "Markozannes G, Pantavou K, Rizos EC, Sindosi OΑ, Tagkas C, " + | ||
"Seyfried M, Saldanha IJ, Hatzianastassiou N, Nikolopoulos GK, Ntzani E." | ||
|
||
// Ensure the fourth author has a capital greek letter alpha in it (visually nearly identical to "A") | ||
val fourthAuthor = authorsStringPmId35469927.split(", ")[3] | ||
fourthAuthor shouldStartWith "Sindosi" | ||
val capitalGreekAlpha = Char(913) // Capital Greek Letter Alpha | ||
fourthAuthor shouldContain capitalGreekAlpha | ||
|
||
val result = translator.replaceGreekLetters(authorsStringPmId35469927) | ||
|
||
result shouldNotContain capitalGreekAlpha | ||
result shouldContain Char(65) // Regular Capital A | ||
} | ||
|
||
@Test | ||
fun replacesSeveralGreekLetters_with_visuallySimilarLetters() { | ||
val wGreeks = "ΑΒΕΖΗΙΚΜΝΟΡΤΥΧ Α." | ||
val woGreek = "ABEZHIKMNOPTYX A." | ||
|
||
assertLettersAreNotSame(wGreeks, woGreek) | ||
@Suppress("KotlinConstantConditions") | ||
(wGreeks == woGreek).shouldBeFalse() | ||
|
||
translator.replaceGreekLetters(wGreeks) shouldBeEqualTo woGreek | ||
} | ||
|
||
@Suppress("SameParameterValue") | ||
private fun assertLettersAreNotSame(one: String, two: String) { | ||
val toIgnore = setOf(' ', '.') | ||
val first = one.filterNot { it in toIgnore } | ||
val second = two.filterNot { it in toIgnore } | ||
first.length shouldBeEqualTo second.length | ||
first.forEachIndexed { index, char -> char shouldNotBeEqualTo second[index] } | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters