diff --git a/scripts/common/name_normalisation.sh b/scripts/common/name_normalisation.sh index 0960ef11..f3241b2f 100644 --- a/scripts/common/name_normalisation.sh +++ b/scripts/common/name_normalisation.sh @@ -1,199 +1,5 @@ #!/bin/bash -TRANSLITERATION_API_URI="http://hmlendea-translit.duckdns.org:9584/Transliteration" - -function get-transliteration() { - RAW_TEXT="${1}" - LANGUAGE="${2}" - ENCODED_TEXT=$(echo "${RAW_TEXT}" | python -c "import urllib.parse, sys; print(urllib.parse.quote(sys.stdin.read()))") - TRANSLITERATION_API_ENDPOINT="${TRANSLITERATION_API_URI}?text=${ENCODED_TEXT}&language=${LANGUAGE}" - - curl --silent --insecure --location "${TRANSLITERATION_API_ENDPOINT}" --request GET -} - -function transliterate-name() { - LANGUAGE_CODE="${1}" && shift - RAW_NAME=$(echo "$*" | \ - sed 's/^"\(.*\)"$/\1/g' | \ - sed 's/^null//g' | \ - sed 's/%0A$//g') - LATIN_NAME="${RAW_NAME}" - - [ -z "${RAW_NAME}" ] && return - - [ "${LANGUAGE_CODE}" == "ary" ] && LANGUAGE_CODE="ar" - [ "${LANGUAGE_CODE}" == "arz" ] && LANGUAGE_CODE="ar" - [ "${LANGUAGE_CODE}" == "be-tarask" ] && LANGUAGE_CODE="be" - [ "${LANGUAGE_CODE}" == "pnt" ] && LANGUAGE_CODE="grc" - - if [ "${LANGUAGE_CODE}" == "ab" ] \ - || [ "${LANGUAGE_CODE}" == "ady" ] \ - || [ "${LANGUAGE_CODE}" == "ar" ] \ - || [ "${LANGUAGE_CODE}" == "ba" ] \ - || [ "${LANGUAGE_CODE}" == "be" ] \ - || [ "${LANGUAGE_CODE}" == "bg" ] \ - || [ "${LANGUAGE_CODE}" == "bn" ] \ - || [ "${LANGUAGE_CODE}" == "cv" ] \ - || [ "${LANGUAGE_CODE}" == "cu" ] \ - || [ "${LANGUAGE_CODE}" == "el" ] \ - || [ "${LANGUAGE_CODE}" == "grc" ] \ - || [ "${LANGUAGE_CODE}" == "gu" ] \ - || [ "${LANGUAGE_CODE}" == "he" ] \ - || [ "${LANGUAGE_CODE}" == "hi" ] \ - || [ "${LANGUAGE_CODE}" == "hy" ] \ - || [ "${LANGUAGE_CODE}" == "hyw" ] \ - || [ "${LANGUAGE_CODE}" == "iu" ] \ - || [ "${LANGUAGE_CODE}" == "ja" ] \ - || [ "${LANGUAGE_CODE}" == "ka" ] \ - || [ "${LANGUAGE_CODE}" == "kk" ] \ - || [ "${LANGUAGE_CODE}" == "kn" ] \ - || [ "${LANGUAGE_CODE}" == "ko" ] \ - || [ "${LANGUAGE_CODE}" == "ky" ] \ - || [ "${LANGUAGE_CODE}" == "mk" ] \ - || [ "${LANGUAGE_CODE}" == "ml" ] \ - || [ "${LANGUAGE_CODE}" == "mn" ] \ - || [ "${LANGUAGE_CODE}" == "mr" ] \ - || [ "${LANGUAGE_CODE}" == "os" ] \ - || [ "${LANGUAGE_CODE}" == "ru" ] \ - || [ "${LANGUAGE_CODE}" == "sa" ] \ - || [ "${LANGUAGE_CODE}" == "si" ] \ - || [ "${LANGUAGE_CODE}" == "sr" ] \ - || [ "${LANGUAGE_CODE}" == "ta" ] \ - || [ "${LANGUAGE_CODE}" == "te" ] \ - || [ "${LANGUAGE_CODE}" == "th" ] \ - || [ "${LANGUAGE_CODE}" == "udm" ] \ - || [ "${LANGUAGE_CODE}" == "uk" ] \ - || [ "${LANGUAGE_CODE}" == "zh" ] \ - || [ "${LANGUAGE_CODE}" == "zh-hans" ]; then - LATIN_NAME=$(get-transliteration "${RAW_NAME}" "${LANGUAGE_CODE}") - fi - - echo "${LATIN_NAME}" -} - -function normalise-name() { - local LANGUAGE_CODE="${1}" && shift - local NAME=$(echo "$*" | \ - sed 's/^\"\(.*\)\"$/\1/g' | \ - awk -F" - " '{print $1}' | \ - awk -F"/" '{print $1}' | \ - awk -F"(" '{print $1}' | \ - awk -F"," '{print $1}' | \ - sed \ - -e 's/\s*\([^<]*\).*/\1/g' -} - -function get-name-from-wikidata-label() { - local LANGUAGE_CODE="${1}" - - echo "${WIKIDATA_DATA}" | jq '.entities.'"${WIKIDATA_ID}"'.labels.'"\""${LANGUAGE_CODE}"\""'.value' -} - -function get-name-from-wikidata-sitelink() { - local LANGUAGE_CODE="${1}" - local SITELINK_TITLE="" - local NAME="" - - LANGUAGE_CODE="$(echo "${LANGUAGE_CODE}" | sed 's/-/_/g')" - SITELINK_TITLE=$(echo "${WIKIDATA_DATA}" | jq '.entities.'"${WIKIDATA_ID}"'.sitelinks.'"\""${LANGUAGE_CODE}wiki"\""'.title') - - echo "${SITELINK_TITLE}" -} - -function get-name-for-comparison() { - echo "${@}" | tr '[:upper:]' '[:lower:]' -} - -if ${GEONAMES_ENABLED}; then - echo "Getting the GeoNames default name..." - GEONAMES_DEFAULT_NAME=$(echo "${GEONAMES_DATA}" | sed 's/%NL%\s*/\n/g' | grep "" | sed 's/\s*\([^<]*\).*/\1/g') - GEONAMES_DEFAULT_NAME_FOR_COMPARISON="$(echo "${GEONAMES_DEFAULT_NAME}" | tr '[:upper:]' '[:lower:]')" -fi - -if ${WIKIDATA_ENABLED}; then - echo "Getting the WikiData default name..." - WIKIDATA_DEFAULT_NAME_RAW="$(get-name-from-wikidata-label "en")" - WIKIDATA_DEFAULT_NAME=$(normalise-name "en" "${WIKIDATA_DEFAULT_NAME_RAW}") - WIKIDATA_DEFAULT_NAME_FOR_COMPARISON="$(echo "${WIKIDATA_DEFAULT_NAME}" | tr '[:upper:]' '[:lower:]')" -fi +EXONYMSAPI_ENDPOINT="${EXONYMSAPI_URL}?geoNamesId=${GEONAMES_ID}&wikiDataId=${WIKIDATA_ID}" -MAIN_DEFAULT_NAME="${WIKIDATA_DEFAULT_NAME}" +echo "Fetching ${EXONYMSAPI_ENDPOINT}..." +EXONYMSAPI_RESPONSE=$(curl -s "${EXONYMSAPI_ENDPOINT}") -[ -z "${MAIN_DEFAULT_NAME}" ] && MAIN_DEFAULT_NAME="${GEONAMES_DEFAULT_NAME}" - -function isNameUsable() { - local LANGUAGE_CODE="${1}" - local NAME_RAW="${2}" - local NAME="" - local NAME_FOR_COMPARISON="" - - NAME=$(normalise-name "${LANGUAGE_CODE}" "${NAME_RAW}") - - if [ -z "${NAME}" ] || [ "${NAME}" == "null" ] || [ "${NAME}" == "Null" ]; then - return 1 # false - fi - - NAME_FOR_COMPARISON="$(get-name-for-comparison "${NAME}")" - - if [ "${LANGUAGE_CODE}" != "en" ]; then - if [ "${NAME_FOR_COMPARISON}" == "${GEONAMES_DEFAULT_NAME_FOR_COMPARISON}" ] || - [ "${NAME_FOR_COMPARISON}" == "${GEONAMES_DEFAULT_NAME_FOR_COMPARISON}'" ] || - [ "${NAME_FOR_COMPARISON}" == "${WIKIDATA_DEFAULT_NAME_FOR_COMPARISON}" ] || - [ "${NAME_FOR_COMPARISON}" == "${WIKIDATA_DEFAULT_NAME_FOR_COMPARISON}'" ]; then - return 1 # false - fi - fi - - return 0 # true -} - -function get-raw-name-for-language() { - local LANGUAGE_CODE="${1}" - local NAME="" - - if ${WIKIDATA_ENABLED}; then - NAME=$(get-name-from-wikidata-label "${LANGUAGE_CODE}") - - if ! isNameUsable "${LANGUAGE_CODE}" "${NAME}"; then - NAME=$(get-name-from-wikidata-sitelink "${LANGUAGE_CODE}") - fi - fi - - if ${GEONAMES_ENABLED}; then - if ! isNameUsable "${LANGUAGE_CODE}" "${NAME}"; then - NAME=$(get-name-from-geonames "${LANGUAGE_CODE}") - fi - fi - - if ! isNameUsable "${LANGUAGE_CODE}" "${NAME}"; then - NAME="" - fi - - echo "${NAME}" -} +MAIN_DEFAULT_NAME=$(echo "${EXONYMSAPI_RESPONSE}" | jq -r '.defaultName') function get-name-for-language() { local LANGUAGE_CODE="${1}" local NAME="" - NAME=$(get-raw-name-for-language "${LANGUAGE_CODE}") - - [ -z "${NAME}" ] && return + LANGUAGE_CODE=$(echo "${LANGUAGE_CODE}" | sed -E 's/([^\.]+)/"\1"/g; s/\./\./g') + NAME=$(echo "${EXONYMSAPI_RESPONSE}" | jq -r '.names.'"${LANGUAGE_CODE}") - NAME=$(normalise-name "${LANGUAGE_CODE}" "${NAME}") + [ "${NAME}" == "null" ] && return echo "${NAME}" } @@ -192,15 +90,13 @@ function get-name-line-2codes() { local LANGUAGE2_CODE="${3}" local LANGUAGE1_NAME=$(get-name-for-language "${LANGUAGE1_CODE}") - local LANGUAGE2_NAME_RAW="" local LANGUAGE2_NAME="" if [ -n "${LANGUAGE1_NAME}" ]; then get-name-line "${LANGUAGE_MCN_ID}" "${LANGUAGE1_CODE}" else if [ "${LANGUAGE1_CODE}" == "grc" ]; then - LANGUAGE2_NAME_RAW=$(get-raw-name-for-language "${LANGUAGE2_CODE}") - LANGUAGE2_NAME=$(normalise-name "${LANGUAGE1_CODE}" "${LANGUAGE2_NAME_RAW}") + LANGUAGE2_NAME=$(get-name-for-language "${LANGUAGE2_CODE}") [ -n "${LANGUAGE2_NAME}" ] && echo " " else get-name-line "${LANGUAGE_MCN_ID}" "${LANGUAGE2_CODE}"