Skip to content

Commit

Permalink
Merge pull request #348 from hmlendea/exonymsapi
Browse files Browse the repository at this point in the history
Use ExonymsAPI for gathering location names
  • Loading branch information
hmlendea authored May 25, 2023
2 parents fe685e0 + 11bb3f7 commit cbe7042
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 310 deletions.
194 changes: 0 additions & 194 deletions scripts/common/name_normalisation.sh
Original file line number Diff line number Diff line change
@@ -1,199 +1,5 @@
#!/bin/bash

TRANSLITERATION_API_URI="http://hmlendea-translit.duckdns.org:9584/Transliteration"

function get-transliteration() {
RAW_TEXT="${1}"
LANGUAGE="${2}"
ENCODED_TEXT=$(echo "${RAW_TEXT}" | python -c "import urllib.parse, sys; print(urllib.parse.quote(sys.stdin.read()))")
TRANSLITERATION_API_ENDPOINT="${TRANSLITERATION_API_URI}?text=${ENCODED_TEXT}&language=${LANGUAGE}"

curl --silent --insecure --location "${TRANSLITERATION_API_ENDPOINT}" --request GET
}

function transliterate-name() {
LANGUAGE_CODE="${1}" && shift
RAW_NAME=$(echo "$*" | \
sed 's/^"\(.*\)"$/\1/g' | \
sed 's/^null//g' | \
sed 's/%0A$//g')
LATIN_NAME="${RAW_NAME}"

[ -z "${RAW_NAME}" ] && return

[ "${LANGUAGE_CODE}" == "ary" ] && LANGUAGE_CODE="ar"
[ "${LANGUAGE_CODE}" == "arz" ] && LANGUAGE_CODE="ar"
[ "${LANGUAGE_CODE}" == "be-tarask" ] && LANGUAGE_CODE="be"
[ "${LANGUAGE_CODE}" == "pnt" ] && LANGUAGE_CODE="grc"

if [ "${LANGUAGE_CODE}" == "ab" ] \
|| [ "${LANGUAGE_CODE}" == "ady" ] \
|| [ "${LANGUAGE_CODE}" == "ar" ] \
|| [ "${LANGUAGE_CODE}" == "ba" ] \
|| [ "${LANGUAGE_CODE}" == "be" ] \
|| [ "${LANGUAGE_CODE}" == "bg" ] \
|| [ "${LANGUAGE_CODE}" == "bn" ] \
|| [ "${LANGUAGE_CODE}" == "cv" ] \
|| [ "${LANGUAGE_CODE}" == "cu" ] \
|| [ "${LANGUAGE_CODE}" == "el" ] \
|| [ "${LANGUAGE_CODE}" == "grc" ] \
|| [ "${LANGUAGE_CODE}" == "gu" ] \
|| [ "${LANGUAGE_CODE}" == "he" ] \
|| [ "${LANGUAGE_CODE}" == "hi" ] \
|| [ "${LANGUAGE_CODE}" == "hy" ] \
|| [ "${LANGUAGE_CODE}" == "hyw" ] \
|| [ "${LANGUAGE_CODE}" == "iu" ] \
|| [ "${LANGUAGE_CODE}" == "ja" ] \
|| [ "${LANGUAGE_CODE}" == "ka" ] \
|| [ "${LANGUAGE_CODE}" == "kk" ] \
|| [ "${LANGUAGE_CODE}" == "kn" ] \
|| [ "${LANGUAGE_CODE}" == "ko" ] \
|| [ "${LANGUAGE_CODE}" == "ky" ] \
|| [ "${LANGUAGE_CODE}" == "mk" ] \
|| [ "${LANGUAGE_CODE}" == "ml" ] \
|| [ "${LANGUAGE_CODE}" == "mn" ] \
|| [ "${LANGUAGE_CODE}" == "mr" ] \
|| [ "${LANGUAGE_CODE}" == "os" ] \
|| [ "${LANGUAGE_CODE}" == "ru" ] \
|| [ "${LANGUAGE_CODE}" == "sa" ] \
|| [ "${LANGUAGE_CODE}" == "si" ] \
|| [ "${LANGUAGE_CODE}" == "sr" ] \
|| [ "${LANGUAGE_CODE}" == "ta" ] \
|| [ "${LANGUAGE_CODE}" == "te" ] \
|| [ "${LANGUAGE_CODE}" == "th" ] \
|| [ "${LANGUAGE_CODE}" == "udm" ] \
|| [ "${LANGUAGE_CODE}" == "uk" ] \
|| [ "${LANGUAGE_CODE}" == "zh" ] \
|| [ "${LANGUAGE_CODE}" == "zh-hans" ]; then
LATIN_NAME=$(get-transliteration "${RAW_NAME}" "${LANGUAGE_CODE}")
fi

echo "${LATIN_NAME}"
}

function normalise-name() {
local LANGUAGE_CODE="${1}" && shift
local NAME=$(echo "$*" | \
sed 's/^\"\(.*\)\"$/\1/g' | \
awk -F" - " '{print $1}' | \
awk -F"/" '{print $1}' | \
awk -F"(" '{print $1}' | \
awk -F"," '{print $1}' | \
sed \
-e 's/\s*<alternateName .*$//g' \
-e 's/[…]//g' \
-e 's/^\s*//g' \
-e 's/\s*$//g')

local P_ABBEY="[AaOo][bp][abd]\([ae][z]*[iy][ae]*\|ij\|tstv[oí]\)\|Benediktinerabtei"
local P_AGENCY="[Aa]gen[cț][ijy][a]*"
local P_ANCIENT="[Aa]ncient\|Antiikin [Aa]nti[i]*[ck]\(a\|in\)*\|Ar[c]*ha[ií][ac]"
local P_AUTONOMOUS_GOVERNMENT="[Aa][uv]tonom\(e|\noye\|ous\) \([Gg]overnment\|[Pp]ravitel’stvo\|[Rr]egering\)\|[Gg]obierno [Aa]ut[oó]nomo\|[Öö]zerk [Hh]ükümeti"
local P_CANTON="[CcKk][’]*[hy]*[aāe][i]*[nṇ][tṭ][’]*[aoóuū]n\(a\|i\|o\|s\|u[l]*\)*"
local P_CASTLE="[CcGgKk]a[i]*[sz][lt][ei]*[aál][il]*[eoulmn]*[a]*\|[Cc]h[aâ]teau\|Dvorac\|[KkQq]al[ae]s[iı]\|Z[aá]m[aeo][gk][y]*"
local P_CATHEDRAL="[CcKk]at[h]*[eé]dr[ai][kl][aeoó]*[s]*"
local P_CHURCH="[Bb]iserica\|[Cc]hiesa\|[Cc]hurch\|[Éé]glise\|[Ii]greja\|[Kk]yōkai"
local P_CITY="[Cc]iud[aá][dt]*\|[Cc]ivitas\|[CcSs]\(ee\|i\)[tṭ]\+[aàeiy]\|Nagara\|Oraș\(ul\)*\|Śahara\|Sich’i\|[Ss]tadt"
local P_COMMUNE="[CcKk]om[m]*un[ae]*\|[Kk]özség"
local P_COUNCIL="[Cc]o[u]*n[cs][ei]l[l]*\(iul\)\|[Cc]omhairle"
local P_COUNTRY="[Nn]egeri"
local P_COUNTY="[Cc]o[u]*[mn]t\(a\(do\|t\)\|y\)\|Landgra[a]*fs\(cha\(ft\|p\)\|tvo\)"
local P_DEPARTMENT="[DdḌḍ][eéi]p[’]*[aā][i]*r[tṭ][’]*[aei]*m[aeēi][e]*[nṇ]*[gtṭ]*[’]*\(as\|i\|o\|u\(l\|va\)*\)*\|Ilākhe\|Penbiran\|Tuṟai\|Vibhaaga\|Zhang Wàt"
local P_DESERT="Anapat\|[Aa]nialwch\|Çölü\|[Dd][i]*[eè]*[sșz][iy]*er[tz]\(h\|o\|ul\)*\|Eḍāri\|Gaineamhh\|Gurun\|Hoang\|Maru[bs]h\(tal\|ūmi\)\|[Mm]ortua\|Pālaivaṉam\|Pustynia\|Raṇa\|Sa[bm]ak[u]*\|Se wedhi\|shāmò\|Tá Laēy Saāi\|Vaalvnt"
local P_DIOCESE="[Dd]io[eít]*[cks][eēi][sz][eēi]*[s]*"
local P_DISTRICT="[Aa]pygarda\|[Bb]arrutia\|[Bb]ucağı\|Ḍāḥīẗ\|[Dd][h]*[iy]str[eiy][ckt]*[akt][eouy]*[als]*\|[Iiİi̇]l[cç]esi\|járás\|Jil[lh]*[aāeo][a]*\|Koān\|Māvaṭṭam\|[Pp]asuni\|[Pp]irrâdâh\|Qu\(ận\)*\|[Rr]a[iy]on[iu]\|sum"
local P_DUCHY="bǎijué\|[Dd][uü][ck]\([aá][dt]*[otu][l]*\|h[éy]\|lüğü\)\|Hertogdom\|Kadipaten"
local P_EMIRATE="Aēy Mí Raēy Dtà\|[ĀāEeÉéƏəIiYy]m[aāi]r[l]*[aàāẗhi][dğty]*\([aeiou][l]*\)*\|qiúcháng\|Saamiro\|Tiểu vương quốc\|T’ohuguk"
local P_FORT="\([CcKk][aá]str[aou][lm]*\|Festung\|[Ff]ort\(e\(tsya\)*\|ul\)*\|[Ff]ort\(ale[sz]a\|[e]*ress[e]*\)\|[Ff]ort[r]*e[t]*s[s]*[y]*[ae]*\|[Kk]repost\|[Tv]rdina\|[Yy]ōsai\|[Zz]amogy\)\( \(roman\|royale\)\)*"
local P_GMINA="[Gg][e]*m[e]*[ij]n[d]*[ae]"
local P_HUNDRED="[Hh][äe]r[r]*[ae]d\|[Hh]undred\|[Kk]ihlakunta"
local P_ISLAND="[Aa]raly\|Đảo\|[Ǧǧ]zīrẗ\|[Ii]l[hl]a\|[Ii]nsula\|[Ii]sl[ae]\|[Ii]sland\|[Îî]le\|[Nn][eḗ]sos\|Ostr[io]v\|Sŏm"
local P_KINGDOM="guó\|Irācciyam\|[Kk][eoö]ni[n]*[gk]r[e]*[iy][cej]*[hk]\|K[io]ng[e]*d[oø]m\(met\)*\|[Kk]irályság\|[Kk][o]*r[oa]l\(ev\)*stvo\|Ōkoku\|Rājy[a]*\|[Rr]egatul\|[Rr][eo][giy][an][eolu][m]*[e]*\|[Rr]īce\|[Tt]eyrnas"
local P_LAKE="Gölü\|[Ll]a\(c\|cul\|go\|ke\)\|[Nn][uú][u]*r\|[Oo]zero"
local P_LANGUAGE="[Bb][h]*[aā][a]*[sṣ][h]*[aā][a]*\|[Ll][l]*[aeií][mn][g]*[buv]*[ao]\(ge\)*"
local P_MOUNTAIN="\([Gg]e\)*[Bb]i[e]*rge[r]*\|[Dd]ağları\|[GgHh][ao]ra\|Ǧibāl\|[Mm][ouū][u]*n[tț[[aei]*\([gi]*[ln][es]\|ii\|s\)*\|[Pp]arvata[ṁ]*\|[Ss]hānmài"
local P_MONASTERY="[Kk]l[aáo][o]*[sš]t[eo]r\(is\)*\|\(\(R[eo][y]*al\|[BV]asilikó\) \)*[Mm][ăo]n[aăe]st[eèḗiíy]r\(e[a]*\|i\|io[a]*\|o\|y\)*\|[Mm]onaĥejo\|[Mm]osteiro\|[Ss]hu[u]*dōin"
local P_MUNICIPIUM="[Bb]elediyesi\|Chibang Chach’ije\|Chū-tī\|Đô thị tự trị\|[Kk]ong-[Ss]iā\|[Kk]otamadya\|[Mm]eūang\|[Mm][y]*un[i]*[t]*[cs]ip[’]*\([aā]*l[i]*[dtṭ][’]*\(a[ds]\|é\|et’i\|[iī]\|y\)\|i[ou][lm]*\)\|[Nn]agara [Ss]abhāva\|[Nn]a[gk][a]*r[aā]\(pālika\|ṭci\)\|[Pp]ašvaldība\|[Pp][a]*urasabh[āe]\|[Ss]avivaldybė"
local P_MUNICIPALITY="Bwrdeistref\|D[ḗií]mos\|O[bp]\([cćčš]\|s[hj]\)[t]*ina"
local P_NATIONAL_PARK="[Nn]ational [Pp]ark\|Par[cq]u[el] Na[ctț]ional\|[Vv]ườn [Qq]uốc"
local P_OASIS="[aā]l-[Ww]āḥāt\|[OoÓóŌō][syẏ]*[aáāeē][sz][h]*[aiīeėē][ans]*[uŭ]*\|Oūh Aēy Sít"
local P_PENINSULA="[Bb][aá]n[ ]*[dđ][aả]o\|[Dd]uoninsulo\|[Hh]antō\|[Ll]edenez\|[Nn]iemimaa\|[Pp][ao][luŭ][ouv]ostr[ao][uŭv]\|[Pp][eé]n[iíì][n]*[t]*[csz][ou][lł][aāe]\|[Pp]enrhyn\|Poàn-tó\|[Ss]emenanjung\|Tīpakaṟpam\|[Yy]arim [Oo]roli\|[Yy]arımadası\|[Žž]arym [Aa]raly"
local P_PLATEAU="Alt[io]p[il]*[aà]\(no\)*\|Àrd-thìr\|Daichi\|gāoyuán\|Hḍbẗ\|ordokia\|[Pp][’]*lat[’]*[e]*\([aå][nu]\(et\)*\|o\(s[iu]\)*\)\|[Pp]lošina\|[Pp]lynaukštė"
local P_PREFECTURE="[Pp]r[aäeé][e]*fe[ckt]t[uúū]r[ae]*"
local P_PROVINCE="eanangoddi\|[Ee]par[ck]hía\|[Ll]alawigan\|[Mm]ākāṇam\|Mḥāfẓẗ\|Mkoa\|Mqāṭʿẗ\|[Pp][’]*r[aāou][bpvw][ëií][nñ][t]*[csz]*[eėiíjoy]*[aeėnsz]*\|Pradēśa\|Pr[aā][a]*nt[y]*[a]*\|Rát\|[Ss][h]*[éě]ng\|Shuu\|suyu\|[Tt]alaith\|[VvWw]il[ao][jy][ae][ht][i]*"
local P_REGION="[Aa]ñcala\|[Bb]ölgesi\|[Ee]skualdea\|Gobolka\|[Kk]alāpaya\|Khu vực\|[Kk]shetr\|Kwáāen\|[Pp]akuti\|[Pp]aḷāta\|[Pp]eri\(f\|ph\)[eéē]r[e]*i[j]*a\|[Pp]iirkond\|[Pp]r[a]*desh[a]*\|[Pp]rāntaṁ\|[Rr][eé][gģhx][ij]*\([ãoóu][ou]*n*[ei]*[as]*\|st[aā]n\)\|[Rr]ijn"
local P_REPUBLIC="Cộng hòa\|[DdTt][aáä][aä]*[ʹ]*s[s]*[ei]*v[aäá][ʹ]*ld[di]\|[Dd][eēi]mokr[h]*atía\|gōnghé\|[Gg]weriniaeth\|[Jj]anarajaya\|Khiung-fò-koet\|Kongwaguk\|Köztársaság\|Kyōwa\( Koku\)*\|Olómìnira\|Praj[aā][a]*[s]*t[t]*a[a]*\(k\|ntra\)\|[Rr][eéi][s]*[ ]*p[’]*[aāuüùúy][ā’]*b[ba]*l[eií][’]*[cgkq][ck]*[’]*\([ai]\|as[ıy]\|en\|[hḥ]y\|i\|ue\)*\|[Ss]ăā-taā-rá-ná-rát\|[Tt]a[sz][ao]val[dt]\(a\|kund\)"
local P_RIVER="Abhainn\|Afon\|[Ff][il]u\(me\|viul\)\|Gawa\|Nadī\|Nhr\|[Rr]âu[l]*\|[Rr]iver\|Sungai"
local P_RUIN="[Rr]uin[ae]*"
local P_STATE="Bang\|[EeÉéIi]*[SsŜŝŜŝŠšŞş]*[h]*[tṭ][’]*[aeē][dtṭu][’]*[aeiıosu]*[l]*\|[Oo]sariik\|[Oo]st[’]*an[ıi]\|Ūlāīẗ\|[Uu]stoni\|valstija*"
local P_TEMPLE="[Dd]ēvālaya\(mu\)*\|[Kk]ōvil\|[Mm][a]*ndir[a]*\|Ná Tiān\|[Pp]agoda\|[Tt]emp[e]*l[eou]*[l]*"
local P_TOWNSHIP="[CcKk]anton[ae]*\(mendua\)*\|[Tt]ownship"
local P_UNIVERSITY="[Dd]aigaku\|\(Lā \)*[BbVv]i[sś][h]*[vw]\+\(a[bv]\)*idyāla[yẏ][a]*[ṁ]*\|[Oo]llscoil\|[Uu]niversit\(ate[a]a*\|y\)\|[Vv]idyaapith"
local P_VOIVODESHIP="V[éo][i]*[e]*vod[ae]*\(s\(hip\|tv[ií]\)\|t\(e\|ul\)\)"

local P_OF="\([AaĀā]p[h]*[a]*\|[Dd]\|[Dd][aeio][ls]*\|gia\|[Oo]f\|[Mm]ạc\|ng\|[Tt]a\|t[ēi]s\|[Tt]o[uy]\|van\|w\|[Yy]r\)[ \'\"’']"

local COMMON_PATTERNS="${P_ABBEY}\|${P_AGENCY}\|${P_ANCIENT}\|${P_AUTONOMOUS_GOVERNMENT}\|${P_CANTON}\|${P_CASTLE}\|${P_CATHEDRAL}"
COMMON_PATTERNS="${COMMON_PATTERNS}\|${P_CHURCH}\|${P_CITY}\|${P_COMMUNE}\|${P_COUNCIL}\|${P_COUNTRY}\|${P_COUNTY}\|${P_DESERT}"
COMMON_PATTERNS="${COMMON_PATTERNS}\|${P_DEPARTMENT}\|${P_DIOCESE}\|${P_DISTRICT}\|${P_DUCHY}\|${P_EMIRATE}\|${P_FORT}\|${P_GMINA}"
COMMON_PATTERNS="${COMMON_PATTERNS}\|${P_HUNDRED}\|${P_ISLAND}\|${P_KINGDOM}\|${P_LAKE}\|${P_LANGUAGE}\|${P_MONASTERY}\|${P_MOUNTAIN}"
COMMON_PATTERNS="${COMMON_PATTERNS}\|${P_MUNICIPIUM}\|${P_MUNICIPALITY}\|${P_NATIONAL_PARK}\|${P_OASIS}\|${P_PENINSULA}\|${P_PLATEAU}"
COMMON_PATTERNS="${COMMON_PATTERNS}\|${P_PREFECTURE}\|${P_PROVINCE}\|${P_REGION}\|${P_REPUBLIC}\|${P_RIVER}\|${P_RUIN}\|${P_STATE}"
COMMON_PATTERNS="${COMMON_PATTERNS}\|${P_TEMPLE}\|${P_TOWNSHIP}\|${P_UNIVERSITY}\|${P_VOIVODESHIP}"

local TRANSLITERATED_NAME=$(transliterate-name "${LANGUAGE_CODE}" "${NAME}")
local NORMALISED_NAME=$(echo "${TRANSLITERATED_NAME}" | \
perl -p0e 's/\r*\n/ /g' | \
awk -F" - " '{print $1}' | \
awk -F"/" '{print $1}' | \
awk -F"(" '{print $1}' | \
awk -F"," '{print $1}' | \
sed \
-e 's/^"\(.*\)"$/\1/g' \
-e 's/^\s*//g' \
-e 's/\s*$//g' \
-e 's/^ẖ/H̱/g' \
\
-e 's/ AG$//g' \
\
-e 's/P‍/P/g' \
-e 's/T‍/T/g' \
-e 's/p‍/p/g' \
-e 's/t‍/t/g')

NORMALISED_NAME=$(echo "${NORMALISED_NAME}" | \
sed \
-e 's/^\('"${COMMON_PATTERNS}"'\)\s\+\('"${P_OF}"'\)*//g' \
-e 's/[ ’-]\('"${COMMON_PATTERNS}"'\)$//g' \
\
-e 's/\([^\s]\)-\s*/\1-/g' \
-e 's/[·]//g' \
-e 's/\(.\)\1\1/\1\1/g' \
-e 's/^\s*//g' \
-e 's/\s*$//g' \
-e 's/\s\s*/ /g')

if [ "${LANGUAGE_CODE}" == "ko" ]; then
NORMALISED_NAME=$(sed 's/[’\"]//g' <<< "${NORMALISED_NAME}")
fi

if [ "${LANGUAGE_CODE}" != "ar" ] && \
[ "${LANGUAGE_CODE}" != "ga" ] && \
[ "${LANGUAGE_CODE}" != "jam" ] && \
[ "${LANGUAGE_CODE}" != "jbo" ]; then
NORMALISED_NAME=$(echo "${NORMALISED_NAME}" | sed 's/^\([a-z]\)/\U\1/g')
fi

[ "${LANGUAGE_CODE}" == "kaa" ] && NORMALISED_NAME=$(echo "${NORMALISED_NAME}" | sed "s/U'/Ú/g")
[ "${LANGUAGE_CODE}" == "lt" ] && NORMALISED_NAME=$(echo "${NORMALISED_NAME}" | sed 's/^Šv\./Šventasis/g')
[ "${LANGUAGE_CODE}" == "zh" ] && NORMALISED_NAME=$(echo "${NORMALISED_NAME}" | sed 's/-//g')
[ "${LANGUAGE_CODE}" == "ang" ] && NORMALISED_NAME=$(echo "${NORMALISED_NAME}" | sed 's/enrice$/e/g')

NORMALISED_NAME=$(echo "${NORMALISED_NAME}" | sed 's/^L'"'"'/l'"'"'/g')

echo "${NORMALISED_NAME}"
}

function nameToLocationId() {
local NAME="${1}"
local LOCATION_ID=""
Expand Down
128 changes: 12 additions & 116 deletions scripts/gather-names.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ fi

source "scripts/common/name_normalisation.sh"

GEONAMES_ENABLED=false
EXONYMSAPI_URL="http://hmlendea-exonyms.duckdns.org:8263/Exonyms"
GEONAMES_API_URL="http://api.geonames.org"
GEONAMES_USERNAME="geonamesfreeaccountt"
WIKIDATA_API_URL="https://www.wikidata.org"

GEONAMES_ENABLED=false
WIKIDATA_ENABLED=false
WIKIDATA_API_URL="https://www.wikidata.org"

while true; do
if [ "${1}" == "--geonamesid" ] || \
Expand All @@ -32,13 +32,11 @@ while true; do
fi
done

if ${WIKIDATA_ENABLED}; then
if ! ${GEONAMES_ENABLED} && ${WIKIDATA_ENABLED}; then
WIKIDATA_ENDPOINT="${WIKIDATA_API_URL}/wiki/Special:EntityData/${WIKIDATA_ID}.json"
echo "Fetching ${WIKIDATA_ENDPOINT}..."
WIKIDATA_DATA=$(curl -s "${WIKIDATA_ENDPOINT}")
fi

if ! ${GEONAMES_ENABLED} && ${WIKIDATA_ENABLED}; then
WIKIDATA_GEONAMES_IDS_COUNT=$(jq '.entities.'"${WIKIDATA_ID}"'.claims.P1566' <<< "${WIKIDATA_DATA}" | grep -c "external-id")

if [ "${WIKIDATA_GEONAMES_IDS_COUNT}" == "1" ]; then
Expand All @@ -57,121 +55,21 @@ if ! ${GEONAMES_ENABLED} && ${WIKIDATA_ENABLED}; then
fi
fi

if ${GEONAMES_ENABLED}; then
GEONAMES_ENDPOINT="${GEONAMES_API_URL}/get?username=${GEONAMES_USERNAME}&geonameId=${GEONAMES_ID}"
echo "Fetching ${GEONAMES_ENDPOINT}..."
GEONAMES_DATA=$(curl -s "${GEONAMES_ENDPOINT}" | perl -p0e 's/\r*//g' | perl -p0e 's/\n/%NL%/g')
fi

function get-name-from-geonames() {
local LANGUAGE_CODE="${1}"
local NAME=""

echo "${GEONAMES_DATA}" | sed 's/%NL%\s*/\n/g' | \
grep "<alternateName " | \
grep "lang=\"${LANGUAGE_CODE}\"" | \
sed 's/isPreferredName=\"[^\"]*\"\s*//g' | \
sed 's/\s*<alternateName lang=\"'"${LANGUAGE_CODE}"'\">\([^<]*\).*/\1/g'
}

function get-name-from-wikidata-label() {
local LANGUAGE_CODE="${1}"

echo "${WIKIDATA_DATA}" | jq '.entities.'"${WIKIDATA_ID}"'.labels.'"\""${LANGUAGE_CODE}"\""'.value'
}

function get-name-from-wikidata-sitelink() {
local LANGUAGE_CODE="${1}"
local SITELINK_TITLE=""
local NAME=""

LANGUAGE_CODE="$(echo "${LANGUAGE_CODE}" | sed 's/-/_/g')"
SITELINK_TITLE=$(echo "${WIKIDATA_DATA}" | jq '.entities.'"${WIKIDATA_ID}"'.sitelinks.'"\""${LANGUAGE_CODE}wiki"\""'.title')

echo "${SITELINK_TITLE}"
}

function get-name-for-comparison() {
echo "${@}" | tr '[:upper:]' '[:lower:]'
}

if ${GEONAMES_ENABLED}; then
echo "Getting the GeoNames default name..."
GEONAMES_DEFAULT_NAME=$(echo "${GEONAMES_DATA}" | sed 's/%NL%\s*/\n/g' | grep "<name>" | sed 's/\s*<name>\([^<]*\).*/\1/g')
GEONAMES_DEFAULT_NAME_FOR_COMPARISON="$(echo "${GEONAMES_DEFAULT_NAME}" | tr '[:upper:]' '[:lower:]')"
fi

if ${WIKIDATA_ENABLED}; then
echo "Getting the WikiData default name..."
WIKIDATA_DEFAULT_NAME_RAW="$(get-name-from-wikidata-label "en")"
WIKIDATA_DEFAULT_NAME=$(normalise-name "en" "${WIKIDATA_DEFAULT_NAME_RAW}")
WIKIDATA_DEFAULT_NAME_FOR_COMPARISON="$(echo "${WIKIDATA_DEFAULT_NAME}" | tr '[:upper:]' '[:lower:]')"
fi
EXONYMSAPI_ENDPOINT="${EXONYMSAPI_URL}?geoNamesId=${GEONAMES_ID}&wikiDataId=${WIKIDATA_ID}"

MAIN_DEFAULT_NAME="${WIKIDATA_DEFAULT_NAME}"
echo "Fetching ${EXONYMSAPI_ENDPOINT}..."
EXONYMSAPI_RESPONSE=$(curl -s "${EXONYMSAPI_ENDPOINT}")

[ -z "${MAIN_DEFAULT_NAME}" ] && MAIN_DEFAULT_NAME="${GEONAMES_DEFAULT_NAME}"

function isNameUsable() {
local LANGUAGE_CODE="${1}"
local NAME_RAW="${2}"
local NAME=""
local NAME_FOR_COMPARISON=""

NAME=$(normalise-name "${LANGUAGE_CODE}" "${NAME_RAW}")

if [ -z "${NAME}" ] || [ "${NAME}" == "null" ] || [ "${NAME}" == "Null" ]; then
return 1 # false
fi

NAME_FOR_COMPARISON="$(get-name-for-comparison "${NAME}")"

if [ "${LANGUAGE_CODE}" != "en" ]; then
if [ "${NAME_FOR_COMPARISON}" == "${GEONAMES_DEFAULT_NAME_FOR_COMPARISON}" ] ||
[ "${NAME_FOR_COMPARISON}" == "${GEONAMES_DEFAULT_NAME_FOR_COMPARISON}'" ] ||
[ "${NAME_FOR_COMPARISON}" == "${WIKIDATA_DEFAULT_NAME_FOR_COMPARISON}" ] ||
[ "${NAME_FOR_COMPARISON}" == "${WIKIDATA_DEFAULT_NAME_FOR_COMPARISON}'" ]; then
return 1 # false
fi
fi

return 0 # true
}

function get-raw-name-for-language() {
local LANGUAGE_CODE="${1}"
local NAME=""

if ${WIKIDATA_ENABLED}; then
NAME=$(get-name-from-wikidata-label "${LANGUAGE_CODE}")

if ! isNameUsable "${LANGUAGE_CODE}" "${NAME}"; then
NAME=$(get-name-from-wikidata-sitelink "${LANGUAGE_CODE}")
fi
fi

if ${GEONAMES_ENABLED}; then
if ! isNameUsable "${LANGUAGE_CODE}" "${NAME}"; then
NAME=$(get-name-from-geonames "${LANGUAGE_CODE}")
fi
fi

if ! isNameUsable "${LANGUAGE_CODE}" "${NAME}"; then
NAME=""
fi

echo "${NAME}"
}
MAIN_DEFAULT_NAME=$(echo "${EXONYMSAPI_RESPONSE}" | jq -r '.defaultName')

function get-name-for-language() {
local LANGUAGE_CODE="${1}"
local NAME=""

NAME=$(get-raw-name-for-language "${LANGUAGE_CODE}")

[ -z "${NAME}" ] && return
LANGUAGE_CODE=$(echo "${LANGUAGE_CODE}" | sed -E 's/([^\.]+)/"\1"/g; s/\./\./g')
NAME=$(echo "${EXONYMSAPI_RESPONSE}" | jq -r '.names.'"${LANGUAGE_CODE}")

NAME=$(normalise-name "${LANGUAGE_CODE}" "${NAME}")
[ "${NAME}" == "null" ] && return

echo "${NAME}"
}
Expand All @@ -192,15 +90,13 @@ function get-name-line-2codes() {
local LANGUAGE2_CODE="${3}"

local LANGUAGE1_NAME=$(get-name-for-language "${LANGUAGE1_CODE}")
local LANGUAGE2_NAME_RAW=""
local LANGUAGE2_NAME=""

if [ -n "${LANGUAGE1_NAME}" ]; then
get-name-line "${LANGUAGE_MCN_ID}" "${LANGUAGE1_CODE}"
else
if [ "${LANGUAGE1_CODE}" == "grc" ]; then
LANGUAGE2_NAME_RAW=$(get-raw-name-for-language "${LANGUAGE2_CODE}")
LANGUAGE2_NAME=$(normalise-name "${LANGUAGE1_CODE}" "${LANGUAGE2_NAME_RAW}")
LANGUAGE2_NAME=$(get-name-for-language "${LANGUAGE2_CODE}")
[ -n "${LANGUAGE2_NAME}" ] && echo " <Name language=\"${LANGUAGE_MCN_ID}\" value=\"${LANGUAGE2_NAME}\" />"
else
get-name-line "${LANGUAGE_MCN_ID}" "${LANGUAGE2_CODE}"
Expand Down

0 comments on commit cbe7042

Please sign in to comment.