Skip to content

Commit

Permalink
Use geonames capital city as last location lookup
Browse files Browse the repository at this point in the history
  • Loading branch information
nickynicolson committed Aug 24, 2022
1 parent 4bdaa43 commit e2f52d6
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 27 deletions.
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ wcvp_dist_url=https://www.dropbox.com/s/9vefyzzp978m2f1/wcvp_distribution.txt?dl
gbif_taxonomy_url=https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip
tdwg_wgsrpd_l3_url=https://github.com/jiacona/tdwg-geojson/raw/master/tdwg-level3.geojson
ih_url="http://sweetgum.nybg.org/science/api/v1/institutions/search?dateModified=%3E01/01/2000&download=yes"
geonames_capital_cities_url=http://download.geonames.org/export/dump/cities15000.zip

python_launch_cmd=python
python_launch_cmd=winpty python
Expand Down Expand Up @@ -45,6 +46,11 @@ downloads/ih.txt:
mkdir -p downloads
wget -O $@ $(ih_url)

# Download geonames capital cities
downloads/cities15000.zip:
mkdir -p downloads
wget -O $@ $(geonames_capital_cities_url)

dl: downloads/wcvp.txt downloads/wcvp_dist.txt downloads/gbif-taxonomy.zip downloads/tdwg_wgsrpd_l3.json

# Extract taxon file from GBIF backbone taxonomy
Expand Down Expand Up @@ -82,7 +88,7 @@ data/gbif-types.zip: data/gbif-type-download.id
wget -O $@ $(download_link)

# Process GBIF type data to add details of publishing organisation
data/gbif-typesloc.zip: types2publisherlocations.py data/gbif-types.zip downloads/ih.txt
data/gbif-typesloc.zip: types2publisherlocations.py data/gbif-types.zip downloads/ih.txt downloads/cities15000.zip
$(python_launch_cmd) $^ $(limit_args) $@

# Analyse how many taxa have type material in GBIF
Expand Down
82 changes: 56 additions & 26 deletions types2publisherlocations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,26 @@
import argparse
from pygbif import registry

GEONAMES_COLUMNS=['geonameid'
,'name'
,'asciiname'
,'alternatenames'
,'latitude'
,'longitude'
,'feature class'
,'feature code'
,'country code'
,'cc2'
,'admin1 code'
,'admin2 code'
,'admin3 code'
,'admin4 code'
,'population'
,'elevation'
,'dem'
,'timezone'
,'modification date']

def getOrganizationData(key):
organization_data = None
try:
Expand All @@ -18,6 +38,9 @@ def main():
parser.add_argument('--delimiter_gbif', type=str, default='\t')
parser.add_argument("inputfile_ih", type=str)
parser.add_argument('--delimiter_ih', type=str, default=',')
parser.add_argument("inputfile_geonames", type=str)
parser.add_argument('--delimiter_geonames', type=str, default='\t')

parser.add_argument("outputfile", type=str)
args = parser.parse_args()

Expand All @@ -33,6 +56,12 @@ def main():
df_ih = pd.read_csv(args.inputfile_ih, sep=args.delimiter_ih, nrows=args.limit,error_bad_lines=False)
print('Read {} IH lines from: {}'.format(len(df_ih), args.inputfile_ih))

# 1.3 Read geonames data file ===========================================================
df_gn = pd.read_csv(args.inputfile_geonames, sep=args.delimiter_geonames, nrows=args.limit,error_bad_lines=False, names=GEONAMES_COLUMNS)
print('Read {} geonames lines from: {}'.format(len(df_gn), args.inputfile_geonames))
df_gn.drop(df_gn[df_gn['feature code']!='PPLC'].index,inplace=True)
print('Retained {} geonames capital city lines'.format(len(df_gn)))

###########################################################################
# 2. Process publishingOrgKey and join
###########################################################################
Expand All @@ -49,40 +78,41 @@ def main():
,suffixes=['','_org'])

###########################################################################
# 3. Fill any gaps (those without lat/long in the GBIF registry) by doing
# a name lookup to IH
# 3. Fill any gaps (those without lat/long in the GBIF registry)
###########################################################################

#
# 3.1 Using IH - first on title, then on city =============================
for (local_column, ih_column) in {'title':'organization','city':'physicalCity'}.items():
location_mapper = dict()
# Establish a mask to find records with no lat/long
coordinates_missing_mask=df.latitude.isnull()&df.longitude.isnull()
# Loop over records with missing lat/long, try to find matches in IH on link_column:
for local_value in df[coordinates_missing_mask][local_column].unique():
mask = (df_ih[ih_column]==local_value)
if len(df_ih[mask]) > 0:
# Save in mapper data structure
location_mapper[local_value] = (df_ih[mask].head(n=1).latitude.iloc[0],df_ih[mask].head(n=1).longitude.iloc[0])
# Map IH derived lat/long data to temporary column
df.loc[coordinates_missing_mask,'location_temp']=df[coordinates_missing_mask][local_column].map(location_mapper)
# Read values from temp column into permanent lat/long home
coordinates_missing_mask = df.location_temp.notnull()
df.loc[coordinates_missing_mask,'latitude']=df[coordinates_missing_mask].location_temp.apply(lambda x: x[0])
df.loc[coordinates_missing_mask,'longitude']=df[coordinates_missing_mask].location_temp.apply(lambda x: x[1])
# Drop temporary column
df.drop(columns=['location_temp'],inplace=True)

coordinates_missing_mask=(df.latitude.isnull()&df.longitude.isnull())
print(df[coordinates_missing_mask].groupby(['publishingOrgKey','title','city','province','country']).size())
print(df[coordinates_missing_mask][['publishingOrgKey','title','city','province','country']].drop_duplicates())
print(df[coordinates_missing_mask].groupby('title').size().sum())
print(df[~coordinates_missing_mask].groupby('title').size().sum())
df = mapLocation(df, local_column, df_ih, ih_column)
#
# 3.2 Using geonames to get lat/long of capital city of country============
df = mapLocation(df, 'country', df_gn, 'country code')

###########################################################################
# 4. Output
###########################################################################
print('Outputting {} rows to {}'.format(len(df), args.outputfile))
df.to_csv(args.outputfile,sep='\t',index=False)

def mapLocation(df, local_column, df_lookup, lookup_column, lat_column='latitude', long_column='longitude'):
location_mapper = dict()
# Establish a mask to find records with no lat/long
coordinates_missing_mask=df.latitude.isnull()&df.longitude.isnull()
# Loop over records with missing lat/long, try to find matches in IH on link_column:
for local_value in df[coordinates_missing_mask][local_column].unique():
mask = (df_lookup[lookup_column]==local_value)
if len(df_lookup[mask]) > 0:
# Save in mapper data structure
location_mapper[local_value] = (df_lookup[mask].head(n=1)[lat_column].iloc[0],df_lookup[mask].head(n=1)[long_column].iloc[0])
# Map IH derived lat/long data to temporary column
df.loc[coordinates_missing_mask,'location_temp']=df[coordinates_missing_mask][local_column].map(location_mapper)
# Read values from temp column into permanent lat/long home
coordinates_missing_mask = df.location_temp.notnull()
df.loc[coordinates_missing_mask,'latitude']=df[coordinates_missing_mask].location_temp.apply(lambda x: x[0])
df.loc[coordinates_missing_mask,'longitude']=df[coordinates_missing_mask].location_temp.apply(lambda x: x[1])
# Drop temporary column
df.drop(columns=['location_temp'],inplace=True)
return df

if __name__ == '__main__':
main()

0 comments on commit e2f52d6

Please sign in to comment.