diff --git a/Makefile b/Makefile index 14ab6ef..b1efc4f 100644 --- a/Makefile +++ b/Makefile @@ -120,7 +120,7 @@ data/taxa2gbiftypeavailability.csv data/taxa2gbiftypeavailability.yaml: taxa2gbi # Analyse how many taxa have type material published from within native range data/taxa2nativerangetypeavailability.csv data/taxa2nativerangetypeavailability.yaml: taxa2nativerangetypeavailability.py data/gbif2wcvp.csv downloads/wcvp_distribution.txt data/gbif-types.zip data/gbif-typesloc.zip downloads/gadm_410-levels.gpkg downloads/tdwg_wgsrpd_l3.json - $(python_launch_cmd) $^ $(limit_args) data/taxa2nativerangetypeavailability.csv data/taxa2nativerangetypeavailability.yaml + $(python_launch_cmd) $^ $(limit_args) --output_spatial_debug_info data/taxa2nativerangetypeavailability.csv data/taxa2nativerangetypeavailability.yaml ############################################################################### # Post-CBD diff --git a/taxa2nativerangetypeavailability.py b/taxa2nativerangetypeavailability.py index bd551e8..74b73f8 100644 --- a/taxa2nativerangetypeavailability.py +++ b/taxa2nativerangetypeavailability.py @@ -21,6 +21,8 @@ def main(): parser.add_argument('--delimiter_publ', type=str, default='\t') parser.add_argument('gadm_geopackage_file', type=str, help='Path to GADM geopackage file') parser.add_argument("inputfile_tdwg_wgsrpd_l3_json", type=str) + parser.add_argument("--output_spatial_debug_info", type=bool, default=False, action='store_true') + parser.add_argument('--output_spatial_debug_dir', type=str, default='data/') parser.add_argument("outputfile_data", type=str) parser.add_argument("outputfile_yaml", type=str) args = parser.parse_args() @@ -59,7 +61,7 @@ def main(): print('Dropped taxonomy outside date range ({}-date), retained {} lines'.format(args.year_min, len(df_tax))) # 1.4 Publishing organisation locations (GBIF) ============================ - df_publ = pd.read_csv(args.inputfile_publ, sep=args.delimiter_publ, nrows=args.limit, usecols=['publishingOrgKey','latitude','longitude','country']) + df_publ = pd.read_csv(args.inputfile_publ, sep=args.delimiter_publ, nrows=args.limit, usecols=['publishingOrgKey','latitude','longitude','country', 'title']) df_publ.drop_duplicates(inplace=True) print('Read {} GBIF publishing organisation lines from: {}'.format(len(df_publ), args.inputfile_publ)) @@ -70,9 +72,11 @@ def main(): # # 2.1 Convert publishing organization locations to points ================== # Make a geodataframe "df_gbif_point" where the geometry is a point built form the lat/long values - df_gbif_point = gpd.GeoDataFrame(df_publ[['publishingOrgKey','latitude','longitude']].drop_duplicates(), geometry=gpd.points_from_xy(df_publ['longitude'], df_publ['latitude'])) + df_gbif_point = gpd.GeoDataFrame(df_publ[['publishingOrgKey','latitude','longitude','country', 'title']].drop_duplicates(), geometry=gpd.points_from_xy(df_publ['longitude'], df_publ['latitude'])) print('Number of points requiring assignment to TDWG regions:', len(df_gbif_point)) - + df_gbif_point['geometry_original_point'] = df_gbif_point.geometry + df_gbif_point.rename(columns={'country':'country_gbif','title':'title_gbif'}, inplace=True) + # 2.2 Read GADM level 1 geojson format shape file ======================== df_gadm_l1 = gpd.read_file(args.gadm_geopackage_file,layer="ADM_1") df_gadm_l1['geometry_gadm_l1'] = df_gadm_l1.geometry @@ -88,6 +92,7 @@ def main(): # 2.4 Read TDWG WGSRPD L3 geojson format shape file ======================== df_tdwg_poly = gpd.read_file(args.inputfile_tdwg_wgsrpd_l3_json) + df_tdwg_poly['geometry_tdwg_l3'] = df_tdwg_poly.geometry print('Read {} TDWG WGSRPD l3 shapes from {}'.format(len(df_tdwg_poly), args.inputfile_tdwg_wgsrpd_l3_json)) # # 2.5 Determine intersection between the GADM representative point and the TDWG L3 polygon @@ -97,6 +102,9 @@ def main(): df_intersect.rename(columns={'index_left':'index_left_legacy','index_right':'index_right_legacy'}, inplace=True) df_intersect = df_intersect.sjoin(df_tdwg_poly, how="left") + if args.output_spatial_debug_info: + generateSpatialDebugInfo(df_intersect, outputdir=args.output_spatial_debug_dir) + ########################################################################### # 3. Integrate taxonomy (df_tax), occurrences (df_occ) and TDWG WGSRPD L3 # (df_intersect) in which the publisher is located @@ -170,5 +178,38 @@ def main(): # print('Outputting {} rows to {}'.format(len(df), args.outputfile_data)) # df.to_csv(args.outputfile_data,sep='\t',index=False) +import matplotlib.pyplot as plt +def generateSpatialDebugInfo(df, outputdir, orig_point_geometry_column_name='geometry_original_point', first_poly_geometry_column_name='geometry_gadm_l1', repr_point_geometry_column_name='geometry_gadm_l1_repr_point', final_poly_geometry_column_name='geometry_tdwg_l3'): + df['geometry_safe'] = df['geometry'] + for i, row in df.iterrows(): + # Create a figure with two subplots + fig, ax = plt.plot(figsize=(8,10)) + + # Plot original point + df.iloc[i]['geometry'] = df.iloc[i][orig_point_geometry_column_name] + df.iloc[i].plot(ax=ax, marker='x', color='red', markersize=5) + + # Plot GADM poly + df.iloc[i]['geometry'] = df.iloc[i][first_poly_geometry_column_name] + df.iloc[i].plot(ax=ax,color='red',alpha=0.1) + + # Plot representative point + df.iloc[i]['geometry'] = df.iloc[i][repr_point_geometry_column_name] + df.iloc[i].plot(ax=ax, marker='x', color='blue', markersize=5) + + # Plot TDWG poly + df.iloc[i]['geometry'] = df.iloc[i][final_poly_geometry_column_name] + df.iloc[i].plot(ax=ax,color='green',alpha=0.1) + + title = '{org_title} ({country})'.format(row['title_gbif'], row['country_gbif']) + plt.title(title) + + # Save the plot + figname = '{outputdir}/{id}.png'.format(outputdir = outputdir, id = i) + plt.savefig(figname) + + df['geometry'] = df['geometry_safe'] + df.drop_columns(['geometry_safe'], axis=1) + if __name__ == '__main__': main()