From d049e56f4e747ac52655f1b147392eb14e7105ee Mon Sep 17 00:00:00 2001 From: Nicky Nicolson Date: Mon, 5 Dec 2022 14:31:21 +0000 Subject: [PATCH] Switch to output YAML --- Makefile | 30 ++++++++++++++--------------- taxa2gbiftypeavailability.py | 17 ++++++++++------ taxa2nativerangetypeavailability.py | 20 +++++++++++++------ 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/Makefile b/Makefile index 113deb9..0987e85 100644 --- a/Makefile +++ b/Makefile @@ -102,12 +102,12 @@ data/gbif-typesloc.zip: types2publisherlocations.py data/gbif-types.zip download # All types # Analyse how many taxa have type material in GBIF -data/taxa2gbiftypeavailability.csv data/taxa2gbiftypeavailability.md: taxa2gbiftypeavailability.py data/gbif2wcvp.csv data/gbif-types.zip - $(python_launch_cmd) $^ $(limit_args) data/taxa2gbiftypeavailability.csv data/taxa2gbiftypeavailability.md +data/taxa2gbiftypeavailability.csv data/taxa2gbiftypeavailability.yaml: taxa2gbiftypeavailability.py data/gbif2wcvp.csv data/gbif-types.zip + $(python_launch_cmd) $^ $(limit_args) data/taxa2gbiftypeavailability.csv data/taxa2gbiftypeavailability.yaml # Analyse how many taxa have type material published from within native range -data/taxa2nativerangetypeavailability.csv data/taxa2nativerangetypeavailability.md: taxa2nativerangetypeavailability.py data/gbif2wcvp.csv downloads/wcvp_dist.txt data/gbif-types.zip data/gbif-typesloc.zip downloads/tdwg_wgsrpd_l3.json - $(python_launch_cmd) $^ $(limit_args) data/taxa2nativerangetypeavailability.csv data/taxa2nativerangetypeavailability.md +data/taxa2nativerangetypeavailability.csv data/taxa2nativerangetypeavailability.yaml: taxa2nativerangetypeavailability.py data/gbif2wcvp.csv downloads/wcvp_dist.txt data/gbif-types.zip data/gbif-typesloc.zip downloads/tdwg_wgsrpd_l3.json + $(python_launch_cmd) $^ $(limit_args) data/taxa2nativerangetypeavailability.csv data/taxa2nativerangetypeavailability.yaml ############################################################################### # Post-CBD @@ -115,12 +115,12 @@ data/taxa2nativerangetypeavailability.csv data/taxa2nativerangetypeavailability. cbd_impl_year:=1992 # Analyse how many taxa have type material in GBIF -data/taxa2gbiftypeavailability-cbd.csv data/taxa2gbiftypeavailability-cbd.md: taxa2gbiftypeavailability.py data/gbif2wcvp.csv data/gbif-types.zip - $(python_launch_cmd) $^ $(limit_args) --year_min=$(cbd_impl_year) data/taxa2gbiftypeavailability-cbd.csv data/taxa2gbiftypeavailability-cbd.md +data/taxa2gbiftypeavailability-cbd.csv data/taxa2gbiftypeavailability-cbd.yaml: taxa2gbiftypeavailability.py data/gbif2wcvp.csv data/gbif-types.zip + $(python_launch_cmd) $^ $(limit_args) --year_min=$(cbd_impl_year) data/taxa2gbiftypeavailability-cbd.csv data/taxa2gbiftypeavailability-cbd.yaml # Analyse how many taxa have type material published from within native range -data/taxa2nativerangetypeavailability-cbd.csv data/taxa2nativerangetypeavailability-cbd.md: taxa2nativerangetypeavailability.py data/gbif2wcvp.csv downloads/wcvp_dist.txt data/gbif-types.zip data/gbif-typesloc.zip downloads/tdwg_wgsrpd_l3.json - $(python_launch_cmd) $^ $(limit_args) --year_min=$(cbd_impl_year) data/taxa2nativerangetypeavailability-cbd.csv data/taxa2nativerangetypeavailability-cbd.md +data/taxa2nativerangetypeavailability-cbd.csv data/taxa2nativerangetypeavailability-cbd.yaml: taxa2nativerangetypeavailability.py data/gbif2wcvp.csv downloads/wcvp_dist.txt data/gbif-types.zip data/gbif-typesloc.zip downloads/tdwg_wgsrpd_l3.json + $(python_launch_cmd) $^ $(limit_args) --year_min=$(cbd_impl_year) data/taxa2nativerangetypeavailability-cbd.csv data/taxa2nativerangetypeavailability-cbd.yaml ############################################################################### # Post-Nagoya @@ -128,23 +128,23 @@ data/taxa2nativerangetypeavailability-cbd.csv data/taxa2nativerangetypeavailabil nagoya_impl_year:=2014 # Analyse how many taxa have type material in GBIF -data/taxa2gbiftypeavailability-nagoya.csv data/taxa2gbiftypeavailability-nagoya.md: taxa2gbiftypeavailability.py data/gbif2wcvp.csv data/gbif-types.zip - $(python_launch_cmd) $^ $(limit_args) --year_min=$(nagoya_impl_year) data/taxa2gbiftypeavailability-nagoya.csv data/taxa2gbiftypeavailability-nagoya.md +data/taxa2gbiftypeavailability-nagoya.csv data/taxa2gbiftypeavailability-nagoya.yaml: taxa2gbiftypeavailability.py data/gbif2wcvp.csv data/gbif-types.zip + $(python_launch_cmd) $^ $(limit_args) --year_min=$(nagoya_impl_year) data/taxa2gbiftypeavailability-nagoya.csv data/taxa2gbiftypeavailability-nagoya.yaml # Analyse how many taxa have type material published from within native range -data/taxa2nativerangetypeavailability-nagoya.csv data/taxa2nativerangetypeavailability-nagoya.md: taxa2nativerangetypeavailability.py data/gbif2wcvp.csv downloads/wcvp_dist.txt data/gbif-types.zip data/gbif-typesloc.zip downloads/tdwg_wgsrpd_l3.json - $(python_launch_cmd) $^ $(limit_args) --year_min=$(nagoya_impl_year) data/taxa2nativerangetypeavailability-nagoya.csv data/taxa2nativerangetypeavailability-nagoya.md +data/taxa2nativerangetypeavailability-nagoya.csv data/taxa2nativerangetypeavailability-nagoya.yaml: taxa2nativerangetypeavailability.py data/gbif2wcvp.csv downloads/wcvp_dist.txt data/gbif-types.zip data/gbif-typesloc.zip downloads/tdwg_wgsrpd_l3.json + $(python_launch_cmd) $^ $(limit_args) --year_min=$(nagoya_impl_year) data/taxa2nativerangetypeavailability-nagoya.csv data/taxa2nativerangetypeavailability-nagoya.yaml -all: data/taxa2gbiftypeavailability.md data/taxa2nativerangetypeavailability.md data/taxa2gbiftypeavailability-cbd.md data/taxa2nativerangetypeavailability-cbd.md data/taxa2gbiftypeavailability-nagoya.md data/taxa2nativerangetypeavailability-nagoya.md +all: data/taxa2gbiftypeavailability.yaml data/taxa2nativerangetypeavailability.yaml data/taxa2gbiftypeavailability-cbd.yaml data/taxa2nativerangetypeavailability-cbd.yaml data/taxa2gbiftypeavailability-nagoya.yaml data/taxa2nativerangetypeavailability-nagoya.yaml data_archive_zip:=$(shell basename $(CURDIR))-data.zip downloads_archive_zip:=$(shell basename $(CURDIR))-downloads.zip -archive: data/taxa2gbiftypeavailability.md data/taxa2nativerangetypeavailability.md data/taxa2gbiftypeavailability-cbd.md data/taxa2nativerangetypeavailability-cbd.md data/taxa2gbiftypeavailability-nagoya.md data/taxa2nativerangetypeavailability-nagoya.md +archive: data/taxa2gbiftypeavailability.yaml data/taxa2nativerangetypeavailability.yaml data/taxa2gbiftypeavailability-cbd.yaml data/taxa2nativerangetypeavailability-cbd.yaml data/taxa2gbiftypeavailability-nagoya.yaml data/taxa2nativerangetypeavailability-nagoya.yaml mkdir -p archive echo "Archived on $(date_formatted)" >> data/archive-info.txt - zip archive/$(data_archive_zip) data/*.md -r + zip archive/$(data_archive_zip) data/*.yaml -r echo "Archived on $(date_formatted)" >> downloads/archive-info.txt zip archive/$(downloads_archive_zip) downloads/* -r diff --git a/taxa2gbiftypeavailability.py b/taxa2gbiftypeavailability.py index 267b5b0..2cc9a6b 100644 --- a/taxa2gbiftypeavailability.py +++ b/taxa2gbiftypeavailability.py @@ -4,6 +4,7 @@ from unidecode import unidecode import re from pygbif import registry +import yaml def main(): parser = argparse.ArgumentParser() @@ -14,7 +15,7 @@ def main(): parser.add_argument('--delimiter_occ', type=str, default='\t') parser.add_argument('--year_min', type=int, default=None) parser.add_argument("outputfile_data", type=str) - parser.add_argument("outputfile_md", type=str) + parser.add_argument("outputfile_yaml", type=str) args = parser.parse_args() ########################################################################### @@ -62,11 +63,15 @@ def main(): mask = (df.typeStatus.notnull()) type_status_available_count = df[mask].accepted_id.nunique() total_taxa_count = df.accepted_id.nunique() - with open(args.outputfile_md,mode='w') as f: - summary_message = '{:.2%} taxa have type material available ({} of {})'.format(type_status_available_count/total_taxa_count, type_status_available_count, total_taxa_count) - print('Writing {} to {}'.format(summary_message, args.outputfile_md)) - f.write(summary_message) - + analysis_variables = dict() + analysis_variables['taxon_count'] = total_taxa_count + analysis_variables['taxa_with_types_available_count'] = type_status_available_count + analysis_variables['taxa_with_types_available_pc'] = round((type_status_available_count/total_taxa_count)*100) + output_variables = dict() + output_variables['taxa2gbiftypeavailability']=analysis_variables + with open(args.outputfile_yaml, 'w') as f: + yaml.dump(output_variables, f) + ########################################################################### # 4. Output ########################################################################### diff --git a/taxa2nativerangetypeavailability.py b/taxa2nativerangetypeavailability.py index 3ee5aa9..613f800 100644 --- a/taxa2nativerangetypeavailability.py +++ b/taxa2nativerangetypeavailability.py @@ -5,6 +5,7 @@ import re from pygbif import registry import numpy as np +import yaml def main(): parser = argparse.ArgumentParser() @@ -20,7 +21,7 @@ def main(): parser.add_argument('--delimiter_publ', type=str, default='\t') parser.add_argument("inputfile_tdwg_wgsrpd_l3_json", type=str) parser.add_argument("outputfile_data", type=str) - parser.add_argument("outputfile_md", type=str) + parser.add_argument("outputfile_yaml", type=str) args = parser.parse_args() ########################################################################### @@ -121,22 +122,29 @@ def main(): wgsrpd_columns = {'continent_code_l1':'publishingOrg_continent_code_l1', 'region_code_l2':'publishingOrg_region_code_l2', 'area_code_l3':'publishingOrg_area_code_l3'} + analysis_variables = dict() + analysis_variables['taxon_count'] = accepted_id_count summary_message="" for (distribution_loc, publishing_org_loc) in wgsrpd_columns.items(): mask=(df[distribution_loc] == df[publishing_org_loc]) accepted_id_served_from_within_native_range_count = df[mask].accepted_id.nunique() accepted_id_count = df.accepted_id.nunique() summary_message += ('- {:.2%} taxa ({} of {}) are represented by type material served from within their native range in {}\n'.format(accepted_id_served_from_within_native_range_count/accepted_id_count, accepted_id_served_from_within_native_range_count, accepted_id_count, distribution_loc)) - print(summary_message) + current_level_variables = dict() + current_level_variables['taxon_represented_total']=accepted_id_served_from_within_native_range_count + current_level_variables['taxon_represented_pc']=round((accepted_id_served_from_within_native_range_count/accepted_id_count)*100) + analysis_variables[distribution_loc] = current_level_variables + + output_variables = dict() + output_variables['taxa2nativerangetypeavailability'] = analysis_variables # ########################################################################### # # 4. Output # ########################################################################### # - # 4.1 markdown format statement - with open(args.outputfile_md, 'w') as f: - print(summary_message) - f.write(summary_message) + # 4.1 YAML format data variables + with open(args.outputfile_yaml, 'w') as f: + yaml.dump(output_variables, f) # 4.2 Data # TBC