diff --git a/.github/workflows/pangolin.yml b/.github/workflows/pangolin.yml index ff3a4b1..c153092 100644 --- a/.github/workflows/pangolin.yml +++ b/.github/workflows/pangolin.yml @@ -55,4 +55,13 @@ jobs: run: pangolin --update-data 2>&1 | tee pangolin_update_data.log - name: Run pangolin verbose mode run: pangolin --verbose pangolin/test/test_seqs.fasta 2>&1 | tee pangolin_verbose.log - + - name: Add assignment cache + run: pangolin --add-assignment-cache + - name: Test use-assignment-cache + run: pangolin --use-assignment-cache pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache' + - name: remove assignment cache + run: pip uninstall -y pangolin-assignment + - name: Add assignment cache to datadir + run: mkdir ac && pangolin --add-assignment-cache --datadir ac + - name: Test use-assignment-cache with datadir + run: pangolin --use-assignment-cache --datadir ac pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache' diff --git a/pangolin/command.py b/pangolin/command.py index 2bd6abb..4cbc810 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -4,17 +4,17 @@ from pangolin.utils import data_checks try: import pangolin_data -except: +except ImportError: data_checks.install_error("pangolin_data", "https://github.com/cov-lineages/pangolin-data.git") try: import scorpio -except: +except ImportError: data_checks.install_error("scorpio", "https://github.com/cov-lineages/scorpio.git") try: import constellations -except: +except ImportError: data_checks.install_error("constellations", "https://github.com/cov-lineages/constellations.git") import os @@ -81,6 +81,7 @@ def main(sysargs = sys.argv[1:]): d_group.add_argument('--add-assignment-cache', action='store_true', dest="add_assignment_cache", default=False, help="Install the pangolin-assignment repository for use with --use-assignment-cache. This makes updates slower and makes pangolin slower for small numbers of input sequences but much faster for large numbers of input sequences.") d_group.add_argument('--use-assignment-cache', action='store_true', dest="use_assignment_cache", default=False, help="Use assignment cache from optional pangolin-assignment repository. NOTE: the repository must be installed by --add-assignment-cache before using --use-assignment-cache.") d_group.add_argument('-d', '--datadir', action='store',dest="datadir",help="Data directory minimally containing the pangoLEARN model, header files and UShER tree. Default: Installed pangolin-data package.") + d_group.add_argument('--use-old-datadir', action='store_true', default=False, help="Use the data from data directory even if older than data installed via Python packages. Default: False") d_group.add_argument('--usher-tree', action='store', dest='usher_protobuf', help="UShER Mutation Annotated Tree protobuf file to use instead of default from pangolin-data repository or --datadir.") d_group.add_argument('--assignment-cache', action='store', dest='assignment_cache', help="Cached precomputed assignment file to use instead of default from pangolin-assignment repository. Does not require installation of pangolin-assignment.") @@ -107,23 +108,25 @@ def main(sysargs = sys.argv[1:]): if args.usher: sys.stderr.write(cyan(f"--usher is a pangolin v3 option and is deprecated in pangolin v4. UShER is now the default analysis mode. Use --analysis-mode to explicitly set mode.\n")) - setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config) + setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config, args.use_old_datadir) if args.add_assignment_cache: - update.install_pangolin_assignment() + update.install_pangolin_assignment(config[KEY_PANGOLIN_ASSIGNMENT_VERSION], args.datadir) if args.update: version_dictionary = {'pangolin': __version__, 'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION], 'constellations': config[KEY_CONSTELLATIONS_VERSION], 'scorpio': config[KEY_SCORPIO_VERSION]} - update.add_pangolin_assignment_if_installed(version_dictionary) + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None: + version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION] update.update(version_dictionary) if args.update_data: version_dictionary = {'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION], 'constellations': config[KEY_CONSTELLATIONS_VERSION]} - update.add_pangolin_assignment_if_installed(version_dictionary) + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None: + version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION] update.update(version_dictionary, args.datadir) # install_pangolin_assignment doesn't exit so that --update/--update-data can be given at the diff --git a/pangolin/utils/config.py b/pangolin/utils/config.py index 1f0d0be..9f10f8e 100644 --- a/pangolin/utils/config.py +++ b/pangolin/utils/config.py @@ -40,6 +40,8 @@ KEY_PANGOLIN_VERSION="pangolin_version" KEY_CONSTELLATIONS_VERSION="constellation_version" KEY_SCORPIO_VERSION="scorpio_version" +KEY_PANGOLIN_ASSIGNMENT_VERSION="pangolin_assignment_version" +KEY_PANGOLIN_ASSIGNMENT_PATH="pangolin_assignment_path" KEY_VERBOSE="verbose" KEY_LOG_API = "log_api" diff --git a/pangolin/utils/data_checks.py b/pangolin/utils/data_checks.py index 05278b5..b62ca2e 100644 --- a/pangolin/utils/data_checks.py +++ b/pangolin/utils/data_checks.py @@ -79,9 +79,8 @@ def install_error(package, url): def get_assignment_cache(cache_file, config): cache = "" - try: - import pangolin_assignment - pangolin_assignment_dir = pangolin_assignment.__path__[0] + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None: + pangolin_assignment_dir = config[KEY_PANGOLIN_ASSIGNMENT_PATH] for r, d, f in os.walk(pangolin_assignment_dir): for fn in f: if fn == cache_file and cache == "": @@ -89,15 +88,15 @@ def get_assignment_cache(cache_file, config): if not os.path.exists(cache): sys.stderr.write(cyan(f'Error: cannot find assignment cache file {cache_file} in pangolin_assignment\n')) sys.exit(-1) - except: + else: sys.stderr.write(cyan('\nError: "pangolin --add-assignment-cache" is required before ' '"pangolin --use-assignment-cache", in order to install optional ' 'pangolin-assignment repository (that will make future data updates slower).\n')) sys.exit(-1) # Check versions of pangolin-data and pangolin-assignment to make sure they are consistent. - if pangolin_assignment.__version__.lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'): - print(cyan(f'Error: pangolin_assignment cache version {pangolin_assignment.__version__} ' + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION].lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'): + print(cyan(f'Error: pangolin_assignment cache version {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]} ' f'does not match pangolin_data version {config[KEY_PANGOLIN_DATA_VERSION]}. ' 'Run "pangolin --update-data" to fetch latest versions of both.')) sys.exit(-1) @@ -115,5 +114,13 @@ def get_assignment_cache(cache_file, config): sys.exit(-1) return cache +def get_constellation_files(path): + constellation_files = [] + for r, _, f in os.walk(path): + for fn in f: + if (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'): + constellation_files.append(os.path.join(r, fn)) + return constellation_files + # config={} # check_install() diff --git a/pangolin/utils/initialising.py b/pangolin/utils/initialising.py index f272449..e504da6 100644 --- a/pangolin/utils/initialising.py +++ b/pangolin/utils/initialising.py @@ -13,6 +13,14 @@ from pangolin import __version__ import pangolin_data +class PangolinAssignmentWrapper(): + __version__ = None + __path__ = [None] +try: + import pangolin_assignment +except ImportError: + # if we can't import the module, leave the variables we replace it with a mock with suitable attributes + pangolin_assignment = PangolinAssignmentWrapper() import scorpio import constellations @@ -54,6 +62,8 @@ def setup_config_dict(cwd): KEY_PANGOLIN_DATA_VERSION: pangolin_data.__version__, KEY_SCORPIO_VERSION: scorpio.__version__, KEY_CONSTELLATIONS_VERSION: constellations.__version__, + KEY_PANGOLIN_ASSIGNMENT_VERSION: pangolin_assignment.__version__, + KEY_PANGOLIN_ASSIGNMENT_PATH: pangolin_assignment.__path__[0], KEY_VERBOSE: False, KEY_LOG_API: "", @@ -118,67 +128,36 @@ def version_from_init(init_file): break return version -def setup_data(datadir_arg,analysis_mode, config): - +def setup_data(datadir_arg, analysis_mode, config, use_old_data): datadir = check_datadir(datadir_arg) - pangolin_data_dir = pangolin_data.__path__[0] - constellations_dir = constellations.__path__[0] - constellation_files = [] - - data_locations = [os.walk(constellations_dir)] - - if datadir: - data_locations.append(os.walk(datadir)) - - # the logic of this is to search the "built-in" constellations - # path first and then if as custom datadir is passed, follow up with those, so that - # any files found in the datadir supercede the "built-in" modules. The assumption - # here is that the datadir contains newer (user updated) data - for r, _, f in itertools.chain.from_iterable(data_locations): - if r.endswith('/constellations') or r.endswith('/constellations/definitions'): - constellation_files = [] # only collect the constellations from the last directory found - for fn in f: - if r.endswith('/constellations') and fn == '__init__.py': - constellations_version = version_from_init(os.path.join(r, fn)) - elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'): - constellation_files.append(os.path.join(r, fn)) - - pangolin_data_version = pangolin_data.__version__ - use_datadir = False - datadir_too_old = False + config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data.__version__ + config[KEY_DATADIR] = pangolin_data.__path__[0] + config[KEY_CONSTELLATIONS_VERSION] = constellations.__version__ + config[KEY_CONSTELLATION_FILES] = get_constellation_files(constellations.__path__[0]) + config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = pangolin_assignment.__version__ + config[KEY_PANGOLIN_ASSIGNMENT_PATH] = pangolin_assignment.__path__[0] + if datadir: - version = "Unknown" - for r,d,f in os.walk(datadir): - for fn in f: - # pangolin-data/__init__.py not constellations/__init__.py: - if r.endswith('data') and fn == "__init__.py": - # print("Found " + os.path.join(r, fn)) - version = version_from_init(os.path.join(r, fn)) - if not version: - continue - - if LooseVersion(version) >= LooseVersion(pangolin_data.__version__): - # only use this if the version is >= than what we already have - pangolin_data_version = version - use_datadir = True - else: - datadir_too_old = True - sys.stderr.write(cyan(f"Warning: Ignoring specified datadir {datadir} - it contains pangoLEARN model files older ({version}) than those installed ({pangolin_data.__version__})\n")) - - if use_datadir == False: - # we haven't got a viable datadir from searching args.datadir - if datadir and not datadir_too_old: - sys.stderr.write(cyan( - f"Warning: Ignoring specified datadir {datadir} - could not find __init__.py file to check versions \n")) - - pangolin_data_dir = pangolin_data.__path__[0] - datadir = os.path.join(pangolin_data_dir,"data") - - config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data_version - config[KEY_CONSTELLATIONS_VERSION] = constellations_version - config[KEY_DATADIR] = datadir - config[KEY_CONSTELLATION_FILES] = constellation_files + for module_name in ('constellations', 'pangolin_data', 'pangolin_assignment'): + for r, _, f in os.walk(datadir): + for fn in f: + if r.endswith('/' + module_name) and fn == '__init__.py': + version = version_from_init(os.path.join(r, fn)) + # module_name has been imported so exists in global namespace + current_version = getattr(globals()[module_name], '__version__', '0') + if use_old_data or current_version is None or LooseVersion(version) >= LooseVersion(current_version): + if module_name == "pangolin_data": + config[KEY_PANGOLIN_DATA_VERSION] = version + config[KEY_DATADIR] = os.path.join(datadir, r) + elif module_name == "pangolin_assignment": + config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = version + config[KEY_PANGOLIN_ASSIGNMENT_PATH] = os.path.join(datadir, r) + elif module_name == "constellations": + config[KEY_CONSTELLATIONS_VERSION] = version + config[KEY_CONSTELLATION_FILES] = get_constellation_files(r) + else: + sys.stderr.write(cyan(f"Warning: Ignoring {module_name} in specified datadir {datadir} - it contains {module_name} with older ({version}) than those installed ({current_version})\n")) def parse_qc_thresholds(maxambig, minlen, reference_fasta, config): @@ -207,11 +186,10 @@ def parse_qc_thresholds(maxambig, minlen, reference_fasta, config): print(green(f"Maximum ambiguity allowed is {config[KEY_MAXAMBIG]}.\n****")) - def print_ram_warning(analysis_mode): if analysis_mode == "pangolearn": print(cyan("Warning: pangoLEARN mode may use a significant amount of RAM, be aware that it will not suit every system.")) - + def print_alias_file_exit(alias_file): with open(alias_file, 'r') as handle: for line in handle: @@ -242,11 +220,8 @@ def print_versions_exit(config): f"constellations: {config[KEY_CONSTELLATIONS_VERSION]}\n" f"scorpio: {config[KEY_SCORPIO_VERSION]}") # Report pangolin_assignment version if it is installed, otherwise ignore - try: - import pangolin_assignment - print(f"pangolin-assignment: {pangolin_assignment.__version__}") - except: - pass + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None: + print(f"pangolin-assignment: {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]}") # Print versions of other important tools used by pangolin print_conda_version(['usher', 'ucsc-fatovcf', 'gofasta', 'minimap2']) sys.exit(0) diff --git a/pangolin/utils/update.py b/pangolin/utils/update.py index 721a9eb..87ca292 100644 --- a/pangolin/utils/update.py +++ b/pangolin/utils/update.py @@ -64,43 +64,33 @@ def git_lfs_install(): sys.stderr.write(cyan(f"Error: {e}:\n{stderr}\n")) sys.exit(-1) -def pip_install_dep(dependency, release): +def pip_install_dep(dependency, release, datadir=None): """ Use pip install to install a cov-lineages repository with the specificed release """ url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}" - subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', url], + pip_command = [sys.executable, '-m', 'pip', 'install', '--upgrade'] + if datadir is not None: + pip_command.extend(['--target', datadir]) + pip_command.append(url) + subprocess.run(pip_command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) -def install_pangolin_assignment(): +def install_pangolin_assignment(pangolin_assignment_version, datadir=None): """ If the pangolin-assignment repo has not been installed already then install the latest release. """ - try: - import pangolin_assignment - print(f"pangolin-assignment already installed with version {pangolin_assignment.__version__}; use --update or --update-data if you wish to update it.", file=sys.stderr) - - except: + if pangolin_assignment_version is not None: + print(f"pangolin-assignment already installed with version {pangolin_assignment_version}; use --update or --update-data if you wish to update it.", file=sys.stderr) + else: git_lfs_install() latest_release, tarball = get_latest_release('pangolin-assignment') - pip_install_dep('pangolin-assignment', latest_release) + pip_install_dep('pangolin-assignment', latest_release, datadir) print(f"pangolin-assignment installed with latest release ({latest_release})") - -def add_pangolin_assignment_if_installed(version_dictionary): - """ - If pangolin_assignment has been installed then add it to version_dictionary, else ignore. - """ - try: - import pangolin_assignment - version_dictionary["pangolin-assignment"] = pangolin_assignment.__version__ - except: - pass - - def update(version_dictionary, data_dir=None): """ Using the github releases API check for the latest current release @@ -154,23 +144,7 @@ def update(version_dictionary, data_dir=None): version = LooseVersion(version) if version < latest_release_tidied: - if data_dir is not None: - # this path only gets followed when the user has --update_data and they - # have also specified a --datadir - with TemporaryDirectory() as tempdir: - dependency_package = package_names.get(dependency, dependency) - tarball_path = os.path.join(tempdir, 'tarball.tgz') - open(tarball_path, 'wb').write(request.urlopen(latest_release_tarball).read()) - tf = tarfile.open(tarball_path) - extracted_dir = tf.next().name - tf.extractall(path=tempdir) - tf.close() - destination_directory = os.path.join(data_dir, dependency_package) - if os.path.isdir(destination_directory): - shutil.rmtree(destination_directory) - shutil.move(os.path.join(tempdir, extracted_dir, dependency_package), destination_directory) - else: - pip_install_dep(dependency, latest_release) + pip_install_dep(dependency, latest_release, data_dir) print(f"{dependency} updated to {latest_release}", file=sys.stderr) elif version > latest_release_tidied: print(f"{dependency} ({version}) is newer than latest stable "