Skip to content

Commit

Permalink
Allow assignment cache to be saved to and loaded from datadir (#444)
Browse files Browse the repository at this point in the history
* Support download assignment cache to datadir and using from datadir
* Switch to using pip for all installs (thanks for tip from Wolfgang Maier)
* Rewrite setup_data() to be less repetitive
* Add --use-old-datadir option
  • Loading branch information
pvanheus authored May 7, 2022
1 parent 764706d commit bafc1d8
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 117 deletions.
11 changes: 10 additions & 1 deletion .github/workflows/pangolin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,13 @@ jobs:
run: pangolin --update-data 2>&1 | tee pangolin_update_data.log
- name: Run pangolin verbose mode
run: pangolin --verbose pangolin/test/test_seqs.fasta 2>&1 | tee pangolin_verbose.log

- name: Add assignment cache
run: pangolin --add-assignment-cache
- name: Test use-assignment-cache
run: pangolin --use-assignment-cache pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache'
- name: remove assignment cache
run: pip uninstall -y pangolin-assignment
- name: Add assignment cache to datadir
run: mkdir ac && pangolin --add-assignment-cache --datadir ac
- name: Test use-assignment-cache with datadir
run: pangolin --use-assignment-cache --datadir ac pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache'
17 changes: 10 additions & 7 deletions pangolin/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@
from pangolin.utils import data_checks
try:
import pangolin_data
except:
except ImportError:
data_checks.install_error("pangolin_data", "https://github.com/cov-lineages/pangolin-data.git")

try:
import scorpio
except:
except ImportError:
data_checks.install_error("scorpio", "https://github.com/cov-lineages/scorpio.git")

try:
import constellations
except:
except ImportError:
data_checks.install_error("constellations", "https://github.com/cov-lineages/constellations.git")

import os
Expand Down Expand Up @@ -81,6 +81,7 @@ def main(sysargs = sys.argv[1:]):
d_group.add_argument('--add-assignment-cache', action='store_true', dest="add_assignment_cache", default=False, help="Install the pangolin-assignment repository for use with --use-assignment-cache. This makes updates slower and makes pangolin slower for small numbers of input sequences but much faster for large numbers of input sequences.")
d_group.add_argument('--use-assignment-cache', action='store_true', dest="use_assignment_cache", default=False, help="Use assignment cache from optional pangolin-assignment repository. NOTE: the repository must be installed by --add-assignment-cache before using --use-assignment-cache.")
d_group.add_argument('-d', '--datadir', action='store',dest="datadir",help="Data directory minimally containing the pangoLEARN model, header files and UShER tree. Default: Installed pangolin-data package.")
d_group.add_argument('--use-old-datadir', action='store_true', default=False, help="Use the data from data directory even if older than data installed via Python packages. Default: False")
d_group.add_argument('--usher-tree', action='store', dest='usher_protobuf', help="UShER Mutation Annotated Tree protobuf file to use instead of default from pangolin-data repository or --datadir.")
d_group.add_argument('--assignment-cache', action='store', dest='assignment_cache', help="Cached precomputed assignment file to use instead of default from pangolin-assignment repository. Does not require installation of pangolin-assignment.")

Expand All @@ -107,23 +108,25 @@ def main(sysargs = sys.argv[1:]):
if args.usher:
sys.stderr.write(cyan(f"--usher is a pangolin v3 option and is deprecated in pangolin v4. UShER is now the default analysis mode. Use --analysis-mode to explicitly set mode.\n"))

setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config)
setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config, args.use_old_datadir)

if args.add_assignment_cache:
update.install_pangolin_assignment()
update.install_pangolin_assignment(config[KEY_PANGOLIN_ASSIGNMENT_VERSION], args.datadir)

if args.update:
version_dictionary = {'pangolin': __version__,
'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
'constellations': config[KEY_CONSTELLATIONS_VERSION],
'scorpio': config[KEY_SCORPIO_VERSION]}
update.add_pangolin_assignment_if_installed(version_dictionary)
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION]
update.update(version_dictionary)

if args.update_data:
version_dictionary = {'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
'constellations': config[KEY_CONSTELLATIONS_VERSION]}
update.add_pangolin_assignment_if_installed(version_dictionary)
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION]
update.update(version_dictionary, args.datadir)

# install_pangolin_assignment doesn't exit so that --update/--update-data can be given at the
Expand Down
2 changes: 2 additions & 0 deletions pangolin/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
KEY_PANGOLIN_VERSION="pangolin_version"
KEY_CONSTELLATIONS_VERSION="constellation_version"
KEY_SCORPIO_VERSION="scorpio_version"
KEY_PANGOLIN_ASSIGNMENT_VERSION="pangolin_assignment_version"
KEY_PANGOLIN_ASSIGNMENT_PATH="pangolin_assignment_path"

KEY_VERBOSE="verbose"
KEY_LOG_API = "log_api"
Expand Down
19 changes: 13 additions & 6 deletions pangolin/utils/data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,24 @@ def install_error(package, url):

def get_assignment_cache(cache_file, config):
cache = ""
try:
import pangolin_assignment
pangolin_assignment_dir = pangolin_assignment.__path__[0]
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
pangolin_assignment_dir = config[KEY_PANGOLIN_ASSIGNMENT_PATH]
for r, d, f in os.walk(pangolin_assignment_dir):
for fn in f:
if fn == cache_file and cache == "":
cache = os.path.join(r, fn)
if not os.path.exists(cache):
sys.stderr.write(cyan(f'Error: cannot find assignment cache file {cache_file} in pangolin_assignment\n'))
sys.exit(-1)
except:
else:
sys.stderr.write(cyan('\nError: "pangolin --add-assignment-cache" is required before '
'"pangolin --use-assignment-cache", in order to install optional '
'pangolin-assignment repository (that will make future data updates slower).\n'))
sys.exit(-1)

# Check versions of pangolin-data and pangolin-assignment to make sure they are consistent.
if pangolin_assignment.__version__.lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'):
print(cyan(f'Error: pangolin_assignment cache version {pangolin_assignment.__version__} '
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION].lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'):
print(cyan(f'Error: pangolin_assignment cache version {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]} '
f'does not match pangolin_data version {config[KEY_PANGOLIN_DATA_VERSION]}. '
'Run "pangolin --update-data" to fetch latest versions of both.'))
sys.exit(-1)
Expand All @@ -115,5 +114,13 @@ def get_assignment_cache(cache_file, config):
sys.exit(-1)
return cache

def get_constellation_files(path):
constellation_files = []
for r, _, f in os.walk(path):
for fn in f:
if (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'):
constellation_files.append(os.path.join(r, fn))
return constellation_files

# config={}
# check_install()
105 changes: 40 additions & 65 deletions pangolin/utils/initialising.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@
from pangolin import __version__

import pangolin_data
class PangolinAssignmentWrapper():
__version__ = None
__path__ = [None]
try:
import pangolin_assignment
except ImportError:
# if we can't import the module, leave the variables we replace it with a mock with suitable attributes
pangolin_assignment = PangolinAssignmentWrapper()
import scorpio
import constellations

Expand Down Expand Up @@ -54,6 +62,8 @@ def setup_config_dict(cwd):
KEY_PANGOLIN_DATA_VERSION: pangolin_data.__version__,
KEY_SCORPIO_VERSION: scorpio.__version__,
KEY_CONSTELLATIONS_VERSION: constellations.__version__,
KEY_PANGOLIN_ASSIGNMENT_VERSION: pangolin_assignment.__version__,
KEY_PANGOLIN_ASSIGNMENT_PATH: pangolin_assignment.__path__[0],

KEY_VERBOSE: False,
KEY_LOG_API: "",
Expand Down Expand Up @@ -118,67 +128,36 @@ def version_from_init(init_file):
break
return version

def setup_data(datadir_arg,analysis_mode, config):

def setup_data(datadir_arg, analysis_mode, config, use_old_data):
datadir = check_datadir(datadir_arg)

pangolin_data_dir = pangolin_data.__path__[0]
constellations_dir = constellations.__path__[0]
constellation_files = []

data_locations = [os.walk(constellations_dir)]

if datadir:
data_locations.append(os.walk(datadir))

# the logic of this is to search the "built-in" constellations
# path first and then if as custom datadir is passed, follow up with those, so that
# any files found in the datadir supercede the "built-in" modules. The assumption
# here is that the datadir contains newer (user updated) data
for r, _, f in itertools.chain.from_iterable(data_locations):
if r.endswith('/constellations') or r.endswith('/constellations/definitions'):
constellation_files = [] # only collect the constellations from the last directory found
for fn in f:
if r.endswith('/constellations') and fn == '__init__.py':
constellations_version = version_from_init(os.path.join(r, fn))
elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'):
constellation_files.append(os.path.join(r, fn))

pangolin_data_version = pangolin_data.__version__
use_datadir = False
datadir_too_old = False
config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data.__version__
config[KEY_DATADIR] = pangolin_data.__path__[0]
config[KEY_CONSTELLATIONS_VERSION] = constellations.__version__
config[KEY_CONSTELLATION_FILES] = get_constellation_files(constellations.__path__[0])
config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = pangolin_assignment.__version__
config[KEY_PANGOLIN_ASSIGNMENT_PATH] = pangolin_assignment.__path__[0]

if datadir:
version = "Unknown"
for r,d,f in os.walk(datadir):
for fn in f:
# pangolin-data/__init__.py not constellations/__init__.py:
if r.endswith('data') and fn == "__init__.py":
# print("Found " + os.path.join(r, fn))
version = version_from_init(os.path.join(r, fn))
if not version:
continue

if LooseVersion(version) >= LooseVersion(pangolin_data.__version__):
# only use this if the version is >= than what we already have
pangolin_data_version = version
use_datadir = True
else:
datadir_too_old = True
sys.stderr.write(cyan(f"Warning: Ignoring specified datadir {datadir} - it contains pangoLEARN model files older ({version}) than those installed ({pangolin_data.__version__})\n"))

if use_datadir == False:
# we haven't got a viable datadir from searching args.datadir
if datadir and not datadir_too_old:
sys.stderr.write(cyan(
f"Warning: Ignoring specified datadir {datadir} - could not find __init__.py file to check versions \n"))

pangolin_data_dir = pangolin_data.__path__[0]
datadir = os.path.join(pangolin_data_dir,"data")

config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data_version
config[KEY_CONSTELLATIONS_VERSION] = constellations_version
config[KEY_DATADIR] = datadir
config[KEY_CONSTELLATION_FILES] = constellation_files
for module_name in ('constellations', 'pangolin_data', 'pangolin_assignment'):
for r, _, f in os.walk(datadir):
for fn in f:
if r.endswith('/' + module_name) and fn == '__init__.py':
version = version_from_init(os.path.join(r, fn))
# module_name has been imported so exists in global namespace
current_version = getattr(globals()[module_name], '__version__', '0')
if use_old_data or current_version is None or LooseVersion(version) >= LooseVersion(current_version):
if module_name == "pangolin_data":
config[KEY_PANGOLIN_DATA_VERSION] = version
config[KEY_DATADIR] = os.path.join(datadir, r)
elif module_name == "pangolin_assignment":
config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = version
config[KEY_PANGOLIN_ASSIGNMENT_PATH] = os.path.join(datadir, r)
elif module_name == "constellations":
config[KEY_CONSTELLATIONS_VERSION] = version
config[KEY_CONSTELLATION_FILES] = get_constellation_files(r)
else:
sys.stderr.write(cyan(f"Warning: Ignoring {module_name} in specified datadir {datadir} - it contains {module_name} with older ({version}) than those installed ({current_version})\n"))

def parse_qc_thresholds(maxambig, minlen, reference_fasta, config):

Expand Down Expand Up @@ -207,11 +186,10 @@ def parse_qc_thresholds(maxambig, minlen, reference_fasta, config):

print(green(f"Maximum ambiguity allowed is {config[KEY_MAXAMBIG]}.\n****"))


def print_ram_warning(analysis_mode):
if analysis_mode == "pangolearn":
print(cyan("Warning: pangoLEARN mode may use a significant amount of RAM, be aware that it will not suit every system."))

def print_alias_file_exit(alias_file):
with open(alias_file, 'r') as handle:
for line in handle:
Expand Down Expand Up @@ -242,11 +220,8 @@ def print_versions_exit(config):
f"constellations: {config[KEY_CONSTELLATIONS_VERSION]}\n"
f"scorpio: {config[KEY_SCORPIO_VERSION]}")
# Report pangolin_assignment version if it is installed, otherwise ignore
try:
import pangolin_assignment
print(f"pangolin-assignment: {pangolin_assignment.__version__}")
except:
pass
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
print(f"pangolin-assignment: {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]}")
# Print versions of other important tools used by pangolin
print_conda_version(['usher', 'ucsc-fatovcf', 'gofasta', 'minimap2'])
sys.exit(0)
Expand Down
50 changes: 12 additions & 38 deletions pangolin/utils/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,43 +64,33 @@ def git_lfs_install():
sys.stderr.write(cyan(f"Error: {e}:\n{stderr}\n"))
sys.exit(-1)

def pip_install_dep(dependency, release):
def pip_install_dep(dependency, release, datadir=None):
"""
Use pip install to install a cov-lineages repository with the specificed release
"""
url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}"
subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', url],
pip_command = [sys.executable, '-m', 'pip', 'install', '--upgrade']
if datadir is not None:
pip_command.extend(['--target', datadir])
pip_command.append(url)
subprocess.run(pip_command,
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)


def install_pangolin_assignment():
def install_pangolin_assignment(pangolin_assignment_version, datadir=None):
"""
If the pangolin-assignment repo has not been installed already then install the latest release.
"""
try:
import pangolin_assignment
print(f"pangolin-assignment already installed with version {pangolin_assignment.__version__}; use --update or --update-data if you wish to update it.", file=sys.stderr)

except:
if pangolin_assignment_version is not None:
print(f"pangolin-assignment already installed with version {pangolin_assignment_version}; use --update or --update-data if you wish to update it.", file=sys.stderr)
else:
git_lfs_install()
latest_release, tarball = get_latest_release('pangolin-assignment')
pip_install_dep('pangolin-assignment', latest_release)
pip_install_dep('pangolin-assignment', latest_release, datadir)
print(f"pangolin-assignment installed with latest release ({latest_release})")


def add_pangolin_assignment_if_installed(version_dictionary):
"""
If pangolin_assignment has been installed then add it to version_dictionary, else ignore.
"""
try:
import pangolin_assignment
version_dictionary["pangolin-assignment"] = pangolin_assignment.__version__
except:
pass


def update(version_dictionary, data_dir=None):
"""
Using the github releases API check for the latest current release
Expand Down Expand Up @@ -154,23 +144,7 @@ def update(version_dictionary, data_dir=None):
version = LooseVersion(version)

if version < latest_release_tidied:
if data_dir is not None:
# this path only gets followed when the user has --update_data and they
# have also specified a --datadir
with TemporaryDirectory() as tempdir:
dependency_package = package_names.get(dependency, dependency)
tarball_path = os.path.join(tempdir, 'tarball.tgz')
open(tarball_path, 'wb').write(request.urlopen(latest_release_tarball).read())
tf = tarfile.open(tarball_path)
extracted_dir = tf.next().name
tf.extractall(path=tempdir)
tf.close()
destination_directory = os.path.join(data_dir, dependency_package)
if os.path.isdir(destination_directory):
shutil.rmtree(destination_directory)
shutil.move(os.path.join(tempdir, extracted_dir, dependency_package), destination_directory)
else:
pip_install_dep(dependency, latest_release)
pip_install_dep(dependency, latest_release, data_dir)
print(f"{dependency} updated to {latest_release}", file=sys.stderr)
elif version > latest_release_tidied:
print(f"{dependency} ({version}) is newer than latest stable "
Expand Down

0 comments on commit bafc1d8

Please sign in to comment.