From d03d6412d39a2d4db24f42f1a82f9e5c10a8cf19 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 18 Apr 2019 13:04:20 -0400 Subject: [PATCH 01/61] first pass at outputs --- looper/const.py | 16 +++++----- looper/exceptions.py | 6 ++++ looper/looper.py | 2 +- looper/project.py | 76 ++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 88 insertions(+), 12 deletions(-) diff --git a/looper/const.py b/looper/const.py index acf5ced17..56a7f96e2 100644 --- a/looper/const.py +++ b/looper/const.py @@ -4,14 +4,9 @@ __email__ = "vreuter@virginia.edu" -__all__ = ["RESULTS_SUBDIR_KEY", "SUBMISSION_SUBDIR_KEY", "TEMPLATES_DIRNAME", "APPEARANCE_BY_FLAG", - "NO_DATA_PLACEHOLDER"] +__all__ = ["APPEARANCE_BY_FLAG", "NO_DATA_PLACEHOLDER", "OUTKEY", + "RESULTS_SUBDIR_KEY", "SUBMISSION_SUBDIR_KEY", "TEMPLATES_DIRNAME"] - -RESULTS_SUBDIR_KEY = "results_subdir" -SUBMISSION_SUBDIR_KEY = "submission_subdir" -TEMPLATES_DIRNAME = "jinja_templates" -NO_DATA_PLACEHOLDER = "NA" APPEARANCE_BY_FLAG = { "completed": { "button_class": "table-success", @@ -33,4 +28,9 @@ "button_class": "table-info", "flag": "Waiting" } -} \ No newline at end of file +} +NO_DATA_PLACEHOLDER = "NA" +OUTKEY = "outputs" +RESULTS_SUBDIR_KEY = "results_subdir" +SUBMISSION_SUBDIR_KEY = "submission_subdir" +TEMPLATES_DIRNAME = "jinja_templates" diff --git a/looper/exceptions.py b/looper/exceptions.py index b480ada95..0d85a4b22 100644 --- a/looper/exceptions.py +++ b/looper/exceptions.py @@ -23,6 +23,12 @@ class LooperError(Exception): __metaclass__ = ABCMeta +class DuplicatePipelineKeyException(LooperError): + """ Duplication of pipeline identifier precludes unique pipeline ref. """ + def __init__(self, key): + super(DuplicatePipelineKeyException, self).__init__(key) + + class InvalidResourceSpecificationException(LooperError): """ Pipeline interface resources--if present--needs default. """ def __init__(self, reason): diff --git a/looper/looper.py b/looper/looper.py index 7d599e0d8..c41ac4d14 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -804,7 +804,7 @@ def main(): determine_config_path(args.config_file), subproject=args.subproject, file_checks=args.file_checks, compute_env_file=getattr(args, 'env', None)) except yaml.parser.ParserError as e: - print("Project config parse failed -- {}".format(e)) + _LOGGER.error("Project config parse failed -- {}".format(e)) sys.exit(1) if hasattr(args, "compute"): diff --git a/looper/project.py b/looper/project.py index f6953ecac..8689f5314 100644 --- a/looper/project.py +++ b/looper/project.py @@ -8,13 +8,16 @@ import peppy from peppy.utils import is_command_callable from .const import * -from .pipeline_interface import PipelineInterface +from .exceptions import DuplicatePipelineKeyException +from .pipeline_interface import PipelineInterface, PROTOMAP_KEY from .utils import get_logger, partition __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" +__all__ = ["Project", "process_pipeline_interfaces"] + _LOGGER = get_logger(__name__) @@ -201,6 +204,69 @@ def build_submission_bundles(self, protocol, priority=True): else: return list(itertools.chain(*job_submission_bundles)) + def get_outputs(self): + """ + Map pipeline identifier to collection of output specifications. + + This method leverages knowledge of two collections of different kinds + of entities that meet in the manifestation of a Project. The first + is a collection of samples, which is known even in peppy.Project. The + second is a mapping from protocol/assay/library strategy to a collection + of pipeline interfaces, in which kinds of output may be declared. + + Knowledge of these two items is here harnessed to map the identifier + for each pipeline about which this Project is aware to a collection of + pairs of identifier for a kind of output and the collection of + this Project's samples for which it's applicable (i.e., those samples + with protocol that maps to the corresponding pipeline). + + :return Mapping[str, Mapping[str, namedtuple]]: collection of bindings + between identifier for pipeline and collection of bindings between + name for a kind of output and pair in which first component is a + path template and the second component is a collection of + sample names + """ + prots_data_pairs = \ + _gather_ifaces(*itertools.chain(*self.interfaces_by_protocol.values())) + m = {} + for name, (prots, data) in prots_data_pairs.items(): + snames = [s.name for s in self.samples if s.protocol in prots] + if not snames: + _LOGGER.debug("No samples with protocol: {}".format(p)) + continue + try: + outs = data[OUTKEY] + except KeyError: + _LOGGER.debug("No {} declared for pipeline: {}". + format(OUTKEY, name)) + else: + m[name] = {path_key: (path_val, snames) + for path_key, path_val in outs.items()} + return m + + +def _gather_ifaces(*ifaces): + specs = {} + for pi in ifaces: + protos_by_name = {} + for p, names in pi[PROTOMAP_KEY].items(): + if isinstance(names, str): + names = [names] + for n in names: + protos_by_name.setdefault(n, set()).add(p) + for k, dat in pi.iterpipes(): + name = dat.get("name") or k + if name in specs: + old_dat, old_prots = specs[name] + if dat != old_dat: + raise DuplicatePipelineKeyException(name) + else: + old_prots = set() + new_prots = protos_by_name.get(name, set()) | \ + protos_by_name.get(k, set()) + specs[name] = (old_prots | new_prots, dat) + return specs + def process_pipeline_interfaces(pipeline_interface_locations): """ @@ -216,8 +282,9 @@ def process_pipeline_interfaces(pipeline_interface_locations): interface_by_protocol = defaultdict(list) for pipe_iface_location in pipeline_interface_locations: if not os.path.exists(pipe_iface_location): - _LOGGER.warning("Ignoring nonexistent pipeline interface " - "location: '%s'", pipe_iface_location) + _LOGGER.warning( + "Ignoring nonexistent pipeline interface location: '%s'", + pipe_iface_location) continue pipe_iface = PipelineInterface(pipe_iface_location) for proto_name in pipe_iface.protocol_mapping: @@ -226,6 +293,9 @@ def process_pipeline_interfaces(pipeline_interface_locations): return interface_by_protocol +OutputGroup = namedtuple("OutputGroup", field_names=["path", "samples"]) + + # Collect PipelineInterface, Sample type, pipeline path, and script with flags. SubmissionBundle = namedtuple( "SubmissionBundle", From 95747c6a28dc5116721f0de97ced4d883855ee25 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 18 Apr 2019 15:15:05 -0400 Subject: [PATCH 02/61] development utilities --- looper/_devtools.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 looper/_devtools.py diff --git a/looper/_devtools.py b/looper/_devtools.py new file mode 100644 index 000000000..16cc2d81a --- /dev/null +++ b/looper/_devtools.py @@ -0,0 +1,25 @@ +""" Utility functions for internal, developmental use """ + +import copy +from logmuse import setup_logger + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" + +__all__ = ["est_log"] + + +def est_log(**kwargs): + """ + Establish logging, e.g. for an interactive session. + + :param dict kwargs: keyword arguments for logger setup. + :return logging.Logger: looper logger + """ + kwds = copy.copy(kwargs) + if "name" in kwds: + print("Ignoring {} and setting fixed values for logging names". + format(kwds["name"])) + del kwds["name"] + setup_logger(name="peppy", **kwds) + return setup_logger(name="looper", **kwds) From 0698a2d28691338590b4f98a3b32b7ac86534582 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 18 Apr 2019 15:16:39 -0400 Subject: [PATCH 03/61] piface data gathering --- looper/exceptions.py | 6 ++++-- looper/project.py | 26 +++++++++++++++++++------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/looper/exceptions.py b/looper/exceptions.py index 0d85a4b22..d26f5b80b 100644 --- a/looper/exceptions.py +++ b/looper/exceptions.py @@ -13,8 +13,10 @@ from collections.abc import Iterable -_all__ = ["InvalidResourceSpecificationException", "JobSubmissionException", - "LooperError", "MissingPipelineConfigurationException", +_all__ = ["DuplicatePipelineKeyException", + "InvalidResourceSpecificationException", + "JobSubmissionException", "LooperError", + "MissingPipelineConfigurationException", "PipelineInterfaceConfigError"] diff --git a/looper/project.py b/looper/project.py index 8689f5314..f71d5a888 100644 --- a/looper/project.py +++ b/looper/project.py @@ -227,12 +227,13 @@ def get_outputs(self): sample names """ prots_data_pairs = \ - _gather_ifaces(*itertools.chain(*self.interfaces_by_protocol.values())) + _gather_ifaces(itertools.chain(*self.interfaces_by_protocol.values())) m = {} for name, (prots, data) in prots_data_pairs.items(): snames = [s.name for s in self.samples if s.protocol in prots] if not snames: - _LOGGER.debug("No samples with protocol: {}".format(p)) + _LOGGER.debug("No samples matching protocol(s): {}". + format(", ".join(prots))) continue try: outs = data[OUTKEY] @@ -245,7 +246,17 @@ def get_outputs(self): return m -def _gather_ifaces(*ifaces): +def _gather_ifaces(ifaces): + """ + For each pipeline map identifier to protocols and interface data. + + :param Iterable[looper.PipelineInterface] ifaces: + :return Mapping[str, (set[str], attmap.AttMap)]: collection of bindings + between pipeline identifier and pair in which first component is + collection of associated protocol names, and second component is a + collection of interface data for pipeline identified by the key + :raise looper. + """ specs = {} for pi in ifaces: protos_by_name = {} @@ -256,12 +267,13 @@ def _gather_ifaces(*ifaces): protos_by_name.setdefault(n, set()).add(p) for k, dat in pi.iterpipes(): name = dat.get("name") or k - if name in specs: - old_dat, old_prots = specs[name] + try: + old_prots, old_dat = specs[name] + except KeyError: + old_prots = set() + else: if dat != old_dat: raise DuplicatePipelineKeyException(name) - else: - old_prots = set() new_prots = protos_by_name.get(name, set()) | \ protos_by_name.get(k, set()) specs[name] = (old_prots | new_prots, dat) From 91597a22de973664f7b99b884454468dcbf3d5ab Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 18 Apr 2019 17:44:31 -0400 Subject: [PATCH 04/61] stubs and initial test --- looper/exceptions.py | 7 +- tests/integration/test_project_get_outputs.py | 94 +++++++++++++++++++ 2 files changed, 96 insertions(+), 5 deletions(-) create mode 100644 tests/integration/test_project_get_outputs.py diff --git a/looper/exceptions.py b/looper/exceptions.py index d26f5b80b..bb3fd0c4e 100644 --- a/looper/exceptions.py +++ b/looper/exceptions.py @@ -1,10 +1,5 @@ """ Exceptions for specific looper issues. """ - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - - from abc import ABCMeta import sys if sys.version_info < (3, 3): @@ -12,6 +7,8 @@ else: from collections.abc import Iterable +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" _all__ = ["DuplicatePipelineKeyException", "InvalidResourceSpecificationException", diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py new file mode 100644 index 000000000..ee9f41cbf --- /dev/null +++ b/tests/integration/test_project_get_outputs.py @@ -0,0 +1,94 @@ +""" Tests for interaction between Project and PipelineInterface """ + +import pytest +import yaml +from looper import Project as LP +from peppy.const import * + + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" + + +MAIN_META_KEY = "main_meta" +SUBS_META_KEY = "subs_meta" +SECTION_BY_FIXTURE = { + MAIN_META_KEY: METADATA_KEY, SUBS_META_KEY: SUBPROJECTS_SECTION} + + +def get_conf_data(req): + """ + Get Project config data for a test case. + + :param pytest.FixtureRequest req: test case requesting Project config data + :return dict: Project config data + """ + m = {key: req.getfixturevalue(fix) for fix, key + in SECTION_BY_FIXTURE.items() if fix in req.fixturenames} + return m + + +@pytest.fixture(scope="function") +def prj(request, tmpdir): + """ Provide a test case with a Project instance. """ + conf_data = get_conf_data(request) + conf_file = tmpdir.join("pc.yaml").strpath + with open(conf_file, 'w') as f: + yaml.dump(conf_data, f) + return LP(conf_file) + + +@pytest.mark.parametrize(MAIN_META_KEY, [{OUTDIR_KEY: "arbitrary"}]) +def test_no_pifaces(prj, main_meta): + """ No pipeline interfaces --> the outputs data mapping is empty.""" + assert {} == prj.get_outputs() + + +@pytest.mark.skip("not implemented") +def test_no_outputs(): + pass + + +@pytest.mark.skip("not implemented") +def test_malformed_outputs(): + pass + + +@pytest.mark.skip("not implemented") +def test_only_subproject_has_pifaces(): + pass + + +@pytest.mark.skip("not implemented") +def test_only_subproject_has_outputs(): + pass + + +@pytest.mark.skip("not implemented") +def test_main_project_and_subproject_have_outputs(): + pass + + +@pytest.mark.skip("not implemented") +def test_no_samples_match_protocols_with_outputs(): + pass + + +@pytest.mark.skip("not implemented") +def test_pipeline_identifier_collision_same_data(): + pass + + +@pytest.mark.skip("not implemented") +def test_pipeline_identifier_collision_different_data(): + pass + + +@pytest.mark.skip("not implemented") +def test_sample_collection_accuracy(): + pass + + +@pytest.mark.skip("not implemented") +def test_protocol_collection_accuracy(): + pass From 9f6fc021ae91b031f0c843acfbb0ab13304ef66c Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 19 Apr 2019 12:57:24 -0400 Subject: [PATCH 05/61] tidy and share values --- looper/looper.py | 9 +++-- looper/pipeline_interface.py | 10 ++--- looper/project.py | 2 +- tests/integration/test_project_get_outputs.py | 1 + tests/models/conftest.py | 24 ++---------- tests/models/test_PipelineInterface.py | 38 ++++++++++--------- tests/test_submission_scripts.py | 8 ++-- 7 files changed, 41 insertions(+), 51 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index c41ac4d14..8a0c18abf 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -29,13 +29,15 @@ from .const import * from .exceptions import JobSubmissionException from .html_reports import HTMLReportBuilder +from .pipeline_interface import RESOURCES_KEY from .project import Project from .utils import determine_config_path, fetch_flag_files, sample_folder from logmuse import setup_logger -from peppy import ProjectContext, SAMPLE_EXECUTION_TOGGLE +from peppy import ProjectContext, METADATA_KEY, SAMPLE_EXECUTION_TOGGLE +PIPELINE_INTERFACES_KEY = "pipeline_interfaces" SUBMISSION_FAILURE_MESSAGE = "Cluster resource failure" @@ -288,7 +290,7 @@ def __call__(self, args, remaining_args, rerun=False, **compute_kwargs): """ if not self.prj.interfaces_by_protocol: - pipe_locs = getattr(self.prj.metadata, "pipeline_interfaces", []) + pipe_locs = getattr(self.prj[METADATA_KEY], PIPELINE_INTERFACES_KEY, []) # TODO: should these cases be handled as equally exceptional? # That is, should they either both raise errors, or both log errors? if len(pipe_locs) == 0: @@ -820,7 +822,8 @@ def main(): if args.command in ["run", "rerun"]: run = Runner(prj) try: - compute_kwargs = _proc_resources_spec(getattr(args, "resources", "")) + compute_kwargs = _proc_resources_spec( + getattr(args, RESOURCES_KEY, "")) run(args, remaining_args, rerun=(args.command == "rerun"), **compute_kwargs) except IOError: diff --git a/looper/pipeline_interface.py b/looper/pipeline_interface.py index 589637ba5..032fc345c 100644 --- a/looper/pipeline_interface.py +++ b/looper/pipeline_interface.py @@ -27,6 +27,7 @@ PL_KEY = "pipelines" PROTOMAP_KEY = "protocol_mapping" +RESOURCES_KEY = "resources" SUBTYPE_MAPPING_SECTION = "sample_subtypes" @@ -113,21 +114,20 @@ def notify(msg): except KeyError: notify("No compute settings") - res_key = "resources" try: - resources = universal_compute[res_key] + resources = universal_compute[RESOURCES_KEY] except KeyError: try: - resources = pl[res_key] + resources = pl[RESOURCES_KEY] except KeyError: notify("No resources") return {} else: - if res_key in pl: + if RESOURCES_KEY in pl: _LOGGER.warning( "{rk} section found in both {c} section and top-level " "pipelines section of pipeline interface; {c} section " - "version will be used".format(rk=res_key, c=compute_key)) + "version will be used".format(rk=RESOURCES_KEY, c=compute_key)) # Require default resource package specification. try: diff --git a/looper/project.py b/looper/project.py index f71d5a888..d47492c52 100644 --- a/looper/project.py +++ b/looper/project.py @@ -316,5 +316,5 @@ def process_pipeline_interfaces(pipeline_interface_locations): def _is_member(item, items): - """ Determine whether an iterm is a member of a collection. """ + """ Determine whether an item is a member of a collection. """ return item in items diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index ee9f41cbf..165e267c9 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -46,6 +46,7 @@ def test_no_pifaces(prj, main_meta): @pytest.mark.skip("not implemented") def test_no_outputs(): + """ """ pass diff --git a/tests/models/conftest.py b/tests/models/conftest.py index f672dce10..2d4b2b80a 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -13,6 +13,7 @@ import pytest import yaml +from looper.pipeline_interface import RESOURCES_KEY from peppy import DEFAULT_COMPUTE_RESOURCES_NAME, METADATA_KEY, \ NAME_TABLE_ATTR, SAMPLE_NAME_COLNAME @@ -21,7 +22,6 @@ __email__ = "vreuter@virginia.edu" - ATAC_PROTOCOL_NAME = "ATAC" CONFIG_FILENAME = "test-proj-conf.yaml" @@ -56,7 +56,6 @@ "time": "30-00:00:00", "partition": "longq"} - def pytest_generate_tests(metafunc): """ Conditional customization of test cases in this directory. """ try: @@ -95,14 +94,12 @@ def pytest_generate_tests(metafunc): } - @pytest.fixture(scope="function") def atac_pipe_name(): """ Oft-used as filename for pipeline module and PipelineInterface key. """ return "ATACSeq.py" - @pytest.fixture(scope="function") def atacseq_iface_with_resources(resources): """ @@ -114,11 +111,10 @@ def atacseq_iface_with_resources(resources): of the base sections plus resources section """ iface_data = copy.deepcopy(ATACSEQ_IFACE_WITHOUT_RESOURCES) - iface_data["resources"] = copy.deepcopy(resources) + iface_data[RESOURCES_KEY] = copy.deepcopy(resources) return iface_data - @pytest.fixture(scope="function") def atacseq_piface_data(atac_pipe_name): """ @@ -131,7 +127,6 @@ def atacseq_piface_data(atac_pipe_name): return {atac_pipe_name: copy.deepcopy(ATACSEQ_IFACE_WITHOUT_RESOURCES)} - @pytest.fixture(scope="function") def basic_data_raw(): return copy.deepcopy( @@ -139,7 +134,6 @@ def basic_data_raw(): "Sample": {SAMPLE_NAME_COLNAME: "arbitrary-sample"}}) - @pytest.fixture(scope="function") def basic_instance_data(request, instance_raw_data): """ @@ -161,14 +155,12 @@ def basic_instance_data(request, instance_raw_data): return transformation_by_class[which_class](instance_raw_data) - @pytest.fixture(scope="function") def default_resources(): """ Provide test case with default PipelineInterface resources section. """ return copy.deepcopy(DEFAULT_RESOURCES) - @pytest.fixture(scope="function") def env_config_filepath(tmpdir): """ Write default project/compute environment file for Project ctor. """ @@ -177,14 +169,12 @@ def env_config_filepath(tmpdir): return conf_file.strpath - @pytest.fixture(scope="function") def huge_resources(): """ Provide non-default resources spec. section for PipelineInterface. """ return copy.deepcopy(HUGE_RESOURCES) - @pytest.fixture(scope="function") def instance_raw_data(request, basic_data_raw, atacseq_piface_data): """ Supply the raw data for a basic model instance as a fixture. """ @@ -249,7 +239,6 @@ def path_config_file(request, tmpdir, atac_pipe_name): conf_data=conf_data, dirpath=tmpdir.strpath) - @pytest.fixture(scope="function") def path_proj_conf_file(tmpdir, proj_conf): """ Write basic project configuration data and provide filepath. """ @@ -259,7 +248,6 @@ def path_proj_conf_file(tmpdir, proj_conf): return conf_path - @pytest.fixture(scope="function") def path_anns_file(request, tmpdir, sample_sheet): """ Write basic annotations, optionally using a different delimiter. """ @@ -273,7 +261,6 @@ def path_anns_file(request, tmpdir, sample_sheet): return filepath - @pytest.fixture(scope="function") def piface_config_bundles(request, resources): """ @@ -300,14 +287,13 @@ def piface_config_bundles(request, resources): raise TypeError("Expected mapping or list collection of " "PipelineInterface data: {} ({})".format( iface_config_datas, type(iface_config_datas))) - resource_specification = request.getfixturevalue("resources") \ - if "resources" in request.fixturenames else resources + resource_specification = request.getfixturevalue(RESOURCES_KEY) \ + if RESOURCES_KEY in request.fixturenames else resources for config_bundle in data_bundles: config_bundle.update(resource_specification) return iface_config_datas - @pytest.fixture(scope="function") def resources(): """ Basic PipelineInterface compute resources data. """ @@ -315,7 +301,6 @@ def resources(): "huge": copy.copy(HUGE_RESOURCES)} - def write_config_data(protomap, conf_data, dirpath): """ Write PipelineInterface data to (temp)file. @@ -334,7 +319,6 @@ def write_config_data(protomap, conf_data, dirpath): return filepath - def _write_config(data, request, filename): """ Write configuration data to file. diff --git a/tests/models/test_PipelineInterface.py b/tests/models/test_PipelineInterface.py index fdcb0ab25..486578138 100644 --- a/tests/models/test_PipelineInterface.py +++ b/tests/models/test_PipelineInterface.py @@ -13,12 +13,14 @@ import yaml from attmap import PathExAttMap -from looper.pipeline_interface import PipelineInterface, PL_KEY, PROTOMAP_KEY +from looper.looper import PIPELINE_INTERFACES_KEY +from looper.pipeline_interface import PipelineInterface, PL_KEY, PROTOMAP_KEY, \ + RESOURCES_KEY from looper.project import Project from looper.exceptions import InvalidResourceSpecificationException, \ MissingPipelineConfigurationException, PipelineInterfaceConfigError -from peppy import Project, Sample, \ - DEFAULT_COMPUTE_RESOURCES_NAME, SAMPLE_ANNOTATIONS_KEY, SAMPLE_NAME_COLNAME +from peppy import Project, Sample +from peppy.const import * from .conftest import ATAC_PROTOCOL_NAME, write_config_data from tests.helpers import powerset @@ -77,7 +79,7 @@ def pi_with_resources(request, bundled_piface, resources): rp_data[file_size_name] = size pipe_iface_config = PipelineInterface(bundled_piface) for pipe_data in pipe_iface_config.pipelines.values(): - pipe_data["resources"] = resources + pipe_data[RESOURCES_KEY] = resources return pipe_iface_config @@ -182,7 +184,7 @@ def test_unconfigured_pipeline_exception( if not use_resources: for pipeline in pi.pipelines.values(): try: - del pipeline["resources"][DEFAULT_COMPUTE_RESOURCES_NAME] + del pipeline[RESOURCES_KEY][DEFAULT_COMPUTE_RESOURCES_NAME] except KeyError: # Already no default resource package. pass @@ -256,11 +258,11 @@ def test_requires_default( pi = pi_with_resources for name, pipeline in pi.iterpipes(): try: - del pipeline["resources"][DEFAULT_COMPUTE_RESOURCES_NAME] + del pipeline[RESOURCES_KEY][DEFAULT_COMPUTE_RESOURCES_NAME] except KeyError: # Already no default resource package. pass - assert "default" not in pipeline["resources"] + assert "default" not in pipeline[RESOURCES_KEY] with pytest.raises(InvalidResourceSpecificationException): pi.choose_resource_package( name, file_size=huge_resources["file_size"] + 1) @@ -281,7 +283,7 @@ def test_resources_not_required( """ Compute resource specification is optional. """ pi = pi_with_resources for pipe_data in pi.pipelines.values(): - del pipe_data["resources"] + del pipe_data[RESOURCES_KEY] for pipe_name in pi.pipeline_names: assert {} == pi.choose_resource_package(pipe_name, int(file_size)) assert {} == pi.choose_resource_package(pipe_name, float(file_size)) @@ -295,13 +297,13 @@ def test_selects_proper_resource_package( file_size, expected_package_name, midsize_resources): """ Minimal resource package sufficient for pipeline and file size. """ for pipe_data in pi_with_resources.pipelines.values(): - pipe_data["resources"].update( + pipe_data[RESOURCES_KEY].update( {"midsize": copy.deepcopy(midsize_resources)}) for pipe_name, pipe_data in pi_with_resources.iterpipes(): observed_package = pi_with_resources.choose_resource_package( pipe_name, file_size) expected_package = copy.deepcopy( - pipe_data["resources"][expected_package_name]) + pipe_data[RESOURCES_KEY][expected_package_name]) assert expected_package == observed_package def test_negative_file_size_prohibited( @@ -309,7 +311,7 @@ def test_negative_file_size_prohibited( """ Negative min file size in resource package spec is prohibited. """ file_size_attr = "min_file_size" if use_new_file_size else "file_size" for pipe_data in pi_with_resources.pipelines.values(): - for package_data in pipe_data["resources"].values(): + for package_data in pipe_data[RESOURCES_KEY].values(): package_data[file_size_attr] = -5 * random.random() for pipe_name in pi_with_resources.pipeline_names: file_size_request = random.randrange(1, 11) @@ -342,13 +344,13 @@ def clear_file_size(resource_package): # Add resource package spec data and create PipelineInterface. pipe_iface_data = copy.deepcopy(bundled_piface) for pipe_data in pipe_iface_data[PL_KEY].values(): - pipe_data["resources"] = resources_data + pipe_data[RESOURCES_KEY] = resources_data pi = PipelineInterface(pipe_iface_data) # We should always get default resource package for mini file. for pipe_name, pipe_data in pi.iterpipes(): default_resource_package = \ - pipe_data["resources"][DEFAULT_COMPUTE_RESOURCES_NAME] + pipe_data[RESOURCES_KEY][DEFAULT_COMPUTE_RESOURCES_NAME] clear_file_size(default_resource_package) assert default_resource_package == \ pi.choose_resource_package(pipe_name, 0.001) @@ -361,7 +363,7 @@ def test_default_package_new_name_zero_size( for pipe_name, pipe_data in pi_with_resources.iterpipes(): # Establish faulty default package setting for file size. - default_resource_package = pipe_data["resources"]["default"] + default_resource_package = pipe_data[RESOURCES_KEY]["default"] if use_new_file_size: if "file_size" in default_resource_package: del default_resource_package["file_size"] @@ -403,7 +405,7 @@ def test_file_size_spec_required_for_non_default_packages( # Create the PipelineInterface. for pipe_data in bundled_piface[PL_KEY].values(): - pipe_data["resources"] = resource_package_data + pipe_data[RESOURCES_KEY] = resource_package_data pi = PipelineInterface(bundled_piface) # Attempt to select resource package should fail for each pipeline, @@ -625,7 +627,7 @@ class GenericProtocolMatchTests: @pytest.fixture def prj_data(self): """ Provide basic Project data. """ - return {"metadata": {"output_dir": "output", + return {METADATA_KEY: {"output_dir": "output", "results_subdir": "results_pipeline", "submission_subdir": "submission"}} @@ -649,8 +651,8 @@ def iface_paths(self, tmpdir): @pytest.fixture def prj(self, tmpdir, prj_data, anns_file, iface_paths): """ Provide basic Project. """ - prj_data["pipeline_interfaces"] = iface_paths - prj_data["metadata"][SAMPLE_ANNOTATIONS_KEY] = anns_file + prj_data[PIPELINE_INTERFACES_KEY] = iface_paths + prj_data[METADATA_KEY][SAMPLE_ANNOTATIONS_KEY] = anns_file prj_file = tmpdir.join("pconf.yaml").strpath with open(prj_file, 'w') as f: yaml.dump(prj_data, f) diff --git a/tests/test_submission_scripts.py b/tests/test_submission_scripts.py index e5547d98c..f0f0b3bac 100644 --- a/tests/test_submission_scripts.py +++ b/tests/test_submission_scripts.py @@ -14,7 +14,8 @@ import looper from looper.const import * from looper.looper import Project -from looper.utils import fetch_sample_flags, sample_folder +from looper.pipeline_interface import RESOURCES_KEY +from looper.utils import fetch_sample_flags, sample_folder from peppy import ASSAY_KEY, SAMPLE_ANNOTATIONS_KEY, SAMPLE_NAME_COLNAME, \ SAMPLE_SUBANNOTATIONS_KEY @@ -26,7 +27,6 @@ ATAC_PIPE = "pepatac.py" PIPE_NAME_KEY = "name" PIPE_PATH_KEY = "path" -PIPE_RESOURCES_KEY = "resources" SAMPLE_METADATA_HEADER = [SAMPLE_NAME_COLNAME, ASSAY_KEY] ASSAYS = ["WGBS", "WGBS", "ATAC", "ATAC"] SAMPLE_METADATA_RECORDS = [("sample" + str(i), p) for i, p in enumerate(ASSAYS)] @@ -35,11 +35,11 @@ DEFAULT_RESOURCES_KEY = "default" ATAC_SPEC = { PIPE_NAME_KEY: "PEPATAC", PIPE_PATH_KEY: ATAC_PIPE, - PIPE_RESOURCES_KEY: {DEFAULT_RESOURCES_KEY: DEFAULT_RESOURCES} + RESOURCES_KEY: {DEFAULT_RESOURCES_KEY: DEFAULT_RESOURCES} } WGBS_SPEC = { PIPE_NAME_KEY: "WGBS", PIPE_PATH_KEY: WGBS_PIPE, - PIPE_RESOURCES_KEY: {DEFAULT_RESOURCES_KEY: DEFAULT_RESOURCES} + RESOURCES_KEY: {DEFAULT_RESOURCES_KEY: DEFAULT_RESOURCES} } PIPE_SPECS = {"pepatac.py": ATAC_SPEC, "wgbs.py": WGBS_SPEC} PLIFACE_DATA = { From aa9e4b0b6d1aac7ec6fba77fcea3faf32aa21938 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 19 Apr 2019 13:08:21 -0400 Subject: [PATCH 06/61] more var sharing --- looper/project.py | 12 ++++++------ tests/models/conftest.py | 19 ++++++++----------- tests/models/test_PipelineInterface.py | 11 ++++++++--- tests/test_submission_scripts.py | 24 +++++++++++------------- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/looper/project.py b/looper/project.py index d47492c52..3fce09735 100644 --- a/looper/project.py +++ b/looper/project.py @@ -6,6 +6,7 @@ import os import peppy +from peppy import OUTDIR_KEY from peppy.utils import is_command_callable from .const import * from .exceptions import DuplicatePipelineKeyException @@ -41,12 +42,12 @@ def __init__(self, config_file, subproject=None, **kwargs): @property def project_folders(self): """ Keys for paths to folders to ensure exist. """ - return ["output_dir", RESULTS_SUBDIR_KEY, SUBMISSION_SUBDIR_KEY] + return [OUTDIR_KEY, RESULTS_SUBDIR_KEY, SUBMISSION_SUBDIR_KEY] @property def required_metadata(self): """ Which metadata attributes are required. """ - return ["output_dir"] + return [OUTDIR_KEY] def build_submission_bundles(self, protocol, priority=True): """ @@ -294,13 +295,12 @@ def process_pipeline_interfaces(pipeline_interface_locations): interface_by_protocol = defaultdict(list) for pipe_iface_location in pipeline_interface_locations: if not os.path.exists(pipe_iface_location): - _LOGGER.warning( - "Ignoring nonexistent pipeline interface location: '%s'", - pipe_iface_location) + _LOGGER.warning("Ignoring nonexistent pipeline interface location: " + "{}".format(pipe_iface_location)) continue pipe_iface = PipelineInterface(pipe_iface_location) for proto_name in pipe_iface.protocol_mapping: - _LOGGER.whisper("Adding protocol name: '%s'", proto_name) + _LOGGER.whisper("Adding protocol name: {}".format(proto_name)) interface_by_protocol[proto_name].append(pipe_iface) return interface_by_protocol diff --git a/tests/models/conftest.py b/tests/models/conftest.py index 2d4b2b80a..4f6b382a8 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -8,12 +8,10 @@ from collections import Iterable, Mapping else: from collections.abc import Iterable, Mapping - import pandas as pd import pytest import yaml - -from looper.pipeline_interface import RESOURCES_KEY +from looper.pipeline_interface import PROTOMAP_KEY, RESOURCES_KEY from peppy import DEFAULT_COMPUTE_RESOURCES_NAME, METADATA_KEY, \ NAME_TABLE_ATTR, SAMPLE_NAME_COLNAME @@ -72,7 +70,6 @@ def pytest_generate_tests(metafunc): {"name": "sans-path"})]) - ATACSEQ_IFACE_WITHOUT_RESOURCES = { "name": "ATACseq", "looper_args": True, @@ -148,8 +145,8 @@ def basic_instance_data(request, instance_raw_data): # Cleanup is free with _write_config, using request's temp folder. transformation_by_class = { "PathExAttMap": lambda data: data, - "PipelineInterface": lambda data: - _write_config(data, request, "pipeline_interface.yaml"), + "PipelineInterface": lambda data: _write_config( + data, request, "pipeline_interface.yaml"), "Sample": lambda data: pd.Series(data)} which_class = request.getfixturevalue("class_name") return transformation_by_class[which_class](instance_raw_data) @@ -284,11 +281,11 @@ def piface_config_bundles(request, resources): elif isinstance(iface_config_datas, Iterable): data_bundles = iface_config_datas else: - raise TypeError("Expected mapping or list collection of " - "PipelineInterface data: {} ({})".format( - iface_config_datas, type(iface_config_datas))) + raise TypeError( + "Expected mapping or list collection of PipelineInterface data: {} " + "({})".format(iface_config_datas, type(iface_config_datas))) resource_specification = request.getfixturevalue(RESOURCES_KEY) \ - if RESOURCES_KEY in request.fixturenames else resources + if RESOURCES_KEY in request.fixturenames else resources for config_bundle in data_bundles: config_bundle.update(resource_specification) return iface_config_datas @@ -312,7 +309,7 @@ def write_config_data(protomap, conf_data, dirpath): file to write :return str: path to the (temp)file written """ - full_conf_data = {"protocol_mapping": protomap, "pipelines": conf_data} + full_conf_data = {PROTOMAP_KEY: protomap, "pipelines": conf_data} filepath = os.path.join(dirpath, "pipeline_interface.yaml") with open(filepath, 'w') as conf_file: yaml.safe_dump(full_conf_data, conf_file) diff --git a/tests/models/test_PipelineInterface.py b/tests/models/test_PipelineInterface.py index 486578138..1fae26740 100644 --- a/tests/models/test_PipelineInterface.py +++ b/tests/models/test_PipelineInterface.py @@ -13,6 +13,7 @@ import yaml from attmap import PathExAttMap +from looper.const import * from looper.looper import PIPELINE_INTERFACES_KEY from looper.pipeline_interface import PipelineInterface, PL_KEY, PROTOMAP_KEY, \ RESOURCES_KEY @@ -627,9 +628,13 @@ class GenericProtocolMatchTests: @pytest.fixture def prj_data(self): """ Provide basic Project data. """ - return {METADATA_KEY: {"output_dir": "output", - "results_subdir": "results_pipeline", - "submission_subdir": "submission"}} + return { + METADATA_KEY: { + OUTDIR_KEY: "output", + RESULTS_SUBDIR_KEY: "results_pipeline", + SUBMISSION_SUBDIR_KEY: "submission" + } + } @pytest.fixture def sheet_lines(self): diff --git a/tests/test_submission_scripts.py b/tests/test_submission_scripts.py index f0f0b3bac..6dbe21204 100644 --- a/tests/test_submission_scripts.py +++ b/tests/test_submission_scripts.py @@ -2,7 +2,6 @@ from collections import OrderedDict import copy -from functools import partial import glob import itertools import os @@ -10,11 +9,11 @@ import pytest import yaml -from peppy import FLAGS +from peppy import FLAGS, METADATA_KEY, OUTDIR_KEY import looper from looper.const import * -from looper.looper import Project -from looper.pipeline_interface import RESOURCES_KEY +from looper.looper import Project, PIPELINE_INTERFACES_KEY +from looper.pipeline_interface import PROTOMAP_KEY, RESOURCES_KEY from looper.utils import fetch_sample_flags, sample_folder from peppy import ASSAY_KEY, SAMPLE_ANNOTATIONS_KEY, SAMPLE_NAME_COLNAME, \ SAMPLE_SUBANNOTATIONS_KEY @@ -43,7 +42,7 @@ } PIPE_SPECS = {"pepatac.py": ATAC_SPEC, "wgbs.py": WGBS_SPEC} PLIFACE_DATA = { - "protocol_mapping": {"ATAC": ATAC_PIPE, "WGBS": WGBS_PIPE}, + PROTOMAP_KEY: {"ATAC": ATAC_PIPE, "WGBS": WGBS_PIPE}, "pipelines": PIPE_SPECS } @@ -100,10 +99,10 @@ def prj(request, tmpdir): yaml.dump(PLIFACE_DATA, f) _touch_pipe_files(outdir, PLIFACE_DATA) metadata = {SAMPLE_ANNOTATIONS_KEY: anns, - "output_dir": outdir, "pipeline_interfaces": pipe_iface_path} + OUTDIR_KEY: outdir, PIPELINE_INTERFACES_KEY: pipe_iface_path} if subanns: metadata[SAMPLE_SUBANNOTATIONS_KEY] = subanns - prjdat = {"metadata": metadata} + prjdat = {METADATA_KEY: metadata} with open(conf_path, 'w') as f: yaml.dump(prjdat, f) @@ -134,7 +133,6 @@ def validate_submission_count(project, conductors): "Expected {} submissions but tallied {}".format(num_exp, num_obs) - def validate_submission_scripts(project, _): """ Check bijection between a project's samples and its submission scripts. @@ -274,7 +272,7 @@ def test_ignoring_flags(prj, flag_name, flagged_sample_names, validate): assert len(flagged_sample_names) == len(preexisting) assert set(flag_files_made) == set(itertools.chain(*preexisting.values())) conductors, pipe_keys = process_protocols( - prj, set(PLIFACE_DATA["protocol_mapping"].keys()), ignore_flags=True) + prj, set(PLIFACE_DATA[PROTOMAP_KEY].keys()), ignore_flags=True) assert all(map(lambda c: c.ignore_flags, conductors.values())), \ "Failed to establish precondition, that flags are to be ignored" for s in prj.samples: @@ -302,18 +300,18 @@ def test_convergent_protocol_mapping_keys(tmpdir): with open(anns_path, 'w') as f: f.write(os.linesep.join(sep.join(r) for r in records)) - pliface_data = {"protocol_mapping": dict(protomap), "pipelines": PIPE_SPECS} + pliface_data = {PROTOMAP_KEY: dict(protomap), "pipelines": PIPE_SPECS} pliface_filepath = os.path.join(outdir, "pipes.yaml") with open(pliface_filepath, 'w') as f: yaml.dump(pliface_data, f) - metadata = {"output_dir": outdir, SAMPLE_ANNOTATIONS_KEY: anns_path, + metadata = {OUTDIR_KEY: outdir, SAMPLE_ANNOTATIONS_KEY: anns_path, "pipeline_interfaces": pliface_filepath} _touch_pipe_files(tmpdir.strpath, pliface_data) - prjdat = {"metadata": metadata} + prjdat = {METADATA_KEY: metadata} pcfg = tmpdir.join("prj.yaml").strpath with open(pcfg, 'w') as f: yaml.dump(prjdat, f) @@ -382,7 +380,7 @@ def _process_base_pliface(prj, **kwargs): protocol name to collection of keys for pipelines for that protocol """ return process_protocols( - prj, set(PLIFACE_DATA["protocol_mapping"].keys()), **kwargs) + prj, set(PLIFACE_DATA[PROTOMAP_KEY].keys()), **kwargs) def _mkflag(sample, prj, flag): From 24e7236b7e6be51240ad00b7c243b68ff9e38921 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 19 Apr 2019 15:34:05 -0400 Subject: [PATCH 07/61] amid project interfaces reconfiguration --- looper/const.py | 4 +- looper/html_reports.py | 2 +- looper/looper.py | 7 +- looper/pipeline_interface.py | 1 - looper/project.py | 49 +++++++---- looper/project_piface_group.py | 116 +++++++++++++++++++++++++ tests/models/test_PipelineInterface.py | 1 - tests/test_submission_scripts.py | 2 +- 8 files changed, 155 insertions(+), 27 deletions(-) create mode 100644 looper/project_piface_group.py diff --git a/looper/const.py b/looper/const.py index 56a7f96e2..69470e16e 100644 --- a/looper/const.py +++ b/looper/const.py @@ -5,7 +5,8 @@ __all__ = ["APPEARANCE_BY_FLAG", "NO_DATA_PLACEHOLDER", "OUTKEY", - "RESULTS_SUBDIR_KEY", "SUBMISSION_SUBDIR_KEY", "TEMPLATES_DIRNAME"] + "PIPELINE_INTERFACES_KEY", "RESULTS_SUBDIR_KEY", + "SUBMISSION_SUBDIR_KEY", "TEMPLATES_DIRNAME"] APPEARANCE_BY_FLAG = { "completed": { @@ -30,6 +31,7 @@ } } NO_DATA_PLACEHOLDER = "NA" +PIPELINE_INTERFACES_KEY = "pipeline_interfaces" OUTKEY = "outputs" RESULTS_SUBDIR_KEY = "results_subdir" SUBMISSION_SUBDIR_KEY = "submission_subdir" diff --git a/looper/html_reports.py b/looper/html_reports.py index d2a9a3816..199ca02b9 100644 --- a/looper/html_reports.py +++ b/looper/html_reports.py @@ -492,7 +492,7 @@ def create_project_objects(self): figures = [] links = [] warnings = [] - ifaces = self.prj.interfaces_by_protocol[protocol] + ifaces = self.prj.get_interfaces(protocol) # Check the interface files for summarizers for iface in ifaces: diff --git a/looper/looper.py b/looper/looper.py index 8a0c18abf..b98e88d7e 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -37,7 +37,6 @@ from peppy import ProjectContext, METADATA_KEY, SAMPLE_EXECUTION_TOGGLE -PIPELINE_INTERFACES_KEY = "pipeline_interfaces" SUBMISSION_FAILURE_MESSAGE = "Cluster resource failure" @@ -252,7 +251,7 @@ def process_protocols(prj, protocols, resource_setting_kwargs=None, **kwargs): comp_vars.update(resource_setting_kwargs or {}) _LOGGER.info("Known protocols: {}".format( - ", ".join(prj.interfaces_by_protocol.keys()))) + ", ".join(prj.interfaces.protocols))) for proto in set(protocols) | {GENERIC_PROTOCOL_KEY}: _LOGGER.debug("Determining sample type, script, and flags for " @@ -289,7 +288,7 @@ def __call__(self, args, remaining_args, rerun=False, **compute_kwargs): run for the first time """ - if not self.prj.interfaces_by_protocol: + if not self.prj.interfaces: pipe_locs = getattr(self.prj[METADATA_KEY], PIPELINE_INTERFACES_KEY, []) # TODO: should these cases be handled as equally exceptional? # That is, should they either both raise errors, or both log errors? @@ -496,7 +495,7 @@ def _run_custom_summarizers(project): for protocol in set(all_protocols): try: - ifaces = project.interfaces_by_protocol[protocol] + ifaces = project.get_interfaces(protocol) except KeyError: _LOGGER.warning("No interface for protocol '{}', skipping summary".format(protocol)) continue diff --git a/looper/pipeline_interface.py b/looper/pipeline_interface.py index 032fc345c..3c6ce1603 100644 --- a/looper/pipeline_interface.py +++ b/looper/pipeline_interface.py @@ -553,7 +553,6 @@ def expand_pl_paths(piface): return piface - def standardize_protocols(piface): """ Handle casing and punctuation of protocol keys in pipeline interface. diff --git a/looper/project.py b/looper/project.py index 3fce09735..ff67b6a29 100644 --- a/looper/project.py +++ b/looper/project.py @@ -1,16 +1,17 @@ """ Looper version of NGS project model. """ -from collections import defaultdict, namedtuple +from collections import namedtuple from functools import partial import itertools import os import peppy -from peppy import OUTDIR_KEY +from peppy import METADATA_KEY, OUTDIR_KEY from peppy.utils import is_command_callable from .const import * from .exceptions import DuplicatePipelineKeyException -from .pipeline_interface import PipelineInterface, PROTOMAP_KEY +from .pipeline_interface import PROTOMAP_KEY +from .project_piface_group import ProjectPifaceGroup from .utils import get_logger, partition @@ -36,8 +37,8 @@ def __init__(self, config_file, subproject=None, **kwargs): config_file, subproject=subproject, no_environment_exception=RuntimeError, no_compute_exception=RuntimeError, **kwargs) - self.interfaces_by_protocol = \ - process_pipeline_interfaces(self.metadata.pipeline_interfaces) + self.interfaces = process_pipeline_interfaces( + self[METADATA_KEY][PIPELINE_INTERFACES_KEY]) @property def project_folders(self): @@ -81,8 +82,7 @@ def build_submission_bundles(self, protocol, priority=True): # sort of pool of information about possible ways in which to submit # pipeline(s) for sample(s) of the indicated protocol. try: - pipeline_interfaces = \ - self.interfaces_by_protocol[protocol] + pipeline_interfaces = self.get_interfaces(protocol) except KeyError: # Messaging can be done by the caller. _LOGGER.debug("No interface for protocol: %s", protocol) @@ -205,6 +205,18 @@ def build_submission_bundles(self, protocol, priority=True): else: return list(itertools.chain(*job_submission_bundles)) + def get_interfaces(self, protocol): + """ + Get the pipeline interfaces associated with the given protocol. + + :param str protocol: name of the protocol for which to get interfaces + :return Iterable[looper.PipelineInterface]: collection of pipeline + interfaces associated with the given protocol + :raise KeyError: if the given protocol is not (perhaps yet) mapped + to any pipeline interface + """ + return self.interfaces[protocol] + def get_outputs(self): """ Map pipeline identifier to collection of output specifications. @@ -227,8 +239,7 @@ def get_outputs(self): path template and the second component is a collection of sample names """ - prots_data_pairs = \ - _gather_ifaces(itertools.chain(*self.interfaces_by_protocol.values())) + prots_data_pairs = _gather_ifaces(self.interfaces) m = {} for name, (prots, data) in prots_data_pairs.items(): snames = [s.name for s in self.samples if s.protocol in prots] @@ -292,17 +303,19 @@ def process_pipeline_interfaces(pipeline_interface_locations): :return Mapping[str, Iterable[PipelineInterface]]: mapping from protocol name to interface(s) for which that protocol is mapped """ - interface_by_protocol = defaultdict(list) - for pipe_iface_location in pipeline_interface_locations: - if not os.path.exists(pipe_iface_location): + iface_group = ProjectPifaceGroup() + for loc in pipeline_interface_locations: + if not os.path.exists(loc): _LOGGER.warning("Ignoring nonexistent pipeline interface location: " - "{}".format(pipe_iface_location)) + "{}".format(loc)) continue - pipe_iface = PipelineInterface(pipe_iface_location) - for proto_name in pipe_iface.protocol_mapping: - _LOGGER.whisper("Adding protocol name: {}".format(proto_name)) - interface_by_protocol[proto_name].append(pipe_iface) - return interface_by_protocol + fs = [loc] if os.path.isfile(loc) else \ + [os.path.join(loc, f) for f in os.listdir(loc) + if os.path.splitext(f)[1] in [".yaml", ".yml"]] + for f in fs: + _LOGGER.debug("Processing interface definition: {}".format(f)) + iface_group.update(f) + return iface_group OutputGroup = namedtuple("OutputGroup", field_names=["path", "samples"]) diff --git a/looper/project_piface_group.py b/looper/project_piface_group.py new file mode 100644 index 000000000..e3bc90212 --- /dev/null +++ b/looper/project_piface_group.py @@ -0,0 +1,116 @@ +""" Group of Project's PipelineInterface instances """ + +import sys +if sys.version_info < (3, 3): + from collections import Mapping +else: + from collections.abc import Mapping +from .pipeline_interface import PipelineInterface, PROTOMAP_KEY +from .utils import get_logger + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" + + +_LOGGER = get_logger(__name__) + + +class ProjectPifaceGroup(object): + """ Collection of PipelineInterface instances and lookup-by-protocol. """ + + def __init__(self, piface=None): + """ + Create the group, either empty or with initial data. + + :param str | Mapping | looper.PipelineInterface piface: either pipeline + interface file, pipeline interface, or interface-defining mapping + """ + self._interfaces = [] + self._indices_by_protocol = {} + piface and self.update(piface) + + def __eq__(self, other): + """ + Instances are equivalent iff interfaces and protocol mappings are. + + :param looper.project_piface_group.ProjectPifaceGroup other: the group + to compare to this one + :return bool: whether this group is equivalent to the compared one + """ + return isinstance(other, ProjectPifaceGroup) and \ + self._interfaces == other._interfaces and \ + self._indices_by_protocol == other._indices_by_protocol + + def __ne__(self, other): + """ Leverage the overridden equivalence operator. """ + return not self == other + + def __getitem__(self, item): + """ + Retrieve interfaces for given protocol name. + + :param str item: name of protocol for which to fetch interfaces. + :return Iterable[looper.PipelineInterface]: + """ + return [self._interfaces[i] for i in self._indices_by_protocol[item]] + + def __iter__(self): + """ + Iteration is over the interfaces. + + :return Iterable[looper.PipelineInterface]: iterator over this group's + PipelineInterface instances + """ + return iter(self._interfaces) + + def __len__(self): + """ + Group size is the number of interfaces. + + :return int: number of interfaces in this group + """ + return sum(1 for _ in iter(self)) + + @property + def protocols(self): + """ + Get the collection of names of protocols mapping into this group. + + :return list[str]: collection of protocol names that map to at least + one pipeline represented by an interface in this group + """ + return [p for p in self._indices_by_protocol] + + def update(self, piface): + """ + Add a pipeline interface to this group. + + :param str | Mapping | looper.PipelineInterface piface: either pipeline + interface file, pipeline interface, or interface-defining mapping + :return looper.project_piface_group.ProjectPifaceGroup: updated instance + :raise TypeError: if the argument to the piface parameter is neither + text (filepath) nor a PipelineInterface or Mapping; additional + exception cases may arise from ensuing attempt to create a + PipelineInterface from the argument if the argument itself is not + already a PipelineInterface. + """ + if isinstance(piface, (str, Mapping)): + piface = PipelineInterface(piface) + elif not isinstance(piface, PipelineInterface): + raise TypeError( + "Update value must be {obj}-defining filepath or {obj} itself; " + "got {argtype}".format( + obj=PipelineInterface.__name__, argtype=type(piface))) + assert isinstance(piface, PipelineInterface) + for curr in self._interfaces: + if curr == piface: + _LOGGER.whisper( + "Found match existing {} match: {}".format( + PipelineInterface.__class__.__name__, piface)) + break + else: + self._interfaces.append(piface) + i = len(self._interfaces) - 1 + for p in piface[PROTOMAP_KEY]: + self._indices_by_protocol.setdefault(p, []).append(i) + return self diff --git a/tests/models/test_PipelineInterface.py b/tests/models/test_PipelineInterface.py index 1fae26740..b67dac6cb 100644 --- a/tests/models/test_PipelineInterface.py +++ b/tests/models/test_PipelineInterface.py @@ -14,7 +14,6 @@ from attmap import PathExAttMap from looper.const import * -from looper.looper import PIPELINE_INTERFACES_KEY from looper.pipeline_interface import PipelineInterface, PL_KEY, PROTOMAP_KEY, \ RESOURCES_KEY from looper.project import Project diff --git a/tests/test_submission_scripts.py b/tests/test_submission_scripts.py index 6dbe21204..65558a934 100644 --- a/tests/test_submission_scripts.py +++ b/tests/test_submission_scripts.py @@ -12,7 +12,7 @@ from peppy import FLAGS, METADATA_KEY, OUTDIR_KEY import looper from looper.const import * -from looper.looper import Project, PIPELINE_INTERFACES_KEY +from looper.looper import Project from looper.pipeline_interface import PROTOMAP_KEY, RESOURCES_KEY from looper.utils import fetch_sample_flags, sample_folder from peppy import ASSAY_KEY, SAMPLE_ANNOTATIONS_KEY, SAMPLE_NAME_COLNAME, \ From b7a0e964b99d34928a6f1bace4ba06009a1825c7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 19 Apr 2019 15:47:34 -0400 Subject: [PATCH 08/61] autodoc config for API; fix links --- docs/autodoc_build/api.md | 284 ----------------------------------- docs/autodoc_build/looper.md | 49 +++++- docs/features.md | 20 +-- mkdocs.yml | 6 +- 4 files changed, 62 insertions(+), 297 deletions(-) delete mode 100644 docs/autodoc_build/api.md diff --git a/docs/autodoc_build/api.md b/docs/autodoc_build/api.md deleted file mode 100644 index 6a1fb503b..000000000 --- a/docs/autodoc_build/api.md +++ /dev/null @@ -1,284 +0,0 @@ -# Package looper Documentation - -## Class Project -Looper-specific NGS Project. - -**Parameters:** - -- `config_file` -- `str`: path to configuration file with data fromwhich Project is to be built -- `subproject` -- `str`: name indicating subproject to use, optional - - -### constants -Return key-value pairs of pan-Sample constants for this Project. -```python -def constants(self) -``` - -**Returns:** - -`Mapping`: collection of KV pairs, each representing a pairingof attribute name and attribute value - - - - -### derived\_columns -Collection of sample attributes for which value of each is derived from elsewhere -```python -def derived_columns(self) -``` - -**Returns:** - -`list[str]`: sample attribute names for which value is derived - - - - -### implied\_columns -Collection of sample attributes for which value of each is implied by other(s) -```python -def implied_columns(self) -``` - -**Returns:** - -`list[str]`: sample attribute names for which value is implied by other(s) - - - - -### num\_samples -Count the number of samples available in this Project. -```python -def num_samples(self) -``` - -**Returns:** - -`int`: number of samples available in this Project. - - - - -### output\_dir -Directory in which to place results and submissions folders. - -By default, assume that the project's configuration file specifies -an output directory, and that this is therefore available within -the project metadata. If that assumption does not hold, though, -consider the folder in which the project configuration file lives -to be the project's output directory. -```python -def output_dir(self) -``` - -**Returns:** - -`str`: path to the project's output directory, either asspecified in the configuration file or the folder that contains the project's configuration file. - - - - -### project\_folders -Keys for paths to folders to ensure exist. -```python -def project_folders(self) -``` - - - - -### protocols -Determine this Project's unique protocol names. -```python -def protocols(self) -``` - -**Returns:** - -`Set[str]`: collection of this Project's unique protocol names - - - - -### required\_metadata -Which metadata attributes are required. -```python -def required_metadata(self) -``` - - - - -### sample\_names -Names of samples of which this Project is aware. -```python -def sample_names(self) -``` - - - - -### samples -Generic/base Sample instance for each of this Project's samples. -```python -def samples(self) -``` - -**Returns:** - -`Iterable[Sample]`: Sample instance for eachof this Project's samples - - - - -### sheet -Annotations/metadata sheet describing this Project's samples. -```python -def sheet(self) -``` - -**Returns:** - -`pandas.core.frame.DataFrame`: table of samples in this Project - - - - -### subproject -Return currently active subproject or None if none was activated -```python -def subproject(self) -``` - -**Returns:** - -`str`: currently active subproject - - - - -### templates\_folder -Path to folder with default submission templates. -```python -def templates_folder(self) -``` - -**Returns:** - -`str`: path to folder with default submission templates - - - - -## Class MissingMetadataException -Project needs certain metadata. - - -## Class MissingSampleSheetError -Represent case in which sample sheet is specified but nonexistent. - - -## Class PipelineInterface -This class parses, holds, and returns information for a yaml file that specifies how to interact with each individual pipeline. This includes both resources to request for cluster job submission, as well as arguments to be passed from the sample annotation metadata to the pipeline - -**Parameters:** - -- `config` -- `str | Mapping`: path to file from which to parseconfiguration data, or pre-parsed configuration data. - - -### pipe\_iface -Old-way access to pipeline key-to-interface mapping -```python -def pipe_iface(self) -``` - -**Returns:** - -`Mapping`: Binding between pipeline key and interface data - - - - -### pipeline\_names -Names of pipelines about which this interface is aware. -```python -def pipeline_names(self) -``` - -**Returns:** - -`Iterable[str]`: names of pipelines about which thisinterface is aware - - - - -### pipelines\_path -Path to pipelines folder. -```python -def pipelines_path(self) -``` - -**Returns:** - -`str | None`: Path to pipelines folder, if configured withfile rather than with raw mapping. - - - - -### protomap -Access protocol mapping portion of this composite interface. -```python -def protomap(self) -``` - -**Returns:** - -`Mapping`: binding between protocol name and pipeline key. - - - - -## Class SubmissionConductor -Collects and then submits pipeline jobs. - -This class holds a 'pool' of commands to submit as a single cluster job. -Eager to submit a job, each instance's collection of commands expands until -it reaches the 'pool' has been filled, and it's therefore time to submit the -job. The pool fills as soon as a fill criteria has been reached, which can -be either total input file size or the number of individual commands. - - -### failed\_samples -```python -def failed_samples(self) -``` - - - -### num\_cmd\_submissions -Return the number of commands that this conductor has submitted. -```python -def num_cmd_submissions(self) -``` - -**Returns:** - -`int`: Number of commands submitted so far. - - - - -### num\_job\_submissions -Return the number of jobs that this conductor has submitted. -```python -def num_job_submissions(self) -``` - -**Returns:** - -`int`: Number of jobs submitted so far. - - - diff --git a/docs/autodoc_build/looper.md b/docs/autodoc_build/looper.md index c63c92a11..81bfb2a41 100644 --- a/docs/autodoc_build/looper.md +++ b/docs/autodoc_build/looper.md @@ -65,6 +65,53 @@ def derived_columns(self) +### get\_interfaces +Get the pipeline interfaces associated with the given protocol. +```python +def get_interfaces(self, protocol) +``` + +**Parameters:** + +- `protocol` -- `str`: name of the protocol for which to get interfaces + + +**Returns:** + +`Iterable[looper.PipelineInterface]`: collection of pipelineinterfaces associated with the given protocol + + +**Raises:** + +- `KeyError`: if the given protocol is not (perhaps yet) mappedto any pipeline interface + + + + +### get\_outputs +Map pipeline identifier to collection of output specifications. + +This method leverages knowledge of two collections of different kinds +of entities that meet in the manifestation of a Project. The first +is a collection of samples, which is known even in peppy.Project. The +second is a mapping from protocol/assay/library strategy to a collection +of pipeline interfaces, in which kinds of output may be declared. +Knowledge of these two items is here harnessed to map the identifier +for each pipeline about which this Project is aware to a collection of +pairs of identifier for a kind of output and the collection of +this Project's samples for which it's applicable (i.e., those samples +with protocol that maps to the corresponding pipeline). +```python +def get_outputs(self) +``` + +**Returns:** + +`Mapping[str, Mapping[str, namedtuple]]`: collection of bindingsbetween identifier for pipeline and collection of bindings between name for a kind of output and pair in which first component is a path template and the second component is a collection of sample names + + + + ### implied\_columns Collection of sample attributes for which value of each is implied by other(s) ```python @@ -651,4 +698,4 @@ def write_skipped_sample_scripts(self) -**Version Information**: `looper` v0.11.0dev, generated by `lucidoc` v0.3 \ No newline at end of file +**Version Information**: `looper` v0.12dev, generated by `lucidoc` v0.3.1 \ No newline at end of file diff --git a/docs/features.md b/docs/features.md index 9cac4cf25..17034cc27 100644 --- a/docs/features.md +++ b/docs/features.md @@ -1,15 +1,15 @@ # Features and benefits -[cli]: ../img/cli.svg -[computing]: ../img/computing.svg -[flexible_pipelines]: ../img/flexible_pipelines.svg -[job_monitoring]: ../img/job_monitoring.svg -[resources]: ../img/resources.svg -[subprojects]: ../img/subprojects.svg -[collate]: ../img/collate.svg -[file_yaml]: ../img/file_yaml.svg -[html]: ../img/HTML.svg -[modular]: ../img/modular.svg +[cli]: img/cli.svg +[computing]: img/computing.svg +[flexible_pipelines]: img/flexible_pipelines.svg +[job_monitoring]: img/job_monitoring.svg +[resources]: img/resources.svg +[subprojects]: img/subprojects.svg +[collate]: img/collate.svg +[file_yaml]: img/file_yaml.svg +[html]: img/HTML.svg +[modular]: img/modular.svg ![modular][modular] **Modular approach to job handling** diff --git a/mkdocs.yml b/mkdocs.yml index 1dc2f8948..4f9de7595 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,6 +20,7 @@ nav: - Reference: - Usage: usage.md - Configuration files: config-files.md + - API: autodoc_build/looper.md - FAQ: faq.md - Support: support.md - Contributing: contributing.md @@ -29,8 +30,9 @@ theme: databio plugins: - databio: + autodoc_build: "docs/autodoc_build" jupyter_source: "docs_jupyter" jupyter_build: "docs_jupyter/build" + autodoc_package: "looper" + no_top_level: true - search - - From 3b384bf30bf196ae212a6c5b09d9902cc7b5726d Mon Sep 17 00:00:00 2001 From: Vince Date: Fri, 19 Apr 2019 19:03:04 -0400 Subject: [PATCH 09/61] handling Project interfaces; update reqs --- looper/project.py | 29 ++++++++++++++++++++++++++--- requirements/requirements-all.txt | 2 +- tests/test_basic_interface_group.py | 0 3 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 tests/test_basic_interface_group.py diff --git a/looper/project.py b/looper/project.py index ff67b6a29..e9430bd85 100644 --- a/looper/project.py +++ b/looper/project.py @@ -1,6 +1,7 @@ """ Looper version of NGS project model. """ from collections import namedtuple +import copy from functools import partial import itertools import os @@ -37,9 +38,19 @@ def __init__(self, config_file, subproject=None, **kwargs): config_file, subproject=subproject, no_environment_exception=RuntimeError, no_compute_exception=RuntimeError, **kwargs) - self.interfaces = process_pipeline_interfaces( + self._interfaces = process_pipeline_interfaces( self[METADATA_KEY][PIPELINE_INTERFACES_KEY]) + @property + def interfaces(self): + """ + Get this Project's collection of pipeline interfaces + + :return Iterable[looper.PipelineInterface]: collection of pipeline + interfaces known by this Project + """ + return copy.deepcopy(self._interfaces) + @property def project_folders(self): """ Keys for paths to folders to ensure exist. """ @@ -257,17 +268,29 @@ def get_outputs(self): for path_key, path_val in outs.items()} return m + def _omit_from_repr(self, k, cls): + """ + Exclude the interfaces from representation. + + :param str k: key of item to consider for omission + :param type cls: placeholder to comply with superclass signature + """ + return super(Project, self)._omit_from_repr(k, cls) or k == "interfaces" + def _gather_ifaces(ifaces): """ For each pipeline map identifier to protocols and interface data. - :param Iterable[looper.PipelineInterface] ifaces: + :param Iterable[looper.PipelineInterface] ifaces: collection of pipeline + interface objects :return Mapping[str, (set[str], attmap.AttMap)]: collection of bindings between pipeline identifier and pair in which first component is collection of associated protocol names, and second component is a collection of interface data for pipeline identified by the key - :raise looper. + :raise looper.DuplicatePipelineKeyException: if the same identifier (key or + name) points to collections of pipeline interface data (for a + particular pipeline) that are not equivalent """ specs = {} for pi in ifaces: diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 3f5ff6a88..edde8b4b3 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,4 +1,4 @@ -attmap>=0.6 +attmap>=0.7dev colorama>=0.3.9 logmuse>=0.0.2 pandas>=0.20.2 diff --git a/tests/test_basic_interface_group.py b/tests/test_basic_interface_group.py new file mode 100644 index 000000000..e69de29bb From 191b5f5b6fd28e32d209f0b4f0734e50fe5628bd Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 22 Apr 2019 12:09:50 -0400 Subject: [PATCH 10/61] cleanup --- .../integration/def test_project_iface_sample_interaction.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/integration/def test_project_iface_sample_interaction.py b/tests/integration/def test_project_iface_sample_interaction.py index 022e6d9c4..c5ea0ba0b 100644 --- a/tests/integration/def test_project_iface_sample_interaction.py +++ b/tests/integration/def test_project_iface_sample_interaction.py @@ -14,12 +14,10 @@ __email__ = "vreuter@virginia.edu" - @pytest.mark.usefixtures("write_project_files", "pipe_iface_config_file") class SampleWrtProjectCtorTests: """ Tests for `Sample` related to `Project` construction """ - @named_param(argnames="sample_index", argvalues=(set(range(NUM_SAMPLES)) - NGS_SAMPLE_INDICES)) def test_required_inputs(self, proj, pipe_iface, sample_index): @@ -38,7 +36,6 @@ def test_required_inputs(self, proj, pipe_iface, sample_index): assert not error_general assert not error_specific - @named_param(argnames="sample_index", argvalues=NGS_SAMPLE_INDICES) def test_ngs_pipe_ngs_sample(self, proj, pipe_iface, sample_index): """ NGS pipeline with NGS input works just fine. """ @@ -59,7 +56,6 @@ def test_ngs_pipe_ngs_sample(self, proj, pipe_iface, sample_index): assert expected_required_input_basename == \ observed_required_input_basename - @named_param(argnames="sample_index", argvalues=set(range(NUM_SAMPLES)) - NGS_SAMPLE_INDICES) @pytest.mark.parametrize( @@ -110,4 +106,3 @@ def test_ngs_pipe_non_ngs_sample( # Remove the temporary handler and assert that we've reset state. del looper._LOGGER.handlers[-1] assert pre_test_handlers == looper._LOGGER.handlers - From 306f5d1f64316814c6159783f09623d14abf81a7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 22 Apr 2019 13:12:30 -0400 Subject: [PATCH 11/61] update examples --- examples/microtest_merge_table.csv | 4 ---- examples/microtest_project_config.yaml | 6 +++--- examples/microtest_sample_annotation.csv | 16 ---------------- 3 files changed, 3 insertions(+), 23 deletions(-) delete mode 100644 examples/microtest_merge_table.csv delete mode 100644 examples/microtest_sample_annotation.csv diff --git a/examples/microtest_merge_table.csv b/examples/microtest_merge_table.csv deleted file mode 100644 index 2bcf237fb..000000000 --- a/examples/microtest_merge_table.csv +++ /dev/null @@ -1,4 +0,0 @@ -sample_name,data_source,file_number -rrbs,microtest_merge,1 -wgbs,microtest_merge,1 -wgbs,microtest_merge,2 diff --git a/examples/microtest_project_config.yaml b/examples/microtest_project_config.yaml index fbf66b586..bb4db47d4 100644 --- a/examples/microtest_project_config.yaml +++ b/examples/microtest_project_config.yaml @@ -3,8 +3,8 @@ metadata: results_subdir: results_pipeline submission_subdir: submission pipelines_dir: ${CODE}/pipelines - sample_annotation: microtest_sample_annotation.csv - merge_table: microtest_merge_table.csv + sample_annotation: microtest_annotation.csv + merge_table: microtest_subannotation.csv derived_columns: [data_source] @@ -18,7 +18,7 @@ subprojects: pipeline_config: wgbs.py: wgbs_ds.yaml -implied_columns: +implied_attributes: organism: human: genomes: hg19 diff --git a/examples/microtest_sample_annotation.csv b/examples/microtest_sample_annotation.csv deleted file mode 100644 index bc9b1b49e..000000000 --- a/examples/microtest_sample_annotation.csv +++ /dev/null @@ -1,16 +0,0 @@ -sample_name,library,organism,ip,data_source -atac-seq_PE,ATAC-seq,human,,microtest -atac-seq_SE,ATAC-seq,human,,microtest -chip-seq_PE,CHIP-seq,human,H3K27ac,microtest -chip-seq_SE,CHIP-seq,human,H3K27ac,microtest -chipmentation_PE,ChIPmentation,human,H3K27ac,microtest -chipmentation_SE,ChIPmentation,human,H3K27ac,microtest -cpgseq_example_data,CpG-seq,human,,microtest -quant-seq_SE,Quant-seq,human,,microtest -rrbs,RRBS,human,,microtest -rrbs_PE,RRBS,human,,microtest -wgbs,WGBS,human,,microtest -RNA_TRUseq_50SE,SMART,human,,microtest -RNA_SMART_50SE,SMART,human,,microtest -rrbs_PE_fq,RRBS,human,,microtest -rrbs_fq,RRBS,human,,microtest From 9243c90605032b06cb6898d16642cbb5b309ced0 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 22 Apr 2019 13:13:15 -0400 Subject: [PATCH 12/61] update examples --- examples/microtest_annotation.csv | 16 ++++++++++++++++ examples/microtest_subannotation.csv | 4 ++++ 2 files changed, 20 insertions(+) create mode 100644 examples/microtest_annotation.csv create mode 100644 examples/microtest_subannotation.csv diff --git a/examples/microtest_annotation.csv b/examples/microtest_annotation.csv new file mode 100644 index 000000000..379c51d53 --- /dev/null +++ b/examples/microtest_annotation.csv @@ -0,0 +1,16 @@ +sample_name,protocol,organism,ip,data_source +atac-seq_PE,ATAC-seq,human,,microtest +atac-seq_SE,ATAC-seq,human,,microtest +chip-seq_PE,CHIP-seq,human,H3K27ac,microtest +chip-seq_SE,CHIP-seq,human,H3K27ac,microtest +chipmentation_PE,ChIPmentation,human,H3K27ac,microtest +chipmentation_SE,ChIPmentation,human,H3K27ac,microtest +cpgseq_example_data,CpG-seq,human,,microtest +quant-seq_SE,Quant-seq,human,,microtest +rrbs,RRBS,human,,microtest +rrbs_PE,RRBS,human,,microtest +wgbs,WGBS,human,,microtest +RNA_TRUseq_50SE,SMART,human,,microtest +RNA_SMART_50SE,SMART,human,,microtest +rrbs_PE_fq,RRBS,human,,microtest +rrbs_fq,RRBS,human,,microtest diff --git a/examples/microtest_subannotation.csv b/examples/microtest_subannotation.csv new file mode 100644 index 000000000..2bcf237fb --- /dev/null +++ b/examples/microtest_subannotation.csv @@ -0,0 +1,4 @@ +sample_name,data_source,file_number +rrbs,microtest_merge,1 +wgbs,microtest_merge,1 +wgbs,microtest_merge,2 From aa0b98bf9478d6c7e7b44e7b73d5b81942c3db97 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 22 Apr 2019 14:20:10 -0400 Subject: [PATCH 13/61] pass some tests --- tests/helpers.py | 21 +++ .../test_project_get_interfaces.py | 0 tests/integration/test_project_get_outputs.py | 163 ++++++++++++++++-- tests/test_utils.py | 10 +- 4 files changed, 175 insertions(+), 19 deletions(-) create mode 100644 tests/integration/test_project_get_interfaces.py diff --git a/tests/helpers.py b/tests/helpers.py index 51a2bb87f..cb42d8a65 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -2,6 +2,8 @@ from functools import partial import itertools +import random +import string import numpy as np import pytest @@ -10,6 +12,9 @@ __email__ = "vreuter@virginia.edu" +LETTERS_AND_DIGITS = string.ascii_letters + string.digits + + def assert_entirely_equal(observed, expected): """ Accommodate equality assertion for varied data, including NaN. """ try: @@ -56,4 +61,20 @@ def powerset(items, min_items=0, include_full_pop=True): for k in range(min_items, max_items))) +def randstr(pool, size): + """ + Generate random string of given size/length. + + :param Iterable[str] pool: collection of characters from which to sample + (with replacement) + :param int size: nunber of characters + :return str: string built by concatenating randomly sampled characters + :raise ValueError: if size is not a positive integer + """ + if size < 1: + raise ValueError("Must build string of positive integral length; got " + "{}".format(size)) + return "".join(random.choice(pool) for _ in range(size)) + + nonempty_powerset = partial(powerset, min_items=1) diff --git a/tests/integration/test_project_get_interfaces.py b/tests/integration/test_project_get_interfaces.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 165e267c9..84c245aad 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -1,10 +1,16 @@ """ Tests for interaction between Project and PipelineInterface """ +from collections import Counter +from copy import deepcopy +import os import pytest import yaml from looper import Project as LP +from looper.const import * +from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY +from attmap import AttMap from peppy.const import * - +from tests.helpers import randstr, LETTERS_AND_DIGITS __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -16,6 +22,120 @@ MAIN_META_KEY: METADATA_KEY, SUBS_META_KEY: SUBPROJECTS_SECTION} +BASE_META = {OUTDIR_KEY: "arbitrary"} +DECLARED_OUTPUTS = {"smooth_bw": "a_{sample.name}/b_{sample.protocol}.txt", + "unalign": "u_{sample.name}_{sample.protocol}.txt"} +WGBS_NAME = "WGBS" +RRBS_NAME = "RRBS" +WGBS_KEY = "wgbs" +RRBS_KEY = "rrbs" + +WGBS_IFACE_LINES = """ +name: {n} +path: src/wgbs.py +required_input_files: [data_source] +ngs_input_files: [data_source] +arguments: + "--sample-name": sample_name + "--genome": genome + "--input": data_source + "--single-or-paired": read_type +resources: + default: + file_size: "0" + cores: "4" + mem: "4000" + time: "0-02:00:00" +""".format(n=WGBS_NAME).splitlines(False) + +RRBS_IFACE_LINES = """ +name: {n} +path: src/rrbs.py +required_input_files: [data_source] +all_input_files: [data_source, read1, read2] +ngs_input_files: [data_source, read1, read2] +arguments: + "--sample-name": sample_name + "--genome": genome + "--input": data_source + "--single-or-paired": read_type +resources: + default: + file_size: "0" + cores: "4" + mem: "4000" + time: "0-02:00:00" +""".format(n=RRBS_NAME).splitlines(False) + + +PROTOMAP = {RRBS_NAME: RRBS_KEY, WGBS_NAME: WGBS_KEY, "EG": WGBS_KEY} +IFACE_LINES = {WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES} + + +def _write_iface_file( + path_iface_file, lines_group_by_pipe_key, + outputs_by_pipe_key=None, pm=None): + """ + Write a pipeline interface file. + + :param str path_iface_file: path to the file to write + :param Mapping[str, Iterable[str]] lines_group_by_pipe_key: binding between + pipeline key and collection of lines that encode its specific + configuration data + :param Mapping[str, Mapping[str, str]] outputs_by_pipe_key: binding between + pipeline key and mapping from output type/kind name to path template + :param Mapping[str, str] pm: protocol mapping + :return str: path to the file written + """ + + folder = os.path.dirname(path_iface_file) + temps = [os.path.join(folder, randconf()) for _ in lines_group_by_pipe_key] + + def read_iface_data(fp, lines): + with open(fp, 'w') as f: + for l in lines: + f.write(l) + with open(fp, 'r') as f: + return yaml.load(f, yaml.SafeLoader) + + outputs_by_pipe_key = outputs_by_pipe_key or dict() + + dat_by_key = { + k: read_iface_data(tf, lines_group) for tf, (k, lines_group) + in zip(temps, outputs_by_pipe_key.items())} + for k, outs in outputs_by_pipe_key.items(): + dat_by_key[k][OUTKEY] = outs + + data = {PROTOMAP_KEY: pm or PROTOMAP, PL_KEY: dat_by_key} + # DEBUG + print("DATA:\n{}".format(data)) + + with open(path_iface_file, 'w') as f: + yaml.dump(data, f) + + return path_iface_file + + +def randconf(ext=".yaml"): + """ + Randomly generate config filename. + + :param str ext: filename extension + :return str: randomly generated string to function as filename + """ + return randstr(LETTERS_AND_DIGITS, 15) + ext + + +def augmented_metadata(metadata, extra=None): + """ Augment base metadata with additional data. """ + assert METADATA_KEY not in metadata, \ + "Found {k} in metadata argument itself; pass just the data/values to " \ + "use as {k}, not the whole mapping".format(k=METADATA_KEY) + m = AttMap({METADATA_KEY: BASE_META}) + m[METADATA_KEY] = m[METADATA_KEY].add_entries(metadata) + return m.add_entries(extra or {}).to_map() + + def get_conf_data(req): """ Get Project config data for a test case. @@ -31,23 +151,38 @@ def get_conf_data(req): @pytest.fixture(scope="function") def prj(request, tmpdir): """ Provide a test case with a Project instance. """ - conf_data = get_conf_data(request) - conf_file = tmpdir.join("pc.yaml").strpath - with open(conf_file, 'w') as f: - yaml.dump(conf_data, f) - return LP(conf_file) + conf_file = tmpdir.join(randconf()).strpath + return _write_and_build_prj(conf_file, conf_data=get_conf_data(request)) -@pytest.mark.parametrize(MAIN_META_KEY, [{OUTDIR_KEY: "arbitrary"}]) +@pytest.mark.parametrize(MAIN_META_KEY, [BASE_META]) def test_no_pifaces(prj, main_meta): """ No pipeline interfaces --> the outputs data mapping is empty.""" assert {} == prj.get_outputs() -@pytest.mark.skip("not implemented") -def test_no_outputs(): - """ """ - pass +@pytest.mark.parametrize("name_cfg_file", [randconf()]) +@pytest.mark.parametrize("ifaces", [ + [{WGBS_KEY: WGBS_IFACE_LINES}], [{RRBS_KEY: RRBS_IFACE_LINES}], + [{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}]]) +def test_no_outputs(tmpdir, name_cfg_file, ifaces): + """ Pipeline interfaces without outputs --> no Project outputs """ + cfg = tmpdir.join(name_cfg_file).strpath + iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] + assert all(1 == n for n in Counter(iface_paths).values()) + for data, path in zip(ifaces, iface_paths): + with open(path, 'w') as f: + yaml.dump(data, f) + md = deepcopy(BASE_META) + md[PIPELINE_INTERFACES_KEY] = iface_paths + + # DEBUG + print("Metadata: {}".format(md)) + + for path, data in zip(iface_paths, ifaces): + _write_iface_file(path, data) + prj = _write_and_build_prj(cfg, {METADATA_KEY: md}) + assert {} == prj.get_outputs() @pytest.mark.skip("not implemented") @@ -93,3 +228,9 @@ def test_sample_collection_accuracy(): @pytest.mark.skip("not implemented") def test_protocol_collection_accuracy(): pass + + +def _write_and_build_prj(conf_file, conf_data): + with open(conf_file, 'w') as f: + yaml.dump(conf_data, f) + return LP(conf_file) diff --git a/tests/test_utils.py b/tests/test_utils.py index 97f826129..ffc26cb1f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,19 +6,13 @@ import pytest from looper.utils import determine_config_path, DEFAULT_CONFIG_SUFFIX, \ DEFAULT_METADATA_FOLDER +from tests.helpers import randstr, LETTERS_AND_DIGITS + __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" -LETTERS_AND_DIGITS = string.ascii_letters + string.digits - - -def randstr(pool, size): - """ Generate random string of given size/length. """ - return "".join(random.choice(pool) for _ in range(size)) - - class ConfigPathDeterminationTests: """ Tests for config path determination function """ From 8dd032aed49df340a8b1f7cbfb9442bc0b4cfdbd Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 22 Apr 2019 14:21:04 -0400 Subject: [PATCH 14/61] start ifaces tests file --- tests/integration/test_project_get_interfaces.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration/test_project_get_interfaces.py b/tests/integration/test_project_get_interfaces.py index e69de29bb..a13ee7b3f 100644 --- a/tests/integration/test_project_get_interfaces.py +++ b/tests/integration/test_project_get_interfaces.py @@ -0,0 +1,6 @@ +""" Tests for request to Project for interfaces(s) for particular protocol """ + +import pytest + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" From c9f91c9a9668e98533783e60026d38d03b8e4ed7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 22 Apr 2019 18:01:29 -0400 Subject: [PATCH 15/61] testing subprojects with outputs --- looper/project.py | 12 +- tests/integration/test_project_get_outputs.py | 273 +++++++++++++----- 2 files changed, 217 insertions(+), 68 deletions(-) diff --git a/looper/project.py b/looper/project.py index ff67b6a29..4c9d43d5c 100644 --- a/looper/project.py +++ b/looper/project.py @@ -217,7 +217,7 @@ def get_interfaces(self, protocol): """ return self.interfaces[protocol] - def get_outputs(self): + def get_outputs(self, skip_sample_less=True): """ Map pipeline identifier to collection of output specifications. @@ -233,17 +233,25 @@ def get_outputs(self): this Project's samples for which it's applicable (i.e., those samples with protocol that maps to the corresponding pipeline). + :param bool skip_sample_less: whether to omit pipelines that are for + protocols of which the Project has no Sample instances :return Mapping[str, Mapping[str, namedtuple]]: collection of bindings between identifier for pipeline and collection of bindings between name for a kind of output and pair in which first component is a path template and the second component is a collection of sample names + :raise TypeError: if argument to sample-less pipeline skipping parameter + is not a Boolean """ + if not isinstance(skip_sample_less, bool): + raise TypeError( + "Non-Boolean argument to sample-less skip flag: {} ({})". + format(skip_sample_less, type(skip_sample_less))) prots_data_pairs = _gather_ifaces(self.interfaces) m = {} for name, (prots, data) in prots_data_pairs.items(): snames = [s.name for s in self.samples if s.protocol in prots] - if not snames: + if not snames and skip_sample_less: _LOGGER.debug("No samples matching protocol(s): {}". format(", ".join(prots))) continue diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 84c245aad..e7cd0cf29 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -3,6 +3,8 @@ from collections import Counter from copy import deepcopy import os +import random +import string import pytest import yaml from looper import Project as LP @@ -20,8 +22,6 @@ SUBS_META_KEY = "subs_meta" SECTION_BY_FIXTURE = { MAIN_META_KEY: METADATA_KEY, SUBS_META_KEY: SUBPROJECTS_SECTION} - - BASE_META = {OUTDIR_KEY: "arbitrary"} DECLARED_OUTPUTS = {"smooth_bw": "a_{sample.name}/b_{sample.protocol}.txt", "unalign": "u_{sample.name}_{sample.protocol}.txt"} @@ -30,6 +30,8 @@ WGBS_KEY = "wgbs" RRBS_KEY = "rrbs" +PROTO_NAMES = {WGBS_KEY: WGBS_NAME, RRBS_KEY: RRBS_NAME} + WGBS_IFACE_LINES = """ name: {n} path: src/wgbs.py @@ -46,7 +48,7 @@ cores: "4" mem: "4000" time: "0-02:00:00" -""".format(n=WGBS_NAME).splitlines(False) +""".format(n=WGBS_NAME).splitlines(True) RRBS_IFACE_LINES = """ name: {n} @@ -65,65 +67,18 @@ cores: "4" mem: "4000" time: "0-02:00:00" -""".format(n=RRBS_NAME).splitlines(False) +""".format(n=RRBS_NAME).splitlines(True) PROTOMAP = {RRBS_NAME: RRBS_KEY, WGBS_NAME: WGBS_KEY, "EG": WGBS_KEY} IFACE_LINES = {WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES} -def _write_iface_file( - path_iface_file, lines_group_by_pipe_key, - outputs_by_pipe_key=None, pm=None): - """ - Write a pipeline interface file. - - :param str path_iface_file: path to the file to write - :param Mapping[str, Iterable[str]] lines_group_by_pipe_key: binding between - pipeline key and collection of lines that encode its specific - configuration data - :param Mapping[str, Mapping[str, str]] outputs_by_pipe_key: binding between - pipeline key and mapping from output type/kind name to path template - :param Mapping[str, str] pm: protocol mapping - :return str: path to the file written - """ - - folder = os.path.dirname(path_iface_file) - temps = [os.path.join(folder, randconf()) for _ in lines_group_by_pipe_key] - - def read_iface_data(fp, lines): - with open(fp, 'w') as f: - for l in lines: - f.write(l) - with open(fp, 'r') as f: - return yaml.load(f, yaml.SafeLoader) - - outputs_by_pipe_key = outputs_by_pipe_key or dict() - - dat_by_key = { - k: read_iface_data(tf, lines_group) for tf, (k, lines_group) - in zip(temps, outputs_by_pipe_key.items())} - for k, outs in outputs_by_pipe_key.items(): - dat_by_key[k][OUTKEY] = outs - - data = {PROTOMAP_KEY: pm or PROTOMAP, PL_KEY: dat_by_key} - # DEBUG - print("DATA:\n{}".format(data)) - - with open(path_iface_file, 'w') as f: - yaml.dump(data, f) - - return path_iface_file - - -def randconf(ext=".yaml"): - """ - Randomly generate config filename. - - :param str ext: filename extension - :return str: randomly generated string to function as filename - """ - return randstr(LETTERS_AND_DIGITS, 15) + ext +def pytest_generate_tests(metafunc): + """ Test case generation and parameterization for this module. """ + skip_empty_flag = "skip_sample_less" + if skip_empty_flag in metafunc.fixturenames: + metafunc.parametrize(skip_empty_flag, [False, True]) def augmented_metadata(metadata, extra=None): @@ -148,6 +103,16 @@ def get_conf_data(req): return m +def randconf(ext=".yaml"): + """ + Randomly generate config filename. + + :param str ext: filename extension + :return str: randomly generated string to function as filename + """ + return randstr(LETTERS_AND_DIGITS, 15) + ext + + @pytest.fixture(scope="function") def prj(request, tmpdir): """ Provide a test case with a Project instance. """ @@ -165,11 +130,12 @@ def test_no_pifaces(prj, main_meta): @pytest.mark.parametrize("ifaces", [ [{WGBS_KEY: WGBS_IFACE_LINES}], [{RRBS_KEY: RRBS_IFACE_LINES}], [{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}]]) -def test_no_outputs(tmpdir, name_cfg_file, ifaces): +def test_no_outputs(tmpdir, name_cfg_file, ifaces, skip_sample_less): """ Pipeline interfaces without outputs --> no Project outputs """ cfg = tmpdir.join(name_cfg_file).strpath iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - assert all(1 == n for n in Counter(iface_paths).values()) + rep_paths = _find_reps(iface_paths) + assert [] == rep_paths, "Repeated temp filepath(s): {}".format(rep_paths) for data, path in zip(ifaces, iface_paths): with open(path, 'w') as f: yaml.dump(data, f) @@ -182,31 +148,133 @@ def test_no_outputs(tmpdir, name_cfg_file, ifaces): for path, data in zip(iface_paths, ifaces): _write_iface_file(path, data) prj = _write_and_build_prj(cfg, {METADATA_KEY: md}) - assert {} == prj.get_outputs() + assert {} == prj.get_outputs(skip_sample_less) -@pytest.mark.skip("not implemented") -def test_malformed_outputs(): - pass +@pytest.mark.parametrize("name_cfg_file", [randconf()]) +@pytest.mark.parametrize(["ifaces", "prot_pool"], [ + ([{WGBS_KEY: WGBS_IFACE_LINES}], [WGBS_NAME]), + ([{RRBS_KEY: RRBS_IFACE_LINES}], [RRBS_NAME]), + ([{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}], + [WGBS_NAME, RRBS_NAME])]) +@pytest.mark.parametrize("declared_outputs", [None, ["out1", "out2"]]) +def test_malformed_outputs( + tmpdir, name_cfg_file, ifaces, prot_pool, + declared_outputs, skip_sample_less): + """ Invalid outputs declaration format is exceptional. """ + + cfg = tmpdir.join(name_cfg_file).strpath + + iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] + rep_paths = _find_reps(iface_paths) + assert [] == rep_paths, "Repeated temp filepath(s): {}".format(rep_paths) + + for data, path in zip(ifaces, iface_paths): + with open(path, 'w') as f: + yaml.dump(data, f) + md = deepcopy(BASE_META) + md[PIPELINE_INTERFACES_KEY] = iface_paths + + anns_file = tmpdir.join("anns.csv").strpath + assert not os.path.exists(anns_file) + sample_protos = [random.choice(prot_pool) for _ in range(10)] + sample_names = [randstr(string.ascii_letters, 20) for _ in sample_protos] + repeated_sample_names = _find_reps(sample_names) + assert [] == repeated_sample_names, \ + "Repeated sample names: {}".format(repeated_sample_names) + anns_data = [(SAMPLE_NAME_COLNAME, ASSAY_KEY)] + \ + list(zip(sample_names, sample_protos)) + with open(anns_file, 'w') as f: + f.write("\n".join("{0},{1}".format(*pair) for pair in anns_data)) + md[SAMPLE_ANNOTATIONS_KEY] = anns_file + + # DEBUG + print("Metadata: {}".format(md)) + + keyed_outputs = {pk: declared_outputs for pk in + [k for pi in ifaces for k in pi.keys()]} + for path, data in zip(iface_paths, ifaces): + _write_iface_file(path, data, outputs_by_pipe_key=keyed_outputs) + prj = _write_and_build_prj(cfg, {METADATA_KEY: md}) + print("TABLE below:\n{}".format(prj.sample_table)) + with pytest.raises(AttributeError): + # Should fail on .items() call during outputs determination. + print("Outputs: {}".format(prj.get_outputs(skip_sample_less))) @pytest.mark.skip("not implemented") -def test_only_subproject_has_pifaces(): - pass +@pytest.mark.parametrize("ifaces", [ + [{WGBS_KEY: WGBS_IFACE_LINES}], [{RRBS_KEY: RRBS_IFACE_LINES}], + [{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}]]) +@pytest.mark.parametrize("declared_outputs", + [{n: DECLARED_OUTPUTS for n in [RRBS_NAME, WGBS_NAME]}]) +@pytest.mark.parametrize("activate", [False, True]) +def test_only_subproject_has_outputs( + tmpdir, name_cfg_file, ifaces, declared_outputs, activate): + """ Activation state affects status of Project's outputs. """ + + cfg = tmpdir.join(name_cfg_file).strpath + + iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] + assert [] == _find_reps(iface_paths), \ + "Repeated temp filepath(s): {}".format(_find_reps(iface_paths)) + + for data, path in zip(ifaces, iface_paths): + with open(path, 'w') as f: + yaml.dump(data, f) + md = deepcopy(BASE_META) + md[PIPELINE_INTERFACES_KEY] = iface_paths + + sp_ifaces_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] + assert [] == _find_reps(sp_ifaces_paths), \ + "Repeated temp filepath(s): {}".format(_find_reps(sp_ifaces_paths)) + iface_path_intersect = set(sp_ifaces_paths) & set(iface_paths) + assert set() == iface_path_intersect, \ + "Nonempty main/subs iface path intersection: {}".\ + format(", ".join(iface_path_intersect)) + + sp_name = "testing_subproj" + md[SUBPROJECTS_SECTION] = {sp_name: { + METADATA_KEY: {PIPELINE_INTERFACES_KEY: sp_ifaces_paths}}} + + # DEBUG + print("Metadata: {}".format(md)) + + keyed_outputs = {pk: declared_outputs for pk in + [k for pi in ifaces for k in pi.keys()]} + for path, data in zip(iface_paths, ifaces): + _write_iface_file(path, data) + for path, data in zip(sp_ifaces_paths, ifaces): + _write_iface_file(path, data, outputs_by_pipe_key=keyed_outputs) + + prj = _write_and_build_prj(cfg, {METADATA_KEY: md}) + print("TABLE below:\n{}".format(prj.sample_table)) + + if activate: + prj.activate_subproject(sp_name) + obs_out = prj.get_outputs(False) + assert len(obs_out) > 0 + exp = {PROTO_NAMES[k]: outs for k, outs in declared_outputs.items()} + assert exp == prj.get_outputs(False) + else: + assert {} == prj.get_outputs(False) @pytest.mark.skip("not implemented") -def test_only_subproject_has_outputs(): +@pytest.mark.parametrize("activate", [False, True]) +def test_only_main_project_has_outputs(activate): + """ Activation state affects status of Project's outputs. """ pass @pytest.mark.skip("not implemented") def test_main_project_and_subproject_have_outputs(): + """ Activation state affects status of Project's outputs. """ pass @pytest.mark.skip("not implemented") -def test_no_samples_match_protocols_with_outputs(): +def test_no_samples_match_protocols_with_outputs(skip_sample_less): pass @@ -230,7 +298,80 @@ def test_protocol_collection_accuracy(): pass +def _find_reps(objs): + """ + Find (and count) repeated objects + + :param Iterable[object] objs: collection of objects in which to seek + repeated elements + :return list[(object, int)]: collection of pairs in which first component + of each is a repeated object, and the second is duplication count + """ + return [(o, n) for o, n in Counter(objs).items() if n > 1] + + def _write_and_build_prj(conf_file, conf_data): + """ + Write Project config data and create the instance. + + :param str conf_file: path to file to write + :param Mapping conf_data: Project config data + :return looper.Project: new Project instance + """ with open(conf_file, 'w') as f: yaml.dump(conf_data, f) return LP(conf_file) + + +def _write_iface_file( + path_iface_file, lines_group_by_pipe_key, + outputs_by_pipe_key=None, pm=None): + """ + Write a pipeline interface file. + + :param str path_iface_file: path to the file to write + :param Mapping[str, Iterable[str]] lines_group_by_pipe_key: binding between + pipeline key and collection of lines that encode its specific + configuration data + :param Mapping[str, Mapping[str, str]] outputs_by_pipe_key: binding between + pipeline key and mapping from output type/kind name to path template + :param Mapping[str, str] pm: protocol mapping + :return str: path to the file written + """ + + folder = os.path.dirname(path_iface_file) + temps = [os.path.join(folder, randconf()) for _ in lines_group_by_pipe_key] + + def read_iface_data(fp, lines): + with open(fp, 'w') as f: + for l in lines: + f.write(l) + try: + with open(fp, 'r') as f: + return yaml.load(f, yaml.SafeLoader) + except yaml.scanner.ScannerError: + with open(fp, 'r') as f: + for l in f.readlines(): + print(l) + raise + + outputs_by_pipe_key = outputs_by_pipe_key or dict() + + dat_by_key = { + k: read_iface_data(tf, lines_group) for tf, (k, lines_group) + in zip(temps, lines_group_by_pipe_key.items())} + # DEBUG + print("DAT BY K: {}".format(dat_by_key)) + for k, outs in outputs_by_pipe_key.items(): + if k not in dat_by_key: + continue + dat_by_key[k][OUTKEY] = outs + + data = {PROTOMAP_KEY: pm or PROTOMAP, PL_KEY: dat_by_key} + # DEBUG + print("DATA: {}".format(data)) + + with open(path_iface_file, 'w') as f: + yaml.dump(data, f) + + return path_iface_file From 52a3cf3f6796eb67248772d2cea5700bb0460894 Mon Sep 17 00:00:00 2001 From: Vince Date: Tue, 23 Apr 2019 00:17:31 -0400 Subject: [PATCH 16/61] pass implemented tests --- looper/project.py | 20 ++-- tests/integration/test_project_get_outputs.py | 104 ++++++++++++++---- tests/test_basic_interface_group.py | 32 ++++++ 3 files changed, 125 insertions(+), 31 deletions(-) diff --git a/looper/project.py b/looper/project.py index 9c3c1c6aa..db9851774 100644 --- a/looper/project.py +++ b/looper/project.py @@ -259,21 +259,25 @@ def get_outputs(self, skip_sample_less=True): "Non-Boolean argument to sample-less skip flag: {} ({})". format(skip_sample_less, type(skip_sample_less))) prots_data_pairs = _gather_ifaces(self.interfaces) + # DEBUG + print("prots_data_pairs: {}".format(prots_data_pairs)) m = {} for name, (prots, data) in prots_data_pairs.items(): - snames = [s.name for s in self.samples if s.protocol in prots] - if not snames and skip_sample_less: - _LOGGER.debug("No samples matching protocol(s): {}". - format(", ".join(prots))) - continue try: outs = data[OUTKEY] + # DEBUG + print("OUTS: {}".format(outs)) except KeyError: _LOGGER.debug("No {} declared for pipeline: {}". format(OUTKEY, name)) - else: - m[name] = {path_key: (path_val, snames) - for path_key, path_val in outs.items()} + continue + snames = [s.name for s in self.samples if s.protocol in prots] + if not snames and skip_sample_less: + _LOGGER.debug("No samples matching protocol(s): {}". + format(", ".join(prots))) + continue + m[name] = {path_key: (path_val, snames) + for path_key, path_val in outs.items()} return m def _omit_from_repr(self, k, cls): diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index e7cd0cf29..db7685bb8 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -2,6 +2,7 @@ from collections import Counter from copy import deepcopy +import itertools import os import random import string @@ -202,18 +203,15 @@ def test_malformed_outputs( print("Outputs: {}".format(prj.get_outputs(skip_sample_less))) -@pytest.mark.skip("not implemented") @pytest.mark.parametrize("ifaces", [ [{WGBS_KEY: WGBS_IFACE_LINES}], [{RRBS_KEY: RRBS_IFACE_LINES}], [{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}]]) @pytest.mark.parametrize("declared_outputs", [{n: DECLARED_OUTPUTS for n in [RRBS_NAME, WGBS_NAME]}]) -@pytest.mark.parametrize("activate", [False, True]) -def test_only_subproject_has_outputs( - tmpdir, name_cfg_file, ifaces, declared_outputs, activate): +def test_only_subproject_has_outputs(tmpdir, ifaces, declared_outputs): """ Activation state affects status of Project's outputs. """ - cfg = tmpdir.join(name_cfg_file).strpath + cfg = tmpdir.join(randconf()).strpath iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] assert [] == _find_reps(iface_paths), \ @@ -233,38 +231,98 @@ def test_only_subproject_has_outputs( "Nonempty main/subs iface path intersection: {}".\ format(", ".join(iface_path_intersect)) - sp_name = "testing_subproj" - md[SUBPROJECTS_SECTION] = {sp_name: { - METADATA_KEY: {PIPELINE_INTERFACES_KEY: sp_ifaces_paths}}} - # DEBUG print("Metadata: {}".format(md)) - keyed_outputs = {pk: declared_outputs for pk in - [k for pi in ifaces for k in pi.keys()]} + used_iface_keys = set(itertools.chain(*[pi.keys() for pi in ifaces])) + keyed_outputs = {pk: declared_outputs[PROTO_NAMES[pk]] + for pk in used_iface_keys} for path, data in zip(iface_paths, ifaces): _write_iface_file(path, data) for path, data in zip(sp_ifaces_paths, ifaces): _write_iface_file(path, data, outputs_by_pipe_key=keyed_outputs) - prj = _write_and_build_prj(cfg, {METADATA_KEY: md}) + sp_name = "testing_subproj" + prj = _write_and_build_prj(cfg, { + METADATA_KEY: md, + SUBPROJECTS_SECTION: { + sp_name: { + METADATA_KEY: { + PIPELINE_INTERFACES_KEY: sp_ifaces_paths + } + } + } + }) + + # DEBUG print("TABLE below:\n{}".format(prj.sample_table)) - if activate: - prj.activate_subproject(sp_name) - obs_out = prj.get_outputs(False) - assert len(obs_out) > 0 - exp = {PROTO_NAMES[k]: outs for k, outs in declared_outputs.items()} - assert exp == prj.get_outputs(False) - else: - assert {} == prj.get_outputs(False) + assert len(prj.get_outputs(False)) == 0 + assert {} == prj.get_outputs(False) + prj.activate_subproject(sp_name) + assert len(prj.get_outputs(False)) > 0 + exp = {pipe_name: {k: (v, []) for k, v in outs.items()} + for pipe_name, outs in declared_outputs.items() + if pipe_name in {PROTO_NAMES[k] for k in used_iface_keys}} + print("EXP: {}".format(exp)) + assert exp == prj.get_outputs(False) @pytest.mark.skip("not implemented") -@pytest.mark.parametrize("activate", [False, True]) -def test_only_main_project_has_outputs(activate): +@pytest.mark.parametrize("ifaces", [ + [{WGBS_KEY: WGBS_IFACE_LINES}], [{RRBS_KEY: RRBS_IFACE_LINES}], + [{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}]]) +@pytest.mark.parametrize("declared_outputs", + [{n: DECLARED_OUTPUTS for n in [RRBS_NAME, WGBS_NAME]}]) +def test_only_main_project_has_outputs(tmpdir, ifaces, declared_outputs): """ Activation state affects status of Project's outputs. """ - pass + + cfg = tmpdir.join(randconf()).strpath + + iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] + assert [] == _find_reps(iface_paths), \ + "Repeated temp filepath(s): {}".format(_find_reps(iface_paths)) + + for data, path in zip(ifaces, iface_paths): + with open(path, 'w') as f: + yaml.dump(data, f) + md = deepcopy(BASE_META) + md[PIPELINE_INTERFACES_KEY] = iface_paths + + sp_ifaces_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] + assert [] == _find_reps(sp_ifaces_paths), \ + "Repeated temp filepath(s): {}".format(_find_reps(sp_ifaces_paths)) + iface_path_intersect = set(sp_ifaces_paths) & set(iface_paths) + assert set() == iface_path_intersect, \ + "Nonempty main/subs iface path intersection: {}". \ + format(", ".join(iface_path_intersect)) + + sp_name = "testing_subproj" + md[SUBPROJECTS_SECTION] = {sp_name: { + METADATA_KEY: {PIPELINE_INTERFACES_KEY: sp_ifaces_paths}}} + + # DEBUG + print("Metadata: {}".format(md)) + + keyed_outputs = { + pk: declared_outputs[pk] for pk in + set(itertools.chain(*[pi.keys() for pi in ifaces]))} + for path, data in zip(iface_paths, ifaces): + _write_iface_file(path, data, outputs_by_pipe_key=keyed_outputs) + for path, data in zip(sp_ifaces_paths, ifaces): + _write_iface_file(path, data) + + prj = _write_and_build_prj(cfg, {METADATA_KEY: md}) + + # DEBUG + print("TABLE below:\n{}".format(prj.sample_table)) + + assert len(prj.get_outputs(False)) > 0 + assert {PROTO_NAMES[k]: outs for k, outs in declared_outputs.items()} == \ + prj.get_outputs(False) + prj.activate_subproject(sp_name) + assert len(prj.get_outputs(False)) == 0 + assert {} == prj.get_outputs(False) @pytest.mark.skip("not implemented") diff --git a/tests/test_basic_interface_group.py b/tests/test_basic_interface_group.py index e69de29bb..c36b6e42e 100644 --- a/tests/test_basic_interface_group.py +++ b/tests/test_basic_interface_group.py @@ -0,0 +1,32 @@ +""" Tests for Project's pipeline interface group """ + +import pytest +from looper.project_piface_group import ProjectPifaceGroup + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" + + +@pytest.mark.skip("not implemented") +def test_iface_grp_cmp(): + pass + + +@pytest.mark.skip("not implemented") +def test_iface_grp_getitem(): + pass + + +@pytest.mark.skip("not implemented") +def test_iface_grp_iter(): + pass + + +@pytest.mark.skip("not implemented") +def test_iface_grp_len(): + pass + + +@pytest.mark.skip("not implemented") +def test_iface_grp_update(): + pass From c370143bdf2c2f9b458a01eea334a8fccfc6ea22 Mon Sep 17 00:00:00 2001 From: Vince Date: Tue, 23 Apr 2019 00:18:53 -0400 Subject: [PATCH 17/61] cleanup --- looper/project.py | 4 ---- tests/integration/test_project_get_outputs.py | 5 ----- 2 files changed, 9 deletions(-) diff --git a/looper/project.py b/looper/project.py index db9851774..0e10a72b2 100644 --- a/looper/project.py +++ b/looper/project.py @@ -259,14 +259,10 @@ def get_outputs(self, skip_sample_less=True): "Non-Boolean argument to sample-less skip flag: {} ({})". format(skip_sample_less, type(skip_sample_less))) prots_data_pairs = _gather_ifaces(self.interfaces) - # DEBUG - print("prots_data_pairs: {}".format(prots_data_pairs)) m = {} for name, (prots, data) in prots_data_pairs.items(): try: outs = data[OUTKEY] - # DEBUG - print("OUTS: {}".format(outs)) except KeyError: _LOGGER.debug("No {} declared for pipeline: {}". format(OUTKEY, name)) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index db7685bb8..32d7f4347 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -418,17 +418,12 @@ def read_iface_data(fp, lines): dat_by_key = { k: read_iface_data(tf, lines_group) for tf, (k, lines_group) in zip(temps, lines_group_by_pipe_key.items())} - # DEBUG - print("DAT BY K: {}".format(dat_by_key)) for k, outs in outputs_by_pipe_key.items(): if k not in dat_by_key: continue dat_by_key[k][OUTKEY] = outs data = {PROTOMAP_KEY: pm or PROTOMAP, PL_KEY: dat_by_key} - # DEBUG - print("DATA: {}".format(data)) - with open(path_iface_file, 'w') as f: yaml.dump(data, f) From 5da577070ccd63fd273d73ab7c77078502781023 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 23 Apr 2019 11:30:52 -0400 Subject: [PATCH 18/61] use more divvy --- looper/__init__.py | 13 +++++++++++-- looper/looper.py | 11 ++++++----- looper/pipeline_interface.py | 7 +++---- requirements/requirements-all.txt | 3 ++- tests/models/conftest.py | 4 ++-- tests/models/test_PipelineInterface.py | 1 + tests/test_submission_scripts.py | 2 +- 7 files changed, 26 insertions(+), 15 deletions(-) diff --git a/looper/__init__.py b/looper/__init__.py index 1b703da16..b6b2f70ac 100644 --- a/looper/__init__.py +++ b/looper/__init__.py @@ -15,6 +15,7 @@ from ._version import __version__ from .parser_types import * +from divvy import DEFAULT_COMPUTE_RESOURCES_NAME, NEW_COMPUTE_KEY as COMPUTE_KEY # Not used here, but make this the main import interface between peppy and # looper, so that other modules within this package need not worry about # the locations of some of the peppy declarations. Effectively, concentrate @@ -144,9 +145,17 @@ def add_subparser(cmd): "By default, pipelines will not be submitted if a sample name" " is duplicated, since samples names should be unique. " " Set this option to override this setting. Default=False") - subparser.add_argument( - "--compute", dest="compute", default="default", + + comp_spec = subparser.add_mutually_exclusive_group() + comp_spec.add_argument( + "--compute", dest=COMPUTE_KEY, + default=DEFAULT_COMPUTE_RESOURCES_NAME, help="YAML file with looper environment compute settings.") + comp_spec.add_argument( + "--compute-packages", dest=COMPUTE_KEY, + default=DEFAULT_COMPUTE_RESOURCES_NAME, + help="YAML file with looper environment compute settings.") + subparser.add_argument( "--resources", help="Specification of individual computing resource settings; " diff --git a/looper/looper.py b/looper/looper.py index b98e88d7e..6f5f0e01b 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -33,6 +33,7 @@ from .project import Project from .utils import determine_config_path, fetch_flag_files, sample_folder +from divvy import DEFAULT_COMPUTE_RESOURCES_NAME, NEW_COMPUTE_KEY as COMPUTE_KEY from logmuse import setup_logger from peppy import ProjectContext, METADATA_KEY, SAMPLE_EXECUTION_TOGGLE @@ -241,13 +242,13 @@ def process_protocols(prj, protocols, resource_setting_kwargs=None, **kwargs): resource_setting_kwargs = {} try: - comp_vars = prj.dcc.compute.to_map() + comp_vars = prj.dcc[COMPUTE_KEY].to_map() except AttributeError: - if not isinstance(prj.dcc.compute, Mapping): + if not isinstance(prj.dcc[COMPUTE_KEY], Mapping): raise TypeError("Project's computing config isn't a mapping: {} ({})". - format(prj.dcc.compute, type(prj.dcc.compute))) + format(prj.dcc[COMPUTE_KEY], type(prj.dcc[COMPUTE_KEY]))) from copy import deepcopy - comp_vars = deepcopy(prj.dcc.compute) + comp_vars = deepcopy(prj.dcc[COMPUTE_KEY]) comp_vars.update(resource_setting_kwargs or {}) _LOGGER.info("Known protocols: {}".format( @@ -810,7 +811,7 @@ def main(): if hasattr(args, "compute"): # Default is already loaded - if args.compute != "default": + if args.compute != DEFAULT_COMPUTE_RESOURCES_NAME: prj.dcc.activate_package(args.compute) _LOGGER.debug("Results subdir: " + prj.metadata[RESULTS_SUBDIR_KEY]) diff --git a/looper/pipeline_interface.py b/looper/pipeline_interface.py index 3c6ce1603..171761d86 100644 --- a/looper/pipeline_interface.py +++ b/looper/pipeline_interface.py @@ -17,8 +17,8 @@ MissingPipelineConfigurationException, PipelineInterfaceConfigError from .utils import get_logger from attmap import PathExAttMap +from divvy import DEFAULT_COMPUTE_RESOURCES_NAME, NEW_COMPUTE_KEY as COMPUTE_KEY from peppy import utils, Sample -from peppy.const import DEFAULT_COMPUTE_RESOURCES_NAME from peppy.utils import is_command_callable @@ -107,10 +107,9 @@ def notify(msg): pl = self.select_pipeline(pipeline_name) - compute_key = "compute" universal_compute = {} try: - universal_compute = pl[compute_key] + universal_compute = pl[COMPUTE_KEY] except KeyError: notify("No compute settings") @@ -127,7 +126,7 @@ def notify(msg): _LOGGER.warning( "{rk} section found in both {c} section and top-level " "pipelines section of pipeline interface; {c} section " - "version will be used".format(rk=RESOURCES_KEY, c=compute_key)) + "version will be used".format(rk=RESOURCES_KEY, c=COMPUTE_KEY)) # Require default resource package specification. try: diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index edde8b4b3..d9511ca8a 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -3,4 +3,5 @@ colorama>=0.3.9 logmuse>=0.0.2 pandas>=0.20.2 pyyaml>=3.12 -peppy>=0.20 +divvy>=0.4dev +peppy>=0.21dev diff --git a/tests/models/conftest.py b/tests/models/conftest.py index 4f6b382a8..e5ff9f229 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -11,9 +11,9 @@ import pandas as pd import pytest import yaml +from divvy import DEFAULT_COMPUTE_RESOURCES_NAME from looper.pipeline_interface import PROTOMAP_KEY, RESOURCES_KEY -from peppy import DEFAULT_COMPUTE_RESOURCES_NAME, METADATA_KEY, \ - NAME_TABLE_ATTR, SAMPLE_NAME_COLNAME +from peppy import METADATA_KEY, NAME_TABLE_ATTR, SAMPLE_NAME_COLNAME __author__ = "Vince Reuter" diff --git a/tests/models/test_PipelineInterface.py b/tests/models/test_PipelineInterface.py index b67dac6cb..38d74e63d 100644 --- a/tests/models/test_PipelineInterface.py +++ b/tests/models/test_PipelineInterface.py @@ -13,6 +13,7 @@ import yaml from attmap import PathExAttMap +from divvy import DEFAULT_COMPUTE_RESOURCES_NAME from looper.const import * from looper.pipeline_interface import PipelineInterface, PL_KEY, PROTOMAP_KEY, \ RESOURCES_KEY diff --git a/tests/test_submission_scripts.py b/tests/test_submission_scripts.py index 65558a934..3ff9d5810 100644 --- a/tests/test_submission_scripts.py +++ b/tests/test_submission_scripts.py @@ -9,6 +9,7 @@ import pytest import yaml +from divvy import DEFAULT_COMPUTE_RESOURCES_NAME as DEFAULT_RESOURCES_KEY from peppy import FLAGS, METADATA_KEY, OUTDIR_KEY import looper from looper.const import * @@ -31,7 +32,6 @@ SAMPLE_METADATA_RECORDS = [("sample" + str(i), p) for i, p in enumerate(ASSAYS)] DEFAULT_RESOURCES = { "file_size": "0", "cores": "1", "mem": "4000", "time": "00-01:00:00"} -DEFAULT_RESOURCES_KEY = "default" ATAC_SPEC = { PIPE_NAME_KEY: "PEPATAC", PIPE_PATH_KEY: ATAC_PIPE, RESOURCES_KEY: {DEFAULT_RESOURCES_KEY: DEFAULT_RESOURCES} From 203f17f50d5de225e49a5a877d4e9f06f07822fc Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 23 Apr 2019 11:34:33 -0400 Subject: [PATCH 19/61] use new CLI hook --- looper/looper.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 6f5f0e01b..40198347b 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -809,10 +809,9 @@ def main(): _LOGGER.error("Project config parse failed -- {}".format(e)) sys.exit(1) - if hasattr(args, "compute"): - # Default is already loaded - if args.compute != DEFAULT_COMPUTE_RESOURCES_NAME: - prj.dcc.activate_package(args.compute) + compute_cli_spec = getattr(args, COMPUTE_KEY, None) + if compute_cli_spec and compute_cli_spec != DEFAULT_COMPUTE_RESOURCES_NAME: + prj.dcc.activate_package(compute_cli_spec) _LOGGER.debug("Results subdir: " + prj.metadata[RESULTS_SUBDIR_KEY]) From 045b8180ca9fa2b03432345e9c99e27f22bcf89d Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 23 Apr 2019 12:31:04 -0400 Subject: [PATCH 20/61] get complementary test passing --- tests/integration/test_project_get_outputs.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 32d7f4347..600959291 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -264,11 +264,9 @@ def test_only_subproject_has_outputs(tmpdir, ifaces, declared_outputs): exp = {pipe_name: {k: (v, []) for k, v in outs.items()} for pipe_name, outs in declared_outputs.items() if pipe_name in {PROTO_NAMES[k] for k in used_iface_keys}} - print("EXP: {}".format(exp)) assert exp == prj.get_outputs(False) -@pytest.mark.skip("not implemented") @pytest.mark.parametrize("ifaces", [ [{WGBS_KEY: WGBS_IFACE_LINES}], [{RRBS_KEY: RRBS_IFACE_LINES}], [{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}]]) @@ -297,29 +295,37 @@ def test_only_main_project_has_outputs(tmpdir, ifaces, declared_outputs): "Nonempty main/subs iface path intersection: {}". \ format(", ".join(iface_path_intersect)) - sp_name = "testing_subproj" - md[SUBPROJECTS_SECTION] = {sp_name: { - METADATA_KEY: {PIPELINE_INTERFACES_KEY: sp_ifaces_paths}}} - # DEBUG print("Metadata: {}".format(md)) - keyed_outputs = { - pk: declared_outputs[pk] for pk in - set(itertools.chain(*[pi.keys() for pi in ifaces]))} + used_iface_keys = set(itertools.chain(*[pi.keys() for pi in ifaces])) + keyed_outputs = {pk: declared_outputs[PROTO_NAMES[pk]] + for pk in used_iface_keys} for path, data in zip(iface_paths, ifaces): _write_iface_file(path, data, outputs_by_pipe_key=keyed_outputs) for path, data in zip(sp_ifaces_paths, ifaces): _write_iface_file(path, data) - prj = _write_and_build_prj(cfg, {METADATA_KEY: md}) + sp_name = "testing_subproj" + prj = _write_and_build_prj(cfg, { + METADATA_KEY: md, + SUBPROJECTS_SECTION: { + sp_name: { + METADATA_KEY: { + PIPELINE_INTERFACES_KEY: sp_ifaces_paths + } + } + } + }) # DEBUG print("TABLE below:\n{}".format(prj.sample_table)) assert len(prj.get_outputs(False)) > 0 - assert {PROTO_NAMES[k]: outs for k, outs in declared_outputs.items()} == \ - prj.get_outputs(False) + exp = {pipe_name: {k: (v, []) for k, v in outs.items()} + for pipe_name, outs in declared_outputs.items() + if pipe_name in {PROTO_NAMES[k] for k in used_iface_keys}} + assert exp == prj.get_outputs(False) prj.activate_subproject(sp_name) assert len(prj.get_outputs(False)) == 0 assert {} == prj.get_outputs(False) From aa4076e56fc41d34c53f69eb767fb7cbcfe1a87a Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 23 Apr 2019 15:04:19 -0400 Subject: [PATCH 21/61] pass main/subs interaction tests for get_outputs --- tests/integration/test_project_get_outputs.py | 63 ++++++++++++++++++- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 600959291..3550795f3 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -331,34 +331,91 @@ def test_only_main_project_has_outputs(tmpdir, ifaces, declared_outputs): assert {} == prj.get_outputs(False) -@pytest.mark.skip("not implemented") -def test_main_project_and_subproject_have_outputs(): +def test_multiple_project_units_have_declare_interfaces_with_outputs(tmpdir): """ Activation state affects status of Project's outputs. """ - pass + + # Generate config filepaths. + iface_paths = set() + while len(iface_paths) < 3: + iface_paths.add(tmpdir.join(randconf()).strpath) + iface_paths = list(iface_paths) + + # Collect the Project config data. + main_iface_file, sp_iface_files = iface_paths[0], iface_paths[1:] + sp_files = dict(zip(["sp1", "sp2"], sp_iface_files)) + prj_dat = { + METADATA_KEY: { + OUTDIR_KEY: tmpdir.strpath, + PIPELINE_INTERFACES_KEY: main_iface_file + }, + SUBPROJECTS_SECTION: {n: {METADATA_KEY: {PIPELINE_INTERFACES_KEY: f}} + for n, f in sp_files.items()} + } + + # Generate Project config filepath and create Project. + while True: + conf_file = tmpdir.join(randconf()).strpath + if conf_file not in iface_paths: + break + for f, (lines_spec, outs_spec) in zip( + iface_paths, + [({WGBS_KEY: WGBS_IFACE_LINES}, {WGBS_KEY: DECLARED_OUTPUTS}), + ({RRBS_KEY: RRBS_IFACE_LINES}, {RRBS_KEY: DECLARED_OUTPUTS}), + ({WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES}, + {WGBS_KEY: DECLARED_OUTPUTS, RRBS_KEY: DECLARED_OUTPUTS})]): + _write_iface_file(f, lines_group_by_pipe_key=lines_spec, + outputs_by_pipe_key=outs_spec) + + prj = _write_and_build_prj(conf_file, prj_dat) + + # DEBUG + print("TMPDIR contents:\n{}".format("\n".join( + os.path.join(tmpdir.strpath, f) for f in os.listdir(tmpdir.strpath)))) + + def observe(p): + return p.get_outputs(False) + + def extract_just_path_template(out_res): + # DEBUG + print("out_res: {}".format(out_res)) + return {pipe_name: {k: v for k, (v, _) in outs.items()} + for pipe_name, outs in out_res.items()} + + assert {WGBS_NAME: DECLARED_OUTPUTS} == extract_just_path_template(observe(prj)) + prj.activate_subproject("sp1") + assert {RRBS_NAME: DECLARED_OUTPUTS} == extract_just_path_template(observe(prj)) + prj.activate_subproject("sp2") + assert {pn: DECLARED_OUTPUTS for pn in [WGBS_NAME, RRBS_NAME]} == \ + extract_just_path_template(observe(prj)) @pytest.mark.skip("not implemented") def test_no_samples_match_protocols_with_outputs(skip_sample_less): + """ get_outputs behavior is sensitive to protocol match and skip flag. """ pass @pytest.mark.skip("not implemented") def test_pipeline_identifier_collision_same_data(): + """ Interface data that differs from another with same identifier is unexceptional. """ pass @pytest.mark.skip("not implemented") def test_pipeline_identifier_collision_different_data(): + """ Interface data that differs from another with same identifier is exceptional. """ pass @pytest.mark.skip("not implemented") def test_sample_collection_accuracy(): + """ Names of samples collected for each pipeline are as expected. """ pass @pytest.mark.skip("not implemented") def test_protocol_collection_accuracy(): + """ Names of protocols collected for each pipeline are as expected. """ pass From 86b0f43a52c2fa044546417dd03f21e80bfc9e34 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 23 Apr 2019 15:43:32 -0400 Subject: [PATCH 22/61] remove message duplication; #171 --- looper/conductor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/looper/conductor.py b/looper/conductor.py index 35613c15f..b48c7fec8 100644 --- a/looper/conductor.py +++ b/looper/conductor.py @@ -205,10 +205,9 @@ def add_sample(self, sample, sample_subtype=Sample, rerun=False): missing_reqs_msg = "{}: {}".format( missing_reqs_general, missing_reqs_specific) if self.prj.permissive: - _LOGGER.warning(missing_reqs_msg) + _LOGGER.warning("> Not submitted: %s", missing_reqs_msg) else: raise error_type(missing_reqs_msg) - _LOGGER.warning("> Not submitted: %s", missing_reqs_msg) use_this_sample and skip_reasons.append(missing_reqs_general) # Check if single_or_paired value is recognized. From 659be8420d1f312ed7f51089c78cd7718212b70f Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 23 Apr 2019 16:31:25 -0400 Subject: [PATCH 23/61] pass same-data collision test --- tests/integration/test_project_get_outputs.py | 92 ++++++++++++++++--- 1 file changed, 81 insertions(+), 11 deletions(-) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 3550795f3..fbdd76ffb 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -353,10 +353,7 @@ def test_multiple_project_units_have_declare_interfaces_with_outputs(tmpdir): } # Generate Project config filepath and create Project. - while True: - conf_file = tmpdir.join(randconf()).strpath - if conf_file not in iface_paths: - break + conf_file = make_temp_file_path(folder=tmpdir.strpath, known=iface_paths) for f, (lines_spec, outs_spec) in zip( iface_paths, [({WGBS_KEY: WGBS_IFACE_LINES}, {WGBS_KEY: DECLARED_OUTPUTS}), @@ -389,16 +386,72 @@ def extract_just_path_template(out_res): extract_just_path_template(observe(prj)) -@pytest.mark.skip("not implemented") -def test_no_samples_match_protocols_with_outputs(skip_sample_less): +@pytest.mark.parametrize("noskip", [False, True]) +@pytest.mark.parametrize("protocols", + [[], [random.choice(["INVALID", "NULL"]) for _ in range(10)]]) +@pytest.mark.parametrize("declared_outputs", + [{n: DECLARED_OUTPUTS for n in [RRBS_NAME, WGBS_NAME]}]) +def test_no_samples_match_protocols_with_outputs( + tmpdir, noskip, protocols, declared_outputs): """ get_outputs behavior is sensitive to protocol match and skip flag. """ - pass + temproot = tmpdir.strpath + path_iface_file = tmpdir.join(randconf()).strpath + prj_cfg = make_temp_file_path(folder=temproot, known=[path_iface_file]) + prj_dat = { + METADATA_KEY: { + OUTDIR_KEY: temproot, + PIPELINE_INTERFACES_KEY: path_iface_file + } + } + if protocols: + anns_file = make_temp_file_path( + folder=temproot, known=[path_iface_file, prj_cfg]) + anns_data = [("sample{}".format(i), p) for i, p in enumerate(protocols)] + with open(anns_file, 'w') as f: + for n, p in [(SAMPLE_NAME_COLNAME, ASSAY_KEY)] + anns_data: + f.write("{},{}\n".format(n, p)) + prj_dat[METADATA_KEY][SAMPLE_ANNOTATIONS_KEY] = anns_file + _write_iface_file( + path_iface_file, {WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES}, + outputs_by_pipe_key={PROTOMAP[n]: DECLARED_OUTPUTS for n in declared_outputs.keys()}) + prj = _write_and_build_prj(prj_cfg, prj_dat) + exp = { + pipe_name: { + path_key: (path_temp, []) + for path_key, path_temp in decl_outs.items()} + for pipe_name, decl_outs in declared_outputs.items() + } if noskip else {} + assert exp == prj.get_outputs(not noskip) + + +@pytest.mark.parametrize("protomap", [None, PROTOMAP]) +@pytest.mark.parametrize("include_outputs", [False, True]) +def test_pipeline_identifier_collision_same_data(tmpdir, protomap, include_outputs): + """ Interface data that differs from another with same identifier is unexceptional. """ + temproot = tmpdir.strpath + lines_groups = {WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES} + outputs = {k: DECLARED_OUTPUTS for k in lines_groups.keys()} \ + if include_outputs else None -@pytest.mark.skip("not implemented") -def test_pipeline_identifier_collision_same_data(): - """ Interface data that differs from another with same identifier is unexceptional. """ - pass + def write_iface(f, pm): + _write_iface_file(f, lines_groups, outputs, pm) + + iface_file_1 = os.path.join(temproot, "piface1.yaml") + write_iface(iface_file_1, protomap) + iface_file_2 = os.path.join(temproot, "piface2.yaml") + write_iface(iface_file_2, protomap) + + prj_dat = { + METADATA_KEY: { + OUTDIR_KEY: tmpdir.strpath, + PIPELINE_INTERFACES_KEY: [iface_file_1, iface_file_2] + } + } + prj = _write_and_build_prj(os.path.join(temproot, "pc.yaml"), prj_dat) + exp = {n: {k: (v, []) for k, v in DECLARED_OUTPUTS.items()} + for n in [WGBS_NAME, RRBS_NAME]} if include_outputs else {} + assert exp == prj.get_outputs(skip_sample_less=False) @pytest.mark.skip("not implemented") @@ -491,3 +544,20 @@ def read_iface_data(fp, lines): yaml.dump(data, f) return path_iface_file + + +def make_temp_file_path(folder, known, generate=randconf): + """ + Generate a new tempfile path. + + :param str folder: path to folder that represents parent of path to + generate, i.e. the path to the folder to which a randomized filename + is to be joined + :param Iterable[str] known: collection of current filePATHs + :param function() -> str generate: how to generate fileNAME + :return str: randomly generated filepath that doesn't match a known value + """ + while True: + fp = os.path.join(folder, generate()) + if fp not in known: + return fp From 873a522e2b44088a18d08bf3f4252a053cfefd1a Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 23 Apr 2019 17:15:01 -0400 Subject: [PATCH 24/61] temp commit --- tests/integration/test_project_get_outputs.py | 58 ++++++++++++++++--- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index fbdd76ffb..9bffe33f5 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -10,6 +10,7 @@ import yaml from looper import Project as LP from looper.const import * +from looper.exceptions import DuplicatePipelineKeyException from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY from attmap import AttMap from peppy.const import * @@ -33,8 +34,7 @@ PROTO_NAMES = {WGBS_KEY: WGBS_NAME, RRBS_KEY: RRBS_NAME} -WGBS_IFACE_LINES = """ -name: {n} +WGBS_IFACE_LINES = """name: {n} path: src/wgbs.py required_input_files: [data_source] ngs_input_files: [data_source] @@ -51,8 +51,7 @@ time: "0-02:00:00" """.format(n=WGBS_NAME).splitlines(True) -RRBS_IFACE_LINES = """ -name: {n} +RRBS_IFACE_LINES = """name: {n} path: src/rrbs.py required_input_files: [data_source] all_input_files: [data_source, read1, read2] @@ -373,8 +372,6 @@ def observe(p): return p.get_outputs(False) def extract_just_path_template(out_res): - # DEBUG - print("out_res: {}".format(out_res)) return {pipe_name: {k: v for k, (v, _) in outs.items()} for pipe_name, outs in out_res.items()} @@ -454,10 +451,53 @@ def write_iface(f, pm): assert exp == prj.get_outputs(skip_sample_less=False) -@pytest.mark.skip("not implemented") -def test_pipeline_identifier_collision_different_data(): +@pytest.mark.parametrize("protomap", [None, PROTOMAP]) +@pytest.mark.parametrize("include_outputs", [False, True]) +@pytest.mark.parametrize("rep_key", [WGBS_KEY, RRBS_KEY]) +def test_pipeline_identifier_collision_different_data( + tmpdir, include_outputs, protomap, skip_sample_less, rep_key): """ Interface data that differs from another with same identifier is exceptional. """ - pass + temproot = tmpdir.strpath + + def write_iface(f, lines_group): + out_by_key = {k: DECLARED_OUTPUTS for k in lines_group} \ + if include_outputs else None + _write_iface_file(f, lines_group, out_by_key, pm=protomap) + + iface_file_1 = os.path.join(temproot, "piface1.yaml") + write_iface(iface_file_1, {rep_key: WGBS_IFACE_LINES}) + iface_file_2 = os.path.join(temproot, "piface2.yaml") + write_iface(iface_file_2, {rep_key: RRBS_IFACE_LINES}) + + def observe(): + prj_cfg = os.path.join(temproot, "pc.yaml") + prj_dat = { + METADATA_KEY: { + OUTDIR_KEY: tmpdir.strpath, + PIPELINE_INTERFACES_KEY: [iface_file_1, iface_file_2] + } + } + return _write_and_build_prj(prj_cfg, prj_dat).get_outputs(skip_sample_less) + + try: + observe() + except Exception as e: + pytest.fail("Unexpected exception: {}".format(e)) + + write_iface(iface_file_1, {rep_key: WGBS_IFACE_LINES[1:]}) + write_iface(iface_file_2, {rep_key: RRBS_IFACE_LINES[1:]}) + + # DEBUG + def print_iface(fp): + with open(fp, 'r') as f: + return yaml.load(f, yaml.SafeLoader) + + # DEBUG + print("First interface contents (below):\n{}\n".format(print_iface(iface_file_1))) + print("Second interface contents (below):\n{}".format(print_iface(iface_file_2))) + + with pytest.raises(DuplicatePipelineKeyException): + observe() @pytest.mark.skip("not implemented") From e7040690fc8384693b0cba767cd33c56b2eac4e8 Mon Sep 17 00:00:00 2001 From: Vince Date: Wed, 24 Apr 2019 10:21:45 -0400 Subject: [PATCH 25/61] setting up additional cases --- tests/integration/test_project_get_outputs.py | 133 +++++++++++++++--- 1 file changed, 115 insertions(+), 18 deletions(-) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 9bffe33f5..ab593cc92 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -501,17 +501,40 @@ def print_iface(fp): @pytest.mark.skip("not implemented") -def test_sample_collection_accuracy(): +def test_sample_collection_accuracy(tmpdir, skip_sample_less): """ Names of samples collected for each pipeline are as expected. """ - pass + samples = [("sampleA", "WGBS"), ("sample2", "HiChIP"), ("sampleC", "scRNA"), ("sample4", "ATAC")] + exp_base = {WGBS_NAME: {: (, )}} + if skip_sample_less: + pass + else: + pass + assert exp == prj.get_outputs(skip_sample_less) @pytest.mark.skip("not implemented") -def test_protocol_collection_accuracy(): +def test_protocol_collection_accuracy(tmpdir): """ Names of protocols collected for each pipeline are as expected. """ pass +def make_temp_file_path(folder, known, generate=randconf): + """ + Generate a new tempfile path. + + :param str folder: path to folder that represents parent of path to + generate, i.e. the path to the folder to which a randomized filename + is to be joined + :param Iterable[str] known: collection of current filePATHs + :param function() -> str generate: how to generate fileNAME + :return str: randomly generated filepath that doesn't match a known value + """ + while True: + fp = os.path.join(folder, generate()) + if fp not in known: + return fp + + def _find_reps(objs): """ Find (and count) repeated objects @@ -585,19 +608,93 @@ def read_iface_data(fp, lines): return path_iface_file +RNA_LINES = """protocol_mapping: + RNA-seq: > + rnaBitSeq.py -f; + rnaKallisto.py; + rnaTopHat.py -f + SMART: > + rnaBitSeq.py -f; + rnaTopHat.py -f + +pipelines: + rnaBitSeq.py: + name: rnaBitSeq + path: src/rnaBitSeq.py + arguments: + "--sample-name": sample_name + "--genome": transcriptome + "--input": data_source + "--single-or-paired": read_type + required_input_files: [data_source] + ngs_input_files: [data_source] + resources: + default: + file_size: "0" + cores: "6" + mem: "36000" + time: "2-00:00:00" + large: + file_size: "4" + cores: "6" + mem: "44000" + time: "2-00:00:00" + + rnaTopHat.py: + name: rnaTopHat + path: src/rnaTopHat.py + required_input_files: [data_source] + ngs_input_files: [data_source] + arguments: + "--sample-name": sample_name + "--genome": genome + "--input": data_source + "--single-or-paired": read_type + resources: + default: + file_size: "0" + cores: "2" + mem: "60000" + time: "7-00:00:00" + + rnaKallisto.py: + name: rnaKallisto + path: src/rnaKallisto.py + required_input_files: [data_source] + ngs_input_files: [data_source] + arguments: + "--sample-yaml": yaml_file + "--sample-name": sample_name + "--input": data_source + "--single-or-paired": read_type + optional_arguments: + "--input2": read2 + "--fragment-length": fragment_length + "--fragment-length-sdev": fragment_length_sdev + resources: + default: + cores: "2" + mem: "4000" + time: "0-6:00:00" + normal: + min_file_size: "3" + cores: "2" + mem: "8000" + time: "0-12:00:00" +""".splitlines(True) + + +@pytest.fixture(scope="function") +def kallisto_lines(): + pass + + +@pytest.fixture(scope="function") +def bitseq_lines(): + pass + + +@pytest.fixture(scope="function") +def tophat_lines(): + pass -def make_temp_file_path(folder, known, generate=randconf): - """ - Generate a new tempfile path. - - :param str folder: path to folder that represents parent of path to - generate, i.e. the path to the folder to which a randomized filename - is to be joined - :param Iterable[str] known: collection of current filePATHs - :param function() -> str generate: how to generate fileNAME - :return str: randomly generated filepath that doesn't match a known value - """ - while True: - fp = os.path.join(folder, generate()) - if fp not in known: - return fp From 5af3112e3d7e5413d433cac16be742672d862387 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 24 Apr 2019 12:50:13 -0400 Subject: [PATCH 26/61] new dev version --- looper/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/looper/_version.py b/looper/_version.py index 008a1d204..b19938576 100644 --- a/looper/_version.py +++ b/looper/_version.py @@ -1,2 +1,2 @@ -__version__ = "0.11.1" +__version__ = "0.12dev" From 975d1efcd92907f8c43e6e4d1a2953d44c8d2520 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 24 Apr 2019 13:19:22 -0400 Subject: [PATCH 27/61] temp --- tests/integration/test_project_get_outputs.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index ab593cc92..9cad6fd41 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -1,6 +1,6 @@ """ Tests for interaction between Project and PipelineInterface """ -from collections import Counter +from collections import Counter, namedtuple from copy import deepcopy import itertools import os @@ -503,8 +503,10 @@ def print_iface(fp): @pytest.mark.skip("not implemented") def test_sample_collection_accuracy(tmpdir, skip_sample_less): """ Names of samples collected for each pipeline are as expected. """ - samples = [("sampleA", "WGBS"), ("sample2", "HiChIP"), ("sampleC", "scRNA"), ("sample4", "ATAC")] - exp_base = {WGBS_NAME: {: (, )}} + scrna_proto = "scRNA" + samples = [("sampleA", WGBS_NAME), ("sample2", "HiChIP"), + ("sampleC", scrna_proto), ("sample4", "ATAC")] + exp_base = {WGBS_NAME: DECLARED_OUTPUTS, scrna_proto: } if skip_sample_less: pass else: @@ -684,6 +686,17 @@ def read_iface_data(fp, lines): """.splitlines(True) +class PipeSpec(object): + def __init__(self, key, name=None): + assert "" != os.path.splitext(key)[1] + self.key = key + self.name = name or key.rstrip(".py") + +RNA_PIPES = {"kallisto": PipeSpec("rnaKallisto.py"), + "tophat": PipeSpec("rnaTopHat.py"), + "bitseq": PipeSpec("rnaBitSeq.py")} + + @pytest.fixture(scope="function") def kallisto_lines(): pass From c3a05fa611f1c1ad819634f4ee153a28cfbce3c7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 24 Apr 2019 13:20:01 -0400 Subject: [PATCH 28/61] include jinja for tests --- requirements/requirements-dev.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 2e300d8dd..767976247 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,2 +1,4 @@ +jinja2 mock==2.0.0 pytest==3.0.7 + From 6ccda3f93ade0bf6e46b1574db00797a6b389b9b Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 24 Apr 2019 18:29:20 -0400 Subject: [PATCH 29/61] additional outputs testing --- looper/pipeline_interface.py | 13 +- tests/integration/test_project_get_outputs.py | 152 ++++++++++++------ 2 files changed, 111 insertions(+), 54 deletions(-) diff --git a/looper/pipeline_interface.py b/looper/pipeline_interface.py index 8b386f2f7..dabf5f667 100644 --- a/looper/pipeline_interface.py +++ b/looper/pipeline_interface.py @@ -55,8 +55,15 @@ def __init__(self, config): _LOGGER.debug("Parsing '%s' for %s config data", config, self.__class__.__name__) self.pipe_iface_file = config - with open(config, 'r') as f: - config = yaml.load(f, SafeLoader) + try: + with open(config, 'r') as f: + config = yaml.load(f, SafeLoader) + except yaml.parser.ParserError: + with open(config, 'r') as f: + _LOGGER.error( + "Failed to parse YAML from {}:\n{}". + format(config, "".join(f.readlines()))) + raise self.source = config # Check presence of 2 main sections (protocol mapping and pipelines). @@ -668,7 +675,6 @@ def class_names(cs): class_names(proper_subtypes))) - def _fetch_classes(mod): """ Return the classes defined in a module. """ try: @@ -679,7 +685,6 @@ def _fetch_classes(mod): return list(classes) - def _proper_subtypes(types, supertype): """ Determine the proper subtypes of a supertype. """ return list(filter( diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 9cad6fd41..6906097a1 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -74,6 +74,10 @@ IFACE_LINES = {WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES} +KALLISTO_ABUNDANCES_KEY = "abundances" +KALLISTO_ABUNDANCES_TEMPLATE = "\"{sample.name}_isoforms.txt\"" + + def pytest_generate_tests(metafunc): """ Test case generation and parameterization for this module. """ skip_empty_flag = "skip_sample_less" @@ -500,17 +504,48 @@ def print_iface(fp): observe() -@pytest.mark.skip("not implemented") -def test_sample_collection_accuracy(tmpdir, skip_sample_less): +def test_sample_collection_accuracy(tmpdir, skip_sample_less, rna_pi_lines): """ Names of samples collected for each pipeline are as expected. """ + temproot = tmpdir.strpath scrna_proto = "scRNA" samples = [("sampleA", WGBS_NAME), ("sample2", "HiChIP"), - ("sampleC", scrna_proto), ("sample4", "ATAC")] - exp_base = {WGBS_NAME: DECLARED_OUTPUTS, scrna_proto: } - if skip_sample_less: - pass - else: - pass + ("sampleC", scrna_proto), ("sample4", "ATAC"), + ("sampleE", WGBS_NAME), ("sample6", "HiChIP"), + ("sampleG", scrna_proto), ("sample8", "ATAC")] + iface_files = list(get_temp_paths(2, temproot)) + anns_file = make_temp_file_path( + temproot, iface_files, + generate=lambda: "".join(randstr(LETTERS_AND_DIGITS, 20)) + ".csv") + with open(anns_file, 'w') as f: + f.write("\n".join("{},{}".format(*pair) for pair in + [(SAMPLE_NAME_COLNAME, ASSAY_KEY)] + samples)) + _write_iface_file( + iface_files[0], + lines_group_by_pipe_key={WGBS_KEY: WGBS_IFACE_LINES}, + outputs_by_pipe_key={WGBS_KEY: DECLARED_OUTPUTS}, pm=PROTOMAP) + with open(iface_files[1], 'w') as f: + for l in rna_pi_lines: + f.write(l) + prj_dat = { + METADATA_KEY: { + OUTDIR_KEY: tmpdir.strpath, + PIPELINE_INTERFACES_KEY: iface_files, + SAMPLE_ANNOTATIONS_KEY: anns_file + } + } + prj_cfg = make_temp_file_path(temproot, iface_files + [anns_file]) + prj = _write_and_build_prj(prj_cfg, prj_dat) + kallisto_outputs = {KALLISTO_ABUNDANCES_KEY: KALLISTO_ABUNDANCES_TEMPLATE} + exp_base = { + WGBS_NAME: DECLARED_OUTPUTS, + scrna_proto: {RNA_PIPES["kallisto"].name: kallisto_outputs} + } + exp = { + pipe_name: { + out_key: (out_val, [sn for sn, pn in samples if pn == pipe_name]) + for out_key, out_val in outs.items()} + for pipe_name, outs in exp_base.items() + } assert exp == prj.get_outputs(skip_sample_less) @@ -520,6 +555,28 @@ def test_protocol_collection_accuracy(tmpdir): pass +def get_temp_paths(n, folder, known=None, generate=randconf): + """ + Generate unique tempfile paths pointing to within a particular folder. + + :param str folder: path to folder into which randomly generated filepaths + should point + :param Iterable[str] known: collection of filepaths to prohibit a + match to for a newly generated path + :param function() -> str generate: how to randomly generate a filename + :return Iterable[str]: collection of unique tempfile paths pointing to + within a particular folder. + """ + paths = set() + known = set(known or []) + gen = lambda pool: make_temp_file_path(folder, pool, generate) + while len(paths) < n: + p = gen(known) + known.add(p) + paths.add(p) + return paths + + def make_temp_file_path(folder, known, generate=randconf): """ Generate a new tempfile path. @@ -610,18 +667,33 @@ def read_iface_data(fp, lines): return path_iface_file -RNA_LINES = """protocol_mapping: - RNA-seq: > - rnaBitSeq.py -f; - rnaKallisto.py; - rnaTopHat.py -f - SMART: > - rnaBitSeq.py -f; - rnaTopHat.py -f + +class PipeSpec(object): + def __init__(self, key, name=None): + assert "" != os.path.splitext(key)[1] + self.key = key + self.name = name or key.rstrip(".py") + + +RNA_PIPES = {"kallisto": PipeSpec("rnaKallisto.py"), + "tophat": PipeSpec("rnaTopHat.py"), + "bitseq": PipeSpec("rnaBitSeq.py")} + + +@pytest.fixture(scope="function") +def rna_outputs_declaration(): + pass + + +@pytest.fixture(scope="function") +def rna_pi_lines(): + return """protocol_mapping: + RNA-seq: [{bs_name}, {kall_name}, {th_name}] + SMART: [{bs_name}, {th_name}] pipelines: - rnaBitSeq.py: - name: rnaBitSeq + {bs_key}: + name: {bs_name} path: src/rnaBitSeq.py arguments: "--sample-name": sample_name @@ -642,8 +714,8 @@ def read_iface_data(fp, lines): mem: "44000" time: "2-00:00:00" - rnaTopHat.py: - name: rnaTopHat + {th_key}: + name: {th_name} path: src/rnaTopHat.py required_input_files: [data_source] ngs_input_files: [data_source] @@ -659,8 +731,8 @@ def read_iface_data(fp, lines): mem: "60000" time: "7-00:00:00" - rnaKallisto.py: - name: rnaKallisto + {kall_key}: + name: {kall_name} path: src/rnaKallisto.py required_input_files: [data_source] ngs_input_files: [data_source] @@ -673,6 +745,8 @@ def read_iface_data(fp, lines): "--input2": read2 "--fragment-length": fragment_length "--fragment-length-sdev": fragment_length_sdev + outputs: + {abundances_key}: {abundances_val} resources: default: cores: "2" @@ -683,31 +757,9 @@ def read_iface_data(fp, lines): cores: "2" mem: "8000" time: "0-12:00:00" -""".splitlines(True) - - -class PipeSpec(object): - def __init__(self, key, name=None): - assert "" != os.path.splitext(key)[1] - self.key = key - self.name = name or key.rstrip(".py") - -RNA_PIPES = {"kallisto": PipeSpec("rnaKallisto.py"), - "tophat": PipeSpec("rnaTopHat.py"), - "bitseq": PipeSpec("rnaBitSeq.py")} - - -@pytest.fixture(scope="function") -def kallisto_lines(): - pass - - -@pytest.fixture(scope="function") -def bitseq_lines(): - pass - - -@pytest.fixture(scope="function") -def tophat_lines(): - pass - +""".format( + bs_key=RNA_PIPES["bitseq"].key, bs_name=RNA_PIPES["bitseq"].name, + th_key=RNA_PIPES["tophat"].key, th_name=RNA_PIPES["tophat"].name, + kall_key=RNA_PIPES["kallisto"].key, kall_name=RNA_PIPES["kallisto"].name, + abundances_key=KALLISTO_ABUNDANCES_KEY, + abundances_val=KALLISTO_ABUNDANCES_TEMPLATE).splitlines(True) From 2a64e2e579450ee254cb373cdf587187c32c6a78 Mon Sep 17 00:00:00 2001 From: Vince Date: Wed, 24 Apr 2019 20:06:20 -0400 Subject: [PATCH 30/61] pass the sample selection test --- tests/integration/test_project_get_outputs.py | 40 +++++++++---------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 6906097a1..725fe9a5f 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -73,9 +73,9 @@ PROTOMAP = {RRBS_NAME: RRBS_KEY, WGBS_NAME: WGBS_KEY, "EG": WGBS_KEY} IFACE_LINES = {WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES} - +RNASEQ = "RNA-seq" KALLISTO_ABUNDANCES_KEY = "abundances" -KALLISTO_ABUNDANCES_TEMPLATE = "\"{sample.name}_isoforms.txt\"" +KALLISTO_ABUNDANCES_TEMPLATE = "{sample.name}_isoforms.txt" def pytest_generate_tests(metafunc): @@ -507,11 +507,10 @@ def print_iface(fp): def test_sample_collection_accuracy(tmpdir, skip_sample_less, rna_pi_lines): """ Names of samples collected for each pipeline are as expected. """ temproot = tmpdir.strpath - scrna_proto = "scRNA" samples = [("sampleA", WGBS_NAME), ("sample2", "HiChIP"), - ("sampleC", scrna_proto), ("sample4", "ATAC"), + ("sampleC", RNASEQ), ("sample4", "ATAC"), ("sampleE", WGBS_NAME), ("sample6", "HiChIP"), - ("sampleG", scrna_proto), ("sample8", "ATAC")] + ("sampleG", RNASEQ), ("sample8", "ATAC")] iface_files = list(get_temp_paths(2, temproot)) anns_file = make_temp_file_path( temproot, iface_files, @@ -528,23 +527,22 @@ def test_sample_collection_accuracy(tmpdir, skip_sample_less, rna_pi_lines): f.write(l) prj_dat = { METADATA_KEY: { + SAMPLE_ANNOTATIONS_KEY: anns_file, OUTDIR_KEY: tmpdir.strpath, - PIPELINE_INTERFACES_KEY: iface_files, - SAMPLE_ANNOTATIONS_KEY: anns_file + PIPELINE_INTERFACES_KEY: iface_files } } prj_cfg = make_temp_file_path(temproot, iface_files + [anns_file]) prj = _write_and_build_prj(prj_cfg, prj_dat) kallisto_outputs = {KALLISTO_ABUNDANCES_KEY: KALLISTO_ABUNDANCES_TEMPLATE} - exp_base = { - WGBS_NAME: DECLARED_OUTPUTS, - scrna_proto: {RNA_PIPES["kallisto"].name: kallisto_outputs} - } exp = { - pipe_name: { - out_key: (out_val, [sn for sn, pn in samples if pn == pipe_name]) - for out_key, out_val in outs.items()} - for pipe_name, outs in exp_base.items() + WGBS_NAME: {k: (v, [sn for sn, pn in samples if pn == WGBS_NAME]) for k, v in DECLARED_OUTPUTS.items()}, + RNA_PIPES["kallisto"].name: { + KALLISTO_ABUNDANCES_KEY: ( + KALLISTO_ABUNDANCES_TEMPLATE, + [sn for sn, prot in samples if prot == RNASEQ] + ) for k, v in kallisto_outputs.items() + } } assert exp == prj.get_outputs(skip_sample_less) @@ -559,6 +557,7 @@ def get_temp_paths(n, folder, known=None, generate=randconf): """ Generate unique tempfile paths pointing to within a particular folder. + :param int n: number of paths to generate :param str folder: path to folder into which randomly generated filepaths should point :param Iterable[str] known: collection of filepaths to prohibit a @@ -669,6 +668,7 @@ def read_iface_data(fp, lines): class PipeSpec(object): + """ Pipeline key and name """ def __init__(self, key, name=None): assert "" != os.path.splitext(key)[1] self.key = key @@ -680,15 +680,10 @@ def __init__(self, key, name=None): "bitseq": PipeSpec("rnaBitSeq.py")} -@pytest.fixture(scope="function") -def rna_outputs_declaration(): - pass - - @pytest.fixture(scope="function") def rna_pi_lines(): return """protocol_mapping: - RNA-seq: [{bs_name}, {kall_name}, {th_name}] + {rnaseq_proto_name}: [{bs_name}, {kall_name}, {th_name}] SMART: [{bs_name}, {th_name}] pipelines: @@ -746,7 +741,7 @@ def rna_pi_lines(): "--fragment-length": fragment_length "--fragment-length-sdev": fragment_length_sdev outputs: - {abundances_key}: {abundances_val} + {abundances_key}: \"{abundances_val}\" resources: default: cores: "2" @@ -758,6 +753,7 @@ def rna_pi_lines(): mem: "8000" time: "0-12:00:00" """.format( + rnaseq_proto_name=RNASEQ, bs_key=RNA_PIPES["bitseq"].key, bs_name=RNA_PIPES["bitseq"].name, th_key=RNA_PIPES["tophat"].key, th_name=RNA_PIPES["tophat"].name, kall_key=RNA_PIPES["kallisto"].key, kall_name=RNA_PIPES["kallisto"].name, From 442388375b39efff19b2d0f6d7ba09bf389d43cd Mon Sep 17 00:00:00 2001 From: Vince Date: Wed, 24 Apr 2019 20:50:57 -0400 Subject: [PATCH 31/61] clear variables --- tests/integration/test_project_get_outputs.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 725fe9a5f..d52a1b9ba 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -534,14 +534,16 @@ def test_sample_collection_accuracy(tmpdir, skip_sample_less, rna_pi_lines): } prj_cfg = make_temp_file_path(temproot, iface_files + [anns_file]) prj = _write_and_build_prj(prj_cfg, prj_dat) - kallisto_outputs = {KALLISTO_ABUNDANCES_KEY: KALLISTO_ABUNDANCES_TEMPLATE} exp = { - WGBS_NAME: {k: (v, [sn for sn, pn in samples if pn == WGBS_NAME]) for k, v in DECLARED_OUTPUTS.items()}, + WGBS_NAME: { + k: (v, [sn for sn, pn in samples if pn == WGBS_NAME]) + for k, v in DECLARED_OUTPUTS.items() + }, RNA_PIPES["kallisto"].name: { KALLISTO_ABUNDANCES_KEY: ( KALLISTO_ABUNDANCES_TEMPLATE, [sn for sn, prot in samples if prot == RNASEQ] - ) for k, v in kallisto_outputs.items() + ) } } assert exp == prj.get_outputs(skip_sample_less) From f1f1bf850533e0bc58187aa32df6d7d04fcf6e0a Mon Sep 17 00:00:00 2001 From: Vince Date: Wed, 24 Apr 2019 20:54:50 -0400 Subject: [PATCH 32/61] remove stub that applies to internal-use-only method --- tests/integration/test_project_get_outputs.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index d52a1b9ba..1f24bc788 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -549,12 +549,6 @@ def test_sample_collection_accuracy(tmpdir, skip_sample_less, rna_pi_lines): assert exp == prj.get_outputs(skip_sample_less) -@pytest.mark.skip("not implemented") -def test_protocol_collection_accuracy(tmpdir): - """ Names of protocols collected for each pipeline are as expected. """ - pass - - def get_temp_paths(n, folder, known=None, generate=randconf): """ Generate unique tempfile paths pointing to within a particular folder. From dc7e0266a9be2eb0aa044d31bd4a1bce6db4e06b Mon Sep 17 00:00:00 2001 From: Vince Date: Wed, 24 Apr 2019 21:04:37 -0400 Subject: [PATCH 33/61] tweak reqs --- requirements/requirements-all.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 19e73c2de..2faa2d0b4 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,7 +1,8 @@ -attmap>=0.7dev +attmap>=0.7 colorama>=0.3.9 logmuse>=0.0.2 pandas>=0.20.2 pyyaml>=3.12 divvy>=0.3.1 -peppy>=0.21dev +peppy>=0.20 +#peppy>=0.21dev From 3a90670fb4d2cbde14af9a84a20ac4843900acd6 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 25 Apr 2019 14:26:46 -0400 Subject: [PATCH 34/61] prevent optarg duplication; close #168 --- docs/autodoc_build/looper.md | 262 ++++++++++-------- docs/changelog.md | 2 + docs_jupyter/build/hello-world.md | 2 +- looper/conductor.py | 55 ++-- requirements/requirements-all.txt | 3 +- tests/helpers.py | 23 ++ tests/integration/test_project_get_outputs.py | 76 ++--- .../test_cli_prj_pipe_args_collision.py | 140 ++++++++++ 8 files changed, 364 insertions(+), 199 deletions(-) create mode 100644 tests/specific_use_cases/test_cli_prj_pipe_args_collision.py diff --git a/docs/autodoc_build/looper.md b/docs/autodoc_build/looper.md index 81bfb2a41..68aae851d 100644 --- a/docs/autodoc_build/looper.md +++ b/docs/autodoc_build/looper.md @@ -102,14 +102,24 @@ pairs of identifier for a kind of output and the collection of this Project's samples for which it's applicable (i.e., those samples with protocol that maps to the corresponding pipeline). ```python -def get_outputs(self) +def get_outputs(self, skip_sample_less=True) ``` +**Parameters:** + +- `skip_sample_less` -- `bool`: whether to omit pipelines that are forprotocols of which the Project has no Sample instances + + **Returns:** `Mapping[str, Mapping[str, namedtuple]]`: collection of bindingsbetween identifier for pipeline and collection of bindings between name for a kind of output and pair in which first component is a path template and the second component is a collection of sample names +**Raises:** + +- `TypeError`: if argument to sample-less pipeline skipping parameteris not a Boolean + + ### implied\_columns @@ -125,6 +135,19 @@ def implied_columns(self) +### interfaces +Get this Project's collection of pipeline interfaces +```python +def interfaces(self) +``` + +**Returns:** + +`Iterable[looper.PipelineInterface]`: collection of pipelineinterfaces known by this Project + + + + ### num\_samples Count the number of samples available in this Project. ```python @@ -309,6 +332,124 @@ Project needs certain metadata. Represent case in which sample sheet is specified but nonexistent. +## Class SubmissionConductor +Collects and then submits pipeline jobs. + +This class holds a 'pool' of commands to submit as a single cluster job. +Eager to submit a job, each instance's collection of commands expands until +it reaches the 'pool' has been filled, and it's therefore time to submit the +job. The pool fills as soon as a fill criteria has been reached, which can +be either total input file size or the number of individual commands. + + +### add\_sample +Add a sample for submission to this conductor. +```python +def add_sample(self, sample, sample_subtype=, rerun=False) +``` + +**Parameters:** + +- `sample` -- `Sample`: sample to be included with this conductor'scurrently growing collection of command submissions +- `sample_subtype` -- `type`: specific subtype associatedwith this new sample; this is used to tailor-make the sample instance as required by its protocol/pipeline and supported by the pipeline interface. +- `rerun` -- `bool`: whether the given sample is being rerun rather thanrun for the first time + + +**Returns:** + +`bool`: Indication of whether the given sample was added tothe current 'pool.' + + +**Raises:** + +- `TypeError`: If sample subtype is provided but does not extendthe base Sample class, raise a TypeError. + + + + +### failed\_samples +```python +def failed_samples(self) +``` + + + +### num\_cmd\_submissions +Return the number of commands that this conductor has submitted. +```python +def num_cmd_submissions(self) +``` + +**Returns:** + +`int`: Number of commands submitted so far. + + + + +### num\_job\_submissions +Return the number of jobs that this conductor has submitted. +```python +def num_job_submissions(self) +``` + +**Returns:** + +`int`: Number of jobs submitted so far. + + + + +### submit +Submit one or more commands as a job. + +This call will submit the commands corresponding to the current pool +of samples if and only if the argument to 'force' evaluates to a +true value, or the pool of samples is full. +```python +def submit(self, force=False) +``` + +**Parameters:** + +- `force` -- `bool`: Whether submission should be done/simulated evenif this conductor's pool isn't full. + + +**Returns:** + +`bool`: Whether a job was submitted (or would've been ifnot for dry run) + + + + +### write\_script +Create the script for job submission. +```python +def write_script(self, pool, size) +``` + +**Parameters:** + +- `pool` -- ``: +- `size` -- `int | float`: + + +**Returns:** + +`str`: Path to the job submission script created. + + + + +### write\_skipped\_sample\_scripts +For any sample skipped during initial processing, write submission script. +```python +def write_skipped_sample_scripts(self) +``` + + + + ## Class PipelineInterface This class parses, holds, and returns information for a yaml file that specifies how to interact with each individual pipeline. This includes both resources to request for cluster job submission, as well as arguments to be passed from the sample annotation metadata to the pipeline @@ -578,124 +719,5 @@ def uses_looper_args(self, pipeline_name) -## Class SubmissionConductor -Collects and then submits pipeline jobs. - -This class holds a 'pool' of commands to submit as a single cluster job. -Eager to submit a job, each instance's collection of commands expands until -it reaches the 'pool' has been filled, and it's therefore time to submit the -job. The pool fills as soon as a fill criteria has been reached, which can -be either total input file size or the number of individual commands. - - -### add\_sample -Add a sample for submission to this conductor. -```python -def add_sample(self, sample, sample_subtype=, rerun=False) -``` - -**Parameters:** - -- `sample` -- `Sample`: sample to be included with this conductor'scurrently growing collection of command submissions -- `sample_subtype` -- `type`: specific subtype associatedwith this new sample; this is used to tailor-make the sample instance as required by its protocol/pipeline and supported by the pipeline interface. -- `rerun` -- `bool`: whether the given sample is being rerun rather thanrun for the first time - - -**Returns:** - -`bool`: Indication of whether the given sample was added tothe current 'pool.' - - -**Raises:** - -- `TypeError`: If sample subtype is provided but does not extendthe base Sample class, raise a TypeError. - - - - -### failed\_samples -```python -def failed_samples(self) -``` - - - -### num\_cmd\_submissions -Return the number of commands that this conductor has submitted. -```python -def num_cmd_submissions(self) -``` - -**Returns:** - -`int`: Number of commands submitted so far. - - - - -### num\_job\_submissions -Return the number of jobs that this conductor has submitted. -```python -def num_job_submissions(self) -``` - -**Returns:** - -`int`: Number of jobs submitted so far. - - - - -### submit -Submit command(s) as a job. - -This call will submit the commands corresponding to the current pool -of samples if and only if the argument to 'force' evaluates to a -true value, or the pool of samples is full. -```python -def submit(self, force=False) -``` - -**Parameters:** - -- `force` -- `bool`: Whether submission should be done/simulated evenif this conductor's pool isn't full. - - -**Returns:** - -`bool`: Whether a job was submitted (or would've been ifnot for dry run) - - - - -### write\_script -Create the script for job submission. -```python -def write_script(self, pool, template_values, prj_argtext, looper_argtext) -``` - -**Parameters:** - -- `template_values` -- `Mapping`: Collection of template placeholderkeys and the values with which to replace them. -- `prj_argtext` -- `str`: Command text related to Project data. -- `looper_argtext` -- `str`: Command text related to looper arguments. - - -**Returns:** - -`str`: Path to the job submission script created. - - - - -### write\_skipped\_sample\_scripts -For any sample skipped during initial processing, write submission script. -```python -def write_skipped_sample_scripts(self) -``` - - - - **Version Information**: `looper` v0.12dev, generated by `lucidoc` v0.3.1 \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md index a7d8c0e1b..eee012111 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -3,6 +3,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. ## [Unreleased] +### Fixed +- Prevent duplication of CLI flags: [Issue 168](https://github.com/pepkit/looper/issues/168) ## [0.11.1] - 2019-04-17 diff --git a/docs_jupyter/build/hello-world.md b/docs_jupyter/build/hello-world.md index 3c0e55396..078ac64fe 100644 --- a/docs_jupyter/build/hello-world.md +++ b/docs_jupyter/build/hello-world.md @@ -87,7 +87,7 @@ Traceback (most recent call last): File "/home/nsheff/.local/lib/python3.5/site-packages/looper/utils.py", line 104, in determine_config_path raise ValueError("Path doesn't exist: {}".format(root)) ValueError: Path doesn't exist: project/project_config.yaml - + ``` Voila! You've run your very first pipeline across multiple samples using `looper`! diff --git a/looper/conductor.py b/looper/conductor.py index 48bee29b5..517a596f4 100644 --- a/looper/conductor.py +++ b/looper/conductor.py @@ -88,7 +88,8 @@ def __init__(self, pipeline_key, pipeline_interface, cmd_base, prj, self.sample_subtype = sample_subtype or Sample self.compute_variables = compute_variables - self.extra_args_text = (extra_args and " ".join(extra_args)) or "" + self.extra_pipe_args = extra_args or [] + #self.extra_args_text = (extra_args and " ".join(extra_args)) or "" self.uses_looper_args = \ pipeline_interface.uses_looper_args(pipeline_key) self.ignore_flags = ignore_flags @@ -228,7 +229,7 @@ def add_sample(self, sample, sample_subtype=Sample, rerun=False): argstring = self.pl_iface.get_arg_string( pipeline_name=self.pl_key, sample=sample, submission_folder_path=self.prj.metadata[SUBMISSION_SUBDIR_KEY]) - except AttributeError as e: + except AttributeError: argstring = None # TODO: inform about which missing attribute. fail_message = "Required attribute missing " \ @@ -257,14 +258,24 @@ def add_sample(self, sample, sample_subtype=Sample, rerun=False): return skip_reasons def _get_settings_looptext_prjtext(self, size): + """ + Determine settings, looper argstring, and project argstring. + + :param int | float size: size of submission, used to select the proper + resource package from the pipeline interface + :return dict, str, str: collection of settings, looper argstring, and + project argstring + """ settings = self.pl_iface.choose_resource_package(self.pl_key, size) settings.update(self.compute_variables or {}) if self.uses_looper_args: settings.setdefault("cores", 1) - looper_argtext = create_looper_args_text(self.pl_key, settings, self.prj) + looper_argtext = \ + create_looper_args_text(self.pl_key, settings, self.prj) else: looper_argtext = "" - prj_argtext = self.prj.get_arg_string(self.pl_key) + prj_argtext = self.prj.get_arg_string( + self.pl_key, {x for x in self.extra_pipe_args if x.startswith("-")}) return settings, looper_argtext, prj_argtext def submit(self, force=False): @@ -286,14 +297,6 @@ def submit(self, force=False): submitted = False elif force or self._is_full(self._pool, self._curr_size): - _LOGGER.debug("Determining submission settings for %d sample " - "(%.2f Gb)", len(self._pool), self._curr_size) - settings, looper_argtext, prj_argtext = \ - self._get_settings_looptext_prjtext(self._curr_size) - assert all(map(lambda cmd_part: isinstance(cmd_part, str), - [self.cmd_base, prj_argtext, looper_argtext])), \ - "Each command component must be a string." - # Ensure that each sample is individually represented on disk, # specific to subtype as applicable (should just be a single # subtype for each submission conductor, but some may just be @@ -312,8 +315,7 @@ def submit(self, force=False): subtype_name, s.name) s.to_yaml(subs_folder_path=self.prj.metadata[SUBMISSION_SUBDIR_KEY]) - script = self.write_script(self._pool, settings, - prj_argtext=prj_argtext, looper_argtext=looper_argtext) + script = self.write_script(self._pool, self._curr_size) self._num_total_job_submissions += 1 @@ -391,20 +393,23 @@ def _jobname(self, pool): name = "lump{}".format(self._num_total_job_submissions + 1) return "{}_{}".format(self.pl_key, name) - def write_script(self, pool, template_values, prj_argtext, looper_argtext): + def _cmd_text_extra(self, size): + _LOGGER.debug("Determining submission settings for pool of size %.2f Gb", size) + settings, ltext, ptext = self._get_settings_looptext_prjtext(size) + from_cli = " ".join(self.extra_pipe_args) if self.extra_pipe_args else "" + return settings, " ".join([t for t in [ptext, ltext, from_cli] if t]) + + def write_script(self, pool, size): """ Create the script for job submission. - :param Mapping template_values: Collection of template placeholder - keys and the values with which to replace them. - :param str prj_argtext: Command text related to Project data. - :param str looper_argtext: Command text related to looper arguments. + :param Iterable[(peppy.Sample, str)] pool: collection of pairs in which + first component is a sample instance and second is command/argstring + :param float size: cumulative size of the given pool :return str: Path to the job submission script created. """ - # Determine the command text for the project, looper, and extra args. - texts = [prj_argtext, looper_argtext, self.extra_args_text] - extra_parts_text = " ".join([t for t in texts if t]) + template_values, extra_parts_text = self._cmd_text_extra(size) def get_final_cmd(c): return "{} {}".format(c, extra_parts_text) if extra_parts_text else c @@ -430,11 +435,7 @@ def get_base_cmd(argstr): def write_skipped_sample_scripts(self): """ For any sample skipped during initial processing, write submission script. """ - scripts = [] - for pool, size in self._skipped_sample_pools: - settings, looptext, prjtext = self._get_settings_looptext_prjtext(size) - scripts.append(self.write_script(pool, settings, prjtext, looptext)) - return scripts + return [self.write_script(pool, size) for pool, size in self._skipped_sample_pools] def _reset_pool(self): """ Reset the state of the pool of samples """ diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 2faa2d0b4..ed522d19b 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,5 +4,4 @@ logmuse>=0.0.2 pandas>=0.20.2 pyyaml>=3.12 divvy>=0.3.1 -peppy>=0.20 -#peppy>=0.21dev +peppy>=0.21dev diff --git a/tests/helpers.py b/tests/helpers.py index cb42d8a65..9780c2d22 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,5 +1,6 @@ """ Test utilities. """ +from collections import Counter from functools import partial import itertools import random @@ -25,6 +26,18 @@ def assert_entirely_equal(observed, expected): assert (observed == expected).all() +def count_repeats(objs): + """ + Find (and count) repeated objects + + :param Iterable[object] objs: collection of objects in which to seek + repeated elements + :return list[(object, int)]: collection of pairs in which first component + of each is a repeated object, and the second is duplication count + """ + return [(o, n) for o, n in Counter(objs).items() if n > 1] + + def named_param(argnames, argvalues): """ Parameterize a test case and automatically name/label by value @@ -77,4 +90,14 @@ def randstr(pool, size): return "".join(random.choice(pool) for _ in range(size)) +def randconf(ext=".yaml"): + """ + Randomly generate config filename. + + :param str ext: filename extension + :return str: randomly generated string to function as filename + """ + return randstr(LETTERS_AND_DIGITS, 15) + ext + + nonempty_powerset = partial(powerset, min_items=1) diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 1f24bc788..0b059d1c3 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -1,6 +1,5 @@ """ Tests for interaction between Project and PipelineInterface """ -from collections import Counter, namedtuple from copy import deepcopy import itertools import os @@ -11,10 +10,11 @@ from looper import Project as LP from looper.const import * from looper.exceptions import DuplicatePipelineKeyException -from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY +from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY, RESOURCES_KEY from attmap import AttMap +from divvy import DEFAULT_COMPUTE_RESOURCES_NAME as DEF_RES from peppy.const import * -from tests.helpers import randstr, LETTERS_AND_DIGITS +from tests.helpers import count_repeats, LETTERS_AND_DIGITS, randstr, randconf __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -43,13 +43,13 @@ "--genome": genome "--input": data_source "--single-or-paired": read_type -resources: - default: +{r}: + {d}: file_size: "0" cores: "4" mem: "4000" time: "0-02:00:00" -""".format(n=WGBS_NAME).splitlines(True) +""".format(n=WGBS_NAME, r=RESOURCES_KEY, d=DEF_RES).splitlines(True) RRBS_IFACE_LINES = """name: {n} path: src/rrbs.py @@ -61,13 +61,13 @@ "--genome": genome "--input": data_source "--single-or-paired": read_type -resources: - default: +{r}: + {d}: file_size: "0" cores: "4" mem: "4000" time: "0-02:00:00" -""".format(n=RRBS_NAME).splitlines(True) +""".format(n=RRBS_NAME, r=RESOURCES_KEY, d=DEF_RES).splitlines(True) PROTOMAP = {RRBS_NAME: RRBS_KEY, WGBS_NAME: WGBS_KEY, "EG": WGBS_KEY} @@ -107,16 +107,6 @@ def get_conf_data(req): return m -def randconf(ext=".yaml"): - """ - Randomly generate config filename. - - :param str ext: filename extension - :return str: randomly generated string to function as filename - """ - return randstr(LETTERS_AND_DIGITS, 15) + ext - - @pytest.fixture(scope="function") def prj(request, tmpdir): """ Provide a test case with a Project instance. """ @@ -138,7 +128,7 @@ def test_no_outputs(tmpdir, name_cfg_file, ifaces, skip_sample_less): """ Pipeline interfaces without outputs --> no Project outputs """ cfg = tmpdir.join(name_cfg_file).strpath iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - rep_paths = _find_reps(iface_paths) + rep_paths = count_repeats(iface_paths) assert [] == rep_paths, "Repeated temp filepath(s): {}".format(rep_paths) for data, path in zip(ifaces, iface_paths): with open(path, 'w') as f: @@ -170,7 +160,7 @@ def test_malformed_outputs( cfg = tmpdir.join(name_cfg_file).strpath iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - rep_paths = _find_reps(iface_paths) + rep_paths = count_repeats(iface_paths) assert [] == rep_paths, "Repeated temp filepath(s): {}".format(rep_paths) for data, path in zip(ifaces, iface_paths): @@ -183,7 +173,7 @@ def test_malformed_outputs( assert not os.path.exists(anns_file) sample_protos = [random.choice(prot_pool) for _ in range(10)] sample_names = [randstr(string.ascii_letters, 20) for _ in sample_protos] - repeated_sample_names = _find_reps(sample_names) + repeated_sample_names = count_repeats(sample_names) assert [] == repeated_sample_names, \ "Repeated sample names: {}".format(repeated_sample_names) anns_data = [(SAMPLE_NAME_COLNAME, ASSAY_KEY)] + \ @@ -217,8 +207,8 @@ def test_only_subproject_has_outputs(tmpdir, ifaces, declared_outputs): cfg = tmpdir.join(randconf()).strpath iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - assert [] == _find_reps(iface_paths), \ - "Repeated temp filepath(s): {}".format(_find_reps(iface_paths)) + assert [] == count_repeats(iface_paths), \ + "Repeated temp filepath(s): {}".format(count_repeats(iface_paths)) for data, path in zip(ifaces, iface_paths): with open(path, 'w') as f: @@ -227,8 +217,8 @@ def test_only_subproject_has_outputs(tmpdir, ifaces, declared_outputs): md[PIPELINE_INTERFACES_KEY] = iface_paths sp_ifaces_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - assert [] == _find_reps(sp_ifaces_paths), \ - "Repeated temp filepath(s): {}".format(_find_reps(sp_ifaces_paths)) + assert [] == count_repeats(sp_ifaces_paths), \ + "Repeated temp filepath(s): {}".format(count_repeats(sp_ifaces_paths)) iface_path_intersect = set(sp_ifaces_paths) & set(iface_paths) assert set() == iface_path_intersect, \ "Nonempty main/subs iface path intersection: {}".\ @@ -281,8 +271,8 @@ def test_only_main_project_has_outputs(tmpdir, ifaces, declared_outputs): cfg = tmpdir.join(randconf()).strpath iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - assert [] == _find_reps(iface_paths), \ - "Repeated temp filepath(s): {}".format(_find_reps(iface_paths)) + assert [] == count_repeats(iface_paths), \ + "Repeated temp filepath(s): {}".format(count_repeats(iface_paths)) for data, path in zip(ifaces, iface_paths): with open(path, 'w') as f: @@ -291,8 +281,8 @@ def test_only_main_project_has_outputs(tmpdir, ifaces, declared_outputs): md[PIPELINE_INTERFACES_KEY] = iface_paths sp_ifaces_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - assert [] == _find_reps(sp_ifaces_paths), \ - "Repeated temp filepath(s): {}".format(_find_reps(sp_ifaces_paths)) + assert [] == count_repeats(sp_ifaces_paths), \ + "Repeated temp filepath(s): {}".format(count_repeats(sp_ifaces_paths)) iface_path_intersect = set(sp_ifaces_paths) & set(iface_paths) assert set() == iface_path_intersect, \ "Nonempty main/subs iface path intersection: {}". \ @@ -589,18 +579,6 @@ def make_temp_file_path(folder, known, generate=randconf): return fp -def _find_reps(objs): - """ - Find (and count) repeated objects - - :param Iterable[object] objs: collection of objects in which to seek - repeated elements - :return list[(object, int)]: collection of pairs in which first component - of each is a repeated object, and the second is duplication count - """ - return [(o, n) for o, n in Counter(objs).items() if n > 1] - - def _write_and_build_prj(conf_file, conf_data): """ Write Project config data and create the instance. @@ -693,8 +671,8 @@ def rna_pi_lines(): "--single-or-paired": read_type required_input_files: [data_source] ngs_input_files: [data_source] - resources: - default: + {res}: + {dr}: file_size: "0" cores: "6" mem: "36000" @@ -715,8 +693,8 @@ def rna_pi_lines(): "--genome": genome "--input": data_source "--single-or-paired": read_type - resources: - default: + {res}: + {dr}: file_size: "0" cores: "2" mem: "60000" @@ -738,8 +716,8 @@ def rna_pi_lines(): "--fragment-length-sdev": fragment_length_sdev outputs: {abundances_key}: \"{abundances_val}\" - resources: - default: + {res}: + {dr}: cores: "2" mem: "4000" time: "0-6:00:00" @@ -749,7 +727,7 @@ def rna_pi_lines(): mem: "8000" time: "0-12:00:00" """.format( - rnaseq_proto_name=RNASEQ, + res=RESOURCES_KEY, dr=DEF_RES, rnaseq_proto_name=RNASEQ, bs_key=RNA_PIPES["bitseq"].key, bs_name=RNA_PIPES["bitseq"].name, th_key=RNA_PIPES["tophat"].key, th_name=RNA_PIPES["tophat"].name, kall_key=RNA_PIPES["kallisto"].key, kall_name=RNA_PIPES["kallisto"].name, diff --git a/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py b/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py new file mode 100644 index 000000000..8276371a5 --- /dev/null +++ b/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py @@ -0,0 +1,140 @@ +""" Tests for collision between CLI- and Project-specified pipe args """ + +import copy +import itertools +import os +import pytest +import yaml +from divvy import DEFAULT_COMPUTE_RESOURCES_NAME as DEF_RES +from looper import PipelineInterface, Project, SubmissionConductor +from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY +from peppy.const import * +from tests.helpers import count_repeats, powerset, randconf + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" + + +ALL_PIPE_FLAGS = {"--random", "--arbitrary", "--does-not-matter"} + + +def generate_flags_partitions(flags): + """ + Generate all partitions of a CLI flag options. + + Each partition will be such that each flag is either designated for CLI + specification or for project config specification, but not both. + + :param Iterable[str] flags: collection of flag-like options to partition + :return Iterable[(str, dict[str, NoneType])]: collection of pairs in which + first component of each pair is collection of flags for CLI-like + specification simulation, and second component is specification of + remaining flags as pipeline args for project config + """ + return [(ps, {f: None for f in flags if f not in ps}) for ps in powerset(flags)] + + +def generate_overlaps(singles, mapped): + """ + Generate improper partitions, i.e. those with some overlap between subsets. + + :param Iterable[str] singles: collection of flag-like option names + :param dict[str, NoneType] mapped: flag-like option name mapped to null + :return Iterable[(str, dict[str, NoneType])]: collection of pairs in which + first component of each pair is collection of flags for CLI-like + specification simulation, and second component is specification of + remaining flags as pipeline args for project config + """ + common = set(singles) & set(mapped.keys()) + assert set() == common, "Nonempty intersection: {}".format(", ".join(common)) + singles_update = [list(singles) + list(m) for m in + powerset(mapped.keys(), min_items=1)] + mapped_update = [{f: None for f in fs} for fs in powerset(singles, min_items=1)] + aug_maps = [] + for mx in mapped_update: + m = copy.copy(mapped) + m.update(mx) + aug_maps.append(m) + return [(s, mapped) for s in singles_update] + [(singles, m) for m in aug_maps] + + +def generate_full_flags_cover(flags): + """ + Generate all paritions of flags, both with and without overlaps. + + Each partition is binary, designating each flag-like option for either + CLI-like specification simulation or for pipeline args project config + specification (or both in the case of a partition with a nonempty + intersection of the parts). + + :param Iterable[str] flags: collection of flag-like options to partition + :return Iterable[(str, dict[str, NoneType])]: collection of pairs in which + first component of each pair is collection of flags for CLI-like + specification simulation, and second component is specification of + remaining flags as pipeline args for project config + """ + partition = generate_flags_partitions(flags) + overlappings = [generate_overlaps(s, m) for s, m in partition] + return partition + list(itertools.chain(*overlappings)) + + +@pytest.fixture +def prj_dat(request, tmpdir): + """ Project config data for a test case """ + prj_dat = {METADATA_KEY: {OUTDIR_KEY: tmpdir.strpath}} + if PIPE_ARGS_SECTION in request.fixturenames: + pipe_args = request.getfixturevalue(PIPE_ARGS_SECTION) + if type(pipe_args) is not dict: + raise TypeError("Pipeline arguments must be a dictionary; got {}". + format(type(pipe_args))) + prj_dat[PIPE_ARGS_SECTION] = pipe_args + return prj_dat + + +@pytest.mark.parametrize( + ["cli_flags", "pipe_args_data"], generate_full_flags_cover(ALL_PIPE_FLAGS)) +def test_flag_like_option(tmpdir, cli_flags, pipe_args_data, prj_dat): + """ Collision of flag-like options adds each only once. """ + + # Pretest + assert len(cli_flags) > 0 or len(pipe_args_data) > 0, \ + "Invalid test case parameterization -- empty flags and pipeline args" + reps = count_repeats(cli_flags) + assert [] == reps, "Unexpected duplicate flags: {}".format(reps) + + # Build and validate Project. + pipe_key = "arbitrary-testpipe" + prj_dat[PIPE_ARGS_SECTION] = {pipe_key: pipe_args_data} + temproot = tmpdir.strpath + prj_cfg = os.path.join(temproot, randconf()) + prj = _write_and_build_prj(prj_cfg, prj_dat) + assert prj_dat[PIPE_ARGS_SECTION] == prj[PIPE_ARGS_SECTION].to_map() + + # Build the submission conductor. + pi_data = { + PROTOMAP_KEY: {GENERIC_PROTOCOL_KEY: pipe_key}, + PL_KEY: {pipe_key: {DEF_RES: { + "file_size": "0", + "cores": "1", + "mem": "1000", + "time": "0-01:00:00" + }}} + } + pi = PipelineInterface(pi_data) + conductor = SubmissionConductor( + pipe_key, pi, cmd_base="", prj=prj, extra_args=cli_flags) + _, addl_args_text = conductor._cmd_text_extra(0) + assert set(addl_args_text.split(" ")) == ALL_PIPE_FLAGS + + +def _write_and_build_prj(fp, d): + """ + Write project config file and build Project. + + :param str fp: path to config file + :param dict d: project config data + :return looper.Project: newly built Project instance + """ + with open(fp, 'w') as f: + yaml.dump(d, f) + return Project(fp) From 47df30a154d061692183e37951639e934361e0ea Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 26 Apr 2019 10:55:39 -0400 Subject: [PATCH 35/61] add note about pipeline outputs declaration; #32 --- docs/pipeline-interface.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/pipeline-interface.md b/docs/pipeline-interface.md index 0457b3cfd..d445fc150 100644 --- a/docs/pipeline-interface.md +++ b/docs/pipeline-interface.md @@ -144,10 +144,11 @@ These are considered optional, and so the pipeline will still be submitted if th - `-C`: config_file (the pipeline config file specified in the project config file; or the default config file, if it exists) - `-P`: cores (the number of processing cores specified by the chosen resource package) - `-M`: mem (memory limit) -- `resources` (recommended) A section outlining how much memory, CPU, and clock time to request, modulated by input file size +- `resources` (recommended): A section outlining how much memory, CPU, and clock time to request, modulated by input file size If the `resources` section is missing, looper will only be able to run the pipeline locally (not submit it to a cluster resource manager). If you provide a `resources` section, you must define at least 1 option named 'default' with `file_size: "0"`. Then, you define as many more resource "packages" or "bundles" as you want. +- `outputs`: key-value pairs in which each key is a name for a kind of output file (or group of them) that a pipeline may produce, and the value is a template template for a path that will be populated by sample variables **More on `resources`** @@ -214,4 +215,7 @@ pipelines: cores: "4" mem: "8000" time: "08:00:00" + outputs: + smoothed_bw: "aligned_{sample.genome}/{sample.name}_smoothed.bw" + pre_smoothed_bw: "aligned_{project.prealignments}/{sample.name}_smoothed.bw" ``` From ccc56b958fe332175b0168b17930695afa9fd8c2 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 26 Apr 2019 10:56:24 -0400 Subject: [PATCH 36/61] update docs --- docs/autodoc_build/looper.md | 236 +++++++++++++++++------------------ 1 file changed, 118 insertions(+), 118 deletions(-) diff --git a/docs/autodoc_build/looper.md b/docs/autodoc_build/looper.md index 68aae851d..8818df11d 100644 --- a/docs/autodoc_build/looper.md +++ b/docs/autodoc_build/looper.md @@ -332,124 +332,6 @@ Project needs certain metadata. Represent case in which sample sheet is specified but nonexistent. -## Class SubmissionConductor -Collects and then submits pipeline jobs. - -This class holds a 'pool' of commands to submit as a single cluster job. -Eager to submit a job, each instance's collection of commands expands until -it reaches the 'pool' has been filled, and it's therefore time to submit the -job. The pool fills as soon as a fill criteria has been reached, which can -be either total input file size or the number of individual commands. - - -### add\_sample -Add a sample for submission to this conductor. -```python -def add_sample(self, sample, sample_subtype=, rerun=False) -``` - -**Parameters:** - -- `sample` -- `Sample`: sample to be included with this conductor'scurrently growing collection of command submissions -- `sample_subtype` -- `type`: specific subtype associatedwith this new sample; this is used to tailor-make the sample instance as required by its protocol/pipeline and supported by the pipeline interface. -- `rerun` -- `bool`: whether the given sample is being rerun rather thanrun for the first time - - -**Returns:** - -`bool`: Indication of whether the given sample was added tothe current 'pool.' - - -**Raises:** - -- `TypeError`: If sample subtype is provided but does not extendthe base Sample class, raise a TypeError. - - - - -### failed\_samples -```python -def failed_samples(self) -``` - - - -### num\_cmd\_submissions -Return the number of commands that this conductor has submitted. -```python -def num_cmd_submissions(self) -``` - -**Returns:** - -`int`: Number of commands submitted so far. - - - - -### num\_job\_submissions -Return the number of jobs that this conductor has submitted. -```python -def num_job_submissions(self) -``` - -**Returns:** - -`int`: Number of jobs submitted so far. - - - - -### submit -Submit one or more commands as a job. - -This call will submit the commands corresponding to the current pool -of samples if and only if the argument to 'force' evaluates to a -true value, or the pool of samples is full. -```python -def submit(self, force=False) -``` - -**Parameters:** - -- `force` -- `bool`: Whether submission should be done/simulated evenif this conductor's pool isn't full. - - -**Returns:** - -`bool`: Whether a job was submitted (or would've been ifnot for dry run) - - - - -### write\_script -Create the script for job submission. -```python -def write_script(self, pool, size) -``` - -**Parameters:** - -- `pool` -- ``: -- `size` -- `int | float`: - - -**Returns:** - -`str`: Path to the job submission script created. - - - - -### write\_skipped\_sample\_scripts -For any sample skipped during initial processing, write submission script. -```python -def write_skipped_sample_scripts(self) -``` - - - - ## Class PipelineInterface This class parses, holds, and returns information for a yaml file that specifies how to interact with each individual pipeline. This includes both resources to request for cluster job submission, as well as arguments to be passed from the sample annotation metadata to the pipeline @@ -719,5 +601,123 @@ def uses_looper_args(self, pipeline_name) +## Class SubmissionConductor +Collects and then submits pipeline jobs. + +This class holds a 'pool' of commands to submit as a single cluster job. +Eager to submit a job, each instance's collection of commands expands until +it reaches the 'pool' has been filled, and it's therefore time to submit the +job. The pool fills as soon as a fill criteria has been reached, which can +be either total input file size or the number of individual commands. + + +### add\_sample +Add a sample for submission to this conductor. +```python +def add_sample(self, sample, sample_subtype=, rerun=False) +``` + +**Parameters:** + +- `sample` -- `Sample`: sample to be included with this conductor'scurrently growing collection of command submissions +- `sample_subtype` -- `type`: specific subtype associatedwith this new sample; this is used to tailor-make the sample instance as required by its protocol/pipeline and supported by the pipeline interface. +- `rerun` -- `bool`: whether the given sample is being rerun rather thanrun for the first time + + +**Returns:** + +`bool`: Indication of whether the given sample was added tothe current 'pool.' + + +**Raises:** + +- `TypeError`: If sample subtype is provided but does not extendthe base Sample class, raise a TypeError. + + + + +### failed\_samples +```python +def failed_samples(self) +``` + + + +### num\_cmd\_submissions +Return the number of commands that this conductor has submitted. +```python +def num_cmd_submissions(self) +``` + +**Returns:** + +`int`: Number of commands submitted so far. + + + + +### num\_job\_submissions +Return the number of jobs that this conductor has submitted. +```python +def num_job_submissions(self) +``` + +**Returns:** + +`int`: Number of jobs submitted so far. + + + + +### submit +Submit one or more commands as a job. + +This call will submit the commands corresponding to the current pool +of samples if and only if the argument to 'force' evaluates to a +true value, or the pool of samples is full. +```python +def submit(self, force=False) +``` + +**Parameters:** + +- `force` -- `bool`: Whether submission should be done/simulated evenif this conductor's pool isn't full. + + +**Returns:** + +`bool`: Whether a job was submitted (or would've been ifnot for dry run) + + + + +### write\_script +Create the script for job submission. +```python +def write_script(self, pool, size) +``` + +**Parameters:** + +- `pool` -- `Iterable[(peppy.Sample, str)]`: collection of pairs in whichfirst component is a sample instance and second is command/argstring +- `size` -- `float`: cumulative size of the given pool + + +**Returns:** + +`str`: Path to the job submission script created. + + + + +### write\_skipped\_sample\_scripts +For any sample skipped during initial processing, write submission script. +```python +def write_skipped_sample_scripts(self) +``` + + + + **Version Information**: `looper` v0.12dev, generated by `lucidoc` v0.3.1 \ No newline at end of file From 481d4833a6665788850cf087a40c1464212578c7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 26 Apr 2019 13:07:12 -0400 Subject: [PATCH 37/61] use repeat counting from peppy --- tests/helpers.py | 13 ------------- tests/integration/test_project_get_outputs.py | 3 ++- .../test_cli_prj_pipe_args_collision.py | 3 ++- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/tests/helpers.py b/tests/helpers.py index 9780c2d22..16f32bf2d 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,6 +1,5 @@ """ Test utilities. """ -from collections import Counter from functools import partial import itertools import random @@ -26,18 +25,6 @@ def assert_entirely_equal(observed, expected): assert (observed == expected).all() -def count_repeats(objs): - """ - Find (and count) repeated objects - - :param Iterable[object] objs: collection of objects in which to seek - repeated elements - :return list[(object, int)]: collection of pairs in which first component - of each is a repeated object, and the second is duplication count - """ - return [(o, n) for o, n in Counter(objs).items() if n > 1] - - def named_param(argnames, argvalues): """ Parameterize a test case and automatically name/label by value diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py index 0b059d1c3..439c70c00 100644 --- a/tests/integration/test_project_get_outputs.py +++ b/tests/integration/test_project_get_outputs.py @@ -14,7 +14,8 @@ from attmap import AttMap from divvy import DEFAULT_COMPUTE_RESOURCES_NAME as DEF_RES from peppy.const import * -from tests.helpers import count_repeats, LETTERS_AND_DIGITS, randstr, randconf +from peppy.utils import count_repeats +from tests.helpers import LETTERS_AND_DIGITS, randstr, randconf __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" diff --git a/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py b/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py index 8276371a5..d492cee54 100644 --- a/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py +++ b/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py @@ -9,7 +9,8 @@ from looper import PipelineInterface, Project, SubmissionConductor from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY from peppy.const import * -from tests.helpers import count_repeats, powerset, randconf +from peppy.utils import count_repeats +from tests.helpers import powerset, randconf __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" From 0cc0fb51ec6367816047d370ef3db537965d77cf Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 29 Apr 2019 12:09:23 -0400 Subject: [PATCH 38/61] move utility to peppy; ef1367dad84eca29fe0a0df8924099ef22f9fb6a --- looper/utils.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/looper/utils.py b/looper/utils.py index cfaf8501e..437f85e39 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -3,11 +3,11 @@ from collections import defaultdict, Iterable import copy import glob -import logging import os from peppy import \ FLAGS, SAMPLE_INDEPENDENT_PROJECT_SECTIONS, SAMPLE_NAME_COLNAME +from peppy.utils import get_logger from .const import * @@ -15,18 +15,6 @@ DEFAULT_CONFIG_SUFFIX = "_config.yaml" -def get_logger(name): - """ - Returm a logger with given name, equipped with custom method. - - :param str name: name for the logger to get/create. - :return logging.Logger: named, custom logger instance. - """ - l = logging.getLogger(name) - l.whisper = lambda msg, *args, **kwargs: l.log(5, msg, *args, **kwargs) - return l - - _LOGGER = get_logger(__name__) From e6dbe0a2f939d3d0ceb954d102033437a6ab799d Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 29 Apr 2019 13:32:18 -0400 Subject: [PATCH 39/61] change docs to pypi: --- docs/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/README.md b/docs/README.md index 60678ad7c..cb2d868d5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -24,13 +24,13 @@ Releases are posted as [GitHub releases](https://github.com/pepkit/looper/releas ``` -pip install --user https://github.com/pepkit/looper/zipball/master +pip install --user loopercli ``` Update with: ``` -pip install --user --upgrade https://github.com/pepkit/looper/zipball/master +pip install --user --upgrade loopercli ``` If the `looper` executable in not automatically in your `$PATH`, add the following line to your `.bashrc` or `.profile`: From 0824d70668d0fd3154ce6e6b88257b2c5616f0b2 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 29 Apr 2019 13:38:09 -0400 Subject: [PATCH 40/61] add pypi link --- docs/autodoc_build/.gitignore | 2 + docs/autodoc_build/looper.md | 309 ++++++++++++------------------ docs_jupyter/build/hello-world.md | 1 - mkdocs.yml | 1 + 4 files changed, 123 insertions(+), 190 deletions(-) create mode 100644 docs/autodoc_build/.gitignore diff --git a/docs/autodoc_build/.gitignore b/docs/autodoc_build/.gitignore new file mode 100644 index 000000000..d6b7ef32c --- /dev/null +++ b/docs/autodoc_build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/docs/autodoc_build/looper.md b/docs/autodoc_build/looper.md index 8818df11d..f7fa62d28 100644 --- a/docs/autodoc_build/looper.md +++ b/docs/autodoc_build/looper.md @@ -65,63 +65,6 @@ def derived_columns(self) -### get\_interfaces -Get the pipeline interfaces associated with the given protocol. -```python -def get_interfaces(self, protocol) -``` - -**Parameters:** - -- `protocol` -- `str`: name of the protocol for which to get interfaces - - -**Returns:** - -`Iterable[looper.PipelineInterface]`: collection of pipelineinterfaces associated with the given protocol - - -**Raises:** - -- `KeyError`: if the given protocol is not (perhaps yet) mappedto any pipeline interface - - - - -### get\_outputs -Map pipeline identifier to collection of output specifications. - -This method leverages knowledge of two collections of different kinds -of entities that meet in the manifestation of a Project. The first -is a collection of samples, which is known even in peppy.Project. The -second is a mapping from protocol/assay/library strategy to a collection -of pipeline interfaces, in which kinds of output may be declared. -Knowledge of these two items is here harnessed to map the identifier -for each pipeline about which this Project is aware to a collection of -pairs of identifier for a kind of output and the collection of -this Project's samples for which it's applicable (i.e., those samples -with protocol that maps to the corresponding pipeline). -```python -def get_outputs(self, skip_sample_less=True) -``` - -**Parameters:** - -- `skip_sample_less` -- `bool`: whether to omit pipelines that are forprotocols of which the Project has no Sample instances - - -**Returns:** - -`Mapping[str, Mapping[str, namedtuple]]`: collection of bindingsbetween identifier for pipeline and collection of bindings between name for a kind of output and pair in which first component is a path template and the second component is a collection of sample names - - -**Raises:** - -- `TypeError`: if argument to sample-less pipeline skipping parameteris not a Boolean - - - - ### implied\_columns Collection of sample attributes for which value of each is implied by other(s) ```python @@ -135,19 +78,6 @@ def implied_columns(self) -### interfaces -Get this Project's collection of pipeline interfaces -```python -def interfaces(self) -``` - -**Returns:** - -`Iterable[looper.PipelineInterface]`: collection of pipelineinterfaces known by this Project - - - - ### num\_samples Count the number of samples available in this Project. ```python @@ -332,6 +262,125 @@ Project needs certain metadata. Represent case in which sample sheet is specified but nonexistent. +## Class SubmissionConductor +Collects and then submits pipeline jobs. + +This class holds a 'pool' of commands to submit as a single cluster job. +Eager to submit a job, each instance's collection of commands expands until +it reaches the 'pool' has been filled, and it's therefore time to submit the +job. The pool fills as soon as a fill criteria has been reached, which can +be either total input file size or the number of individual commands. + + +### add\_sample +Add a sample for submission to this conductor. +```python +def add_sample(self, sample, sample_subtype=, rerun=False) +``` + +**Parameters:** + +- `sample` -- `Sample`: sample to be included with this conductor'scurrently growing collection of command submissions +- `sample_subtype` -- `type`: specific subtype associatedwith this new sample; this is used to tailor-make the sample instance as required by its protocol/pipeline and supported by the pipeline interface. +- `rerun` -- `bool`: whether the given sample is being rerun rather thanrun for the first time + + +**Returns:** + +`bool`: Indication of whether the given sample was added tothe current 'pool.' + + +**Raises:** + +- `TypeError`: If sample subtype is provided but does not extendthe base Sample class, raise a TypeError. + + + + +### failed\_samples +```python +def failed_samples(self) +``` + + + +### num\_cmd\_submissions +Return the number of commands that this conductor has submitted. +```python +def num_cmd_submissions(self) +``` + +**Returns:** + +`int`: Number of commands submitted so far. + + + + +### num\_job\_submissions +Return the number of jobs that this conductor has submitted. +```python +def num_job_submissions(self) +``` + +**Returns:** + +`int`: Number of jobs submitted so far. + + + + +### submit +Submit command(s) as a job. + +This call will submit the commands corresponding to the current pool +of samples if and only if the argument to 'force' evaluates to a +true value, or the pool of samples is full. +```python +def submit(self, force=False) +``` + +**Parameters:** + +- `force` -- `bool`: Whether submission should be done/simulated evenif this conductor's pool isn't full. + + +**Returns:** + +`bool`: Whether a job was submitted (or would've been ifnot for dry run) + + + + +### write\_script +Create the script for job submission. +```python +def write_script(self, pool, template_values, prj_argtext, looper_argtext) +``` + +**Parameters:** + +- `template_values` -- `Mapping`: Collection of template placeholderkeys and the values with which to replace them. +- `prj_argtext` -- `str`: Command text related to Project data. +- `looper_argtext` -- `str`: Command text related to looper arguments. + + +**Returns:** + +`str`: Path to the job submission script created. + + + + +### write\_skipped\_sample\_scripts +For any sample skipped during initial processing, write submission script. +```python +def write_skipped_sample_scripts(self) +``` + + + + ## Class PipelineInterface This class parses, holds, and returns information for a yaml file that specifies how to interact with each individual pipeline. This includes both resources to request for cluster job submission, as well as arguments to be passed from the sample annotation metadata to the pipeline @@ -601,123 +650,5 @@ def uses_looper_args(self, pipeline_name) -## Class SubmissionConductor -Collects and then submits pipeline jobs. - -This class holds a 'pool' of commands to submit as a single cluster job. -Eager to submit a job, each instance's collection of commands expands until -it reaches the 'pool' has been filled, and it's therefore time to submit the -job. The pool fills as soon as a fill criteria has been reached, which can -be either total input file size or the number of individual commands. - - -### add\_sample -Add a sample for submission to this conductor. -```python -def add_sample(self, sample, sample_subtype=, rerun=False) -``` - -**Parameters:** - -- `sample` -- `Sample`: sample to be included with this conductor'scurrently growing collection of command submissions -- `sample_subtype` -- `type`: specific subtype associatedwith this new sample; this is used to tailor-make the sample instance as required by its protocol/pipeline and supported by the pipeline interface. -- `rerun` -- `bool`: whether the given sample is being rerun rather thanrun for the first time - - -**Returns:** - -`bool`: Indication of whether the given sample was added tothe current 'pool.' - - -**Raises:** - -- `TypeError`: If sample subtype is provided but does not extendthe base Sample class, raise a TypeError. - - - - -### failed\_samples -```python -def failed_samples(self) -``` - - - -### num\_cmd\_submissions -Return the number of commands that this conductor has submitted. -```python -def num_cmd_submissions(self) -``` - -**Returns:** - -`int`: Number of commands submitted so far. - - - - -### num\_job\_submissions -Return the number of jobs that this conductor has submitted. -```python -def num_job_submissions(self) -``` - -**Returns:** - -`int`: Number of jobs submitted so far. - - - - -### submit -Submit one or more commands as a job. - -This call will submit the commands corresponding to the current pool -of samples if and only if the argument to 'force' evaluates to a -true value, or the pool of samples is full. -```python -def submit(self, force=False) -``` - -**Parameters:** - -- `force` -- `bool`: Whether submission should be done/simulated evenif this conductor's pool isn't full. - - -**Returns:** - -`bool`: Whether a job was submitted (or would've been ifnot for dry run) - - - - -### write\_script -Create the script for job submission. -```python -def write_script(self, pool, size) -``` - -**Parameters:** - -- `pool` -- `Iterable[(peppy.Sample, str)]`: collection of pairs in whichfirst component is a sample instance and second is command/argstring -- `size` -- `float`: cumulative size of the given pool - - -**Returns:** - -`str`: Path to the job submission script created. - - - - -### write\_skipped\_sample\_scripts -For any sample skipped during initial processing, write submission script. -```python -def write_skipped_sample_scripts(self) -``` - - - - -**Version Information**: `looper` v0.12dev, generated by `lucidoc` v0.3.1 \ No newline at end of file +**Version Information**: `looper` v0.11.0, generated by `lucidoc` v0.3 \ No newline at end of file diff --git a/docs_jupyter/build/hello-world.md b/docs_jupyter/build/hello-world.md index 252af4e9e..078ac64fe 100644 --- a/docs_jupyter/build/hello-world.md +++ b/docs_jupyter/build/hello-world.md @@ -1,4 +1,3 @@ -jupyter:True # Hello World! example for looper diff --git a/mkdocs.yml b/mkdocs.yml index 6d3048177..314f760d6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -2,6 +2,7 @@ site_name: Looper site_logo: img/looper_logo_dark.svg site_url: http://code.databio.org/looper/ repo_url: http://github.com/pepkit/looper +pypi_name: loopercli nav: - Getting Started: From ea083e960523149b47bca9a170b5331e36e9efd5 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 1 May 2019 10:12:18 -0400 Subject: [PATCH 41/61] make func public, visual changes in HTML --- looper/jinja_templates/project_object.html | 8 ++++---- looper/looper.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/looper/jinja_templates/project_object.html b/looper/jinja_templates/project_object.html index 793641c1c..c618d29f7 100644 --- a/looper/jinja_templates/project_object.html +++ b/looper/jinja_templates/project_object.html @@ -1,7 +1,7 @@
{% if links[0] is defined or figures[0] is defined %}
-
Looper project objects
+

Looper project objects

{% endif %} {% if figures[0] is defined %}
Figures
@@ -9,11 +9,11 @@
Figures
{% for figure in figures %}
- + - -
'{{ figure[0] }}'
+
+
'{{ figure[1] }}'
{% endfor %} diff --git a/looper/looper.py b/looper/looper.py index 40198347b..b69e62fd3 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -477,7 +477,7 @@ def __init__(self, prj): def __call__(self): """ Do the summarization. """ - _run_custom_summarizers(self.prj) + run_custom_summarizers(self.prj) # initialize the report builder report_builder = HTMLReportBuilder(self.prj) # run the report builder. a set of HTML pages is produced @@ -485,7 +485,7 @@ def __call__(self): _LOGGER.info("HTML Report (n=" + str(len(self.stats)) + "): " + report_path) -def _run_custom_summarizers(project): +def run_custom_summarizers(project): """ Run custom summarizers if any are defined From 82ccef5f4456fafefefdcc599d4e48b8e5c31bbf Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 1 May 2019 10:15:09 -0400 Subject: [PATCH 42/61] update defaults for HMTL CLI parsers, addresses https://github.com/pepkit/caravel/issues/108 --- looper/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/looper/__init__.py b/looper/__init__.py index b6b2f70ac..35ff939be 100644 --- a/looper/__init__.py +++ b/looper/__init__.py @@ -171,12 +171,12 @@ def add_subparser(cmd): # distinguish between explicit 0 and lack of specification. subparser.add_argument( "--lump", default=None, - type=html_range(min_val=0, max_val=100, step=0.1, value=100), + type=html_range(min_val=0, max_val=100, step=0.1, value=0), help="Maximum total input file size for a lump/batch of commands " "in a single job (in GB)") subparser.add_argument( "--lumpn", default=None, - type=html_range(min_val=1, max_val="num_samples", value="num_samples"), + type=html_range(min_val=1, max_val="num_samples", value=1), help="Number of individual scripts grouped into single submission") # Other commands From fb63602168b0c427e7dd7767a353c53b99013446 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 2 May 2019 09:58:38 -0400 Subject: [PATCH 43/61] parse and pass sample selection attr as str; close https://github.com/pepkit/peppy/issues/298 --- looper/__init__.py | 2 +- looper/looper.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/looper/__init__.py b/looper/__init__.py index 35ff939be..91046907b 100644 --- a/looper/__init__.py +++ b/looper/__init__.py @@ -222,7 +222,7 @@ def add_subparser(cmd): subparser.add_argument_group("select samples", "This group of arguments lets you specify samples to use by " "exclusion OR inclusion of the samples attribute values.") - fetch_samples_group.add_argument("--selector-attribute", nargs=1, dest="selector_attribute", + fetch_samples_group.add_argument("--selector-attribute", dest="selector_attribute", help="Specify the attribute for samples exclusion OR inclusion", default="protocol") protocols = fetch_samples_group.add_mutually_exclusive_group() diff --git a/looper/looper.py b/looper/looper.py index b69e62fd3..a72925e13 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -815,7 +815,9 @@ def main(): _LOGGER.debug("Results subdir: " + prj.metadata[RESULTS_SUBDIR_KEY]) - with ProjectContext(prj, selector_attribute=args.selector_attribute, selector_include=args.selector_include, + with ProjectContext(prj, + selector_attribute=args.selector_attribute, + selector_include=args.selector_include, selector_exclude=args.selector_exclude) as prj: if args.command in ["run", "rerun"]: From 486b717dd45f4ce167ca41844a5f04de5eacbe8e Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 2 May 2019 10:02:36 -0400 Subject: [PATCH 44/61] avoid lib/proto-based deprecations in tests --- tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 07c4caf9a..a8f8863f3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -136,7 +136,7 @@ "testngs.sh": FILE_BY_SAMPLE } -SAMPLE_ANNOTATION_LINES = """sample_name,library,file,file2,organism,nonmerged_col,data_source,dcol2 +SAMPLE_ANNOTATION_LINES = """sample_name,protocol,file,file2,organism,nonmerged_col,data_source,dcol2 a,testlib,src3,src3,,src3,src3, b,testlib,,,,src3,src3,src1 c,testlib,src3,src3,,src3,src3, @@ -182,7 +182,7 @@ } COMPARISON_FUNCTIONS = ["__eq__", "__ne__", "__len__", "keys", "values", "items"] -COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"] +COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", "protocol"] PROJECT_CONFIG_DATA = {"metadata": {SAMPLE_ANNOTATIONS_KEY: "annotations.csv"}} From f9b6e9bccdeedb005629849fc55187384fb176e0 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 2 May 2019 10:02:44 -0400 Subject: [PATCH 45/61] update changelog --- docs/changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/changelog.md b/docs/changelog.md index eee012111..7e5c74f36 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -5,6 +5,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] ### Fixed - Prevent duplication of CLI flags: [Issue 168](https://github.com/pepkit/looper/issues/168) +- Safer usage of CLI specification of sample subset selection: [`peppy` issue 298](https://github.com/pepkit/peppy/issues/298) ## [0.11.1] - 2019-04-17 From 4054fc2fc39cc75f314aeef9cc964b09c389aa06 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 2 May 2019 10:04:50 -0400 Subject: [PATCH 46/61] spaces --- tests/conftest.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index a8f8863f3..b61aeb8b3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -186,7 +186,6 @@ PROJECT_CONFIG_DATA = {"metadata": {SAMPLE_ANNOTATIONS_KEY: "annotations.csv"}} - def update_project_conf_data(extension): """ Updated Project configuration data mapping based on file extension """ updated = copy.deepcopy(PROJECT_CONFIG_DATA) @@ -196,7 +195,6 @@ def update_project_conf_data(extension): return updated - def pytest_addoption(parser): """ Facilitate command-line test behavior adjustment. """ parser.addoption("--logging-level", @@ -204,7 +202,6 @@ def pytest_addoption(parser): help="Project root logger level to use for tests") - def pytest_generate_tests(metafunc): """ Centralize dynamic test case parameterization. """ if "empty_collection" in metafunc.fixturenames: From 0ead4bf470590724bb3bfe3046dd18e3e9f271e0 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 2 May 2019 11:14:30 -0400 Subject: [PATCH 47/61] coverage --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index bcdd386a0..566f8749c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ install: - pip install . - pip install -r requirements/requirements-dev.txt - pip install -r requirements/requirements-test.txt -script: pytest +script: pytest --cov=looper branches: only: - dev From 75eb623cdb925dc23262a0addaef130f177dbb41 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 2 May 2019 11:23:41 -0400 Subject: [PATCH 48/61] introduce microtest-based CLI smoketests --- requirements/requirements-test.txt | 2 + tests/test_with_microtest_as_smoketest.py | 69 +++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 tests/test_with_microtest_as_smoketest.py diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 606b3da48..d6d1664da 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,2 +1,4 @@ coveralls==1.1 pytest-cov==2.4.0 +pytest-remotedata +ubiquerg>=0.0.3dev diff --git a/tests/test_with_microtest_as_smoketest.py b/tests/test_with_microtest_as_smoketest.py new file mode 100644 index 000000000..3a55d167b --- /dev/null +++ b/tests/test_with_microtest_as_smoketest.py @@ -0,0 +1,69 @@ +""" Use microtest for smoketesting the looper CLI. """ + +import os +import subprocess +import pytest +from ubiquerg import build_cli_extra + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" + + +REPO_NAME = "microtest" +REPO_URL = "https://github.com/databio/{}".format(REPO_NAME) +SAMPLE_SELECTOR_OPTION = "--selector-attribute" +INCLUSION_OPTION = "--selector-include" + + +@pytest.mark.remote_data +@pytest.fixture +def data_root(tmpdir): + """ Clone data repo and return path to it. """ + tmp = tmpdir.strpath + cmd = "git clone {}".format(REPO_URL) + try: + subprocess.check_call(cmd, cwd=tmp, shell=True) + except subprocess.CalledProcessError: + raise Exception("Failed to pull data ()".format(cmd)) + root = os.path.join(tmp, REPO_NAME) + assert os.path.isdir(root) + return root + + +@pytest.fixture +def data_conf_file(data_root): + """ Clone data repo and return path to project config file. """ + f = os.path.join(data_root, "config", "microtest_config.yaml") + assert os.path.isfile(f), "Contents: {}".format(os.listdir(data_root)) + return f + + +@pytest.fixture(scope="function") +def temp_chdir_home(tmpdir): + key = "HOME" + prev_home = os.environ[key] + prev_work = os.environ["PWD"] + curr_home = tmpdir.strpath + os.environ[key] = curr_home + os.chdir(curr_home) + yield + os.environ[key] = prev_home + os.chdir(prev_work) + assert os.getcwd() == prev_work + assert os.getenv(key) == prev_home + assert os.environ[key] == prev_home + + +@pytest.mark.remote_data +@pytest.mark.usefixtures("temp_chdir_home") +@pytest.mark.parametrize("cli_extra", + [build_cli_extra(**kvs) for kvs in + [{SAMPLE_SELECTOR_OPTION: "protocol", INCLUSION_OPTION: "ATAC-seq"}]]) +def test_cli_microtest_smoke(cli_extra, data_conf_file): + """ """ + cmd = "looper run -d {} {}".format(data_conf_file, cli_extra) + try: + subprocess.check_call(cmd, shell=True) + except Exception as e: + print("Exception: {}".format(e)) + pytest.fail("Failed command: {}".format(cmd)) From 53ebbc02f99e8db2a22028987d24aa798c578c06 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 2 May 2019 11:25:44 -0400 Subject: [PATCH 49/61] test comments --- tests/test_with_microtest_as_smoketest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_with_microtest_as_smoketest.py b/tests/test_with_microtest_as_smoketest.py index 3a55d167b..0bde9a189 100644 --- a/tests/test_with_microtest_as_smoketest.py +++ b/tests/test_with_microtest_as_smoketest.py @@ -40,6 +40,7 @@ def data_conf_file(data_root): @pytest.fixture(scope="function") def temp_chdir_home(tmpdir): + """ Temporarily (for a test case) change home and working directories. """ key = "HOME" prev_home = os.environ[key] prev_work = os.environ["PWD"] @@ -60,7 +61,7 @@ def temp_chdir_home(tmpdir): [build_cli_extra(**kvs) for kvs in [{SAMPLE_SELECTOR_OPTION: "protocol", INCLUSION_OPTION: "ATAC-seq"}]]) def test_cli_microtest_smoke(cli_extra, data_conf_file): - """ """ + """ Using microtest as project, test CLI for failure on specific cases. """ cmd = "looper run -d {} {}".format(data_conf_file, cli_extra) try: subprocess.check_call(cmd, shell=True) From c2727386b6a713376a23d79a1ab3c804e75d1fab Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 2 May 2019 13:01:49 -0400 Subject: [PATCH 50/61] powerset from ubiquerg --- requirements/requirements-dev.txt | 2 +- tests/helpers.py | 25 ------------------- tests/models/test_PipelineInterface.py | 2 +- .../test_cli_prj_pipe_args_collision.py | 3 ++- tests/test_with_microtest_as_smoketest.py | 2 +- 5 files changed, 5 insertions(+), 29 deletions(-) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 767976247..963cabdb3 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,4 +1,4 @@ jinja2 mock==2.0.0 pytest==3.0.7 - +ubiquerg>=0.0.3 diff --git a/tests/helpers.py b/tests/helpers.py index 16f32bf2d..2bb49a110 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,7 +1,6 @@ """ Test utilities. """ from functools import partial -import itertools import random import string import numpy as np @@ -40,27 +39,6 @@ def named_param(argnames, argvalues): ids=lambda arg: "{}={}".format(argnames, arg))) -def powerset(items, min_items=0, include_full_pop=True): - """ - Build the powerset of a collection of items. - - :param Iterable[object] items: "Pool" of all items, the population for - which to build the power set. - :param int min_items: Minimum number of individuals from the population - to allow in any given subset. - :param bool include_full_pop: Whether to include the full population in - the powerset (default True to accord with genuine definition) - :return list[object]: Sequence of subsets of the population, in - nondecreasing size order - """ - items = list(items) # Account for iterable burn possibility. - max_items = len(items) + 1 if include_full_pop else len(items) - min_items = min_items or 0 - return list(itertools.chain.from_iterable( - itertools.combinations(items, k) - for k in range(min_items, max_items))) - - def randstr(pool, size): """ Generate random string of given size/length. @@ -85,6 +63,3 @@ def randconf(ext=".yaml"): :return str: randomly generated string to function as filename """ return randstr(LETTERS_AND_DIGITS, 15) + ext - - -nonempty_powerset = partial(powerset, min_items=1) diff --git a/tests/models/test_PipelineInterface.py b/tests/models/test_PipelineInterface.py index 38d74e63d..71a0eba71 100644 --- a/tests/models/test_PipelineInterface.py +++ b/tests/models/test_PipelineInterface.py @@ -23,7 +23,7 @@ from peppy import Project, Sample from peppy.const import * from .conftest import ATAC_PROTOCOL_NAME, write_config_data -from tests.helpers import powerset +from ubiquerg import powerset __author__ = "Vince Reuter" diff --git a/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py b/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py index d492cee54..b522bf93c 100644 --- a/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py +++ b/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py @@ -10,7 +10,8 @@ from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY from peppy.const import * from peppy.utils import count_repeats -from tests.helpers import powerset, randconf +from tests.helpers import randconf +from ubiquerg import powerset __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" diff --git a/tests/test_with_microtest_as_smoketest.py b/tests/test_with_microtest_as_smoketest.py index 0bde9a189..742e13126 100644 --- a/tests/test_with_microtest_as_smoketest.py +++ b/tests/test_with_microtest_as_smoketest.py @@ -58,7 +58,7 @@ def temp_chdir_home(tmpdir): @pytest.mark.remote_data @pytest.mark.usefixtures("temp_chdir_home") @pytest.mark.parametrize("cli_extra", - [build_cli_extra(**kvs) for kvs in + [build_cli_extra(kvs) for kvs in [{SAMPLE_SELECTOR_OPTION: "protocol", INCLUSION_OPTION: "ATAC-seq"}]]) def test_cli_microtest_smoke(cli_extra, data_conf_file): """ Using microtest as project, test CLI for failure on specific cases. """ From f112caf280d7d04438184fb188ac908b476f3642 Mon Sep 17 00:00:00 2001 From: Nathan Sheffield Date: Thu, 2 May 2019 13:47:22 -0400 Subject: [PATCH 51/61] url --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b90ffeb6b..cb7bb71dd 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,4 @@ [![Build Status](https://travis-ci.org/pepkit/looper.svg?branch=master)](https://travis-ci.org/pepkit/looper) [![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) -`Looper` is a pipeline submission engine. The typical use case is to run a bioinformatics pipeline across many different input samples. Instructions are in the [documentation](http://code.databio.org/looper/). +`Looper` is a pipeline submission engine. The typical use case is to run a bioinformatics pipeline across many different input samples. Instructions are in the [documentation](http://looper.databio.org/). From 640518bcb70e352a79296264d383e01d533dbdd5 Mon Sep 17 00:00:00 2001 From: Nathan Sheffield Date: Thu, 2 May 2019 13:48:30 -0400 Subject: [PATCH 52/61] url --- mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 314f760d6..f3e89817c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,6 +1,6 @@ site_name: Looper site_logo: img/looper_logo_dark.svg -site_url: http://code.databio.org/looper/ +site_url: http://looper.databio.org repo_url: http://github.com/pepkit/looper pypi_name: loopercli From 2214e85037a5dda629010e3b0a37adcbd520a45f Mon Sep 17 00:00:00 2001 From: Nathan Sheffield Date: Thu, 2 May 2019 13:48:51 -0400 Subject: [PATCH 53/61] Update _version.py --- looper/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/looper/_version.py b/looper/_version.py index b19938576..9d1ce461c 100644 --- a/looper/_version.py +++ b/looper/_version.py @@ -1,2 +1,2 @@ -__version__ = "0.12dev" +__version__ = "0.12.0" From bf067174d4001fe51c67ca3f4656e263754b370d Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 2 May 2019 14:07:09 -0400 Subject: [PATCH 54/61] fix reqs --- requirements/requirements-test.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index d6d1664da..7f4458baa 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,4 +1,3 @@ coveralls==1.1 pytest-cov==2.4.0 pytest-remotedata -ubiquerg>=0.0.3dev From 5c63f2109533dbd6ee863929e03dcff566cd605a Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 2 May 2019 16:57:46 -0400 Subject: [PATCH 55/61] add ubiquerg to req --- requirements/requirements-all.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index ed522d19b..cce9cac1b 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -5,3 +5,4 @@ pandas>=0.20.2 pyyaml>=3.12 divvy>=0.3.1 peppy>=0.21dev +ubiquerg>=0.0.3 From af53e7dd9cf72dc637b6a3673cb16a9fab9f55ba Mon Sep 17 00:00:00 2001 From: Nathan Sheffield Date: Thu, 2 May 2019 17:07:04 -0400 Subject: [PATCH 56/61] Update requirements-all.txt --- requirements/requirements-all.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index cce9cac1b..a893ddfdc 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,5 +4,5 @@ logmuse>=0.0.2 pandas>=0.20.2 pyyaml>=3.12 divvy>=0.3.1 -peppy>=0.21dev +peppy>=0.21.0 ubiquerg>=0.0.3 From 540b0abd53425a9adbfa344bd9dd2473630894ec Mon Sep 17 00:00:00 2001 From: Nathan Sheffield Date: Thu, 2 May 2019 17:08:45 -0400 Subject: [PATCH 57/61] Update changelog.md --- docs/changelog.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 7e5c74f36..cb8d75c24 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -3,9 +3,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. ## [Unreleased] -### Fixed -- Prevent duplication of CLI flags: [Issue 168](https://github.com/pepkit/looper/issues/168) -- Safer usage of CLI specification of sample subset selection: [`peppy` issue 298](https://github.com/pepkit/peppy/issues/298) + + +## [0.12.0] -- 2019-05-0X + +### Added +- First implementation of pipeline interface 'outputs', so pipeline authors can specify items of interest produced by the pipeline. ## [0.11.1] - 2019-04-17 From 36c9569bafbbf5801b567dec336ef5d2f3ebeefd Mon Sep 17 00:00:00 2001 From: Vince Date: Thu, 2 May 2019 20:52:52 -0400 Subject: [PATCH 58/61] changelog for release --- docs/changelog.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index cb8d75c24..b37e0993a 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,13 +2,18 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. -## [Unreleased] - - -## [0.12.0] -- 2019-05-0X +## [0.12.0] -- 2019-05-02 ### Added - First implementation of pipeline interface 'outputs', so pipeline authors can specify items of interest produced by the pipeline. +- Functions and attributes on `Project` to support "outputs" (`interfaces`, `get_interfaces`, `get_outputs`) + +### Changed +- Start "compute" --> "compute_packges" transition +- `get_logger` moved to `peppy` + +### Fixed +- Prevent CLI option duplication in pipeline commands generated ## [0.11.1] - 2019-04-17 From cde9f8ef335ef6005bf452168f321dbd04ed22ce Mon Sep 17 00:00:00 2001 From: Vince Date: Thu, 2 May 2019 20:55:07 -0400 Subject: [PATCH 59/61] ignore build stuff --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c834a3a4c..8491a1235 100644 --- a/.gitignore +++ b/.gitignore @@ -68,10 +68,12 @@ open_pipelines/ # Reserved files for comparison *RESERVE* +doc/ +build/ dist/ looper.egg-info/ loopercli.egg-info/ *ipynb_checkpoints* -hello_looper-master* \ No newline at end of file +hello_looper-master* From b84220edd9bc0c40af10c3b82e1cd4da8ba7ebe0 Mon Sep 17 00:00:00 2001 From: Nathan Sheffield Date: Fri, 3 May 2019 08:16:39 -0400 Subject: [PATCH 60/61] Update requirements-doc.txt --- requirements/requirements-doc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-doc.txt b/requirements/requirements-doc.txt index 9c5fe172d..492fbb486 100644 --- a/requirements/requirements-doc.txt +++ b/requirements/requirements-doc.txt @@ -2,4 +2,4 @@ mkdocs>=1.0 markdown-include pydoc-markdown https://github.com/databio/mkdocs-databio/archive/master.zip -https://github.com/pepkit/looper/archive/master.zip +loopercli From 7a01cb06e3e8f7a2784dc7f5d97d6ad2f6d9f76d Mon Sep 17 00:00:00 2001 From: Nathan Sheffield Date: Fri, 3 May 2019 08:22:37 -0400 Subject: [PATCH 61/61] relax test reqs --- requirements/requirements-dev.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 963cabdb3..72dcff1ac 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,4 +1,4 @@ jinja2 -mock==2.0.0 -pytest==3.0.7 +mock>=2.0.0 +pytest>=3.0.7 ubiquerg>=0.0.3