Refactor for multiple dataset support (#29)

* [TMP] script working * [TMP] update DA scripts * [TMP] Viz + LR working * Updates to workflow for multiple datasets * Fix #27 * Update GH Action * Move workflow structure * updated gh actions
biocore · Aug 16, 2022 · 4f6df23 · 4f6df23
1 parent c5ed705
commit 4f6df23
Show file tree

Hide file tree

Showing 62 changed files with 9,157 additions and 430 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -16,36 +16,27 @@ on:
       - "README.md"
 
 jobs:
-  Linting:
+  build:
     runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v2
-    - name: Lint workflow
-      uses: snakemake/[email protected]
-      with:
-        directory: .
-        snakefile: workflow/Snakefile
-        stagein: "mamba install -y -n snakemake --channel conda-forge --channel bioconda"
-        args: "--lint"
 
-  Testing:
-    runs-on: ubuntu-latest
-    needs:
-      - Linting
     steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@v2
+        with:
+          persist-credentials: false
+          fetch-depth: 0
+
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          activate-environment: qadabra
+          mamba-version: "*"
+          channels: conda-forge,defaults,bioconda
+          channel-priority: true
+          python-version: "3.8"
 
-    - name: Test workflow
-      uses: snakemake/snakemake-github-action@v1
-      with:
-        directory: .
-        snakefile: workflow/Snakefile
-        args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba"
-        stagein: "conda config --get channel_priority --json"
+      - name: Install conda packages
+        shell: bash -l {0}
+        run: mamba install snakemake click biom-format pandas
 
-    - name: Test report
-      uses: snakemake/snakemake-github-action@v1
-      with:
-        directory: .
-        snakefile: workflow/Snakefile
-        args: "--report report.zip"
+      - name: Run Snakemake
+        shell: bash -l {0}
+        run: make snaketest
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@
 *.swp
 *.snakemake
 *__pycache__
+*egg-info/
+config/datasets.tsv
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+graft workflow
+graft config
diff --git a/Makefile b/Makefile
@@ -1,2 +1,12 @@
+TMPDIR := $(shell mktemp -d)
+TABLE_FILE := $(shell realpath qadabra/test_data/table.biom)
+MD_FILE := $(shell realpath qadabra/test_data/metadata.tsv)
+
 create_rulegraph:
 	snakemake -f --rulegraph | dot -Tpng > imgs/rule_graph.png
+
+snaketest:
+	@cd $(TMPDIR); \
+	qadabra create-workflow --workflow-dest . ;\
+	qadabra add-dataset --table $(TABLE_FILE) --metadata $(MD_FILE) --name "ampharos" --factor-name anemia --target-level anemic --reference-level normal --verbose ; \
+	snakemake --use-conda --cores 2
diff --git a/qadabra/__init__.py b/qadabra/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.3.0a1"
diff --git a/config/config.yaml → qadabra/config/config.yaml b/config/config.yaml → qadabra/config/config.yaml
@@ -1,14 +1,4 @@
-table: "data/table.biom"
-metadata: "data/metadata.tsv"
-tree: "data/tree.nwk"
 stylesheet: "config/qadabra.mplstyle"
-model:
-    covariate: anemia
-    target: anemic
-    reference: normal
-    confounders:
-        - sex
-        - collection_cutoff
 tools:
     - deseq2
     - ancombc
@@ -27,5 +17,5 @@ log_ratio_feat_pcts:
     - 15
     - 20
 ml_params:
-    n_splits: 10
+    n_splits: 5
     n_repeats: 5
diff --git a/config/qadabra.mplstyle → qadabra/config/qadabra.mplstyle b/config/qadabra.mplstyle → qadabra/config/qadabra.mplstyle
diff --git a/qadabra/qadabra.py b/qadabra/qadabra.py
@@ -0,0 +1,196 @@
+import logging
+import os
+import pathlib
+from pkg_resources import resource_filename
+import shutil
+from typing import List
+
+import biom
+import click
+import pandas as pd
+
+from qadabra import __version__
+from qadabra.utils import _validate_input
+
+SNKFILE_TEXT = """from pkg_resources import resource_filename
+
+from snakemake.utils import min_version
+min_version("6.0")
+
+qadabra_snakefile = resource_filename("qadabra", "workflow/Snakefile")
+configfile: "config/config.yaml"
+
+module qadabra:
+    snakefile:
+        qadabra_snakefile
+    config:
+        config
+
+use rule * from qadabra
+"""
+
+
+@click.group()
+@click.version_option(__version__)
+def qadabra():
+    """Differential abundance workflow"""
+    pass
+
+
+@qadabra.command()
+@click.option(
+    "--table",
+    type=click.Path(exists=True),
+    required=True,
+    help="Feature table in BIOM format"
+)
+@click.option(
+    "--metadata",
+    type=click.Path(exists=True),
+    required=True,
+    help="Metadata in TSV format"
+)
+@click.option(
+    "--tree",
+    type=click.Path(exists=True),
+    required=False,
+    help="Phylogenetic tree in Newick format"
+)
+@click.option(
+    "--name",
+    type=str,
+    required=True,
+    help="Name of dataset"
+)
+@click.option(
+    "--factor-name",
+    type=str,
+    required=True,
+    help="Name of factor grouping in metadata"
+)
+@click.option(
+    "--target-level",
+    type=str,
+    required=True,
+    help="Grouping level on which to perform differential abundance"
+)
+@click.option(
+    "--reference-level",
+    type=str,
+    required=True,
+    help="Grouping level to use as reference"
+)
+@click.option(
+    "--confounder",
+    type=str,
+    required=False,
+    multiple=True,
+    help="Confounder variable to consider (can provide multiple)"
+)
+@click.option(
+    "--validate-input",
+    is_flag=True,
+    show_default=True,
+    default=True
+)
+@click.option(
+    "--verbose",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="Whether to output progress to console"
+)
+def add_dataset(
+    table,
+    metadata,
+    tree,
+    name,
+    factor_name,
+    target_level,
+    reference_level,
+    confounder,
+    validate_input,
+    verbose
+):
+    """Add dataset on which to run Qadabra"""
+    if not pathlib.Path("./workflow").exists:
+        raise ValueError("Workflow has not been created!")
+
+    dataset_sheet = "config/datasets.tsv"
+    logger = logging.getLogger(__name__)
+    log_level = logging.INFO if verbose else logging.WARNING
+    logger.setLevel(log_level)
+    sh = logging.StreamHandler()
+    sh.setLevel(log_level)
+    formatter = logging.Formatter(
+        "[%(asctime)s - %(levelname)s] :: %(message)s",
+        "%Y-%m-%d %H:%M:%S"
+    )
+    sh.setFormatter(formatter)
+    logger.addHandler(sh)
+
+    if validate_input:
+        logger.info("Validating input...")
+        _validate_input(logger, table, metadata, factor_name, target_level,
+                        reference_level, tree, confounder)
+
+    dataset_sheet = pathlib.Path(dataset_sheet)
+    new_ds = pd.Series({
+        "table": pathlib.Path(table).resolve(),
+        "metadata": pathlib.Path(metadata).resolve(),
+        "factor_name": factor_name,
+        "target_level": target_level,
+        "reference_level": reference_level,
+    }, name=name).to_frame().T
+
+    if tree is not None:
+        new_ds["tree"] = pathlib.Path(tree).resolve()
+    else:
+        new_ds["tree"] = None
+
+    if confounder:
+        new_ds["confounders"] = ";".join(confounder)
+    else:
+        new_ds["confounders"] = None
+
+    if dataset_sheet.exists():
+        logger.info("Loading datasheet...")
+        ds_sheet = pd.read_table(dataset_sheet, sep="\t", index_col=0)
+        if name in ds_sheet.index:
+            raise ValueError(f"{name} already exists in dataset sheet!")
+        ds_sheet = pd.concat([ds_sheet, new_ds], axis=0)
+    else:
+        logger.info("Dataset does not exist. Creating...")
+        ds_sheet = new_ds
+
+    ds_sheet.to_csv(dataset_sheet, sep="\t", index=True)
+    logger.info(f"Saved dataset sheet to {dataset_sheet}")
+
+
+@qadabra.command()
+@click.option(
+    "--workflow-dest",
+    type=click.Path(exists=False),
+    default="."
+)
+def create_workflow(workflow_dest):
+    """Create new Qadabra workflow structure"""
+    wflow_dest = pathlib.Path(workflow_dest)
+    wflow_dir = wflow_dest / "workflow"
+    cfg_dir = wflow_dest/ "config"
+    os.makedirs(wflow_dir)
+    os.makedirs(cfg_dir)
+
+    cfg_file = resource_filename("qadabra", "config/config.yaml")
+    shutil.copy(cfg_file, cfg_dir / "config.yaml")
+
+    style_file = resource_filename("qadabra", "config/qadabra.mplstyle")
+    shutil.copy(style_file, cfg_dir / "qadabra.mplstyle")
+
+    snkfile_path = wflow_dir / "Snakefile"
+    with open(snkfile_path, "w") as f:
+        f.write(SNKFILE_TEXT)
+
+
+if __name__ == "__main__":
+    qadabra()
diff --git a/qadabra/templates/Snakefile.jinja2 b/qadabra/templates/Snakefile.jinja2
@@ -0,0 +1,15 @@
+from pkg_resources import resource_filename
+
+from snakemake.utils import min_version
+min_version("6.0")
+
+qadabra_snakefile = resource_filename("qadabra", "workflow/Snakefile")
+configfile: "config/config.yaml"
+
+module qadabra:
+    snakefile:
+        qadabra_snakefile
+    config:
+        config
+
+use rule * from qadabra
diff --git a/qadabra/test_data/archive/127612_insertion_tree.relabelled.tre b/qadabra/test_data/archive/127612_insertion_tree.relabelled.tre