Skip to content

Commit

Permalink
Refactor for multiple dataset support (#29)
Browse files Browse the repository at this point in the history
* [TMP] script working

* [TMP] update DA scripts

* [TMP] Viz + LR working

* Updates to workflow for multiple datasets

* Fix #27

* Update GH Action

* Move workflow structure

* updated gh actions
  • Loading branch information
gibsramen authored Aug 16, 2022
1 parent c5ed705 commit 4f6df23
Show file tree
Hide file tree
Showing 62 changed files with 9,157 additions and 430 deletions.
47 changes: 19 additions & 28 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,36 +16,27 @@ on:
- "README.md"

jobs:
Linting:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Lint workflow
uses: snakemake/[email protected]
with:
directory: .
snakefile: workflow/Snakefile
stagein: "mamba install -y -n snakemake --channel conda-forge --channel bioconda"
args: "--lint"

Testing:
runs-on: ubuntu-latest
needs:
- Linting
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v2
with:
persist-credentials: false
fetch-depth: 0

- uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: qadabra
mamba-version: "*"
channels: conda-forge,defaults,bioconda
channel-priority: true
python-version: "3.8"

- name: Test workflow
uses: snakemake/snakemake-github-action@v1
with:
directory: .
snakefile: workflow/Snakefile
args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba"
stagein: "conda config --get channel_priority --json"
- name: Install conda packages
shell: bash -l {0}
run: mamba install snakemake click biom-format pandas

- name: Test report
uses: snakemake/snakemake-github-action@v1
with:
directory: .
snakefile: workflow/Snakefile
args: "--report report.zip"
- name: Run Snakemake
shell: bash -l {0}
run: make snaketest
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
*.swp
*.snakemake
*__pycache__
*egg-info/
config/datasets.tsv
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
graft workflow
graft config
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
TMPDIR := $(shell mktemp -d)
TABLE_FILE := $(shell realpath qadabra/test_data/table.biom)
MD_FILE := $(shell realpath qadabra/test_data/metadata.tsv)

create_rulegraph:
snakemake -f --rulegraph | dot -Tpng > imgs/rule_graph.png

snaketest:
@cd $(TMPDIR); \
qadabra create-workflow --workflow-dest . ;\
qadabra add-dataset --table $(TABLE_FILE) --metadata $(MD_FILE) --name "ampharos" --factor-name anemia --target-level anemic --reference-level normal --verbose ; \
snakemake --use-conda --cores 2
1 change: 1 addition & 0 deletions qadabra/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.3.0a1"
12 changes: 1 addition & 11 deletions config/config.yaml → qadabra/config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,4 @@
table: "data/table.biom"
metadata: "data/metadata.tsv"
tree: "data/tree.nwk"
stylesheet: "config/qadabra.mplstyle"
model:
covariate: anemia
target: anemic
reference: normal
confounders:
- sex
- collection_cutoff
tools:
- deseq2
- ancombc
Expand All @@ -27,5 +17,5 @@ log_ratio_feat_pcts:
- 15
- 20
ml_params:
n_splits: 10
n_splits: 5
n_repeats: 5
File renamed without changes.
196 changes: 196 additions & 0 deletions qadabra/qadabra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import logging
import os
import pathlib
from pkg_resources import resource_filename
import shutil
from typing import List

import biom
import click
import pandas as pd

from qadabra import __version__
from qadabra.utils import _validate_input

SNKFILE_TEXT = """from pkg_resources import resource_filename
from snakemake.utils import min_version
min_version("6.0")
qadabra_snakefile = resource_filename("qadabra", "workflow/Snakefile")
configfile: "config/config.yaml"
module qadabra:
snakefile:
qadabra_snakefile
config:
config
use rule * from qadabra
"""


@click.group()
@click.version_option(__version__)
def qadabra():
"""Differential abundance workflow"""
pass


@qadabra.command()
@click.option(
"--table",
type=click.Path(exists=True),
required=True,
help="Feature table in BIOM format"
)
@click.option(
"--metadata",
type=click.Path(exists=True),
required=True,
help="Metadata in TSV format"
)
@click.option(
"--tree",
type=click.Path(exists=True),
required=False,
help="Phylogenetic tree in Newick format"
)
@click.option(
"--name",
type=str,
required=True,
help="Name of dataset"
)
@click.option(
"--factor-name",
type=str,
required=True,
help="Name of factor grouping in metadata"
)
@click.option(
"--target-level",
type=str,
required=True,
help="Grouping level on which to perform differential abundance"
)
@click.option(
"--reference-level",
type=str,
required=True,
help="Grouping level to use as reference"
)
@click.option(
"--confounder",
type=str,
required=False,
multiple=True,
help="Confounder variable to consider (can provide multiple)"
)
@click.option(
"--validate-input",
is_flag=True,
show_default=True,
default=True
)
@click.option(
"--verbose",
is_flag=True,
show_default=True,
default=False,
help="Whether to output progress to console"
)
def add_dataset(
table,
metadata,
tree,
name,
factor_name,
target_level,
reference_level,
confounder,
validate_input,
verbose
):
"""Add dataset on which to run Qadabra"""
if not pathlib.Path("./workflow").exists:
raise ValueError("Workflow has not been created!")

dataset_sheet = "config/datasets.tsv"
logger = logging.getLogger(__name__)
log_level = logging.INFO if verbose else logging.WARNING
logger.setLevel(log_level)
sh = logging.StreamHandler()
sh.setLevel(log_level)
formatter = logging.Formatter(
"[%(asctime)s - %(levelname)s] :: %(message)s",
"%Y-%m-%d %H:%M:%S"
)
sh.setFormatter(formatter)
logger.addHandler(sh)

if validate_input:
logger.info("Validating input...")
_validate_input(logger, table, metadata, factor_name, target_level,
reference_level, tree, confounder)

dataset_sheet = pathlib.Path(dataset_sheet)
new_ds = pd.Series({
"table": pathlib.Path(table).resolve(),
"metadata": pathlib.Path(metadata).resolve(),
"factor_name": factor_name,
"target_level": target_level,
"reference_level": reference_level,
}, name=name).to_frame().T

if tree is not None:
new_ds["tree"] = pathlib.Path(tree).resolve()
else:
new_ds["tree"] = None

if confounder:
new_ds["confounders"] = ";".join(confounder)
else:
new_ds["confounders"] = None

if dataset_sheet.exists():
logger.info("Loading datasheet...")
ds_sheet = pd.read_table(dataset_sheet, sep="\t", index_col=0)
if name in ds_sheet.index:
raise ValueError(f"{name} already exists in dataset sheet!")
ds_sheet = pd.concat([ds_sheet, new_ds], axis=0)
else:
logger.info("Dataset does not exist. Creating...")
ds_sheet = new_ds

ds_sheet.to_csv(dataset_sheet, sep="\t", index=True)
logger.info(f"Saved dataset sheet to {dataset_sheet}")


@qadabra.command()
@click.option(
"--workflow-dest",
type=click.Path(exists=False),
default="."
)
def create_workflow(workflow_dest):
"""Create new Qadabra workflow structure"""
wflow_dest = pathlib.Path(workflow_dest)
wflow_dir = wflow_dest / "workflow"
cfg_dir = wflow_dest/ "config"
os.makedirs(wflow_dir)
os.makedirs(cfg_dir)

cfg_file = resource_filename("qadabra", "config/config.yaml")
shutil.copy(cfg_file, cfg_dir / "config.yaml")

style_file = resource_filename("qadabra", "config/qadabra.mplstyle")
shutil.copy(style_file, cfg_dir / "qadabra.mplstyle")

snkfile_path = wflow_dir / "Snakefile"
with open(snkfile_path, "w") as f:
f.write(SNKFILE_TEXT)


if __name__ == "__main__":
qadabra()
15 changes: 15 additions & 0 deletions qadabra/templates/Snakefile.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from pkg_resources import resource_filename

from snakemake.utils import min_version
min_version("6.0")

qadabra_snakefile = resource_filename("qadabra", "workflow/Snakefile")
configfile: "config/config.yaml"

module qadabra:
snakefile:
qadabra_snakefile
config:
config

use rule * from qadabra

Large diffs are not rendered by default.

Loading

0 comments on commit 4f6df23

Please sign in to comment.