From 9612414613ab8d96ee1d25eb0c47ab28bf7a6a7b Mon Sep 17 00:00:00 2001 From: Erik Serrano <31600622+axiomcura@users.noreply.github.com> Date: Mon, 27 Mar 2023 15:10:45 -0700 Subject: [PATCH] Decoupling rule modules into individual components (#33) * fixed minor pathing bugs * separated cp_process module * update logs documentation * added documentation * edit typos * update cp_process workflow * update pycytominer version * file typo fixed --- cytosnake/cli/args.py | 2 +- cytosnake/helpers/helper_funcs.py | 2 +- workflows/envs/cytominer_env.yaml | 2 +- workflows/rules/aggregate.smk | 51 +++++++++++++++++ workflows/rules/annotate.smk | 46 +++++++++++++++ workflows/rules/feature_select.smk | 38 ++++++++----- workflows/rules/generate_consensus.smk | 38 +++++++++++++ workflows/rules/merge_logs.smk | 20 +++++-- workflows/rules/normalize.smk | 40 +++++++++++++ workflows/rules/preprocessing.smk | 78 -------------------------- workflows/scripts/consensus.py | 1 - workflows/workflow/cp_process.smk | 8 ++- 12 files changed, 221 insertions(+), 105 deletions(-) create mode 100644 workflows/rules/aggregate.smk create mode 100644 workflows/rules/annotate.smk create mode 100644 workflows/rules/generate_consensus.smk create mode 100644 workflows/rules/normalize.smk delete mode 100644 workflows/rules/preprocessing.smk diff --git a/cytosnake/cli/args.py b/cytosnake/cli/args.py index ec97343f..f7b1315d 100644 --- a/cytosnake/cli/args.py +++ b/cytosnake/cli/args.py @@ -60,7 +60,7 @@ def __call__(self, parser, args, values, option_string=None): f"Unable to find '{values}'. Please specify a supported workflow: {supported_wf}" ) # grabbing and setting the new value with the extracted path - values = load_workflow_path(values) + values = str(load_workflow_path(values)) # return new attributes of the `workflow` parameter setattr(args, self.dest, values) diff --git a/cytosnake/helpers/helper_funcs.py b/cytosnake/helpers/helper_funcs.py index e18f4d46..8033dea8 100644 --- a/cytosnake/helpers/helper_funcs.py +++ b/cytosnake/helpers/helper_funcs.py @@ -68,7 +68,7 @@ def get_barcodes() -> str: """ # Barcodes are optional. If not added, set to "None" try: - barcode_path = PATHS["project_dir"]["data_dir_conts"]["barcode"] + barcode_path = PATHS["project_dir"]["data_directory_contents"]["barcode"] except KeyError: barcode_path = None diff --git a/workflows/envs/cytominer_env.yaml b/workflows/envs/cytominer_env.yaml index d45cd5e7..c9218b8b 100644 --- a/workflows/envs/cytominer_env.yaml +++ b/workflows/envs/cytominer_env.yaml @@ -8,4 +8,4 @@ dependencies: - pyyaml - pip - pip: - - git+https://github.com/cytomining/pycytominer.git@b2c6cc4580cf9e1c040a7370b99976916a22e756 + - git+https://github.com/cytomining/pycytominer.git@c90438fd7c11ad8b1689c21db16dab1a5280de6c diff --git a/workflows/rules/aggregate.smk b/workflows/rules/aggregate.smk new file mode 100644 index 00000000..bedfa818 --- /dev/null +++ b/workflows/rules/aggregate.smk @@ -0,0 +1,51 @@ +""" +rule module: aggregate.smk + + +Utilize's pycytominer's aggregate module: +https://github.com/cytomining/pycytominer/blob/c90438fd7c11ad8b1689c21db16dab1a5280de6c/pycytominer/aggregate.py + +Aggregates single-cell profiles into aggregated profiles based on a given strata + +For example, users can configure `Metadata_Well` as their strata in order to +aggregate single-cell data into the Well level. + +Parameters: +----------- +input: + sql_file: single-cell dataset + + barcodes: file containing unique barcodes that maps to a specific plate + + metadata: directory containing metadata associated with the aggregate + profile +output: + aggregated_profile: aggregated profiles + cell_counts: CSV file that contains how many cells were counted per well + +Returns +------- + aggregated profiles and cell count data stored in the `results/` directory +# -------------------- +""" + + +configfile: "configs/configuration.yaml" + + +rule aggregate: + input: + sql_files=PLATE_DATA, + barcodes=BARCODES, + metadata=METADATA_DIR, + output: + aggregate_profile=AGGREGATE_DATA, + cell_counts=CELL_COUNTS, + log: + "logs/aggregate_{file_name}.log", + conda: + "../envs/cytominer_env.yaml" + params: + aggregate_config=config["config_paths"]["single_cell"], + script: + "../scripts/aggregate_cells.py" diff --git a/workflows/rules/annotate.smk b/workflows/rules/annotate.smk new file mode 100644 index 00000000..fb48e901 --- /dev/null +++ b/workflows/rules/annotate.smk @@ -0,0 +1,46 @@ +""" +rule module: annotate.smk + +Utilizes pycytominer's annotate module: +https://github.com/cytomining/pycytominer/blob/master/pycytominer/annotate.py + +Annotates profiles with given metadata. + +Parameters +---------- +input: + aggregate_profile: aggregated profile dataset + + barcodes: file containing unique barcodes that maps to a specific plate + + metadata: directory containing metadata associated with the aggregate + profile + +output: + generates an annotated profile. + + +Returns: +-------- + Generates an annotated profile stored in the `results/` directory +""" + + +configfile: "configs/configuration.yaml" + + +rule annotate: + input: + aggregate_profile=AGGREGATE_DATA, + barcodes=BARCODES, + metadata=METADATA_DIR, + output: + ANNOTATED_DATA, + conda: + "../envs/cytominer_env.yaml" + log: + "logs/annotate_{file_name}.log", + params: + annotate_config=config["config_paths"]["annotate"], + script: + "../scripts/annotate.py" diff --git a/workflows/rules/feature_select.smk b/workflows/rules/feature_select.smk index 90b3794e..ef36890f 100644 --- a/workflows/rules/feature_select.smk +++ b/workflows/rules/feature_select.smk @@ -1,3 +1,26 @@ +""" +rule module: feature_select.smk + +Utilizes pycytominer's feature select module: +https://github.com/cytomining/pycytominer/blob/master/pycytominer/feature_select.py + +Performs feature selection based on this given profiles. PyCytominer contains +different operations to conduct its feature selection: variance_threshold, +correlation_threshold, drop_na_columns, drop_outliers, and noise_removal. + +Parameters: +----------- +Input: + Cell morphology profiles +Output: + Selected features from profiles + +Returns +------- + CSV file containing selected features. Stored in the `results/` directory. +""" + + configfile: "configs/configuration.yaml" @@ -14,18 +37,3 @@ rule feature_select: "../envs/cytominer_env.yaml" script: "../scripts/feature_select.py" - - -rule create_consensus: - input: - SELECTED_FEATURE_DATA_EXPAND, - output: - CONSENSUS_DATA, - params: - consensus_configs=config["config_paths"]["consensus_config"], - log: - "logs/create_consensus.log", - conda: - "../envs/cytominer_env.yaml" - script: - "../scripts/consensus.py" diff --git a/workflows/rules/generate_consensus.smk b/workflows/rules/generate_consensus.smk new file mode 100644 index 00000000..8c292763 --- /dev/null +++ b/workflows/rules/generate_consensus.smk @@ -0,0 +1,38 @@ +""" +rule module: generate_consensus.smk + +Utilize's pycytominer's consensus module: +https://github.com/cytomining/pycytominer/blob/master/pycytominer/consensus.py + +Creates consensus profiles that reflects unique signatures associated with +external factors. + +Parameters: +---------- +input: + Selected features profile +output: + Consensus profile + +Return: +------- + Consensus profile stored in the `results/` directory +""" + + +configfile: "configs/configuration.yaml" + + +rule create_consensus: + input: + SELECTED_FEATURE_DATA_EXPAND, + output: + CONSENSUS_DATA, + params: + consensus_configs=config["config_paths"]["consensus_config"], + log: + "logs/create_consensus.log", + conda: + "../envs/cytominer_env.yaml" + script: + "../scripts/consensus.py" diff --git a/workflows/rules/merge_logs.smk b/workflows/rules/merge_logs.smk index c1dde8da..67cea78f 100644 --- a/workflows/rules/merge_logs.smk +++ b/workflows/rules/merge_logs.smk @@ -1,13 +1,21 @@ """ -Documentation: -Rule collects all generated logs from all porcessess and merges -them into a single log file. +rule module: merge_logs.smk -individual log files are stored into an archive file along with -the generated merged log. +Collects all log files generated within each rule module and merges it into +one log file -The archive file is taged with (Month-day-year)-(hour-min-sec) +The log file is tagged with (Month-day-year)-(hour-min-sec) Example: 072922-083033_archived_logs + +Parameters: +Inputs: + No user defined outputs, searches individual logs in the `logs/` folder +Output: + Merged log file + + +Returns + Merged log file stored in the `logs/` directory """ diff --git a/workflows/rules/normalize.smk b/workflows/rules/normalize.smk new file mode 100644 index 00000000..a374970b --- /dev/null +++ b/workflows/rules/normalize.smk @@ -0,0 +1,40 @@ +""" +rule module: normalize.smk + +Utlizes pycytominer's normalization module: +https://github.com/cytomining/pycytominer/blob/c90438fd7c11ad8b1689c21db16dab1a5280de6c/pycytominer/normalize.py + +Normalizing single-cell or aggregate features. Current default normalization +method is `standardize` other methods include: + + +parameters +---------- +input + single-cell or aggregated profiles + +output + normalized single-cell or aggregate dataset. + +Output +------ + Generates an annotated profile stored in the `results/` directory +""" + + +configfile: "configs/configuration.yaml" + + +rule normalize: + input: + ANNOTATED_DATA, + output: + NORMALIZED_DATA, + conda: + "../envs/cytominer_env.yaml" + log: + "logs/normalized_{file_name}.log", + params: + normalize_config=config["config_paths"]["normalize"], + script: + "../scripts/normalize.py" diff --git a/workflows/rules/preprocessing.smk b/workflows/rules/preprocessing.smk deleted file mode 100644 index 6411c6b4..00000000 --- a/workflows/rules/preprocessing.smk +++ /dev/null @@ -1,78 +0,0 @@ -""" -Documentation: -Workflow that involves preprocessing raw single-cell plate data and -transforming it into normalized aggregate profiles. - -Parameters ----------- -sql_files : List[str] - List of SQL files containing plate data -barcodes : str - path pointing to the barcode file storing platemap IDs -metadata : str - path pointing to plate metadata - -Generates ---------- -cell_counts: .csv file - csv file containing n_cells per well -augmented: csv.gz file - Annotated aggregated profiles -normalized: csv.gz file - Normalized annotated aggregate profiles -# -------------------- -""" - - -# collecting all unique IDs from plate -configfile: "configs/configuration.yaml" - - -rule aggregate: - input: - sql_files=PLATE_DATA, - barcodes=BARCODES, - metadata=METADATA_DIR, - output: - aggregate_profile=AGGREGATE_DATA, - cell_counts=CELL_COUNTS, - log: - "logs/aggregate_{file_name}.log", - conda: - "../envs/cytominer_env.yaml" - params: - aggregate_config=config["config_paths"]["single_cell"], - script: - "../scripts/aggregate_cells.py" - - -rule annotate: - input: - aggregate_profile=AGGREGATE_DATA, - barcodes=BARCODES, - metadata=METADATA_DIR, - output: - ANNOTATED_DATA, - conda: - "../envs/cytominer_env.yaml" - log: - "logs/annotate_{file_name}.log", - params: - annotate_config=config["config_paths"]["annotate"], - script: - "../scripts/annotate.py" - - -rule normalize: - input: - ANNOTATED_DATA, - output: - NORMALIZED_DATA, - conda: - "../envs/cytominer_env.yaml" - log: - "logs/normalized_{file_name}.log", - params: - normalize_config=config["config_paths"]["normalize"], - script: - "../scripts/normalize.py" diff --git a/workflows/scripts/consensus.py b/workflows/scripts/consensus.py index a554418c..29619446 100644 --- a/workflows/scripts/consensus.py +++ b/workflows/scripts/consensus.py @@ -2,7 +2,6 @@ from pathlib import Path import pandas as pd -import snakemake import yaml from pycytominer import consensus from pycytominer.operations import get_na_columns diff --git a/workflows/workflow/cp_process.smk b/workflows/workflow/cp_process.smk index 6e8ae4a8..cce42f02 100644 --- a/workflows/workflow/cp_process.smk +++ b/workflows/workflow/cp_process.smk @@ -2,10 +2,13 @@ import glob from cytosnake.helpers import helper_funcs as hf -# importing Modules +# importing rule modules include: "../rules/common.smk" -include: "../rules/preprocessing.smk" +include: "../rules/aggregate.smk" +include: "../rules/annotate.smk" +include: "../rules/normalize.smk" include: "../rules/feature_select.smk" +include: "../rules/generate_consensus.smk" # expected outputs from workflow @@ -16,3 +19,4 @@ rule all: ANNOTATED_DATA_EXPAND, NORMALIZED_DATA_EXPAND, SELECTED_FEATURE_DATA_EXPAND, + CONSENSUS_DATA,