From 9612414613ab8d96ee1d25eb0c47ab28bf7a6a7b Mon Sep 17 00:00:00 2001
From: Erik Serrano <31600622+axiomcura@users.noreply.github.com>
Date: Mon, 27 Mar 2023 15:10:45 -0700
Subject: [PATCH] Decoupling rule modules into individual components  (#33)

* fixed minor pathing bugs

* separated cp_process module

* update logs documentation

* added documentation

* edit typos

* update cp_process workflow

* update pycytominer version

* file typo fixed
---
 cytosnake/cli/args.py                  |  2 +-
 cytosnake/helpers/helper_funcs.py      |  2 +-
 workflows/envs/cytominer_env.yaml      |  2 +-
 workflows/rules/aggregate.smk          | 51 +++++++++++++++++
 workflows/rules/annotate.smk           | 46 +++++++++++++++
 workflows/rules/feature_select.smk     | 38 ++++++++-----
 workflows/rules/generate_consensus.smk | 38 +++++++++++++
 workflows/rules/merge_logs.smk         | 20 +++++--
 workflows/rules/normalize.smk          | 40 +++++++++++++
 workflows/rules/preprocessing.smk      | 78 --------------------------
 workflows/scripts/consensus.py         |  1 -
 workflows/workflow/cp_process.smk      |  8 ++-
 12 files changed, 221 insertions(+), 105 deletions(-)
 create mode 100644 workflows/rules/aggregate.smk
 create mode 100644 workflows/rules/annotate.smk
 create mode 100644 workflows/rules/generate_consensus.smk
 create mode 100644 workflows/rules/normalize.smk
 delete mode 100644 workflows/rules/preprocessing.smk

diff --git a/cytosnake/cli/args.py b/cytosnake/cli/args.py
index ec97343f..f7b1315d 100644
--- a/cytosnake/cli/args.py
+++ b/cytosnake/cli/args.py
@@ -60,7 +60,7 @@ def __call__(self, parser, args, values, option_string=None):
                 f"Unable to find '{values}'. Please specify a supported workflow: {supported_wf}"
             )
         # grabbing and setting the new value with the extracted path
-        values = load_workflow_path(values)
+        values = str(load_workflow_path(values))
 
         # return new attributes of the `workflow` parameter
         setattr(args, self.dest, values)
diff --git a/cytosnake/helpers/helper_funcs.py b/cytosnake/helpers/helper_funcs.py
index e18f4d46..8033dea8 100644
--- a/cytosnake/helpers/helper_funcs.py
+++ b/cytosnake/helpers/helper_funcs.py
@@ -68,7 +68,7 @@ def get_barcodes() -> str:
     """
     # Barcodes are optional. If not added, set to "None"
     try:
-        barcode_path = PATHS["project_dir"]["data_dir_conts"]["barcode"]
+        barcode_path = PATHS["project_dir"]["data_directory_contents"]["barcode"]
     except KeyError:
         barcode_path = None
 
diff --git a/workflows/envs/cytominer_env.yaml b/workflows/envs/cytominer_env.yaml
index d45cd5e7..c9218b8b 100644
--- a/workflows/envs/cytominer_env.yaml
+++ b/workflows/envs/cytominer_env.yaml
@@ -8,4 +8,4 @@ dependencies:
   - pyyaml
   - pip
   - pip:
-      - git+https://github.com/cytomining/pycytominer.git@b2c6cc4580cf9e1c040a7370b99976916a22e756
+      - git+https://github.com/cytomining/pycytominer.git@c90438fd7c11ad8b1689c21db16dab1a5280de6c
diff --git a/workflows/rules/aggregate.smk b/workflows/rules/aggregate.smk
new file mode 100644
index 00000000..bedfa818
--- /dev/null
+++ b/workflows/rules/aggregate.smk
@@ -0,0 +1,51 @@
+"""
+rule module: aggregate.smk
+
+
+Utilize's pycytominer's aggregate module:
+https://github.com/cytomining/pycytominer/blob/c90438fd7c11ad8b1689c21db16dab1a5280de6c/pycytominer/aggregate.py
+
+Aggregates single-cell profiles into aggregated profiles based on a given strata
+
+For example, users can configure `Metadata_Well` as their strata in order to
+aggregate single-cell data into the Well level.
+
+Parameters:
+-----------
+input:
+  sql_file: single-cell dataset
+
+  barcodes: file containing unique barcodes that maps to a specific plate
+
+  metadata: directory containing metadata associated with the aggregate
+            profile
+output:
+  aggregated_profile: aggregated profiles
+  cell_counts: CSV file that contains how many cells were counted per well
+
+Returns
+-------
+  aggregated profiles and cell count data stored in the `results/` directory
+# --------------------
+"""
+
+
+configfile: "configs/configuration.yaml"
+
+
+rule aggregate:
+    input:
+        sql_files=PLATE_DATA,
+        barcodes=BARCODES,
+        metadata=METADATA_DIR,
+    output:
+        aggregate_profile=AGGREGATE_DATA,
+        cell_counts=CELL_COUNTS,
+    log:
+        "logs/aggregate_{file_name}.log",
+    conda:
+        "../envs/cytominer_env.yaml"
+    params:
+        aggregate_config=config["config_paths"]["single_cell"],
+    script:
+        "../scripts/aggregate_cells.py"
diff --git a/workflows/rules/annotate.smk b/workflows/rules/annotate.smk
new file mode 100644
index 00000000..fb48e901
--- /dev/null
+++ b/workflows/rules/annotate.smk
@@ -0,0 +1,46 @@
+"""
+rule module: annotate.smk
+
+Utilizes pycytominer's annotate module:
+https://github.com/cytomining/pycytominer/blob/master/pycytominer/annotate.py
+
+Annotates profiles with given metadata.
+
+Parameters
+----------
+input:
+    aggregate_profile: aggregated profile dataset
+
+    barcodes: file containing unique barcodes that maps to a specific plate
+
+    metadata: directory containing metadata associated with the aggregate
+              profile
+
+output:
+    generates an annotated profile.
+
+
+Returns:
+--------
+    Generates an annotated profile stored in the `results/` directory
+"""
+
+
+configfile: "configs/configuration.yaml"
+
+
+rule annotate:
+    input:
+        aggregate_profile=AGGREGATE_DATA,
+        barcodes=BARCODES,
+        metadata=METADATA_DIR,
+    output:
+        ANNOTATED_DATA,
+    conda:
+        "../envs/cytominer_env.yaml"
+    log:
+        "logs/annotate_{file_name}.log",
+    params:
+        annotate_config=config["config_paths"]["annotate"],
+    script:
+        "../scripts/annotate.py"
diff --git a/workflows/rules/feature_select.smk b/workflows/rules/feature_select.smk
index 90b3794e..ef36890f 100644
--- a/workflows/rules/feature_select.smk
+++ b/workflows/rules/feature_select.smk
@@ -1,3 +1,26 @@
+"""
+rule module: feature_select.smk
+
+Utilizes pycytominer's feature select module:
+https://github.com/cytomining/pycytominer/blob/master/pycytominer/feature_select.py
+
+Performs feature selection based on this given profiles. PyCytominer contains
+different operations to conduct its feature selection: variance_threshold,
+correlation_threshold, drop_na_columns, drop_outliers, and noise_removal.
+
+Parameters:
+-----------
+Input:
+    Cell morphology profiles
+Output:
+    Selected features from profiles
+
+Returns
+-------
+    CSV file containing selected features. Stored in the `results/` directory.
+"""
+
+
 configfile: "configs/configuration.yaml"
 
 
@@ -14,18 +37,3 @@ rule feature_select:
         "../envs/cytominer_env.yaml"
     script:
         "../scripts/feature_select.py"
-
-
-rule create_consensus:
-    input:
-        SELECTED_FEATURE_DATA_EXPAND,
-    output:
-        CONSENSUS_DATA,
-    params:
-        consensus_configs=config["config_paths"]["consensus_config"],
-    log:
-        "logs/create_consensus.log",
-    conda:
-        "../envs/cytominer_env.yaml"
-    script:
-        "../scripts/consensus.py"
diff --git a/workflows/rules/generate_consensus.smk b/workflows/rules/generate_consensus.smk
new file mode 100644
index 00000000..8c292763
--- /dev/null
+++ b/workflows/rules/generate_consensus.smk
@@ -0,0 +1,38 @@
+"""
+rule module: generate_consensus.smk
+
+Utilize's pycytominer's consensus module:
+https://github.com/cytomining/pycytominer/blob/master/pycytominer/consensus.py
+
+Creates consensus profiles that reflects unique signatures associated with
+external factors.
+
+Parameters:
+----------
+input:
+    Selected features profile
+output:
+    Consensus profile
+
+Return:
+-------
+    Consensus profile stored in the `results/` directory
+"""
+
+
+configfile: "configs/configuration.yaml"
+
+
+rule create_consensus:
+    input:
+        SELECTED_FEATURE_DATA_EXPAND,
+    output:
+        CONSENSUS_DATA,
+    params:
+        consensus_configs=config["config_paths"]["consensus_config"],
+    log:
+        "logs/create_consensus.log",
+    conda:
+        "../envs/cytominer_env.yaml"
+    script:
+        "../scripts/consensus.py"
diff --git a/workflows/rules/merge_logs.smk b/workflows/rules/merge_logs.smk
index c1dde8da..67cea78f 100644
--- a/workflows/rules/merge_logs.smk
+++ b/workflows/rules/merge_logs.smk
@@ -1,13 +1,21 @@
 """
-Documentation:
-Rule collects all generated logs from all porcessess and merges
-them into a single log file.
+rule module: merge_logs.smk
 
-individual log files are stored into an archive file along with
-the generated merged log.
+Collects all log files generated within each rule module and merges it into
+one log file
 
-The archive file is taged with (Month-day-year)-(hour-min-sec)
+The log file is tagged with (Month-day-year)-(hour-min-sec)
 Example: 072922-083033_archived_logs
+
+Parameters:
+Inputs:
+    No user defined outputs, searches individual logs in the `logs/` folder
+Output:
+    Merged log file
+
+
+Returns
+    Merged log file stored in the `logs/` directory
 """
 
 
diff --git a/workflows/rules/normalize.smk b/workflows/rules/normalize.smk
new file mode 100644
index 00000000..a374970b
--- /dev/null
+++ b/workflows/rules/normalize.smk
@@ -0,0 +1,40 @@
+"""
+rule module: normalize.smk
+
+Utlizes pycytominer's normalization module:
+https://github.com/cytomining/pycytominer/blob/c90438fd7c11ad8b1689c21db16dab1a5280de6c/pycytominer/normalize.py
+
+Normalizing single-cell or aggregate features. Current default normalization
+method is `standardize` other methods include:
+
+
+parameters
+----------
+input
+    single-cell or aggregated profiles
+
+output
+    normalized single-cell or aggregate dataset.
+
+Output
+------
+    Generates an annotated profile stored in the `results/` directory
+"""
+
+
+configfile: "configs/configuration.yaml"
+
+
+rule normalize:
+    input:
+        ANNOTATED_DATA,
+    output:
+        NORMALIZED_DATA,
+    conda:
+        "../envs/cytominer_env.yaml"
+    log:
+        "logs/normalized_{file_name}.log",
+    params:
+        normalize_config=config["config_paths"]["normalize"],
+    script:
+        "../scripts/normalize.py"
diff --git a/workflows/rules/preprocessing.smk b/workflows/rules/preprocessing.smk
deleted file mode 100644
index 6411c6b4..00000000
--- a/workflows/rules/preprocessing.smk
+++ /dev/null
@@ -1,78 +0,0 @@
-"""
-Documentation:
-Workflow that involves preprocessing raw single-cell plate data and
-transforming it into normalized aggregate profiles.
-
-Parameters
-----------
-sql_files : List[str]
-  List of SQL files containing plate data
-barcodes : str
-  path pointing to the barcode file storing platemap IDs
-metadata : str
-  path pointing to plate metadata
-
-Generates
----------
-cell_counts: .csv file
-  csv file containing n_cells per well
-augmented: csv.gz file
-  Annotated aggregated profiles
-normalized: csv.gz file
-  Normalized annotated aggregate profiles
-# --------------------
-"""
-
-
-# collecting all unique IDs from plate
-configfile: "configs/configuration.yaml"
-
-
-rule aggregate:
-    input:
-        sql_files=PLATE_DATA,
-        barcodes=BARCODES,
-        metadata=METADATA_DIR,
-    output:
-        aggregate_profile=AGGREGATE_DATA,
-        cell_counts=CELL_COUNTS,
-    log:
-        "logs/aggregate_{file_name}.log",
-    conda:
-        "../envs/cytominer_env.yaml"
-    params:
-        aggregate_config=config["config_paths"]["single_cell"],
-    script:
-        "../scripts/aggregate_cells.py"
-
-
-rule annotate:
-    input:
-        aggregate_profile=AGGREGATE_DATA,
-        barcodes=BARCODES,
-        metadata=METADATA_DIR,
-    output:
-        ANNOTATED_DATA,
-    conda:
-        "../envs/cytominer_env.yaml"
-    log:
-        "logs/annotate_{file_name}.log",
-    params:
-        annotate_config=config["config_paths"]["annotate"],
-    script:
-        "../scripts/annotate.py"
-
-
-rule normalize:
-    input:
-        ANNOTATED_DATA,
-    output:
-        NORMALIZED_DATA,
-    conda:
-        "../envs/cytominer_env.yaml"
-    log:
-        "logs/normalized_{file_name}.log",
-    params:
-        normalize_config=config["config_paths"]["normalize"],
-    script:
-        "../scripts/normalize.py"
diff --git a/workflows/scripts/consensus.py b/workflows/scripts/consensus.py
index a554418c..29619446 100644
--- a/workflows/scripts/consensus.py
+++ b/workflows/scripts/consensus.py
@@ -2,7 +2,6 @@
 from pathlib import Path
 
 import pandas as pd
-import snakemake
 import yaml
 from pycytominer import consensus
 from pycytominer.operations import get_na_columns
diff --git a/workflows/workflow/cp_process.smk b/workflows/workflow/cp_process.smk
index 6e8ae4a8..cce42f02 100644
--- a/workflows/workflow/cp_process.smk
+++ b/workflows/workflow/cp_process.smk
@@ -2,10 +2,13 @@ import glob
 from cytosnake.helpers import helper_funcs as hf
 
 
-# importing Modules
+# importing rule modules
 include: "../rules/common.smk"
-include: "../rules/preprocessing.smk"
+include: "../rules/aggregate.smk"
+include: "../rules/annotate.smk"
+include: "../rules/normalize.smk"
 include: "../rules/feature_select.smk"
+include: "../rules/generate_consensus.smk"
 
 
 # expected outputs from workflow
@@ -16,3 +19,4 @@ rule all:
         ANNOTATED_DATA_EXPAND,
         NORMALIZED_DATA_EXPAND,
         SELECTED_FEATURE_DATA_EXPAND,
+        CONSENSUS_DATA,