WayScience · axiomcura · Apr 12, 2023 · Mar 14, 2023 · Mar 28, 2023 · Mar 28, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,19 +1,34 @@
 ---
 repos:
+  # remove unused imports
+  - repo: https://github.com/hadialqattan/pycln.git
+    rev: v2.1.3
+    hooks:
+      - id: pycln
+
+  # import formatter with black configurations
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort (python)
+        args: ["--profile", "black", "--filter-files"]
 
   # Code formatter for both python files and jupyter notebooks
+  # support pep 8 standards
   - repo: https://github.com/psf/black
     rev: 22.10.0
     hooks:
       - id: black-jupyter
       - id: black
         language_version: python3.10
 
-  # remove unused imports
-  - repo: https://github.com/hadialqattan/pycln.git
-    rev: v2.1.3
+  # AI based formatter to improve readability
+  - repo: https://github.com/sourcery-ai/sourcery
+    rev: v1.1.0
     hooks:
-      - id: pycln
+      - id: sourcery
+        args: [--diff=git diff HEAD, --no-summary]
 
   # snakemake formatting
   - repo: https://github.com/snakemake/snakefmt

diff --git a/configs/analysis_configs/cytotable_convert.yaml b/configs/analysis_configs/cytotable_convert.yaml
@@ -0,0 +1,11 @@
+cytotable_convert:
+  params:
+    dest_datatype: parquet
+    source_datatype: sqlite
+    concat: True
+    join: True
+    infer_common_schema: True
+    drop_null: True
+    preset: cellprofiler_sqlite
+    log_level: ERROR
+
diff --git a/configs/configuration.yaml b/configs/configuration.yaml
@@ -1,17 +1,27 @@
-config_name: cytopipe_defaults
+config_name:
+
+env_manager: conda
 
 # computation configs
 analysis_configs:
   preprocessing:
     threads: 4
 
+# data configurations
+data_configs:
+  plate_data_format: sqlite
+
 # Analysis configuration file paths
 config_paths:
+  # general configs
+  general_configs: "configs/configuration.yaml"
+
   # CellProfiler Specific analysis configurations
   single_cell: "configs/analysis_configs/single_cell_configs.yaml"
   normalize: "configs/analysis_configs/normalize_configs.yaml"
   feature_select: "configs/analysis_configs/feature_select_configs.yaml"
   aggregate: "configs/analysis_configs/aggregate_configs.yaml"
+  cytotable_config: "configs/analysis_configs/cytotable_convert.yaml"
 
   # DeepProfiler Specific analysis configurations
   dp_data: "configs/analysis_configs/dp_data_configs.yaml"

diff --git a/configs/wf_configs/cp_process_singlecells.yaml b/configs/wf_configs/cp_process_singlecells.yaml
@@ -0,0 +1,84 @@
+name: cp_process_singlecells_configs
+
+# Documentation
+docs: |
+  Description:
+  ------------
+  Converts sqlite plate data into parquet and returns selected features in csv
+  format
+
+  Workflow Steps:
+  ---------------
+  Below the workflow steps are separated in chunks.
+
+  cytotable_convert:
+    Takes in sqlite file and converts it into a parquet file.
+
+    Uses CytoTable's convert workflow which can be found in:
+    https://github.com/cytomining/CytoTable/blob/main/cytotable/convert.py
+
+  normalize_configs:
+    Noramlizes single cell morphological features
+
+    Uses Pycytominer normalization module:
+    https://github.com/cytomining/pycytominer/blob/master/pycytominer/normalize.py
+
+
+  feature_select_configs:
+    Selects morphological features from normalized dataset
+
+    Uses Pycytominer feature extraction module
+    https://github.com/cytomining/pycytominer/blob/master/pycytominer/feature_select.py
+
+
+cytotable_convert:
+  params:
+    dest_datatype: parquet
+    source_datatype: null
+    concat: True
+    join: True
+    infer_common_schema: True
+    drop_null: True
+    preset: cellprofiler_sqlite
+    log_level: ERROR
+
+normalize_configs:
+  params:
+    features: infer
+    image_features: False
+    meta_features: infer
+    samples: all
+    method: mad_robustize
+    compression_options:
+      method: gzip
+      mtime: 1
+    float_format: null
+    mad_robustize_epsilon: 1.0e-18
+    spherize_center: True
+    spherize_method: ZCA-cor
+    spherize_epsilon: 1.0e-6
+
+feature_select_configs:
+  params:
+    features: infer
+    image_features: False
+    samples: all
+    operation:
+      - variance_threshold
+      - drop_na_columns
+      - correlation_threshold
+      - drop_outliers
+      - blocklist
+    na_cutoff: 0.05
+    corr_threshold: 0.9
+    corr_method: pearson
+    freq_cut: 0.05
+    unique_cut: 0.1
+    compression_options:
+      method: gzip
+      mtime: 1
+    float_format: null
+    blocklist_file: null
+    outlier_cutoff: 15
+    noise_removal_perturb_groups: null
+    noise_removal_stdev_cutoff: null
diff --git a/cytosnake/cli/cmd.py b/cytosnake/cli/cmd.py
@@ -5,8 +5,8 @@
 
 Generates CLI interface in order to interact with CytoSnake.
 """
-import sys
 import logging
+import sys
 from pathlib import Path
 
 # cytosnake imports

diff --git a/cytosnake/common/errors.py b/cytosnake/common/errors.py
@@ -87,6 +87,10 @@ class ProjectExistsError(BaseFileExistsError):
     that the current directory has already been set up for cytosnake analysis"""
 
 
+class ExtensionError(BaseValueError):
+    """Raised when invalid extensions are captured"""
+
+
 # -----------------------
 # Error handling functions
 # -----------------------

diff --git a/cytosnake/guards/ext_guards.py b/cytosnake/guards/ext_guards.py
@@ -0,0 +1,50 @@
+"""
+module: ext_guards.py
+
+Checks if the correct extensions are provided
+"""
+
+import pathlib
+from typing import TypeGuard
+
+from cytosnake.guards.path_guards import is_valid_path
+
+
+def has_parquet_ext(file_name: str | pathlib.Path) -> TypeGuard[str]:
+    """Checks if the provided file path contains parquet file extension .
+    Parameters
+    ----------
+    file_name : str | pathlib.Path
+        path to file
+
+    Returns
+    -------
+    TypeGuard[str]
+        return True if it is a parquet file, else False
+    """
+    return (
+        file_name.suffix in [".parquet", ".parq", ".pq"]
+        if is_valid_path(file_name)
+        else False
+    )
+
+
+def has_sqlite_ext(file_name: str | pathlib.Path) -> TypeGuard[str]:
+    """Checks if the provided file path contains parquet file extension .
+
+    Parameters
+    ----------
+    file_name : str | pathlib.Path
+        path to file
+
+    Returns
+    -------
+    TypeGuard[str]
+        return True if it is a parquet file, else False
+    """
+
+    return (
+        file_name.suffix in [".sqlite", ".sqlite3"]
+        if is_valid_path(file_name)
+        else False
+    )
diff --git a/cytosnake/guards/path_guards.py b/cytosnake/guards/path_guards.py
@@ -9,20 +9,19 @@
     - valid path strings
 """
 
-from pathlib import Path
+import pathlib
 from typing import TypeGuard
 
 
-def is_valid_path(val: object) -> TypeGuard[Path]:
+def is_valid_path(val: object) -> TypeGuard[pathlib.Path]:
     """checks if provided value is a valid path"""
 
-    # check if the val is valid type
-    # -- if string, convert to Path
-    accepted_types = (str, Path)
-    if not isinstance(val, accepted_types):
+    # type checking
+    if not isinstance(val, (str, pathlib.Path)):
         return False
+    # convert to pathlib.Path
     if isinstance(val, str):
-        val = Path(val)
+        val = pathlib.Path(val).resolve(strict=True)
 
     # check if the path exists
-    return bool(val.exists())
+    return val.exists()
diff --git a/cytosnake/helpers/helper_funcs.py b/cytosnake/helpers/helper_funcs.py
@@ -6,14 +6,17 @@
 """
 
 
+from pathlib import Path
 from typing import Optional
+
 from snakemake.io import expand
-from pathlib import Path
-from cytosnake.utils.config_utils import load_meta_path_configs
+
 from cytosnake.guards.path_guards import is_valid_path
+from cytosnake.utils.config_utils import load_general_configs, load_meta_path_configs
 
 # loading in config as global variables
 PATHS = load_meta_path_configs()
+CYTOSNAKE_CONFIGS = load_general_configs()
 
 
 # ------------------------------
@@ -151,7 +154,6 @@ def annotated_output() -> str:
 
 
 def normalized_output() -> str:
-
     """Generates output path for normalized dataset
 
     Returns
@@ -169,7 +171,6 @@ def normalized_output() -> str:
 
 
 def selected_features_output() -> str:
-
     """Generates output path for selected features dataset
 
     Returns
@@ -187,7 +188,6 @@ def selected_features_output() -> str:
 
 
 def consensus_output() -> str:
-
     """Generates output path for consensus  dataset
 
     Returns
@@ -204,6 +204,23 @@ def consensus_output() -> str:
     return str(results_path / f"{output_name}.{ext}")
 
 
+def parquet_output():
+    """Generates output path for parquet profiles
+
+    Returns
+    -------
+    str
+        path to generated parquet files
+
+    """
+    data_path = Path(PATHS["project_dir_path"]) / "data"
+    output_name = "{file_name}"
+    ext = "parquet"
+
+    # constructing file output string
+    return str(data_path / f"{output_name}.{ext}")
+
+
 # ------------------------------
 # Formatting I/O functions
 # ------------------------------

diff --git a/cytosnake/utils/config_utils.py b/cytosnake/utils/config_utils.py
@@ -30,9 +30,7 @@ def load_configs(config_path: str | Path) -> dict:
     if not is_valid_path(config_path):
         raise FileNotFoundError("Invalid config path provided")
     if isinstance(config_path, str):
-        config_path = Path(config_path).absolute()
-    if not config_path.is_absolute():
-        config_path = config_path.absolute()
+        config_path = Path(config_path).resolve(strict=True)
 
     # loading in config_path
     with open(config_path, "r") as yaml_contents:
@@ -41,6 +39,18 @@ def load_configs(config_path: str | Path) -> dict:
     return loaded_configs
 
 
+def load_general_configs() -> dict:
+    """Loads cytosnake's general configurations
+
+    Returns:
+    -------
+    dict
+        dictionary containing the cytosnake general configs
+    """
+    config_dir_path = cp.get_config_dir_path() / "configuration.yaml"
+    return load_configs(config_dir_path)
+
+
 def load_meta_path_configs() -> dict:
     """Loads the metadata path from `.cytosnake/_paths.yaml` file