diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 07dfd209..60c1aea3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,21 @@ --- repos: + # remove unused imports + - repo: https://github.com/hadialqattan/pycln.git + rev: v2.1.3 + hooks: + - id: pycln + + # import formatter with black configurations + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + name: isort (python) + args: ["--profile", "black", "--filter-files"] # Code formatter for both python files and jupyter notebooks + # support pep 8 standards - repo: https://github.com/psf/black rev: 22.10.0 hooks: @@ -9,11 +23,12 @@ repos: - id: black language_version: python3.10 - # remove unused imports - - repo: https://github.com/hadialqattan/pycln.git - rev: v2.1.3 + # AI based formatter to improve readability + - repo: https://github.com/sourcery-ai/sourcery + rev: v1.1.0 hooks: - - id: pycln + - id: sourcery + args: [--diff=git diff HEAD, --no-summary] # snakemake formatting - repo: https://github.com/snakemake/snakefmt diff --git a/configs/analysis_configs/cytotable_convert.yaml b/configs/analysis_configs/cytotable_convert.yaml new file mode 100644 index 00000000..5883627d --- /dev/null +++ b/configs/analysis_configs/cytotable_convert.yaml @@ -0,0 +1,11 @@ +cytotable_convert: + params: + dest_datatype: parquet + source_datatype: sqlite + concat: True + join: True + infer_common_schema: True + drop_null: True + preset: cellprofiler_sqlite + log_level: ERROR + diff --git a/configs/configuration.yaml b/configs/configuration.yaml index 143fe989..c55687d0 100644 --- a/configs/configuration.yaml +++ b/configs/configuration.yaml @@ -1,17 +1,27 @@ -config_name: cytopipe_defaults +config_name: + +env_manager: conda # computation configs analysis_configs: preprocessing: threads: 4 +# data configurations +data_configs: + plate_data_format: sqlite + # Analysis configuration file paths config_paths: + # general configs + general_configs: "configs/configuration.yaml" + # CellProfiler Specific analysis configurations single_cell: "configs/analysis_configs/single_cell_configs.yaml" normalize: "configs/analysis_configs/normalize_configs.yaml" feature_select: "configs/analysis_configs/feature_select_configs.yaml" aggregate: "configs/analysis_configs/aggregate_configs.yaml" + cytotable_config: "configs/analysis_configs/cytotable_convert.yaml" # DeepProfiler Specific analysis configurations dp_data: "configs/analysis_configs/dp_data_configs.yaml" diff --git a/configs/wf_configs/cp_process_singlecells.yaml b/configs/wf_configs/cp_process_singlecells.yaml new file mode 100644 index 00000000..290f2faa --- /dev/null +++ b/configs/wf_configs/cp_process_singlecells.yaml @@ -0,0 +1,84 @@ +name: cp_process_singlecells_configs + +# Documentation +docs: | + Description: + ------------ + Converts sqlite plate data into parquet and returns selected features in csv + format + + Workflow Steps: + --------------- + Below the workflow steps are separated in chunks. + + cytotable_convert: + Takes in sqlite file and converts it into a parquet file. + + Uses CytoTable's convert workflow which can be found in: + https://cytomining.github.io/CytoTable/python-api.html#cytotable.convert.convert + + normalize_configs: + Noramlizes single cell morphological features + + Uses Pycytominer normalization module: + https://github.com/cytomining/pycytominer/blob/master/pycytominer/normalize.py + + + feature_select_configs: + Selects morphological features from normalized dataset + + Uses Pycytominer feature extraction module + https://github.com/cytomining/pycytominer/blob/master/pycytominer/feature_select.py + + +cytotable_convert: + params: + dest_datatype: parquet + source_datatype: sqlite + concat: True + join: True + infer_common_schema: True + drop_null: True + preset: cellprofiler_sqlite + log_level: ERROR + +normalize_configs: + params: + features: infer + image_features: False + meta_features: infer + samples: all + method: mad_robustize + compression_options: + method: gzip + mtime: 1 + float_format: null + mad_robustize_epsilon: 1.0e-18 + spherize_center: True + spherize_method: ZCA-cor + spherize_epsilon: 1.0e-6 + +feature_select_configs: + params: + features: infer + image_features: False + samples: all + operation: + - variance_threshold + - drop_na_columns + - correlation_threshold + - drop_outliers + - blocklist + na_cutoff: 0.05 + corr_threshold: 0.9 + corr_method: pearson + freq_cut: 0.05 + unique_cut: 0.1 + compression_options: + method: gzip + mtime: 1 + float_format: null + blocklist_file: null + outlier_cutoff: 15 + noise_removal_perturb_groups: null + noise_removal_stdev_cutoff: null diff --git a/cytosnake/cli/cmd.py b/cytosnake/cli/cmd.py index c9fd8996..7609383b 100644 --- a/cytosnake/cli/cmd.py +++ b/cytosnake/cli/cmd.py @@ -5,8 +5,8 @@ Generates CLI interface in order to interact with CytoSnake. """ -import sys import logging +import sys from pathlib import Path # cytosnake imports diff --git a/cytosnake/common/errors.py b/cytosnake/common/errors.py index fa69e8d1..57c02667 100644 --- a/cytosnake/common/errors.py +++ b/cytosnake/common/errors.py @@ -87,6 +87,10 @@ class ProjectExistsError(BaseFileExistsError): that the current directory has already been set up for cytosnake analysis""" +class ExtensionError(BaseValueError): + """Raised when invalid extensions are captured""" + + # ----------------------- # Error handling functions # ----------------------- diff --git a/cytosnake/guards/ext_guards.py b/cytosnake/guards/ext_guards.py new file mode 100644 index 00000000..9c76e12a --- /dev/null +++ b/cytosnake/guards/ext_guards.py @@ -0,0 +1,50 @@ +""" +module: ext_guards.py + +Checks if the correct extensions are provided +""" + +import pathlib +from typing import TypeGuard + +from cytosnake.guards.path_guards import is_valid_path + + +def has_parquet_ext(file_name: str | pathlib.Path) -> TypeGuard[str]: + """Checks if the provided file path contains parquet file extension . + Parameters + ---------- + file_name : str | pathlib.Path + path to file + + Returns + ------- + TypeGuard[str] + return True if it is a parquet file, else False + """ + return ( + file_name.suffix in [".parquet", ".parq", ".pq"] + if is_valid_path(file_name) + else False + ) + + +def has_sqlite_ext(file_name: str | pathlib.Path) -> TypeGuard[str]: + """Checks if the provided file path contains parquet file extension . + + Parameters + ---------- + file_name : str | pathlib.Path + path to file + + Returns + ------- + TypeGuard[str] + return True if it is a parquet file, else False + """ + + return ( + file_name.suffix in [".sqlite", ".sqlite3"] + if is_valid_path(file_name) + else False + ) diff --git a/cytosnake/guards/path_guards.py b/cytosnake/guards/path_guards.py index 13b9b46f..27c24ee4 100644 --- a/cytosnake/guards/path_guards.py +++ b/cytosnake/guards/path_guards.py @@ -9,20 +9,19 @@ - valid path strings """ -from pathlib import Path +import pathlib from typing import TypeGuard -def is_valid_path(val: object) -> TypeGuard[Path]: +def is_valid_path(val: object) -> TypeGuard[pathlib.Path]: """checks if provided value is a valid path""" - # check if the val is valid type - # -- if string, convert to Path - accepted_types = (str, Path) - if not isinstance(val, accepted_types): + # type checking + if not isinstance(val, (str, pathlib.Path)): return False + # convert to pathlib.Path if isinstance(val, str): - val = Path(val) + val = pathlib.Path(val).resolve(strict=True) # check if the path exists - return bool(val.exists()) + return val.exists() diff --git a/cytosnake/helpers/helper_funcs.py b/cytosnake/helpers/helper_funcs.py index 8033dea8..02da7f20 100644 --- a/cytosnake/helpers/helper_funcs.py +++ b/cytosnake/helpers/helper_funcs.py @@ -6,14 +6,17 @@ """ +from pathlib import Path from typing import Optional + from snakemake.io import expand -from pathlib import Path -from cytosnake.utils.config_utils import load_meta_path_configs + from cytosnake.guards.path_guards import is_valid_path +from cytosnake.utils.config_utils import load_general_configs, load_meta_path_configs # loading in config as global variables PATHS = load_meta_path_configs() +CYTOSNAKE_CONFIGS = load_general_configs() # ------------------------------ @@ -151,7 +154,6 @@ def annotated_output() -> str: def normalized_output() -> str: - """Generates output path for normalized dataset Returns @@ -169,7 +171,6 @@ def normalized_output() -> str: def selected_features_output() -> str: - """Generates output path for selected features dataset Returns @@ -187,7 +188,6 @@ def selected_features_output() -> str: def consensus_output() -> str: - """Generates output path for consensus dataset Returns @@ -204,6 +204,23 @@ def consensus_output() -> str: return str(results_path / f"{output_name}.{ext}") +def parquet_output(): + """Generates output path for parquet profiles + + Returns + ------- + str + path to generated parquet files + + """ + data_path = Path(PATHS["project_dir_path"]) / "data" + output_name = "{file_name}" + ext = "parquet" + + # constructing file output string + return str(data_path / f"{output_name}.{ext}") + + # ------------------------------ # Formatting I/O functions # ------------------------------ diff --git a/cytosnake/utils/config_utils.py b/cytosnake/utils/config_utils.py index ad31090a..1454291d 100644 --- a/cytosnake/utils/config_utils.py +++ b/cytosnake/utils/config_utils.py @@ -30,9 +30,7 @@ def load_configs(config_path: str | Path) -> dict: if not is_valid_path(config_path): raise FileNotFoundError("Invalid config path provided") if isinstance(config_path, str): - config_path = Path(config_path).absolute() - if not config_path.is_absolute(): - config_path = config_path.absolute() + config_path = Path(config_path).resolve(strict=True) # loading in config_path with open(config_path, "r") as yaml_contents: @@ -41,6 +39,18 @@ def load_configs(config_path: str | Path) -> dict: return loaded_configs +def load_general_configs() -> dict: + """Loads cytosnake's general configurations + + Returns: + ------- + dict + dictionary containing the cytosnake general configs + """ + config_dir_path = cp.get_config_dir_path() / "configuration.yaml" + return load_configs(config_dir_path) + + def load_meta_path_configs() -> dict: """Loads the metadata path from `.cytosnake/_paths.yaml` file diff --git a/cytosnake/utils/cyto_paths.py b/cytosnake/utils/cyto_paths.py index 378b6cb7..464177ac 100644 --- a/cytosnake/utils/cyto_paths.py +++ b/cytosnake/utils/cyto_paths.py @@ -105,12 +105,9 @@ def get_project_root() -> Path: """ # get current working directory - project_dir = Path().absolute() - - # check if the `.cytosnake` folder exist - project_folder = project_dir / ".cytosnake" - if not project_folder.exists(): - raise FileNotFoundError("Current directory is not a project folder") + project_dir = find_project_dir() + if project_dir is None: + raise NotADirectoryError("Unable to find project directory") return project_dir @@ -139,20 +136,32 @@ def get_workflow_fpaths() -> dict: return file_search(workflow_path) -def get_config_fpaths() -> dict: - """Obtains all file paths located in the `configs` folder as a dictionary. +def get_config_dir_path() -> Path: + """Returns path to configuration folder Returns ------- - dict - structured dictionary directory name and file paths as key value pairs + Path + Path to config directory """ + proj_root_path = get_project_root() config_path = proj_root_path / "configs" if not is_valid_path(config_path): raise FileNotFoundError("Unable to find config directory") - return file_search(config_path) + return config_path + + +def get_config_fpaths() -> dict: + """Obtains all file paths located in the `configs` folder as a dictionary. + + Returns + ------- + dict + structured dictionary with directory name and file paths as key value pairs + """ + return file_search(get_config_dir_path()) def get_project_dirpaths(args: Namespace) -> dict: diff --git a/cytosnake/utils/file_utils.py b/cytosnake/utils/file_utils.py index 40bb92df..f64eb03a 100644 --- a/cytosnake/utils/file_utils.py +++ b/cytosnake/utils/file_utils.py @@ -97,7 +97,7 @@ def find_project_dir(steps: Optional[int] = 10) -> Path | None: # check if the file is a directory and has the name `cytosnake` # -- if true, return the complete path if _file.is_dir() and _file.name == ".cytosnake": - return _file.parent.absolute() + return _file.parent.resolve(strict=True) start_point = start_point.parent diff --git a/workflows/envs/cytotable.yaml b/workflows/envs/cytotable.yaml new file mode 100644 index 00000000..5d52d2d2 --- /dev/null +++ b/workflows/envs/cytotable.yaml @@ -0,0 +1,10 @@ +name: cytotable +channels: + - conda-forge + - anaconda + - defaults +dependencies: + - python==3.8 + - pip + - pip: + - git+https://github.com/cytomining/CytoTable.git@f336d26baf96b8b3d9ed6ca07f617895030bf038 diff --git a/workflows/rules/common.smk b/workflows/rules/common.smk index 2561edaf..bdb13d5e 100644 --- a/workflows/rules/common.smk +++ b/workflows/rules/common.smk @@ -14,10 +14,16 @@ DATA_DIR = str(load_data_path_configs()) # ------ # INPUTS # ------ -# -- generating a wild card list (just the file base names) +# generating a wild card list (just the file base names) plate_name = hf.get_file_basenames(DATA_DIR, ext_target="sqlite") -# -- getting the rest of the input paths from helper functions +# getting the rest of the input paths from helper functions + +# level 2 data: Single cell dataused as inputs along with associated +# metadata and barcodes +# +# METADATA_DIR: contains information of what has been added to the cells +# BARCODES: Unique id that points to a specific level 2 dataset (plate data) PLATE_DATA = hf.get_plate_data() BARCODES = hf.get_barcodes() METADATA_DIR = hf.get_metadata_dir() @@ -25,20 +31,38 @@ METADATA_DIR = hf.get_metadata_dir() # ------- # OUTPUTS # ------- -# -- extended = list of the file names with a given wildcard -AGGREGATE_DATA = hf.aggregate_output() -CELL_COUNTS = hf.cell_count_output() +# Things to know: +# extended = list of the file names with a given wildcard +# +# To understand the level of data, please refere to PyCytominer documentation +# https://github.com/cytomining/pycytominer -CELL_COUNTS_EXPANDED = expand(CELL_COUNTS, file_name=plate_name) -AGGREGATE_DATA_EXPAND = expand(AGGREGATE_DATA, file_name=plate_name) +# Level 2 data: converted into parquet format +CYTOTABLE_OUTPUT_DATA = hf.parquet_output() +CYTOTABLE_OUTPUT_DATA_EXTENDED = expand(CONVERTED_DATA, file_name=plate_name) +# level 2.5 data: annotated level 2 data based on given metadata (e.g treatments) ANNOTATED_DATA = hf.annotated_output() ANNOTATED_DATA_EXPAND = expand(ANNOTATED_DATA, file_name=plate_name) +# level 3 data: aggregated profile based on given aggregation level +# (e.g aggregating single-cell data to the well level) +AGGREGATE_DATA = hf.aggregate_output() +AGGREGATE_DATA_EXPAND = expand(AGGREGATE_DATA, file_name=plate_name) + +# level 4a data: noramlzied profile NORMALIZED_DATA = hf.normalized_output() NORMALIZED_DATA_EXPAND = expand(NORMALIZED_DATA, file_name=plate_name) +# level 4b: selected features profile SELECTED_FEATURE_DATA = hf.selected_features_output() SELECTED_FEATURE_DATA_EXPAND = expand(SELECTED_FEATURE_DATA, file_name=plate_name) +# level 5: Consensus profile captures unique signatures that resulted from +# any external factor (e.g pertubations) CONSENSUS_DATA = hf.consensus_output() + +# other outputs +# Cell counts: cell counts per well in level 2 data +CELL_COUNTS = hf.cell_count_output() +CELL_COUNTS_EXPANDED = expand(CELL_COUNTS, file_name=plate_name) diff --git a/workflows/rules/cytotable_convert.smk b/workflows/rules/cytotable_convert.smk new file mode 100644 index 00000000..aad3f596 --- /dev/null +++ b/workflows/rules/cytotable_convert.smk @@ -0,0 +1,32 @@ +""" +rule module: cytotable_convert.smk + +Utilizes CytoTable's convert workflow module: +https://github.com/cytomining/CytoTable/blob/main/cytotable/convert.py + +Parameters: +----------- + + +Returns: +-------- + parquet files stored within the data/ folder + +""" + + +configfile: "configs/configuration.yaml" + + +rule convert: + input: + PLATE_DATA, + output: + CYTOTABLE_OUTPUT_DATA, + conda: + "../envs/cytotable.yaml" + params: + data_configs=config["data_configs"]["plate_data_format"], + cytotable_config=config["cytotable_convert"], + script: + "../scripts/convert.py" diff --git a/workflows/rules/feature_select.smk b/workflows/rules/feature_select.smk index ef36890f..bb863073 100644 --- a/workflows/rules/feature_select.smk +++ b/workflows/rules/feature_select.smk @@ -30,7 +30,7 @@ rule feature_select: output: SELECTED_FEATURE_DATA_EXPAND, params: - feature_select_config=config["config_paths"]["feature_select"], + feature_select_config=config["feature_select_configs"], log: "logs/feature_select.log", conda: diff --git a/workflows/rules/normalize.smk b/workflows/rules/normalize.smk index a374970b..e4c1a58a 100644 --- a/workflows/rules/normalize.smk +++ b/workflows/rules/normalize.smk @@ -27,7 +27,7 @@ configfile: "configs/configuration.yaml" rule normalize: input: - ANNOTATED_DATA, + CYTOTABLE_OUTPUT_DATA, output: NORMALIZED_DATA, conda: @@ -35,6 +35,6 @@ rule normalize: log: "logs/normalized_{file_name}.log", params: - normalize_config=config["config_paths"]["normalize"], + normalize_config=config["normalize_configs"], script: "../scripts/normalize.py" diff --git a/workflows/scripts/aggregate_cells.py b/workflows/scripts/aggregate_cells.py index b5ef75d2..e42e3626 100644 --- a/workflows/scripts/aggregate_cells.py +++ b/workflows/scripts/aggregate_cells.py @@ -3,9 +3,9 @@ from pathlib import Path import pandas as pd -from snakemake.script import Snakemake import yaml from pycytominer.cyto_utils.cells import SingleCells +from snakemake.script import Snakemake def aggregate( @@ -136,9 +136,12 @@ def aggregate( logging.info(f"Aggregate profile saved in : {aggregate_file_out}") +# execute main code for aggregation if __name__ == "__main__": # snakemake inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python plate_data = str(snakemake.input["sql_files"]) barcode_path = str(snakemake.input["barcodes"]) metadata_dir_path = str(snakemake.input["metadata"]) @@ -147,6 +150,7 @@ def aggregate( config_path = str(snakemake.params["aggregate_config"]) log_path = str(snakemake.log) + # execute aggregation function aggregate( sql_file=plate_data, metadata_dir=metadata_dir_path, diff --git a/workflows/scripts/annotate.py b/workflows/scripts/annotate.py index 4451c150..1e3ba50b 100644 --- a/workflows/scripts/annotate.py +++ b/workflows/scripts/annotate.py @@ -107,9 +107,12 @@ def annotate_cells( logging.info(f"Annotated files saved: {annotate_file_out}") +# annotates dataset with given metadata if __name__ == "__main__": # snakemake inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python aggregate_data_path = str(snakemake.input["aggregate_profile"]) annotate_data_output = str(snakemake.output) barcode_path = str(snakemake.input["barcodes"]) diff --git a/workflows/scripts/build_dp_consensus.py b/workflows/scripts/build_dp_consensus.py index dc1bb5af..b25c9551 100644 --- a/workflows/scripts/build_dp_consensus.py +++ b/workflows/scripts/build_dp_consensus.py @@ -142,9 +142,12 @@ def build_dp_consensus( dp_consensus_profile.to_csv(outname, sep="\t", index=False) +# building consensus from DeepProfiler datasets if __name__ == "__main__": # snakemake inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python`` norm_agg_data = str(snakemake.input) out_name = str(snakemake.output) consensus_config_path = str(snakemake.params["consensus_config"]) diff --git a/workflows/scripts/consensus.py b/workflows/scripts/consensus.py index 29619446..ee381360 100644 --- a/workflows/scripts/consensus.py +++ b/workflows/scripts/consensus.py @@ -96,9 +96,12 @@ def build_consensus( x_consensus_df.to_csv(consensus_file_out, sep="\t", index=False) +# build consensus profiles from CellProfile data if __name__ in "__main__": # loading inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python`` inputs = [str(f_in) for f_in in snakemake.input] output = str(snakemake.output) config_path = str(snakemake.params["consensus_configs"]) diff --git a/workflows/scripts/convert.py b/workflows/scripts/convert.py new file mode 100644 index 00000000..b4ea2125 --- /dev/null +++ b/workflows/scripts/convert.py @@ -0,0 +1,84 @@ +""" +script: convert.py + +converts sqlite (or other formats) into parquet files +""" +import pathlib +from typing import List, Union + +import cytotable + + +def convert_to_parquet( + input_file: Union[str, List[str]], + out_path: str, + target_ext: str, + convert_configs: str, +): + """Takes in a file or a list of files that will be converted into parquet. + + Parameters + ---------- + input_file : Union[str, List[str]] + files or list of files converted into `.parquet` files + out_path : str + path where generated parquet files will be saved + target_ext : str + dictates which file format + convert_configs : dict + dictionary containing cytotable.convert() configs + + Raises + ------ + ValueError + raised if an unsupported file extension is provided + """ + # checking if user has parquet file + if target_ext == ".parquet" and pathlib.Path(input_file).suffix not in [ + ".sqlite", + ".csv", + ]: + raise ValueError( + "Converting to parquet files requires sqlite file." + f"File provided: {target_ext}" + ) + + # convert sqlite file into parquet + cytotable_config = convert_configs["params"] + cytotable.convert( + source_path=input_file, + dest_path=out_path, + dest_datatype=cytotable_config["dest_datatype"], + source_datatype=cytotable_config["source_datatype"], + concat=cytotable_config["concat"], + join=cytotable_config["join"], + infer_common_schema=cytotable_config["infer_common_schema"], + drop_null=cytotable_config["drop_null"], + preset=cytotable_config["preset"], + log_level=cytotable_config["log_level"], + ) + + +def main(): + """Execution of the main script""" + + # grabbing snakemake inputs from workflow + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python + plate_data = str(snakemake.input) + output_path = str(snakemake.output) + general_configs = snakemake.params["data_configs"] + cytotable_configs = snakemake.params["cytotable_config"] + + # executing conversion input file to parquet + convert_to_parquet( + input_file=plate_data, + out_path=output_path, + convert_configs=cytotable_configs, + target_ext=general_configs, + ) + + +# executes the main function for conversion +if __name__ == "__main__": + main() diff --git a/workflows/scripts/dp_aggregate.py b/workflows/scripts/dp_aggregate.py index db3338e9..83a153fb 100644 --- a/workflows/scripts/dp_aggregate.py +++ b/workflows/scripts/dp_aggregate.py @@ -87,9 +87,12 @@ def aggregate_dp_profiles( dp_agg_df.to_csv(outname) +# aggregating DeepProfiler level 2 dataset if __name__ == "__main__": # collecting snakemake inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python`` dp_path = str(snakemake.input["dp_features_dir"]) index_file_path = str(snakemake.input["index_file"]) out_name = str(snakemake.output) diff --git a/workflows/scripts/dp_build_consensus.py b/workflows/scripts/dp_build_consensus.py index fbcc4f15..087bc6ab 100644 --- a/workflows/scripts/dp_build_consensus.py +++ b/workflows/scripts/dp_build_consensus.py @@ -48,13 +48,17 @@ def build_dp_consensus(dp_profile: str, outname: str, config: str): dp_consensus_profile.to_csv(outname, sep="\t", index=False) +# building consensus profile from deep profiler features if __name__ == "__main__": # snakemake inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python`` norm_agg_dp_profile = str(snakemake.input) out_name = str(snakemake.output) config_path = str(snakemake.input) + # building consensus profiles build_dp_consensus( dp_profile=norm_agg_dp_profile, outname=out_name, config=config_path ) diff --git a/workflows/scripts/dp_normalize.py b/workflows/scripts/dp_normalize.py index 5b0c5418..558df556 100644 --- a/workflows/scripts/dp_normalize.py +++ b/workflows/scripts/dp_normalize.py @@ -118,9 +118,12 @@ def normalize_aggregate_dp_profiles( ) +# normalize deep profiler datasets if __name__ == "__main__": # snakemake inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python`` agg_profile_path = str(snakemake.input) out_name = str(snakemake.output) config_path = str(snakemake.params["normalize_config"]) diff --git a/workflows/scripts/feature_select.py b/workflows/scripts/feature_select.py index 31c2874c..7e0873fd 100644 --- a/workflows/scripts/feature_select.py +++ b/workflows/scripts/feature_select.py @@ -1,7 +1,6 @@ import logging from pathlib import Path -import yaml from pycytominer.feature_select import feature_select @@ -24,7 +23,8 @@ def feature_selection( Returns ------- - Generates output + None + Generates a csv file containing the selected features. """ # initiating logger @@ -41,23 +41,11 @@ def feature_selection( # loading configs logging.info(f"Loading feature selection configuration from: {config}") - # -- checking if the config file exists - feature_select_obj = Path(config) - if not feature_select_obj.is_file(): - e_msg = "Unable to find Feature Selection configuration file" - logging.error(e_msg) - raise FileNotFoundError(e_msg) - - # -- reading config parameters - feature_select_config_path = feature_select_obj.absolute() - with open(feature_select_config_path, "r") as yaml_contents: - feature_select_config = yaml.safe_load(yaml_contents)["feature_select_configs"][ - "params" - ] - logging.info(f"Feature Selection configuration loaded") + feature_select_config = config["params"] + logging.info("Feature Selection configuration loaded") # Feature selection - logging.info(f"Conducting feature selection") + logging.info("Conducting feature selection") feature_select( normalized_profile, features=feature_select_config["features"], @@ -82,10 +70,15 @@ def feature_selection( logging.info(f"Selected features saved: {out_file}") +# conduct feature selection on datasets if __name__ == "__main__": + + # snakemake inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python`` all_norm_profile = [str(f_in) for f_in in snakemake.input] out_files = [str(f_out) for f_out in snakemake.output] - config_path = str(snakemake.params["feature_select_config"]) + config_path = snakemake.params["feature_select_config"] io_files = zip(all_norm_profile, out_files) log_path = str(snakemake.log) diff --git a/workflows/scripts/merge_logs.py b/workflows/scripts/merge_logs.py index 637d3b5d..8b2b8ad3 100644 --- a/workflows/scripts/merge_logs.py +++ b/workflows/scripts/merge_logs.py @@ -131,9 +131,12 @@ def combine_logs(logs: list[str], outname: str) -> None: shutil.copy(outname, archive_dir_path) +# merging all scripts into one single script file if __name__ == "__main__": # loading snakemake inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python log_files = list(snakemake.input) out_name = str(snakemake.output) diff --git a/workflows/scripts/normalize.py b/workflows/scripts/normalize.py index a90f035a..314ff3c1 100644 --- a/workflows/scripts/normalize.py +++ b/workflows/scripts/normalize.py @@ -40,28 +40,7 @@ def normalization( # loading parameters logging.info(f"Loading Annotation configuration from: {config}") - - normalize_obj = Path(config) - normalize_config_path = normalize_obj.absolute() - if not normalize_obj.is_file(): - e_msg = "Unable to find Normalization configuration file" - logging.error(e_msg) - raise FileNotFoundError(e_msg) - - with open(normalize_config_path, "r") as yaml_contents: - normalize_config = yaml.safe_load(yaml_contents)["normalize_configs"]["params"] - logging.info("Annotation configuration loaded") - - meta_features = [ - "Metadata_Plate", - "Metadata_Well", - "Metadata_WellRow", - "Metadata_WellCol", - "Metadata_gene_name", - "Metadata_pert_name", - "Metadata_broad_sample", - "Metadata_cell_line", - ] + normalize_config = config["params"] # normalizing annotated aggregated profiles logging.info(f"Normalizing annotated aggregated profiles: {anno_file}") @@ -69,7 +48,7 @@ def normalization( anno_file, features=normalize_config["features"], image_features=normalize_config["image_features"], - meta_features=meta_features, + meta_features=normalize_config["meta_features"], samples=normalize_config["samples"], method=normalize_config["method"], output_file=norm_outfile, @@ -83,11 +62,14 @@ def normalization( logging.info(f"Normalized aggregated profile saved: {norm_outfile}") +# executes normalization protocol if __name__ == "__main__": # snakemake inputs + # more information how snakemake transfers workflow variables to scripts: + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#python annotated_data_path = str(snakemake.input) - config_path = str(snakemake.params["normalize_config"]) + normalize_configs = snakemake.params["normalize_config"] normalized_data_output = str(snakemake.output) log_path = str(snakemake.log) @@ -95,6 +77,6 @@ def normalization( normalization( anno_file=annotated_data_path, norm_outfile=normalized_data_output, - config=config_path, + config=normalize_configs, log_file=log_path, ) diff --git a/workflows/workflow/cp_process_singlecells.smk b/workflows/workflow/cp_process_singlecells.smk new file mode 100644 index 00000000..ec49c0f1 --- /dev/null +++ b/workflows/workflow/cp_process_singlecells.smk @@ -0,0 +1,37 @@ +""" +workflow: cp_process_singlecells.smk + +Description: +------------ +Converts sqlite plate data into parquet and returns selected features in csv +format + +Parameters: +---------- +input: + Plate data in sqlite format +output: + Selected features in csv format + +Returns +------- + Selected morphological features +""" + + +# importing workflow configs +configfile: "./configs/wf_configs/cp_process_singlecells.yaml" + + +# importing modules +include: "../rules/common.smk" +include: "../rules/cytotable_convert.smk" +include: "../rules/normalize.smk" +include: "../rules/feature_select.smk" + + +rule all: + input: + CYTOTABLE_OUTPUT_DATA_EXTENDED, + NORMALIZED_DATA_EXPAND, + SELECTED_FEATURE_DATA_EXPAND,