diff --git a/README.md b/README.md index 4008358..a20cf3d 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Kinex - Kinome Exploration Tool -**Kinex** is a Python package for infering causal kinases from phosphoproteomics data. +**Kinex** is a Python package for inferring causal kinases from phosphoproteomics data. -Paper: Kinex infers causal kinases from phosphoproteomics data. https://doi.org/10.1101/2023.11.23.568445 +Paper: Kinex infers causal kinases from phosphoproteomics data. [https://doi.org/10.1101/2023.11.23.568445](https://doi.org/10.1101/2023.11.23.568445) ## Main Features @@ -13,13 +13,13 @@ Paper: Kinex infers causal kinases from phosphoproteomics data. https://doi.org/ ## Requirements - [conda](https://docs.conda.io/en/latest/miniconda.html) -- python 3.11 +- Python 3.11 ## Installation ### From Conda -``` +```sh # Create and activate your conda environment conda create --name kinex conda activate kinex @@ -28,10 +28,10 @@ conda activate kinex conda install -c bioconda kinex ``` -### From source +### From Source -``` -# Create and activate a python 3.11 conda environment +```sh +# Create and activate a Python 3.11 conda environment conda create --name kinex conda activate kinex conda install python=3.11 @@ -44,28 +44,43 @@ cd kinex pip install . ``` -## Quick start +## Quick Start -1. Import package and create Kinex object -``` +#### 1. Import Package and Create Kinex Object + +```python from kinex import Kinex import pandas as pd +``` -# Read scoring matrices from zenodo -scoring_matrix_ser_thr = pd.read_csv("https://zenodo.org/records/13964893/files/scoring_matrix_ser_thr_82k_sorted.csv.gz?download=1", compression="gzip") -scoring_matrix_tyr = pd.read_csv("https://zenodo.org/records/13964893/files/scoring_matrix_tyr_7k_sorted.csv.gz?download=1", compression="gzip") +##### Create Kinex Object -# Create Kinex object -kinex = Kinex(scoring_matrix_ser_thr, scoring_matrix_tyr) -``` -2. Score a sequence -``` +1. With Predefined Matrices: + + ```python + kinex = Kinex() + ``` + +2. With Your Custom Matrices: + + ```python + kinex = Kinex(scoring_matrix_ser_thr=pd.read_csv('path_to_ser_thr_matrix.csv'), scoring_matrix_tyr=pd.read_csv('path_to_tyr_matrix.csv')) + ``` + +Predefined matrices can be found here: +- [Scoring Matrix for Serine/Threonine](https://zenodo.org/records/13964893/files/scoring_matrix_ser_thr_82k_sorted.csv.gz?download=1) +- [Scoring Matrix for Tyrosine](https://zenodo.org/records/13964893/files/scoring_matrix_tyr_7k_sorted.csv.gz?download=1) + +#### 2. Score a Sequence + +```python sequence = "FVKQKAY*QSPQKQ" res = kinex.get_score(sequence) ``` -3. Enrichment analysis -``` +#### 3. Enrichment Analysis + +```python enrich = kinex.get_enrichment(input_sites, fc_threshold=1.5, phospho_priming=False, favorability=True, method="max") enrich.ser_thr.plot() diff --git a/docs/chapters/features/usage.rst b/docs/chapters/features/usage.rst index 0d90257..712b998 100644 --- a/docs/chapters/features/usage.rst +++ b/docs/chapters/features/usage.rst @@ -1,46 +1,47 @@ -Import package and initialise Kinex +Import Package and Initialize Kinex =================================== -1. Import kinex +1. **Import Kinex** .. code:: python - from kinex import Kinex + from kinex import Kinex -2. Read the scoring matrix +2. **Create a Kinex Object** + +- With Predefined Matrices: + +It will look in the resources to find the matrices. If it doesn't find them, it will download them and save them for future use. .. code:: python - scoring_matrix_ser_thr = pd.read_csv("https://zenodo.org/records/13964893/files/scoring_matrix_ser_thr_82k_sorted.csv.gz?download=1", compression="gzip") - scoring_matrix_tyr = pd.read_csv("https://zenodo.org/records/13964893/files/scoring_matrix_tyr_7k_sorted.csv.gz?download=1", compression="gzip") - scoring_matrix_ser_thr + kinex = Kinex() + +- With Your Custom Matrices: .. code:: python - AAK1 ACVR2A ... YSK4 ZAK - 0 -11.147481 -6.325340 ... -6.723077 -7.402360 - 1 -10.421859 -6.178601 ... -6.343452 -7.373478 - ... ... ... ... - 82753 8.074270 7.289390 ... 4.525527 4.837377 - 82754 8.623180 7.871226 ... 4.869195 5.062391 + scoring_matrix_ser_thr = pd.read_csv("path/to/scoring_matrix_ser_thr.csv") + scoring_matrix_tyr = pd.read_csv("path/to/scoring_matrix_tyr.csv") - [82755 rows x 303 columns] + kinex = Kinex(scoring_matrix_ser_thr, scoring_matrix_tyr) .. note:: - You can optionally save the scoring matrix locally for faster use in the future. + The matrix looks like this: .. code:: python - scoring_matrix_ser_thr.to_csv("scoring_matrix_ser_thr.csv") - scoring_matrix_tyr.to_csv("scoring_matrix_tyr.csv") + AAK1 ACVR2A ... YSK4 ZAK + 0 -11.147481 -6.325340 ... -6.723077 -7.402360 + 1 -10.421859 -6.178601 ... -6.343452 -7.373478 + ... ... ... ... + 82753 8.074270 7.289390 ... 4.525527 4.837377 + 82754 8.623180 7.871226 ... 4.869195 5.062391 - Or just download using the links: - `https://zenodo.org/records/13964893/files/scoring_matrix_ser_thr_82k_sorted.csv.gz?download=1 `_ - `https://zenodo.org/records/13964893/files/scoring_matrix_tyr_7k_sorted.csv.gz?download=1 `_ - -3. Create a kinex object +.. note:: -.. code:: python + Predefined matrices can be found here: - kinex = Kinex(scoring_matrix_ser_thr, scoring_matrix_tyr) + - `Scoring Matrix for Serine/Threonine `_ + - `Scoring Matrix for Tyrosine `_ diff --git a/pyproject.toml b/pyproject.toml index 9437cbb..35296cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,8 @@ dependencies = [ "plotly", "scikit-learn", "umap-learn", - "importlib-resources" + "importlib-resources", + "requests", ] [project.optional-dependencies] @@ -28,6 +29,7 @@ dev = [ "furo" ] + [tool.setuptools] package-dir = { "" = "src" } @@ -35,4 +37,4 @@ package-dir = { "" = "src" } where = ["src"] [tool.setuptools.package-data] -kinex = ["resources/*.csv", "resources/*.json"] \ No newline at end of file +kinex = ["resources/*.csv", "resources/*.json"] diff --git a/src/kinex/functions.py b/src/kinex/functions.py index 80f248f..10e51fd 100644 --- a/src/kinex/functions.py +++ b/src/kinex/functions.py @@ -1,7 +1,10 @@ -import pandas as pd -import numpy as np -from math import sqrt, pow +import requests +from importlib import resources +from math import pow, sqrt +from pathlib import Path +import numpy as np +import pandas as pd def get_sequence_format(sequence: str) -> str: """ @@ -234,4 +237,51 @@ def get_distances(experiment1, experiment2): np.array(experiment2['dominant_enrichment_value_log2']) p_val = np.array(experiment1['dominant_p_value_log10_abs']) - \ np.array(experiment2['dominant_p_value_log10_abs']) - return np.power(np.power(enrich, 2) + np.power(p_val, 2), 0.5) \ No newline at end of file + return np.power(np.power(enrich, 2) + np.power(p_val, 2), 0.5) + + +def download_file_to_resource(url: str, resource_name: str) -> None: + """ + Downloads a file from a given URL and saves it to the specified resource path. + + Parameters: + ---------- + url : str + The URL of the file to be downloaded. + resource_name : str + The name of the resource file to save (e.g., "default_scoring_matrix_tyr.csv.gz"). + + Raises: + ------- + ValueError: + If the URL or resource details are invalid. + requests.exceptions.RequestException: + If there are issues with the HTTP request (e.g., network error, 404). + IOError: + If there's an error saving the file. + """ + try: + # Determine the file save path using importlib.resources + with resources.path("kinex.resources", resource_name) as file_path: + save_path = Path(file_path) + + print(f"Starting download from: {url}") + response = requests.get(url, stream=True, timeout=10) + + # Raise an error for HTTP codes 4xx/5xx + response.raise_for_status() + + + with open(save_path, "wb") as file: + file.write(response.content) + + print(f"File successfully downloaded and saved to: {save_path}") + + except requests.exceptions.MissingSchema: + raise ValueError("The provided URL is not valid.") + except requests.exceptions.RequestException as e: + print(f"Error during file download: {e}") + raise + except IOError as e: + print(f"Error saving the file to {save_path}: {e}") + raise diff --git a/src/kinex/kinex.py b/src/kinex/kinex.py index a141535..6d66439 100644 --- a/src/kinex/kinex.py +++ b/src/kinex/kinex.py @@ -1,18 +1,22 @@ -import pandas as pd -import numpy as np import bisect +from collections import namedtuple from functools import reduce -from kinex.resources import get_pssm_ser_thr, get_pssm_tyr +import numpy as np +import pandas as pd +from kinex.functions import download_file_to_resource +from kinex.resources import get_pssm_ser_thr, get_pssm_tyr, get_scoring_matrix_ser_thr, get_scoring_matrix_tyr, get_configuration_file from kinex.score import Score from kinex.enrichment import Enrichment from kinex.sequence import get_sequence_object, SequenceType -from collections import namedtuple - EnrichmentResults = namedtuple("EnrichmentResults", ["ser_thr", "tyr", "failed_sites"]) + +# Load the pyproject.toml file +config = get_configuration_file() + class Kinex: """ The class representing a PSSM table and a scoring matrix needed for scoring and enrichment analysis. @@ -60,8 +64,8 @@ class Kinex: """ def __init__(self, - scoring_matrix_ser_thr: pd.DataFrame, - scoring_matrix_tyr: pd.DataFrame, + scoring_matrix_ser_thr: pd.DataFrame = None, + scoring_matrix_tyr: pd.DataFrame = None, pssm_ser_thr: pd.DataFrame = get_pssm_ser_thr(), pssm_tyr: pd.DataFrame = get_pssm_tyr()) -> None: """ @@ -70,12 +74,30 @@ def __init__(self, Parameters ---------- pssm_ser_thr : pandas.DataFrame - Normalised and scaled densiometries from PSPA experiments. + Normalized and scaled densiometries from PSPA experiments. The table cotains on rows the kinases and on columns the positions for each aminoacid. scoring_matrix_ser_thr : pandas.DataFrame Table containing 82,755 experimentally identified Ser/Thr phosphosites that have been scored by 303 Ser or Thr kinase PSSM. The table allows the ranking of kinases, as well as the calculation of promiscuity index and median percentile for each input validation. """ + + # Matrix is not provided + if scoring_matrix_ser_thr is None: + # Trying to look for the matrix in the resources + scoring_matrix_ser_thr = get_scoring_matrix_ser_thr() + # Matrix is not provided and not found in the resources, download the default matrix + if scoring_matrix_ser_thr is None: + scoring_matrix_ser_thr_url = config["urls"]["scoring_matrix_ser_thr"] + download_file_to_resource(scoring_matrix_ser_thr_url, 'default_scoring_matrix_ser_thr.csv.gz') + scoring_matrix_ser_thr = get_scoring_matrix_ser_thr() + + + if scoring_matrix_tyr is None: + scoring_matrix_tyr = get_scoring_matrix_tyr() + if scoring_matrix_tyr is None: + scoring_matrix_tyr_url = config["urls"]["scoring_matrix_tyr"] + download_file_to_resource(scoring_matrix_tyr_url, 'default_scoring_matrix_tyr.csv.gz') + scoring_matrix_tyr = get_scoring_matrix_tyr() self.pssm_ser_thr = pssm_ser_thr self.pssm_tyr = pssm_tyr diff --git a/src/kinex/resources/__init__.py b/src/kinex/resources/__init__.py index 1be06f2..a2f0397 100644 --- a/src/kinex/resources/__init__.py +++ b/src/kinex/resources/__init__.py @@ -2,7 +2,6 @@ import json import pandas as pd - def get_pssm_ser_thr() -> pd.DataFrame: with resources.path("kinex.resources", "pssm_table_ser_thr.csv") as df: return pd.read_csv(df, index_col=0) @@ -40,4 +39,23 @@ def get_tyr_family_colors() -> dict: def get_experiments() -> dict: with resources.path("kinex.resources", "experiments.json") as file_path: with open(file_path) as json_file: - return json.load(json_file) \ No newline at end of file + return json.load(json_file) + + +def get_scoring_matrix_ser_thr() -> pd.DataFrame: + try: + with resources.path("kinex.resources", "default_scoring_matrix_ser_thr.csv.gz") as file_path: + return pd.read_csv(file_path, compression='gzip') + except FileNotFoundError: + return None + +def get_scoring_matrix_tyr() -> pd.DataFrame: + try: + with resources.path("kinex.resources", "default_scoring_matrix_tyr.csv.gz") as file_path: + return pd.read_csv(file_path, compression='gzip') + except FileNotFoundError: + return None + +def get_configuration_file() -> dict: + with resources.files("kinex.resources").joinpath("config.json").open() as json_file: + return json.load(json_file) \ No newline at end of file diff --git a/src/kinex/resources/config.json b/src/kinex/resources/config.json new file mode 100644 index 0000000..f73e3bf --- /dev/null +++ b/src/kinex/resources/config.json @@ -0,0 +1,6 @@ +{ + "urls": { + "scoring_matrix_ser_thr": "https://zenodo.org/records/13964893/files/scoring_matrix_ser_thr_82k_sorted.csv.gz?download=1", + "scoring_matrix_tyr": "https://zenodo.org/records/13964893/files/scoring_matrix_tyr_7k_sorted.csv.gz?download=1" + } +}