Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring data reading and writing #116

Merged
merged 24 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ jobs:
strategy:
matrix:
os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
python-version: ['3.8', '3.9', '3.10', '3.11']
python-version: ['3.10', '3.11']

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,6 @@ tests/test_pandas_data.py

# vscode
.vscode

# poetry
poetry.lock
2 changes: 1 addition & 1 deletion RIAssigner/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.3'
__version__ = '0.4.0'
6 changes: 6 additions & 0 deletions RIAssigner/compute/ComputationMethod.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ def _check_data_args(self, query: Data, reference: Data):
raise ValueError("Query data is not defined.")
if reference is None:
raise ValueError("Reference data is not defined.")
if not query.has_retention_times():
raise ValueError("Query data has no retention times.")
if not reference.has_retention_times():
raise ValueError("Reference data has no retention times.")
if not reference.has_retention_indices():
raise ValueError("Reference data has no retention indices.")

def __eq__(self, o: object) -> bool:
return type(o) == type(self)
18 changes: 7 additions & 11 deletions RIAssigner/data/Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,14 @@
import pandas as pd

from pint import Quantity, UnitRegistry
from pint.unit import build_unit_class


class Data(ABC):
""" Base class for data managers. """
RetentionTimeType = Optional[float]
RetentionIndexType = Optional[float]
RetentionTimeType = float
RetentionIndexType = float
CommentFieldType = Optional[str]
URegistry = UnitRegistry()
Unit = build_unit_class(URegistry)

_rt_possible_keys = {'RT', 'rt', 'rts', 'retention_times', 'retention_time', 'retention', 'time', 'retentiontime'}
_ri_possible_keys = {'RI', 'ri', 'ris', 'retention_indices', 'retention_index', 'kovats', 'retentionindex'}
Expand All @@ -27,7 +25,7 @@ def is_valid(value: Union[RetentionTimeType, RetentionIndexType]) -> bool:
Returns:
bool: State of validity (True/False).
"""
result = value is not None and Data.can_be_float(value) and value >= 0.0
result = value is not None and Data.can_be_float(value) and value > 0
return result

@staticmethod
Expand Down Expand Up @@ -68,7 +66,7 @@ def __init__(self, filename: str, filetype: str, rt_unit: str):
self._filename = filename
self._filetype = filetype
self._rt_unit = rt_unit
self._unit = Data.Unit(self._rt_unit)
self._unit = Data.URegistry(self._rt_unit)

@abstractmethod
def write(self, filename):
Expand Down Expand Up @@ -129,7 +127,7 @@ def has_retention_indices(self) -> bool:
Returns:
bool: True if all retention indices exist, False otherwise.
"""
return all([Data.is_valid(rt) for rt in self.retention_indices])
return len(self.retention_indices) > 0 and all([Data.is_valid(rt) for rt in self.retention_indices])

def has_retention_times(self) -> bool:
"""
Expand All @@ -142,7 +140,7 @@ def has_retention_times(self) -> bool:
Returns:
bool: True if all retention times exist, False otherwise.
"""
return all([Data.is_valid(rt) for rt in self.retention_times])
return len(self.retention_times) > 0 and all([Data.is_valid(rt) for rt in self.retention_times])


@property
Expand All @@ -155,7 +153,7 @@ def comment(self) -> Iterable[CommentFieldType]:
"""
...

def extract_ri_from_comment(self, ri_source: str):
def init_ri_from_comment(self, ri_source: str):
""" Extract RI from comment field.
Extracts the RI from the comment field of the data file. The RI is expected to be
in the format 'ri_source=RI_value'. The function extracts the RI value and
Expand All @@ -168,8 +166,6 @@ def extract_ri_from_comment(self, ri_source: str):
ri_source:
String that is expected to be in the comment field before the RI value.
"""


mask = pd.Series(self.comment).str.contains(rf'\b{ri_source}\b', na=False)
extracted_values = pd.Series(self.comment).str.extract(rf'\b{ri_source}=(\d+)\b')[0].astype(float)
self.retention_indices = extracted_values.where(mask, None).tolist()
Expand Down
67 changes: 32 additions & 35 deletions RIAssigner/data/MatchMSData.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from typing import Iterable, Optional
from typing import Iterable, List, Optional, Tuple
import numpy as np

from matchms import Spectrum
from matchms.exporting import save_as_msp
from matchms.importing import load_from_msp
from matchms.exporting import save_spectra
from matchms.exporting.metadata_export import get_metadata_as_array
from matchms.importing import load_spectra
from RIAssigner.utils import get_first_common_element

from .Data import Data


class MatchMSData(Data):
""" Class to handle data from filetypes which can be imported
using 'matchMS'.

Currently only supports 'msp'.
using 'matchms'.
"""

def __init__(self, filename: str, filetype: str, rt_unit: str):
Expand All @@ -22,7 +22,9 @@ def __init__(self, filename: str, filetype: str, rt_unit: str):
def _read(self):
"""Load data into object and initialize properties.
"""
self._read_spectra(self._filename, self._filetype)
self._spectra = list(load_spectra(self._filename, True, self._filetype))
_, self._keys = get_metadata_as_array(self._spectra)

self._init_rt_key()
self._init_ri_key()

Expand All @@ -32,35 +34,27 @@ def _read(self):
self._read_retention_indices()

def write(self, filename: str):
"""Write data to back to 'msp' file
"""Write data to back to the spectra file

Args:
filename (str): Path to filename under which to store the data.
"""
save_as_msp(self._spectra, filename)

def _read_spectra(self, filename: str, filetype: str):
"""Read spectra from 'msp' file into data.

Args:
filename (str): Path to filename from which to load the data.
self._write_RIs_to_spectra()
save_spectra(self._spectra, filename)

Raises:
NotImplementedError: For filetypes other tahn 'msp'.
def _write_RIs_to_spectra(self):
"""Write the RI values stored in the object to the spectra metadata.
"""
if filetype == 'msp':
self._spectra = list(load_from_msp(filename))
else:
raise NotImplementedError("Currently only supports 'msp'.")
list(map(_assign_ri_value, self._spectra, [self._ri_key] * len(self._spectra), self._retention_indices))

def _init_rt_key(self):
""" Identify retention-time key from spectrum metadata. """
rt_key = get_first_common_element(self._rt_possible_keys, self._spectra[0].metadata.keys())
rt_key = get_first_common_element(self._rt_possible_keys, self._keys)
self._rt_key = rt_key or 'retentiontime'

def _init_ri_key(self):
""" Identify retention-index key from spectrum metadata. """
ri_key = get_first_common_element(self._ri_possible_keys, self._spectra[0].metadata.keys())
ri_key = get_first_common_element(self._ri_possible_keys, self._keys)
self._ri_key = ri_key or 'retentionindex'

def _read_retention_times(self):
Expand Down Expand Up @@ -112,9 +106,6 @@ def retention_indices(self, values: Iterable[Data.RetentionIndexType]):
""" Set retention indices. """
if len(values) == len(self._spectra):
self._retention_indices = values
list(
map(_assign_ri_value, self._spectra, [self._ri_key] * len(self._spectra), values)
)
else:
raise ValueError('There is different numbers of computed indices and peaks.')

Expand All @@ -125,11 +116,15 @@ def comment(self) -> Iterable[Data.CommentFieldType]:
content = [spectrum.get(self.comment_keys, default=None) for spectrum in self._spectra]
return content

@property
def spectra_metadata(self) -> Tuple[np.array, List[str]]:
return get_metadata_as_array(self._spectra)


def safe_read_key(spectrum: Spectrum, key: str) -> Optional[float]:
""" Read key from spectrum and convert to float or return 'None'.
def safe_read_key(spectrum: Spectrum, key: str) -> float:
""" Read key from spectrum and convert to float or return 0.0.
Tries to read the given key from the spectrum metadata and convert it to a float.
In case an exception is thrown or the key is not present, returns 'None'.
In case an exception is thrown or the key is not present, returns 0.0.

Parameters
----------
Expand All @@ -140,16 +135,18 @@ def safe_read_key(spectrum: Spectrum, key: str) -> Optional[float]:

Returns
-------
Either the key's value converted to float or 'None'.
Either the key's value converted to float or 0.0.
"""

value = spectrum.get(key, default=None)
if value is not None:
value = spectrum.get(key, default=0.0)
if isinstance(value, str):
try:
value = float(value)
except ValueError:
# RT is in format that can't be converted to float -> set rt to None
value = None
# RT is in format that can't be converted to float -> set rt to 0.0
value = 0.0
if not Data.can_be_float(value):
value = 0.0
return value

def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexType):
Expand All @@ -159,6 +156,6 @@ def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexTyp
spectrum (Spectrum): Spectrum to add RI to
value (Data.RetentionIndexType): RI to be added to Spectrum
"""
if value is not None:
if value > 0:
retention_index = ('%f' % float(value)).rstrip('0').rstrip('.')
spectrum.set(key=key, value=retention_index)
17 changes: 13 additions & 4 deletions RIAssigner/data/PandasData.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Iterable

from pandas import read_csv
from pandas import read_csv, read_parquet
from RIAssigner.utils import define_separator, get_first_common_element

from .Data import Data
Expand All @@ -23,12 +23,16 @@ def _read(self):
self._init_rt_column_info()
self._init_ri_column_info()
self._init_ri_indices()

self._sort_by_rt()
self._replace_nans_with_0s()

def _read_into_dataframe(self):
""" Read the data from file into dataframe. """
if(self._filetype in ['csv', 'tsv']):
self._data = read_csv(self._filename, sep=None, engine="python")
elif self._filetype == 'parquet':
self._data = read_parquet(self._filename)
else:
raise NotImplementedError("File formats different from ['csv', 'tsv'] are not implemented yet.")

Expand Down Expand Up @@ -72,6 +76,13 @@ def _sort_by_rt(self):
""" Sort peaks by their retention times. """
if self._rt_index is not None:
self._data.sort_values(by=self._rt_index, axis=0, inplace=True)

def _replace_nans_with_0s(self):
""" Replace NaN values with 0s. """
if self._rt_index is not None:
self._data[self._rt_index].fillna(0, inplace=True)
if self._ri_index is not None:
self._data[self._ri_index].fillna(0, inplace=True)

def __eq__(self, o: object) -> bool:
"""Comparison operator `==`.
Expand Down Expand Up @@ -105,9 +116,7 @@ def retention_indices(self) -> Iterable[Data.RetentionIndexType]:
""" Get retention indices from data or computed from carbon numbers. """
if self._carbon_number_index is not None:
return self._ri_from_carbon_numbers()
if not self._data[self._ri_index].isnull().all():
return self._data[self._ri_index]
raise KeyError("Dataset does not contain retention indices!")
return self._data[self._ri_index]

def _ri_from_carbon_numbers(self):
""" Returns the RI of compound based on carbon number. """
Expand Down
6 changes: 0 additions & 6 deletions RIAssigner/data/SimpleData.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,8 @@ def __init__(self, retention_times: Iterable[float], rt_unit: str, retention_ind
"""
super().__init__(None, None, rt_unit)

self._validate_input_type(retention_times)

self._read(retention_times, retention_indices)

def _validate_input_type(self, retention_times):
if not isinstance(retention_times, list) or None in retention_times:
raise TypeError("Retention times must be a list and cannot contain None.")

def _read(self, retention_times, retention_indices):
self._retention_times = Data.URegistry.Quantity(retention_times, self._unit)
self._retention_indices = copy(retention_indices)
Expand Down
2 changes: 2 additions & 0 deletions RIAssigner/data/ValidateSimpleData.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def __init__(self, retention_times: Iterable[float], rt_unit: str, retention_ind
self._read(retention_times, retention_indices)

def _validate_input(self, retention_times, retention_indices):
if not isinstance(retention_times, list) or None in retention_times:
raise TypeError("Retention times must be a list and cannot contain None.")
if not all(map(Data.is_valid, retention_times)):
raise ValueError("Retention time data is invalid.")
if not is_sorted(retention_times):
Expand Down
14 changes: 7 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "RIAssigner"
version = '0.3.3'
version = "0.4.0"
description = "Python library for retention index calculation."
authors = ["Helge Hecht <[email protected]>", "Maksym Skoryk <[email protected]>"]

Expand All @@ -19,22 +19,22 @@ classifiers = [
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8"
"Programming Language :: Python :: 3.10",
]

packages = [
{ include = "RIAssigner" },
]

[tool.poetry.dependencies]
python = "^3.8"
matchms = "^0.14.0, <0.18.0"
python = "^3.10, <3.13"
matchms = "^0.24.1"
numpy = "*"
pandas = "*"
pint = "^0.17, <0.20"
pint = "^0.23"
scipy = "*"
urllib3 = "1.26.15"
fastparquet = "^2023.10.1"

[tool.poetry.group.dev.dependencies]
pytest = "*"
Expand Down
Loading
Loading