Skip to content

Commit

Permalink
Merge pull request #118 from RECETOX/wverastegui/issue98
Browse files Browse the repository at this point in the history
Sanitize column names on data import
  • Loading branch information
hechth authored Feb 12, 2024
2 parents 7684d26 + b830390 commit 4c049ff
Show file tree
Hide file tree
Showing 11 changed files with 216 additions and 86 deletions.
68 changes: 47 additions & 21 deletions RIAssigner/data/Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd

from pint import Quantity, UnitRegistry
from matchms.utils import load_known_key_conversions


class Data(ABC):
Expand All @@ -11,9 +12,9 @@ class Data(ABC):
RetentionIndexType = float
CommentFieldType = Optional[str]
URegistry = UnitRegistry()

_rt_possible_keys = {'RT', 'rt', 'rts', 'retention_times', 'retention_time', 'retention', 'time', 'retentiontime'}
_ri_possible_keys = {'RI', 'ri', 'ris', 'retention_indices', 'retention_index', 'kovats', 'retentionindex'}
_keys_conversions = load_known_key_conversions()
_rt_possible_keys = [ key for key , value in _keys_conversions.items() if "retention_time" == value] + ["retention_time"]
_ri_possible_keys = [ key for key , value in _keys_conversions.items() if "retention_index" == value] + ["retention_index"]

@staticmethod
def is_valid(value: Union[RetentionTimeType, RetentionIndexType]) -> bool:
Expand All @@ -29,38 +30,63 @@ def is_valid(value: Union[RetentionTimeType, RetentionIndexType]) -> bool:
return result

@staticmethod
def can_be_float(rt):
def can_be_float(rt: Union[Quantity, float, int]) -> bool:
"""Determine whether a value can be converted to a float.
This function checks if the provided input is an instance of either
Quantity, float, or int.
Args:
rt (Union[Quantity, float, int]): Value to check for float conversion.
Returns:
bool: True if the input is an instance of Quantity, float, or int, False otherwise.
"""
if isinstance(rt, (Quantity, float, int)):
return True
return False

@classmethod
def add_possible_rt_keys(cls, keys: List[str]):
""" A method that adds new identifiers for the retention time information lookup. """
cls._rt_possible_keys.update(keys)
def add_possible_rt_keys(cls, keys: List[str]) -> None:
""" A method that adds new identifiers to get retention time information.
Args:
keys (List[str]): A list of new identifiers (keys) to be added to the `_rt_possible_keys`.
Returns:
None
"""
cls._rt_possible_keys.append(keys)

@classmethod
def add_possible_ri_keys(cls, keys: List[str]):
""" A method that adds new identifiers for the retention index information lookup. """
cls._ri_possible_keys.update(keys)
def add_possible_ri_keys(cls, keys: List[str]) -> None:
""" A method that adds new identifiers to get retention index information.
Args:
keys (List[str]): A list of new identifiers (keys) to be added to the `_ri_possible_keys`.
Returns:
None
"""
cls._ri_possible_keys.append(keys)

@classmethod
def get_possible_rt_keys(cls) -> List[str]:
"""Method to get the supported retention time keys
""" A method that returns the possible keys to get retention times.
Returns:
List[str]: List of supported retention time keys.
List[str]: A list of possible keys to get retention times.
"""
return cls._rt_possible_keys.copy()

return cls._rt_possible_keys
@classmethod
def get_possible_ri_keys(cls) -> List[str]:
"""Method to get the supported retention index keys
""" A method that returns the possible keys to get retention indices.
Returns:
List[str]: List of supported retention index keys.
List[str]: A list of possible keys to get retention indices.
"""
return cls._ri_possible_keys.copy()
return cls._ri_possible_keys

def __init__(self, filename: str, filetype: str, rt_unit: str):
self._filename = filename
Expand All @@ -69,7 +95,7 @@ def __init__(self, filename: str, filetype: str, rt_unit: str):
self._unit = Data.URegistry(self._rt_unit)

@abstractmethod
def write(self, filename):
def write(self, filename: str) -> None:
"""Store current content to disk.
Args:
Expand Down Expand Up @@ -108,7 +134,7 @@ def retention_indices(self) -> Iterable[RetentionIndexType]:

@retention_indices.setter
@abstractmethod
def retention_indices(self, value: Iterable[RetentionIndexType]):
def retention_indices(self, value: Iterable[RetentionIndexType]) -> None:
"""Setter for `retention_indices` variable.
Args:
Expand Down Expand Up @@ -153,7 +179,7 @@ def comment(self) -> Iterable[CommentFieldType]:
"""
...

def init_ri_from_comment(self, ri_source: str):
def init_ri_from_comment(self, ri_source: str) -> None:
""" Extract RI from comment field.
Extracts the RI from the comment field of the data file. The RI is expected to be
in the format 'ri_source=RI_value'. The function extracts the RI value and
Expand Down
32 changes: 10 additions & 22 deletions RIAssigner/data/MatchMSData.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from typing import Iterable, List, Optional, Tuple
import numpy as np

from matchms import Spectrum
from matchms import Spectrum, Metadata
from matchms.exporting import save_spectra
from matchms.exporting.metadata_export import get_metadata_as_array
from matchms.importing import load_spectra
from RIAssigner.utils import get_first_common_element

from .Data import Data

Expand All @@ -25,15 +24,15 @@ def _read(self):
self._spectra = list(load_spectra(self._filename, True, self._filetype))
_, self._keys = get_metadata_as_array(self._spectra)

self._init_rt_key()
self._init_ri_key()
self._rt_key = "retention_time"
self._ri_key = "retention_index"

self._sort_spectra_by_rt()

self._read_retention_times()
self._read_retention_indices()

def write(self, filename: str):
def write(self, filename: str) -> None:
"""Write data to back to the spectra file
Args:
Expand All @@ -42,31 +41,21 @@ def write(self, filename: str):
self._write_RIs_to_spectra()
save_spectra(self._spectra, filename)

def _write_RIs_to_spectra(self):
def _write_RIs_to_spectra(self) -> None:
"""Write the RI values stored in the object to the spectra metadata.
"""
list(map(_assign_ri_value, self._spectra, [self._ri_key] * len(self._spectra), self._retention_indices))

def _init_rt_key(self):
""" Identify retention-time key from spectrum metadata. """
rt_key = get_first_common_element(self._rt_possible_keys, self._keys)
self._rt_key = rt_key or 'retentiontime'

def _init_ri_key(self):
""" Identify retention-index key from spectrum metadata. """
ri_key = get_first_common_element(self._ri_possible_keys, self._keys)
self._ri_key = ri_key or 'retentionindex'

def _read_retention_times(self):
def _read_retention_times(self) -> None:
""" Read retention times from spectrum metadata. """
magnitude = [safe_read_key(spectrum, self._rt_key) for spectrum in self._spectra]
self._retention_times = Data.URegistry.Quantity(magnitude, self._unit)

def _read_retention_indices(self):
def _read_retention_indices(self) -> None:
""" Read retention indices from spectrum metadata. """
self.retention_indices = [safe_read_key(spectrum, self._ri_key) for spectrum in self._spectra]

def _sort_spectra_by_rt(self):
def _sort_spectra_by_rt(self) -> None:
""" Sort objects (peaks) in spectra list by their retention times. """
self._spectra.sort(key=lambda spectrum: safe_read_key(spectrum, self._rt_key) or 0)

Expand Down Expand Up @@ -119,8 +108,7 @@ def comment(self) -> Iterable[Data.CommentFieldType]:
@property
def spectra_metadata(self) -> Tuple[np.array, List[str]]:
return get_metadata_as_array(self._spectra)



def safe_read_key(spectrum: Spectrum, key: str) -> float:
""" Read key from spectrum and convert to float or return 0.0.
Tries to read the given key from the spectrum metadata and convert it to a float.
Expand Down Expand Up @@ -149,7 +137,7 @@ def safe_read_key(spectrum: Spectrum, key: str) -> float:
value = 0.0
return value

def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexType):
def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexType) -> None:
"""Assign RI value to Spectrum object
Args:
Expand Down
32 changes: 19 additions & 13 deletions RIAssigner/data/PandasData.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

from pandas import read_csv, read_parquet
from RIAssigner.utils import define_separator, get_first_common_element
from RIAssigner.utils import clean_column_names

from .Data import Data


class PandasData(Data):
""" Class to handle data from filetypes which can be imported into a pandas dataframe. """
_carbon_number_column_names = set(['Carbon_Number'])
_carbon_number_column_names = set(['carbon_number'])

def __init__(self, filename: str, filetype: str, rt_unit: str):
super().__init__(filename, filetype, rt_unit)
Expand All @@ -27,57 +28,58 @@ def _read(self):
self._sort_by_rt()
self._replace_nans_with_0s()

def _read_into_dataframe(self):
def _read_into_dataframe(self) -> None:
""" Read the data from file into dataframe. """
if(self._filetype in ['csv', 'tsv']):
self._data = read_csv(self._filename, sep=None, engine="python")
elif self._filetype == 'parquet':
self._data = read_parquet(self._filename)
else:
raise NotImplementedError("File formats different from ['csv', 'tsv'] are not implemented yet.")
self._data.columns = clean_column_names(self._data.columns)

def write(self, filename: str):
def write(self, filename: str) -> None:
""" Write data on disk. Currently supports 'csv' and 'tsv' formats. """
if not filename.endswith((".csv", ".tsv")):
raise ValueError("File extension must be 'csv' or 'tsv'.")
separator = define_separator(filename)
self._data.to_csv(filename, index=False, sep=separator)

def _init_carbon_number_index(self):
def _init_carbon_number_index(self) -> None:
""" Find key of carbon number column and store it. """
self._carbon_number_index = get_first_common_element(self._data.columns, self._carbon_number_column_names)

def _init_rt_column_info(self):
def _init_rt_column_info(self) -> None:
""" Find key of retention time column and store it. """
self._rt_index = get_first_common_element(self._data.columns, self._rt_possible_keys)
self._rt_index = get_first_common_element(self._data.columns, Data.get_possible_rt_keys())
if self._rt_index is not None:
self._rt_position = self._data.columns.tolist().index(self._rt_index)
else:
self._rt_position = None

def _init_ri_column_info(self):
def _init_ri_column_info(self) -> None:
""" Initialize retention index column name and set its position next to the retention time column. """
self._ri_index = get_first_common_element(self._data.columns, self._ri_possible_keys)
self._ri_index = get_first_common_element(self._data.columns, Data.get_possible_ri_keys())
if self._ri_index in self._data.columns:
self._ri_position = self._data.columns.get_loc(self._ri_index)
else:
self._ri_index = 'retention_index'
self._ri_position = None

def _init_ri_indices(self):
def _init_ri_indices(self) -> None:
""" Initialize retention indices to a factor of 100 of carbon numbers or None if carbon numbers are not present. """
if self._carbon_number_index is not None:
self._data[self._ri_index] = self._data[self._carbon_number_index] * 100
elif self._ri_position is None:
self._ri_position = self._rt_position + 1
self._data.insert(loc=self._ri_position, column=self._ri_index, value=None)

def _sort_by_rt(self):
def _sort_by_rt(self) -> None:
""" Sort peaks by their retention times. """
if self._rt_index is not None:
self._data.sort_values(by=self._rt_index, axis=0, inplace=True)

def _replace_nans_with_0s(self):
def _replace_nans_with_0s(self) -> None:
""" Replace NaN values with 0s. """
if self._rt_index is not None:
self._data[self._rt_index].fillna(0, inplace=True)
Expand Down Expand Up @@ -118,7 +120,7 @@ def retention_indices(self) -> Iterable[Data.RetentionIndexType]:
return self._ri_from_carbon_numbers()
return self._data[self._ri_index]

def _ri_from_carbon_numbers(self):
def _ri_from_carbon_numbers(self) -> Iterable[int]:
""" Returns the RI of compound based on carbon number. """
return self._data[self._carbon_number_index] * 100

Expand All @@ -133,7 +135,11 @@ def retention_indices(self, values: Iterable[int]):

@property
def comment(self) -> Iterable[Data.CommentFieldType]:
""" Get comments."""
""" Get comments.
Returns:
Iterable[Data.CommentFieldType]: Comments.
"""
self._comment_keys = "comment"
content = self._data[self._comment_keys].tolist()
return content
32 changes: 27 additions & 5 deletions RIAssigner/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from os.path import splitext
from typing import Iterable, TypeVar
from typing import Iterable, TypeVar, List

import numpy

Expand All @@ -8,14 +8,25 @@

def get_first_common_element(first: Iterable[T], second: Iterable[T]) -> T:
""" Get first common element from two lists.
Returns 'None' if there are no common elements.
Args:
first (Iterable[T]): First list.
second (Iterable[T]): Second list.
Returns:
T: First common element or None if no common element is found.
"""
return next((item for item in first if item in second), None)


def define_separator(filename):
def define_separator(filename: str) -> str:
""" Select separator for data values based on filename extension.
Returns separator.
Args:
filename (str): Filename for which to get the separator.
Returns:
str: Separator for data values.
"""
if filename.endswith(".tsv"):
separator = "\t"
Expand All @@ -24,7 +35,7 @@ def define_separator(filename):
return separator


def get_extension(filename: str):
def get_extension(filename: str) -> str:
"""Get extension of filename.
Args:
Expand All @@ -46,3 +57,14 @@ def is_sorted(values) -> bool:
bool: True if sorted.
"""
return numpy.all(values[:-1] <= values[1:])

def clean_column_names(column_names: List[str]) -> List[str]:
""" Clean column names by removing leading and trailing whitespaces, converting to lowercase.
Args:
column_names (List[str]): List of column names to clean.
Returns:
List[str]: List of cleaned column names.
"""
return [name.strip().lower() for name in column_names]
2 changes: 2 additions & 0 deletions tests/data/csv/minimal_unclean_colnames.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
RT
1
2 changes: 1 addition & 1 deletion tests/data/integration/peaks_with_rt.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Unnamed: 0,mz,rt,retention_index,mz_min,mz_max,intensity.dataset_23283.dat,intensity.dataset_23284.dat,intensity.dataset_23285.dat,intensity.dataset_23286.dat,intensity.dataset_23287.dat,intensity.dataset_23288.dat,intensity.dataset_23289.dat,intensity.dataset_23290.dat,intensity.dataset_23291.dat,intensity.dataset_23292.dat,intensity.dataset_23293.dat,intensity.dataset_23294.dat,intensity.dataset_23295.dat,intensity.dataset_23296.dat,intensity.dataset_23297.dat,intensity.dataset_23298.dat,intensity.dataset_23299.dat,intensity.dataset_23300.dat,intensity.dataset_23301.dat,intensity.dataset_23302.dat,intensity.dataset_23303.dat,intensity.dataset_23304.dat,intensity.dataset_23305.dat,intensity.dataset_23306.dat,intensity.dataset_23307.dat,intensity.dataset_23308.dat,time.dataset_23283.dat,time.dataset_23284.dat,time.dataset_23285.dat,time.dataset_23286.dat,time.dataset_23287.dat,time.dataset_23288.dat,time.dataset_23289.dat,time.dataset_23290.dat,time.dataset_23291.dat,time.dataset_23292.dat,time.dataset_23293.dat,time.dataset_23294.dat,time.dataset_23295.dat,time.dataset_23296.dat,time.dataset_23297.dat,time.dataset_23298.dat,time.dataset_23299.dat,time.dataset_23300.dat,time.dataset_23301.dat,time.dataset_23302.dat,time.dataset_23303.dat,time.dataset_23304.dat,time.dataset_23305.dat,time.dataset_23306.dat,time.dataset_23307.dat,time.dataset_23308.dat
unnamed: 0,mz,rt,retention_index,mz_min,mz_max,intensity.dataset_23283.dat,intensity.dataset_23284.dat,intensity.dataset_23285.dat,intensity.dataset_23286.dat,intensity.dataset_23287.dat,intensity.dataset_23288.dat,intensity.dataset_23289.dat,intensity.dataset_23290.dat,intensity.dataset_23291.dat,intensity.dataset_23292.dat,intensity.dataset_23293.dat,intensity.dataset_23294.dat,intensity.dataset_23295.dat,intensity.dataset_23296.dat,intensity.dataset_23297.dat,intensity.dataset_23298.dat,intensity.dataset_23299.dat,intensity.dataset_23300.dat,intensity.dataset_23301.dat,intensity.dataset_23302.dat,intensity.dataset_23303.dat,intensity.dataset_23304.dat,intensity.dataset_23305.dat,intensity.dataset_23306.dat,intensity.dataset_23307.dat,intensity.dataset_23308.dat,time.dataset_23283.dat,time.dataset_23284.dat,time.dataset_23285.dat,time.dataset_23286.dat,time.dataset_23287.dat,time.dataset_23288.dat,time.dataset_23289.dat,time.dataset_23290.dat,time.dataset_23291.dat,time.dataset_23292.dat,time.dataset_23293.dat,time.dataset_23294.dat,time.dataset_23295.dat,time.dataset_23296.dat,time.dataset_23297.dat,time.dataset_23298.dat,time.dataset_23299.dat,time.dataset_23300.dat,time.dataset_23301.dat,time.dataset_23302.dat,time.dataset_23303.dat,time.dataset_23304.dat,time.dataset_23305.dat,time.dataset_23306.dat,time.dataset_23307.dat,time.dataset_23308.dat
3835,99.03162030799956,142.67379366183633,1185.1133031516015,99.03161686937968,99.03162445630808,0.0,0.0,0.0,0.0,1124023.5414198886,0.0,0.0,0.0,302811.79369835946,0.0,0.0,0.0,0.0,0.0,0.0,689466.2692573667,0.0,0.0,0.0,0.0,0.0,1185293.3404002658,0.0,0.0,0.0,0.0,,,,,140.7677080635659,,,,148.14892015326652,,,,,,,160.31743543051294,,,,,,121.461111,,,,
798,169.0314087180851,143.0311614429152,1186.8150544900723,169.03127206136165,169.0316819389772,0.0,0.0,0.0,0.0,305244.92965288245,0.0,0.0,12538844.289546728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,668804.8639133602,0.0,0.0,0.0,0.0,,,,,148.17622517287128,,,151.30544042838676,,,,,,,,,,,,,,129.6118187274875,,,,
78,109.00492907059838,143.68682488560978,1189.9372613600465,109.00492721060658,109.0049302044596,0.0,0.0,0.0,0.0,387414.4182122475,463239.19886551664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462236.3507863071,0.0,0.0,0.0,492413.733649669,399820.6922610059,0.0,0.0,0.0,0.0,,,,,143.77939636976086,149.54889043214672,,,,,,,,,,,148.41804574766948,,,,148.6747370599136,128.0130548185582,,,,
Expand Down
Loading

0 comments on commit 4c049ff

Please sign in to comment.