diff --git a/spoc/cli.py b/spoc/cli.py index 46f3afc..88a0d1c 100644 --- a/spoc/cli.py +++ b/spoc/cli.py @@ -22,7 +22,15 @@ def main(): help="Number of fragments per read to expand", ) def expand(fragments_path, expanded_contacts_path, n_fragments): - """Script for expanding labelled fragments to contacts""" + """ + Script for expanding labelled fragments to contacts + + Args: + fragments_path (str): Path to the labelled fragments file. + expanded_contacts_path (str): Path to the output contacts file. + n_fragments (int, optional): Number of fragments per read to expand. Defaults to 3. + + """ expander = FragmentExpander(number_fragments=n_fragments) file_manager = FileManager() input_fragments = file_manager.load_fragments(fragments_path) @@ -35,7 +43,14 @@ def expand(fragments_path, expanded_contacts_path, n_fragments): @click.argument("label_library_path") @click.argument("labelled_fragments_path") def annotate(fragments_path, label_library_path, labelled_fragments_path): - """Script for annotating porec fragments""" + """Script for annotating porec fragments + + Args: + fragments_path (str): Path to the input fragments file. + label_library_path (str): Path to the label library file. + labelled_fragments_path (str): Path to the output labelled fragments file. + + """ file_manager = FileManager() label_library = file_manager.load_label_library(label_library_path) annotator = FragmentAnnotator(label_library) @@ -55,7 +70,15 @@ def bin_contacts( bin_size, same_chromosome, ): - """Script for binning contacts""" + """Script for binning contacts + + Args: + contact_path (str): Path to the input contact file. + pixel_path (str): Path to the output pixel file. + bin_size (int, optional): Size of the bins. Defaults to 10000. + same_chromosome (bool, optional): Only bin contacts on the same chromosome. Defaults to False. + + """ # load data from disk file_manager = FileManager(use_dask=True) contacts = file_manager.load_contacts(contact_path) @@ -75,7 +98,12 @@ def merge(): @click.argument("contact_paths", nargs=-1) @click.option("-o", "--output", help="output path") def merge_contacts(contact_paths, output): - """Functionality to merge annotated fragments""" + """Functionality to merge annotated fragments + + Args: + contact_paths (tuple): Paths to the input contact files. + output (str, optional): Path to the output merged contact file. + """ file_manager = FileManager(use_dask=True) manipulator = ContactManipulator() contact_files = [file_manager.load_contacts(path) for path in contact_paths] diff --git a/spoc/contacts.py b/spoc/contacts.py index 4d68f83..d7b056e 100644 --- a/spoc/contacts.py +++ b/spoc/contacts.py @@ -11,7 +11,25 @@ class Contacts: - """N-way genomic contacts""" + """N-way genomic contacts + + Args: + contact_frame (DataFrame): DataFrame containing the contact data. + number_fragments (int, optional): Number of fragments. Defaults to None. + metadata_combi (List[str], optional): List of metadata combinations. Defaults to None. + label_sorted (bool, optional): Whether the labels are sorted. Defaults to False. + binary_labels_equal (bool, optional): Whether the binary labels are equal. Defaults to False. + symmetry_flipped (bool, optional): Whether the symmetry is flipped. Defaults to False. + + Attributes: + contains_metadata (bool): Whether the contact data contains metadata. + number_fragments (int): Number of fragments. + is_dask (bool): Whether the contact data is a Dask DataFrame. + metadata_combi (List[str]): List of metadata combinations. + label_sorted (bool): Whether the labels are sorted. + binary_labels_equal (bool): Whether the binary labels are equal. + symmetry_flipped (bool): Whether the symmetry is flipped. + """ def __init__( self, @@ -88,7 +106,11 @@ def data(self): @data.setter def data(self, contact_frame): - """Sets the contact data""" + """Sets the contact data + + Args: + contact_frame (DataFrame): DataFrame containing the contact data. + """ self._data = self._schema.validate(contact_frame) def __repr__(self) -> str: @@ -100,7 +122,14 @@ class ContactManipulator: contact data such as merging, splitting and subsetting.""" def merge_contacts(self, merge_list: List[Contacts]) -> Contacts: - """Merge contacts""" + """Merge contacts + + Args: + merge_list (List[Contacts]): List of Contacts objects to merge. + + Returns: + Contacts: Merged Contacts object. + """ # validate that merge is possible if len({i.number_fragments for i in merge_list}) != 1: raise ValueError("All contacts need to have the same order!") @@ -244,7 +273,14 @@ def _flip_labelled_contacts( return result def sort_labels(self, contacts: Contacts) -> Contacts: - """Sorts labels in ascending, alphabetical order""" + """Sorts labels in ascending, alphabetical order + + Args: + contacts (Contacts): Contacts object to sort. + + Returns: + Contacts: Sorted Contacts object. + """ if not contacts.contains_metadata: raise ValueError( "Sorting labels for unlabelled contacts is not implemented." @@ -329,12 +365,22 @@ def _generate_binary_label_mapping( return mapping def equate_binary_labels(self, contacts: Contacts) -> Contacts: - """Binary labels often only carry information about whether + """ + Equate binary labels. + + Binary labels often only carry information about whether they happen between the same or different fragments. This method equates these labels be replacing all equivalent binary labels with the alphabetically first label. For example, if we have a contact between two fragments that are labelled B and B, the label will be replaced with AA. + + Args: + contacts (Contacts): Contacts object to equate binary labels. + + Returns: + Contacts: Contacts object with equated binary labels. + """ assert contacts.contains_metadata, "Contacts do not contain metadata!" if not contacts.label_sorted: @@ -376,7 +422,16 @@ def equate_binary_labels(self, contacts: Contacts) -> Contacts: def subset_on_metadata( self, contacts: Contacts, metadata_combi: List[str] ) -> Contacts: - """Subset contacts based on metadata""" + """Subset contacts based on metadata + + Args: + contacts (Contacts): Contacts object to subset. + metadata_combi (List[str]): List of metadata combinations to subset on. + + Returns: + Contacts: Subsetted Contacts object. + + """ # check if metadata is present assert contacts.contains_metadata, "Contacts do not contain metadata!" # check if metadata_combi has the correct length @@ -406,7 +461,16 @@ def subset_on_metadata( def flip_symmetric_contacts( self, contacts: Contacts, sort_chromosomes: bool = False ) -> Contacts: - """Flips contacts based on inherent symmetry""" + """Flips contacts based on inherent symmetry + + Args: + contacts (Contacts): Contacts object to flip symmetric contacts. + sort_chromosomes (bool, optional): Whether to sort chromosomes. Defaults to False. + + Returns: + Contacts: Contacts object with flipped symmetric contacts. + + """ if contacts.contains_metadata: if not contacts.label_sorted: contacts = self.sort_labels(contacts) diff --git a/spoc/dataframe_models.py b/spoc/dataframe_models.py index 004fce2..6d851ae 100644 --- a/spoc/dataframe_models.py +++ b/spoc/dataframe_models.py @@ -34,7 +34,12 @@ class ContactSchema: - """Dynamic schema for N-way contacts""" + """Dynamic schema for N-way contacts + + Args: + number_fragments (int, optional): Number of fragments. Defaults to 3. + contains_metadata (bool, optional): Whether the contact data contains metadata. Defaults to True. + """ # field groups @@ -69,7 +74,14 @@ def __init__( @classmethod def get_contact_fields(cls, contains_metadata: bool) -> Dict: - """returns contact fields""" + """returns contact fields + + Args: + contains_metadata (bool): Whether the contact data contains metadata. + + Returns: + Dict: Dictionary containing the contact fields. + """ if contains_metadata: return copy.deepcopy(cls.contact_fields) return { @@ -90,7 +102,11 @@ def _expand_contact_fields( def validate_header(self, data_frame: DataFrame) -> None: """Validates only header, needed to validate that dask taskgraph can be built before - evaluation""" + evaluation. + + Args: + data_frame (DataFrame): The DataFrame to validate. + """ for column in data_frame.columns: if column not in self._schema.columns: raise pa.errors.SchemaError( @@ -98,13 +114,22 @@ def validate_header(self, data_frame: DataFrame) -> None: ) def validate(self, data_frame: DataFrame) -> DataFrame: - """Validate multiway contact dataframe""" + """Validate multiway contact dataframe + + Args: + data_frame (DataFrame): The DataFrame to validate. + """ self.validate_header(data_frame) return self._schema.validate(data_frame) class PixelSchema: - """Dynamic schema for N-way pixels""" + """Dynamic schema for N-way pixels + + Args: + number_fragments (int, optional): Number of fragments. Defaults to 3. + same_chromosome (bool, optional): Whether the fragments are on the same chromosome. Defaults to True. + """ def __init__(self, number_fragments: int = 3, same_chromosome: bool = True) -> None: self._number_fragments = number_fragments @@ -146,7 +171,11 @@ def _expand_contact_fields(self, expansions: Iterable = (1, 2, 3)) -> dict: def validate_header(self, data_frame: DataFrame) -> None: """Validates only header, needed to validate that dask taskgraph can be built before - evaluation""" + evaluation + + Args: + data_frame (DataFrame): The DataFrame to validate. + """ for column in data_frame.columns: if column not in self._schema.columns: raise pa.errors.SchemaError( @@ -154,5 +183,10 @@ def validate_header(self, data_frame: DataFrame) -> None: ) def validate(self, data_frame: DataFrame) -> DataFrame: - """Validate multiway contact dataframe""" + """Validate multiway contact dataframe + + Args: + data_frame (DataFrame): The DataFrame to validate. + + """ return self._schema.validate(data_frame) diff --git a/spoc/fragments.py b/spoc/fragments.py index 0d0581e..dba2eaf 100644 --- a/spoc/fragments.py +++ b/spoc/fragments.py @@ -12,31 +12,51 @@ class Fragments: - """Genomic fragments that can be labelled or not""" + """Genomic fragments that can be labelled or not. + + Args: + fragment_frame (DataFrame): DataFrame containing the fragment data. + """ def __init__(self, fragment_frame: DataFrame) -> None: self._data = FragmentSchema.validate(fragment_frame) self._contains_metadata = "metadata" in fragment_frame.columns @property - def data(self): - """Returns the underlying dataframe""" + def data(self) -> DataFrame: + """Returns the underlying dataframe. + + Returns: + DataFrame: Fragment data. + """ return self._data @property - def contains_metadata(self): - """Returns whether the dataframe contains metadata""" + def contains_metadata(self) -> bool: + """Returns whether the dataframe contains metadata. + + Returns: + bool: Whether the fragment data contains metadata. + """ return self._contains_metadata @property - def is_dask(self): - """Returns whether the underlying dataframe is dask""" + def is_dask(self) -> bool: + """Returns whether the underlying dataframe is dask. + + Returns: + bool: Whether the underlying dataframe is a dask dataframe. + """ return isinstance(self._data, dd.DataFrame) # TODO: make generic such that label library can hold arbitrary information class FragmentAnnotator: - """Responsible for annotating labels and sister identity of mapped read fragments""" + """Responsible for annotating labels and sister identity of mapped read fragments. + + Args: + label_library (Dict[str, bool]): Dictionary containing the label library. + """ def __init__(self, label_library: Dict[str, bool]) -> None: self._label_library = label_library @@ -72,7 +92,15 @@ def _assign_label_state(self, data_frame: pd.DataFrame) -> pd.Series: def annotate_fragments(self, fragments: Fragments) -> Fragments: """Takes fragment dataframe and returns a copy of it with its labelling state in a separate column with name `is_labelled`. If drop_uninformative is true, drops fragments that - are not in label library.""" + are not in label library. + + Args: + fragments (Fragments): Fragments object containing the fragment data. + + Returns: + Fragments: Fragments object with annotated fragment data. + + """ return Fragments( fragments.data.assign(is_labelled=self._assign_label_state) .dropna(subset=["is_labelled"]) @@ -83,7 +111,13 @@ def annotate_fragments(self, fragments: Fragments) -> Fragments: class FragmentExpander: """Expands n-way fragments over sequencing reads - to yield contacts.""" + to yield contacts. + + Args: + number_fragments (int): Number of fragments. + contains_metadata (bool, optional): Whether the fragment data contains metadata. Defaults to True. + + """ def __init__(self, number_fragments: int, contains_metadata: bool = True) -> None: self._number_fragments = number_fragments @@ -127,7 +161,14 @@ def _expand_single_read( return pd.DataFrame(result) def expand(self, fragments: Fragments) -> Contacts: - """expand contacts n-ways""" + """expand contacts n-ways + + Args: + fragments (Fragments): Fragments object containing the fragment data. + + Returns: + Contacts: Contacts object containing the expanded contact data. + """ # construct dataframe type specific kwargs if fragments.is_dask: kwargs = dict(meta=self._get_expansion_output_structure()) diff --git a/spoc/io.py b/spoc/io.py index c9e9eb2..3b07c8f 100644 --- a/spoc/io.py +++ b/spoc/io.py @@ -14,11 +14,16 @@ from spoc.contacts import Contacts from spoc.pixels import Pixels from spoc.file_parameter_models import ContactsParameters, PixelParameters +from spoc.dataframe_models import DataFrame from spoc.fragments import Fragments class FileManager: - """Is responsible for loading and writing files""" + """Is responsible for loading and writing files + + Args: + use_dask (bool, optional): Whether to use Dask for reading Parquet files. Defaults to False. + """ def __init__(self, use_dask: bool = False) -> None: if use_dask: @@ -68,30 +73,71 @@ def _load_parquet_global_parameters(path: str) -> BaseModel: @staticmethod def write_label_library(path: str, data: Dict[str, bool]) -> None: - """Writes label library to file""" + """Writes label library to file + + Args: + path (str): Path to write the file to. + data (Dict[str, bool]): Label library data. + + Returns: + None + """ with open(path, "wb") as handle: pickle.dump(data, handle) @staticmethod - def load_label_library(path: str): - """Load label library""" + def load_label_library(path: str) -> Dict: + """Load label library + + Args: + path (str): Path to the label library file. + + Returns: + Dict: Label library data. + """ with open(path, "rb") as handle: label_library = pickle.load(handle) return label_library - def load_fragments(self, path: str): - """Load annotated fragments""" + def load_fragments(self, path: str) -> Fragments: + """Load annotated fragments + + Args: + path (str): Path to the fragments file. + + Returns: + Fragments: Fragments object containing the fragment data. + + """ data = self._parquet_reader_func(path) return Fragments(data) @staticmethod def write_fragments(path: str, fragments: Fragments) -> None: - """Write annotated fragments""" + """Write annotated fragments + + Args: + path (str): Path to write the file to. + fragments (Fragments): Fragments object containing the fragment data. + + Returns: + None + + """ # Write fragments fragments.data.to_parquet(path, row_group_size=1024 * 1024) def write_multiway_contacts(self, path: str, contacts: Contacts) -> None: - """Write multiway contacts""" + """Write multiway contacts + + Args: + path (str): Path to write the file to. + contacts (Contacts): Contacts object containing the contact data. + + Returns: + None + + """ if contacts.is_dask: self._write_parquet_dask( path, contacts.data, contacts.get_global_parameters() @@ -104,7 +150,16 @@ def write_multiway_contacts(self, path: str, contacts: Contacts) -> None: def load_contacts( self, path: str, global_parameters: Optional[ContactsParameters] = None ) -> Contacts: - """Load multiway contacts""" + """Load multiway contacts + + Args: + path (str): Path to the contacts file. + global_parameters (Optional[ContactsParameters], optional): Global parameters. Defaults to None. + + Returns: + Contacts: Contacts object containing the contact data. + + """ if global_parameters is None: global_parameters = self._load_parquet_global_parameters(path) else: @@ -113,7 +168,14 @@ def load_contacts( @staticmethod def load_chromosome_sizes(path: str): - """Load chromosome sizes""" + """Load chromosome sizes + + Args: + path (str): Path to the chromosome sizes file. + + Returns: + pd.DataFrame: DataFrame containing the chromosome sizes. + """ # TODO: validate schema for this return pd.read_csv( path, @@ -137,7 +199,14 @@ def _load_pixel_metadata(path: str): @staticmethod def list_pixels(path: str): - """List available pixels""" + """List available pixels + + Args: + path (str): Path to the pixel data. + + Returns: + List[PixelParameters]: List of PixelParameters objects. + """ # read metadata.json metadata = FileManager._load_pixel_metadata(path) # instantiate pixel parameters @@ -149,7 +218,17 @@ def load_pixels( ) -> Pixels: """Loads specific pixels instance based on global parameters. load_dataframe specifies whether the dataframe should be loaded, or whether pixels - should be instantiated based on the path alone.""" + should be instantiated based on the path alone. + + Args: + path (str): Path to the pixel data. + global_parameters (PixelParameters): Global parameters. + load_dataframe (bool, optional): Whether to load the dataframe. Defaults to True. + + Returns: + Pixels: Pixels object containing the pixel data. + + """ metadata = self._load_pixel_metadata(path) # find matching pixels for pixel_path, value in metadata.items(): @@ -173,7 +252,16 @@ def _get_pixel_hash_path(path: str, pixels: Pixels) -> str: return md5(hash_string.encode()).hexdigest() + ".parquet" def write_pixels(self, path: str, pixels: Pixels) -> None: - """Write pixels""" + """Write pixels + + Args: + path (str): Path to write the pixel data to. + pixels (Pixels): Pixels object containing the pixel data. + + Returns: + None + + """ # check whether path exists metadata_path = Path(path) / "metadata.json" if not Path(path).exists(): diff --git a/spoc/pixels.py b/spoc/pixels.py index bd85373..89cc687 100644 --- a/spoc/pixels.py +++ b/spoc/pixels.py @@ -4,7 +4,7 @@ from typing import Union, Optional, List import pandas as pd import dask.dataframe as dd -from spoc.dataframe_models import PixelSchema +from spoc.dataframe_models import PixelSchema, DataFrame from spoc.file_parameter_models import PixelParameters from spoc.contacts import Contacts @@ -22,6 +22,17 @@ class Pixels: - pandas dataframe - dask dataframe - path to a parquet file + + + Args: + pixel_source (Union[pd.DataFrame, dd.DataFrame, str]): The source of the pixel data. + number_fragments (Optional[int], optional): The number of fragments. Defaults to None. + binsize (Optional[int], optional): The bin size. Defaults to None. + metadata_combi (Optional[List[str]], optional): The metadata combination. Defaults to None. + label_sorted (bool, optional): Whether the labels are sorted. Defaults to False. + binary_labels_equal (bool, optional): Whether binary labels are equal. Defaults to False. + symmetry_flipped (bool, optional): Whether the pixels are symmetry flipped. Defaults to False. + same_chromosome (bool, optional): Whether the pixels are on the same chromosome. Defaults to True. """ def __init__( @@ -69,6 +80,15 @@ def from_uri(uri, mode="path"): and will be tried to match to the available pixels. If no match is found, or there is no uniue match, an error is raised. Mode can be one of pandas|dask|path, which corresponds to the type of the pixel source. + + + Args: + uri (str): The URI to construct the pixels from. + mode (str, optional): The mode to use. Defaults to "path". + + Returns: + Pixels: The constructed pixels. + """ # import here to avoid circular imports # pylint: disable=import-outside-toplevel @@ -123,7 +143,11 @@ def from_uri(uri, mode="path"): ) def get_global_parameters(self): - """Returns global parameters of pixels""" + """Returns global parameters of pixels + + Returns: + PixelParameters: The global parameters of the pixels. + """ return PixelParameters( number_fragments=self._number_fragments, binsize=self._binsize, @@ -135,50 +159,90 @@ def get_global_parameters(self): ) @property - def path(self): - """Returns path of pixels""" + def path(self) -> str: + """Returns path of pixels + + Returns: + str: The path of the pixels. + """ return self._path @property - def data(self): - """Returns pixels as dataframe""" + def data(self) -> DataFrame: + """Returns pixels as dataframe + + Returns: + DataFrame: The pixels as a dataframe. + + """ return self._data @property - def number_fragments(self): - """Returns number of fragments in pixels""" + def number_fragments(self) -> int: + """Returns number of fragments in pixels + + Returns: + int: The number of fragments in the pixels. + """ return self._number_fragments @property - def binsize(self): - """Returns binsize of pixels""" + def binsize(self) -> int: + """Returns binsize of pixels + + Returns: + int: The binsize of the pixels. + """ return self._binsize @property - def binary_labels_equal(self): - """Returns whether binary labels are equal""" + def binary_labels_equal(self) -> bool: + """Returns whether binary labels are equal + + Returns: + bool: Whether binary labels are equal. + """ return self._binary_labels_equal @property - def symmetry_flipped(self): - """Returns whether pixels are symmetry flipped""" + def symmetry_flipped(self) -> bool: + """Returns whether pixels are symmetry flipped + + Returns: + bool: Whether pixels are symmetry flipped. + """ return self._symmetry_flipped @property - def metadata_combi(self): - """Returns metadata combination of pixels""" + def metadata_combi(self) -> Optional[List[str]]: + """Returns metadata combination of pixels + + Returns: + Optional[List[str]]: The metadata combination of the pixels. + """ return self._metadata_combi @property - def same_chromosome(self): - """Returns whether pixels are on same chromosome""" + def same_chromosome(self) -> bool: + """Returns whether pixels are on same chromosome + + + Returns: + bool: Whether pixels are on same chromosome. + + """ return self._same_chromosome class GenomicBinner: """Bins higher order contacts into genomic bins of fixed size. Is capable of sorting genomic bins along columns based on sister chromatid - identity""" + identity + + Args: + bin_size (int): The size of the genomic bins. + + """ def __init__(self, bin_size: int) -> None: self._bin_size = bin_size @@ -221,8 +285,17 @@ def _assign_midpoints(self, contacts: dd.DataFrame) -> dd.DataFrame: def bin_contacts( self, contacts: Contacts, same_chromosome: bool = True - ) -> dd.DataFrame: - """Bins genomic contacts""" + ) -> Pixels: + """Bins genomic contacts + + Args: + contacts (Contacts): The genomic contacts to bin. + same_chromosome (bool, optional): Whether to only retain pixels on the same chromosome. Defaults to True. + + Returns: + Pixels: The binned genomic pixels. + + """ self._contact_order = contacts.number_fragments contacts_w_midpoints = self._assign_midpoints(contacts.data) if contacts.is_dask: