Skip to content

Commit

Permalink
added google style docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
Mittmich committed Oct 7, 2023
1 parent 1f8eaab commit 26d1f2a
Show file tree
Hide file tree
Showing 6 changed files with 391 additions and 63 deletions.
36 changes: 32 additions & 4 deletions spoc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,15 @@ def main():
help="Number of fragments per read to expand",
)
def expand(fragments_path, expanded_contacts_path, n_fragments):
"""Script for expanding labelled fragments to contacts"""
"""
Script for expanding labelled fragments to contacts
Args:
fragments_path (str): Path to the labelled fragments file.
expanded_contacts_path (str): Path to the output contacts file.
n_fragments (int, optional): Number of fragments per read to expand. Defaults to 3.
"""
expander = FragmentExpander(number_fragments=n_fragments)
file_manager = FileManager()
input_fragments = file_manager.load_fragments(fragments_path)
Expand All @@ -35,7 +43,14 @@ def expand(fragments_path, expanded_contacts_path, n_fragments):
@click.argument("label_library_path")
@click.argument("labelled_fragments_path")
def annotate(fragments_path, label_library_path, labelled_fragments_path):
"""Script for annotating porec fragments"""
"""Script for annotating porec fragments
Args:
fragments_path (str): Path to the input fragments file.
label_library_path (str): Path to the label library file.
labelled_fragments_path (str): Path to the output labelled fragments file.
"""
file_manager = FileManager()
label_library = file_manager.load_label_library(label_library_path)
annotator = FragmentAnnotator(label_library)
Expand All @@ -55,7 +70,15 @@ def bin_contacts(
bin_size,
same_chromosome,
):
"""Script for binning contacts"""
"""Script for binning contacts
Args:
contact_path (str): Path to the input contact file.
pixel_path (str): Path to the output pixel file.
bin_size (int, optional): Size of the bins. Defaults to 10000.
same_chromosome (bool, optional): Only bin contacts on the same chromosome. Defaults to False.
"""
# load data from disk
file_manager = FileManager(use_dask=True)
contacts = file_manager.load_contacts(contact_path)
Expand All @@ -75,7 +98,12 @@ def merge():
@click.argument("contact_paths", nargs=-1)
@click.option("-o", "--output", help="output path")
def merge_contacts(contact_paths, output):
"""Functionality to merge annotated fragments"""
"""Functionality to merge annotated fragments
Args:
contact_paths (tuple): Paths to the input contact files.
output (str, optional): Path to the output merged contact file.
"""
file_manager = FileManager(use_dask=True)
manipulator = ContactManipulator()
contact_files = [file_manager.load_contacts(path) for path in contact_paths]
Expand Down
78 changes: 71 additions & 7 deletions spoc/contacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,25 @@


class Contacts:
"""N-way genomic contacts"""
"""N-way genomic contacts
Args:
contact_frame (DataFrame): DataFrame containing the contact data.
number_fragments (int, optional): Number of fragments. Defaults to None.
metadata_combi (List[str], optional): List of metadata combinations. Defaults to None.
label_sorted (bool, optional): Whether the labels are sorted. Defaults to False.
binary_labels_equal (bool, optional): Whether the binary labels are equal. Defaults to False.
symmetry_flipped (bool, optional): Whether the symmetry is flipped. Defaults to False.
Attributes:
contains_metadata (bool): Whether the contact data contains metadata.
number_fragments (int): Number of fragments.
is_dask (bool): Whether the contact data is a Dask DataFrame.
metadata_combi (List[str]): List of metadata combinations.
label_sorted (bool): Whether the labels are sorted.
binary_labels_equal (bool): Whether the binary labels are equal.
symmetry_flipped (bool): Whether the symmetry is flipped.
"""

def __init__(
self,
Expand Down Expand Up @@ -88,7 +106,11 @@ def data(self):

@data.setter
def data(self, contact_frame):
"""Sets the contact data"""
"""Sets the contact data
Args:
contact_frame (DataFrame): DataFrame containing the contact data.
"""
self._data = self._schema.validate(contact_frame)

def __repr__(self) -> str:
Expand All @@ -100,7 +122,14 @@ class ContactManipulator:
contact data such as merging, splitting and subsetting."""

def merge_contacts(self, merge_list: List[Contacts]) -> Contacts:
"""Merge contacts"""
"""Merge contacts
Args:
merge_list (List[Contacts]): List of Contacts objects to merge.
Returns:
Contacts: Merged Contacts object.
"""
# validate that merge is possible
if len({i.number_fragments for i in merge_list}) != 1:
raise ValueError("All contacts need to have the same order!")
Expand Down Expand Up @@ -244,7 +273,14 @@ def _flip_labelled_contacts(
return result

def sort_labels(self, contacts: Contacts) -> Contacts:
"""Sorts labels in ascending, alphabetical order"""
"""Sorts labels in ascending, alphabetical order
Args:
contacts (Contacts): Contacts object to sort.
Returns:
Contacts: Sorted Contacts object.
"""
if not contacts.contains_metadata:
raise ValueError(
"Sorting labels for unlabelled contacts is not implemented."
Expand Down Expand Up @@ -329,12 +365,22 @@ def _generate_binary_label_mapping(
return mapping

def equate_binary_labels(self, contacts: Contacts) -> Contacts:
"""Binary labels often only carry information about whether
"""
Equate binary labels.
Binary labels often only carry information about whether
they happen between the same or different fragments. This
method equates these labels be replacing all equivalent binary labels with
the alphabetically first label.
For example, if we have a contact between two fragments
that are labelled B and B, the label will be replaced with AA.
Args:
contacts (Contacts): Contacts object to equate binary labels.
Returns:
Contacts: Contacts object with equated binary labels.
"""
assert contacts.contains_metadata, "Contacts do not contain metadata!"
if not contacts.label_sorted:
Expand Down Expand Up @@ -376,7 +422,16 @@ def equate_binary_labels(self, contacts: Contacts) -> Contacts:
def subset_on_metadata(
self, contacts: Contacts, metadata_combi: List[str]
) -> Contacts:
"""Subset contacts based on metadata"""
"""Subset contacts based on metadata
Args:
contacts (Contacts): Contacts object to subset.
metadata_combi (List[str]): List of metadata combinations to subset on.
Returns:
Contacts: Subsetted Contacts object.
"""
# check if metadata is present
assert contacts.contains_metadata, "Contacts do not contain metadata!"
# check if metadata_combi has the correct length
Expand Down Expand Up @@ -406,7 +461,16 @@ def subset_on_metadata(
def flip_symmetric_contacts(
self, contacts: Contacts, sort_chromosomes: bool = False
) -> Contacts:
"""Flips contacts based on inherent symmetry"""
"""Flips contacts based on inherent symmetry
Args:
contacts (Contacts): Contacts object to flip symmetric contacts.
sort_chromosomes (bool, optional): Whether to sort chromosomes. Defaults to False.
Returns:
Contacts: Contacts object with flipped symmetric contacts.
"""
if contacts.contains_metadata:
if not contacts.label_sorted:
contacts = self.sort_labels(contacts)
Expand Down
48 changes: 41 additions & 7 deletions spoc/dataframe_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,12 @@


class ContactSchema:
"""Dynamic schema for N-way contacts"""
"""Dynamic schema for N-way contacts
Args:
number_fragments (int, optional): Number of fragments. Defaults to 3.
contains_metadata (bool, optional): Whether the contact data contains metadata. Defaults to True.
"""

# field groups

Expand Down Expand Up @@ -69,7 +74,14 @@ def __init__(

@classmethod
def get_contact_fields(cls, contains_metadata: bool) -> Dict:
"""returns contact fields"""
"""returns contact fields
Args:
contains_metadata (bool): Whether the contact data contains metadata.
Returns:
Dict: Dictionary containing the contact fields.
"""
if contains_metadata:
return copy.deepcopy(cls.contact_fields)
return {
Expand All @@ -90,21 +102,34 @@ def _expand_contact_fields(

def validate_header(self, data_frame: DataFrame) -> None:
"""Validates only header, needed to validate that dask taskgraph can be built before
evaluation"""
evaluation.
Args:
data_frame (DataFrame): The DataFrame to validate.
"""
for column in data_frame.columns:
if column not in self._schema.columns:
raise pa.errors.SchemaError(
self._schema, data_frame, "Header is invalid!"
)

def validate(self, data_frame: DataFrame) -> DataFrame:
"""Validate multiway contact dataframe"""
"""Validate multiway contact dataframe
Args:
data_frame (DataFrame): The DataFrame to validate.
"""
self.validate_header(data_frame)
return self._schema.validate(data_frame)


class PixelSchema:
"""Dynamic schema for N-way pixels"""
"""Dynamic schema for N-way pixels
Args:
number_fragments (int, optional): Number of fragments. Defaults to 3.
same_chromosome (bool, optional): Whether the fragments are on the same chromosome. Defaults to True.
"""

def __init__(self, number_fragments: int = 3, same_chromosome: bool = True) -> None:
self._number_fragments = number_fragments
Expand Down Expand Up @@ -146,13 +171,22 @@ def _expand_contact_fields(self, expansions: Iterable = (1, 2, 3)) -> dict:

def validate_header(self, data_frame: DataFrame) -> None:
"""Validates only header, needed to validate that dask taskgraph can be built before
evaluation"""
evaluation
Args:
data_frame (DataFrame): The DataFrame to validate.
"""
for column in data_frame.columns:
if column not in self._schema.columns:
raise pa.errors.SchemaError(
self._schema, data_frame, "Header is invalid!"
)

def validate(self, data_frame: DataFrame) -> DataFrame:
"""Validate multiway contact dataframe"""
"""Validate multiway contact dataframe
Args:
data_frame (DataFrame): The DataFrame to validate.
"""
return self._schema.validate(data_frame)
63 changes: 52 additions & 11 deletions spoc/fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,51 @@


class Fragments:
"""Genomic fragments that can be labelled or not"""
"""Genomic fragments that can be labelled or not.
Args:
fragment_frame (DataFrame): DataFrame containing the fragment data.
"""

def __init__(self, fragment_frame: DataFrame) -> None:
self._data = FragmentSchema.validate(fragment_frame)
self._contains_metadata = "metadata" in fragment_frame.columns

@property
def data(self):
"""Returns the underlying dataframe"""
def data(self) -> DataFrame:
"""Returns the underlying dataframe.
Returns:
DataFrame: Fragment data.
"""
return self._data

@property
def contains_metadata(self):
"""Returns whether the dataframe contains metadata"""
def contains_metadata(self) -> bool:
"""Returns whether the dataframe contains metadata.
Returns:
bool: Whether the fragment data contains metadata.
"""
return self._contains_metadata

@property
def is_dask(self):
"""Returns whether the underlying dataframe is dask"""
def is_dask(self) -> bool:
"""Returns whether the underlying dataframe is dask.
Returns:
bool: Whether the underlying dataframe is a dask dataframe.
"""
return isinstance(self._data, dd.DataFrame)


# TODO: make generic such that label library can hold arbitrary information
class FragmentAnnotator:
"""Responsible for annotating labels and sister identity of mapped read fragments"""
"""Responsible for annotating labels and sister identity of mapped read fragments.
Args:
label_library (Dict[str, bool]): Dictionary containing the label library.
"""

def __init__(self, label_library: Dict[str, bool]) -> None:
self._label_library = label_library
Expand Down Expand Up @@ -72,7 +92,15 @@ def _assign_label_state(self, data_frame: pd.DataFrame) -> pd.Series:
def annotate_fragments(self, fragments: Fragments) -> Fragments:
"""Takes fragment dataframe and returns a copy of it with its labelling state in a separate
column with name `is_labelled`. If drop_uninformative is true, drops fragments that
are not in label library."""
are not in label library.
Args:
fragments (Fragments): Fragments object containing the fragment data.
Returns:
Fragments: Fragments object with annotated fragment data.
"""
return Fragments(
fragments.data.assign(is_labelled=self._assign_label_state)
.dropna(subset=["is_labelled"])
Expand All @@ -83,7 +111,13 @@ def annotate_fragments(self, fragments: Fragments) -> Fragments:

class FragmentExpander:
"""Expands n-way fragments over sequencing reads
to yield contacts."""
to yield contacts.
Args:
number_fragments (int): Number of fragments.
contains_metadata (bool, optional): Whether the fragment data contains metadata. Defaults to True.
"""

def __init__(self, number_fragments: int, contains_metadata: bool = True) -> None:
self._number_fragments = number_fragments
Expand Down Expand Up @@ -127,7 +161,14 @@ def _expand_single_read(
return pd.DataFrame(result)

def expand(self, fragments: Fragments) -> Contacts:
"""expand contacts n-ways"""
"""expand contacts n-ways
Args:
fragments (Fragments): Fragments object containing the fragment data.
Returns:
Contacts: Contacts object containing the expanded contact data.
"""
# construct dataframe type specific kwargs
if fragments.is_dask:
kwargs = dict(meta=self._get_expansion_output_structure())
Expand Down
Loading

0 comments on commit 26d1f2a

Please sign in to comment.