From a1cad4887bedd0dd474c73416a66b69477337f11 Mon Sep 17 00:00:00 2001 From: Robert McArthur Date: Wed, 24 Apr 2024 14:05:38 +1000 Subject: [PATCH 1/5] ENH: support block size param in seqcoll.to_fasta --- src/cogent3/core/alignment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index e72883d40..084430428 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -984,9 +984,9 @@ def to_json(self): """returns json formatted string""" return json.dumps(self.to_rich_dict()) - def to_fasta(self): + def to_fasta(self, block_size: int = 60): """Return alignment in Fasta format""" - return alignment_to_fasta(self.to_dict()) + return alignment_to_fasta(self.to_dict(), block_size=block_size) def to_nexus(self, seq_type, wrap=50): """ From 17f057d04d50098f36204ab912d5694fbf6654a6 Mon Sep 17 00:00:00 2001 From: Robert McArthur Date: Wed, 24 Apr 2024 14:31:58 +1000 Subject: [PATCH 2/5] DOC: improve docstrings and add type hinting --- src/cogent3/core/alignment.py | 28 ++++++++++++++---- src/cogent3/format/fasta.py | 53 ++++++++++++++++++++++++++++------- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index 084430428..dcfe5d81e 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -984,8 +984,20 @@ def to_json(self): """returns json formatted string""" return json.dumps(self.to_rich_dict()) - def to_fasta(self, block_size: int = 60): - """Return alignment in Fasta format""" + def to_fasta(self, block_size: int = 60) -> str: + """Return alignment in Fasta format. + + Parameters + ---------- + block_size : int, optional + the sequence length to write to each line, + by default 60 + + Returns + ------- + str + The Fasta formatted alignment. + """ return alignment_to_fasta(self.to_dict(), block_size=block_size) def to_nexus(self, seq_type, wrap=50): @@ -1192,10 +1204,16 @@ def get_seq(self, seqname): """Return a sequence object for the specified seqname.""" return self.named_seqs[seqname] - def to_dict(self): - """Returns the alignment as dict of names -> strings. + def to_dict(self) -> dict[str, str]: + """Returns the alignment as a dict of sequence names -> strings. + + Note the mapping goes to strings, not Sequence objects. - Note: returns strings, NOT Sequence objects. + Returns + ------- + dict[str, str] + a dict mapping sequence names to a string representation of + their sequences. """ align_dict = {} diff --git a/src/cogent3/format/fasta.py b/src/cogent3/format/fasta.py index 0a4ebf216..ab30d4b65 100644 --- a/src/cogent3/format/fasta.py +++ b/src/cogent3/format/fasta.py @@ -2,31 +2,64 @@ """Writer for FASTA sequence format """ +from typing import Optional + from cogent3.format.util import _AlignmentFormatter -def alignment_to_fasta(alignment_dict, block_size=60, order=None): - """Returns a Fasta string given an alignment.""" +def alignment_to_fasta( + alignment_dict: dict[str, str], + block_size: int = 60, + order: Optional[list[str]] = None, +) -> str: + """Returns a Fasta string given an alignment. + + Parameters + ---------- + alignment_dict : dict[str, str] + dict of seq_name + block_size : int, optional + the sequence length to write to each line, + by default 60 + order : Optional[list[str]], optional + optional list of sequence names, which order to print in. + Assumes complete and correct list of names, + by default None + + Returns + ------- + str + The alignment in the Fasta format. + """ order = order or [] return FastaFormatter().format(alignment_dict, block_size, order) class FastaFormatter(_AlignmentFormatter): - def format(self, alignment_dict, block_size, order): + def format( + self, + alignment_dict: dict[str, str], + block_size: int = 60, + order: Optional[list[str]] = None, + ) -> str: """Format the alignment to Fasta. Parameters ---------- - alignment_dict + alignment_dict : dict[str, str] dict of seq_name - block_size + block_size : int, optional the sequence length to write to each line, - default is 60 - order - optional list of sequence names, which order to - print in. + by default 60 + order : Optional[list[str]], optional + optional list of sequence names, which order to print in. + Assumes complete and correct list of names, + by default None - (Assumes complete and correct list of names) + Returns + ------- + str + The alignment in the Fasta format. """ # setup if not order: From 9e4e37cdf10900ba7379198f9bbe5eca81196e45 Mon Sep 17 00:00:00 2001 From: Robert McArthur Date: Wed, 24 Apr 2024 15:31:00 +1000 Subject: [PATCH 3/5] DOC: update typing --- src/cogent3/core/alignment.py | 6 +++--- src/cogent3/format/fasta.py | 18 ++++++++---------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index dcfe5d81e..c30ba463f 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -69,6 +69,7 @@ from cogent3.core.info import Info as InfoClass from cogent3.core.profile import PSSM, MotifCountsArray from cogent3.core.sequence import ArraySequence, Sequence, frac_same + # which is a circular import otherwise. from cogent3.format.alignment import save_to_filename from cogent3.format.fasta import alignment_to_fasta @@ -1211,9 +1212,8 @@ def to_dict(self) -> dict[str, str]: Returns ------- - dict[str, str] - a dict mapping sequence names to a string representation of - their sequences. + a dict mapping sequence names to a string representation of + their sequences. """ align_dict = {} diff --git a/src/cogent3/format/fasta.py b/src/cogent3/format/fasta.py index ab30d4b65..28a3bf375 100644 --- a/src/cogent3/format/fasta.py +++ b/src/cogent3/format/fasta.py @@ -16,20 +16,19 @@ def alignment_to_fasta( Parameters ---------- - alignment_dict : dict[str, str] + alignment_dict dict of seq_name - block_size : int, optional + block_size the sequence length to write to each line, by default 60 - order : Optional[list[str]], optional + order optional list of sequence names, which order to print in. Assumes complete and correct list of names, by default None Returns ------- - str - The alignment in the Fasta format. + The alignment in the Fasta format. """ order = order or [] return FastaFormatter().format(alignment_dict, block_size, order) @@ -46,20 +45,19 @@ def format( Parameters ---------- - alignment_dict : dict[str, str] + alignment_dict dict of seq_name - block_size : int, optional + block_size the sequence length to write to each line, by default 60 - order : Optional[list[str]], optional + order optional list of sequence names, which order to print in. Assumes complete and correct list of names, by default None Returns ------- - str - The alignment in the Fasta format. + The alignment in the Fasta format. """ # setup if not order: From 1bb75a2bf988924b700ae5c751bcc9a46242cb73 Mon Sep 17 00:00:00 2001 From: Robert McArthur Date: Wed, 24 Apr 2024 16:19:49 +1000 Subject: [PATCH 4/5] DOC: update to_fast docstring formatting --- src/cogent3/core/alignment.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index c30ba463f..1bba1b448 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -69,7 +69,6 @@ from cogent3.core.info import Info as InfoClass from cogent3.core.profile import PSSM, MotifCountsArray from cogent3.core.sequence import ArraySequence, Sequence, frac_same - # which is a circular import otherwise. from cogent3.format.alignment import save_to_filename from cogent3.format.fasta import alignment_to_fasta @@ -990,14 +989,13 @@ def to_fasta(self, block_size: int = 60) -> str: Parameters ---------- - block_size : int, optional + block_size the sequence length to write to each line, by default 60 Returns ------- - str - The Fasta formatted alignment. + The Fasta formatted alignment. """ return alignment_to_fasta(self.to_dict(), block_size=block_size) From 952379abb229a43b769f18e60ef8e446e66e3cdf Mon Sep 17 00:00:00 2001 From: Robert McArthur Date: Wed, 24 Apr 2024 16:27:11 +1000 Subject: [PATCH 5/5] TST: extend to_fasta cases to include block_size param --- tests/test_core/test_alignment.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index 78070ee2b..dcb2ec9cd 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -413,8 +413,18 @@ def test_to_phylip(self): def test_to_fasta(self): """SequenceCollection should return correct FASTA string""" - aln = self.Class(["AAA", "CCC"]) - self.assertEqual(aln.to_fasta(), ">seq_0\nAAA\n>seq_1\nCCC\n") + aln1 = self.Class(["AAA", "CCC"]) + self.assertEqual(aln1.to_fasta(), ">seq_0\nAAA\n>seq_1\nCCC\n") + self.assertEqual(aln1.to_fasta(block_size=2), ">seq_0\nAA\nA\n>seq_1\nCC\nC\n") + + aln2 = self.Class(["GCATGCAT", "TCAGACGT"]) + self.assertEqual(aln2.to_fasta(), ">seq_0\nGCATGCAT\n>seq_1\nTCAGACGT\n") + self.assertEqual( + aln2.to_fasta(block_size=4), ">seq_0\nGCAT\nGCAT\n>seq_1\nTCAG\nACGT\n" + ) + self.assertEqual( + aln2.to_fasta(block_size=3), ">seq_0\nGCA\nTGC\nAT\n>seq_1\nTCA\nGAC\nGT\n" + ) def test_to_nexus(self): """SequenceCollection should return correct Nexus string format"""