Merge pull request #53 from GavinHuttley/develop

ENH: fleshing out selecting alignments
cogent3 · Jan 1, 2024 · be70222 · be70222
2 parents 7efadfa + fe128e6
commit be70222
Show file tree

Hide file tree

Showing 4 changed files with 305 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,7 +22,7 @@ doc/draw
 # Installer logs
 pip-log.txt
 
-# Unit test / coverage reports
+# Unit test / coverage / dev tool reports
 .coverage*
 .tox
 .nox
@@ -31,6 +31,7 @@ junit-*.xml
 nosetests.xml
 tests/draw_results
 lcov*.info
+.ruff_cache/*
 
 # Translations
 *.mo

diff --git a/.hgignore b/.hgignore
@@ -34,4 +34,5 @@ junit-*.xml
 doc/draw*
 dist/*
 working/*
-lcov*.info
+lcov*.info
+.ruff_cache/*
diff --git a/src/ensembl_lite/_aligndb.py b/src/ensembl_lite/_aligndb.py
@@ -81,15 +81,18 @@ def _get_block_id(
     ) -> list[int]:
         sql = f"SELECT block_id from {self.table_name} WHERE species = ? AND coord_name = ?"
         values = species, coord_name
-        if start and end:
-            sql = f"{sql} AND start > ? AND end < ?"
-            values += (start, end)
-        elif start:
-            sql = f"{sql} AND start > ?"
-            values += (start,)
-        elif end:
-            sql = f"{sql} AND end < ?"
-            values += (end,)
+        if start is not None and end is not None:
+            # as long as start or end are within the record start/end, it's a match
+            sql = f"{sql} AND ((start <= ? AND ? < end) OR (start <= ? AND ? < end))"
+            values += (start, start, end, end)
+        elif start is not None:
+            # the aligned segment overlaps start
+            sql = f"{sql} AND start <= ? AND ? < end"
+            values += (start, start)
+        elif end is not None:
+            # the aligned segment overlaps end
+            sql = f"{sql} AND start <= ? AND ? < end"
+            values += (end, end)
 
         return self.db.execute(sql, values).fetchall()
 
@@ -115,7 +118,7 @@ def get_records_matching(
             )
         ]
 
-        values = " ".join("?" * len(block_ids))
+        values = ", ".join("?" * len(block_ids))
         sql = f"SELECT * from {self.table_name} WHERE block_id IN ({values})"
         results = defaultdict(list)
         for record in self.db.execute(sql, block_ids).fetchall():
@@ -129,42 +132,109 @@ def get_alignment(
     genomes: dict,
     species: str,
     coord_name: str,
-    start=None,
-    end=None,
-) -> Alignment:
-    """return a cogent3 Alignment"""
+    start: int | None = None,
+    end: int | None = None,
+) -> typing.Generator[Alignment]:
+    """yields cogent3 Alignments"""
     from ensembl_lite.convert import gap_coords_to_seq
 
-    # todo connect to annotation db has been filtered for all records for
-    #  each species that fall within
-    # the coordinates of the records
+    if species not in genomes:
+        raise ValueError(f"unknown species {species!r}")
+
+    if start is not None:  # deal with case where numpy scalars are input
+        start = int(start)
+    if end is not None:
+        end = int(end)
+
+    # todo connect to annotation db that has been filtered for all records for
+    #  each species that fall within the coordinates of the records
     align_records = align_db.get_records_matching(
         species=species, coord_name=coord_name, start=start, end=end
     )
+
     # sample the sequences
-    seqs = []
     for block in align_records:
-        # todo we get the gaps corresponding to the reference sequence
-        #  and convert them to a GapPosition instance. We then convert
-        #  the start, end into align_start, align_end. Those values are
-        #  used for all other species -- they are converted into sequence
-        #  coordinates for a species -- selecting their sequence and,
-        #  building the aligned instance, selecting the annotation subset.
-        for record in block:
-            species = record["species"]
+        seqs = []
+        # we get the gaps corresponding to the reference sequence
+        # and convert them to a GapPosition instance. We then convert
+        # the start, end into align_start, align_end. Those values are
+        # used for all other species -- they are converted into sequence
+        # coordinates for each species -- selecting their sequence and,
+        # building the aligned instance, selecting the annotation subset.
+        for align_record in block:
+            if (
+                align_record["species"] == species
+                and align_record["coord_name"] == coord_name
+            ):
+                # start, end are genomic positions and the align_record
+                # start / end are also genomic positions
+                genome_start = align_record["start"]
+                genome_end = align_record["end"]
+                gaps = GapPositions(
+                    align_record["gap_spans"], seq_length=genome_end - genome_start
+                )
+
+                # We use the GapPosition object to identify the alignment
+                # positions the start / end correspond to. The alignment
+                # positions are used below for slicing each sequence in the
+                # alignment.
+
+                # make sure the sequence start and end are within this
+                # aligned block
+                seq_start = max(start or genome_start, genome_start)
+                seq_end = min(end or genome_end, genome_end)
+                # make these coordinates relative to the aligned segment
+                if align_record["strand"] == "-":
+                    # if record is on minus strand, then genome end is
+                    # the alignment start
+                    seq_start, seq_end = genome_end - seq_end, genome_end - seq_start
+                else:
+                    seq_start = seq_start - genome_start
+                    seq_end = seq_end - genome_start
+
+                align_start = gaps.from_seq_to_align_index(seq_start)
+                align_end = gaps.from_seq_to_align_index(seq_end)
+                break
+        else:
+            raise ValueError(f"no matching alignment record for {species!r}")
+
+        for align_record in block:
+            species = align_record["species"]
             genome = genomes[species]
+            coord_name = align_record["coord_name"]
+            # We need to convert the alignment coordinates into sequence
+            # coordinates for this species.
+            genome_start = align_record["start"]
+            genome_end = align_record["end"]
+            gaps = GapPositions(
+                align_record["gap_spans"], seq_length=genome_end - genome_start
+            )
+
+            # We use the alignment indices derived for the reference sequence
+            # above
+            seq_start = gaps.from_align_to_seq_index(align_start)
+            seq_end = gaps.from_align_to_seq_index(align_end)
+            seq_length = seq_end - seq_start
+            if align_record["strand"] == "-":
+                # if it's neg strand, the alignment start is the genome end
+                seq_start = gaps.seq_length - seq_end
             s = genome.get_seq(
-                coord_name=record["coord_name"],
-                start=record["start"],
-                end=record["end"],
+                coord_name=coord_name,
+                start=genome_start + seq_start,
+                end=genome_start + seq_start + seq_length,
             )
-            s = make_seq(s, name=record["coord_name"], moltype="dna")
-            if record["strand"] == "-":
+            # we now trim the gaps for this sequence to the sub-alignment
+            gaps = gaps[align_start:align_end]
+
+            s = make_seq(s, name=coord_name, moltype="dna")
+            if align_record["strand"] == "-":
                 s = s.rc()
-            aligned = gap_coords_to_seq(record["gap_spans"], s)
+
+            aligned = gap_coords_to_seq(gaps.gaps, s)
             seqs.append(aligned)
 
-    return Alignment(seqs)
+        # todo need to add annotation_db
+        yield Alignment(seqs)
 
 
 def _adjust_gap_starts(gaps: numpy.ndarray, new_start: int) -> numpy.ndarray:
@@ -265,7 +335,7 @@ def _within_a_gap(gaps: numpy.ndarray, start: int, stop: int) -> bool:
 class GapPositions:
     # 2D numpy int array,
     # each row is a gap
-    # column 0 is sequence index of gap
+    # column 0 is sequence index of gap **relative to the alignment**
     # column 1 is gap length
     gaps: numpy.ndarray
     seq_length: int