Skip to content

Commit

Permalink
Merge pull request #53 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
ENH: fleshing out selecting alignments
  • Loading branch information
GavinHuttley authored Jan 1, 2024
2 parents 7efadfa + fe128e6 commit be70222
Show file tree
Hide file tree
Showing 4 changed files with 305 additions and 63 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ doc/draw
# Installer logs
pip-log.txt

# Unit test / coverage reports
# Unit test / coverage / dev tool reports
.coverage*
.tox
.nox
Expand All @@ -31,6 +31,7 @@ junit-*.xml
nosetests.xml
tests/draw_results
lcov*.info
.ruff_cache/*

# Translations
*.mo
Expand Down
3 changes: 2 additions & 1 deletion .hgignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ junit-*.xml
doc/draw*
dist/*
working/*
lcov*.info
lcov*.info
.ruff_cache/*
138 changes: 104 additions & 34 deletions src/ensembl_lite/_aligndb.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,18 @@ def _get_block_id(
) -> list[int]:
sql = f"SELECT block_id from {self.table_name} WHERE species = ? AND coord_name = ?"
values = species, coord_name
if start and end:
sql = f"{sql} AND start > ? AND end < ?"
values += (start, end)
elif start:
sql = f"{sql} AND start > ?"
values += (start,)
elif end:
sql = f"{sql} AND end < ?"
values += (end,)
if start is not None and end is not None:
# as long as start or end are within the record start/end, it's a match
sql = f"{sql} AND ((start <= ? AND ? < end) OR (start <= ? AND ? < end))"
values += (start, start, end, end)
elif start is not None:
# the aligned segment overlaps start
sql = f"{sql} AND start <= ? AND ? < end"
values += (start, start)
elif end is not None:
# the aligned segment overlaps end
sql = f"{sql} AND start <= ? AND ? < end"
values += (end, end)

return self.db.execute(sql, values).fetchall()

Expand All @@ -115,7 +118,7 @@ def get_records_matching(
)
]

values = " ".join("?" * len(block_ids))
values = ", ".join("?" * len(block_ids))
sql = f"SELECT * from {self.table_name} WHERE block_id IN ({values})"
results = defaultdict(list)
for record in self.db.execute(sql, block_ids).fetchall():
Expand All @@ -129,42 +132,109 @@ def get_alignment(
genomes: dict,
species: str,
coord_name: str,
start=None,
end=None,
) -> Alignment:
"""return a cogent3 Alignment"""
start: int | None = None,
end: int | None = None,
) -> typing.Generator[Alignment]:
"""yields cogent3 Alignments"""
from ensembl_lite.convert import gap_coords_to_seq

# todo connect to annotation db has been filtered for all records for
# each species that fall within
# the coordinates of the records
if species not in genomes:
raise ValueError(f"unknown species {species!r}")

if start is not None: # deal with case where numpy scalars are input
start = int(start)
if end is not None:
end = int(end)

# todo connect to annotation db that has been filtered for all records for
# each species that fall within the coordinates of the records
align_records = align_db.get_records_matching(
species=species, coord_name=coord_name, start=start, end=end
)

# sample the sequences
seqs = []
for block in align_records:
# todo we get the gaps corresponding to the reference sequence
# and convert them to a GapPosition instance. We then convert
# the start, end into align_start, align_end. Those values are
# used for all other species -- they are converted into sequence
# coordinates for a species -- selecting their sequence and,
# building the aligned instance, selecting the annotation subset.
for record in block:
species = record["species"]
seqs = []
# we get the gaps corresponding to the reference sequence
# and convert them to a GapPosition instance. We then convert
# the start, end into align_start, align_end. Those values are
# used for all other species -- they are converted into sequence
# coordinates for each species -- selecting their sequence and,
# building the aligned instance, selecting the annotation subset.
for align_record in block:
if (
align_record["species"] == species
and align_record["coord_name"] == coord_name
):
# start, end are genomic positions and the align_record
# start / end are also genomic positions
genome_start = align_record["start"]
genome_end = align_record["end"]
gaps = GapPositions(
align_record["gap_spans"], seq_length=genome_end - genome_start
)

# We use the GapPosition object to identify the alignment
# positions the start / end correspond to. The alignment
# positions are used below for slicing each sequence in the
# alignment.

# make sure the sequence start and end are within this
# aligned block
seq_start = max(start or genome_start, genome_start)
seq_end = min(end or genome_end, genome_end)
# make these coordinates relative to the aligned segment
if align_record["strand"] == "-":
# if record is on minus strand, then genome end is
# the alignment start
seq_start, seq_end = genome_end - seq_end, genome_end - seq_start
else:
seq_start = seq_start - genome_start
seq_end = seq_end - genome_start

align_start = gaps.from_seq_to_align_index(seq_start)
align_end = gaps.from_seq_to_align_index(seq_end)
break
else:
raise ValueError(f"no matching alignment record for {species!r}")

for align_record in block:
species = align_record["species"]
genome = genomes[species]
coord_name = align_record["coord_name"]
# We need to convert the alignment coordinates into sequence
# coordinates for this species.
genome_start = align_record["start"]
genome_end = align_record["end"]
gaps = GapPositions(
align_record["gap_spans"], seq_length=genome_end - genome_start
)

# We use the alignment indices derived for the reference sequence
# above
seq_start = gaps.from_align_to_seq_index(align_start)
seq_end = gaps.from_align_to_seq_index(align_end)
seq_length = seq_end - seq_start
if align_record["strand"] == "-":
# if it's neg strand, the alignment start is the genome end
seq_start = gaps.seq_length - seq_end
s = genome.get_seq(
coord_name=record["coord_name"],
start=record["start"],
end=record["end"],
coord_name=coord_name,
start=genome_start + seq_start,
end=genome_start + seq_start + seq_length,
)
s = make_seq(s, name=record["coord_name"], moltype="dna")
if record["strand"] == "-":
# we now trim the gaps for this sequence to the sub-alignment
gaps = gaps[align_start:align_end]

s = make_seq(s, name=coord_name, moltype="dna")
if align_record["strand"] == "-":
s = s.rc()
aligned = gap_coords_to_seq(record["gap_spans"], s)

aligned = gap_coords_to_seq(gaps.gaps, s)
seqs.append(aligned)

return Alignment(seqs)
# todo need to add annotation_db
yield Alignment(seqs)


def _adjust_gap_starts(gaps: numpy.ndarray, new_start: int) -> numpy.ndarray:
Expand Down Expand Up @@ -265,7 +335,7 @@ def _within_a_gap(gaps: numpy.ndarray, start: int, stop: int) -> bool:
class GapPositions:
# 2D numpy int array,
# each row is a gap
# column 0 is sequence index of gap
# column 0 is sequence index of gap **relative to the alignment**
# column 1 is gap length
gaps: numpy.ndarray
seq_length: int
Expand Down
Loading

0 comments on commit be70222

Please sign in to comment.