Skip to content

Commit

Permalink
make .has_index() into attribute, clarify umi length names, remove re…
Browse files Browse the repository at this point in the history
…dundant attributes
  • Loading branch information
kedhammar committed May 27, 2024
1 parent dbcbe34 commit 91c17e4
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 44 deletions.
72 changes: 33 additions & 39 deletions anglerfish/demux/adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,13 @@ def __init__(
i5_index: str | None = None,
):
self.name: str = name
self.i5_token = (adaptors[name]["i5"],)
self.i7_token = (adaptors[name]["i7"],)
self.index_token: str = INDEX_TOKEN

# i5 attributes
self.i5 = AdaptorPart(
sequence_token=self.i5_token,
name=name,
index=i5_index,
sequence_token=adaptors[name]["i5"], name=name, index=i5_index
)
self.i5_index: str | None = i5_index
self.i5_umi: str | None = self.i5.umi_token
self.i5_umi_before: int = self.i5.len_umi_before_index
self.i5_umi_after: int = self.i5.len_umi_after_index

# i7 attributes
self.i7 = AdaptorPart(
sequence_token=self.i7_token,
name=name,
index=i7_index,
sequence_token=adaptors[name]["i7"], name=name, index=i7_index
)
self.i7_index: str | None = i7_index
self.i7_umi: str | None = self.i7.umi_token
self.i7_umi_before: int = self.i7.len_umi_before_index
self.i7_umi_after: int = self.i7.len_umi_after_index

def get_i5_mask(self, insert_Ns: bool = True) -> str:
"""Get the i5 mask of the adaptor.
Expand All @@ -56,19 +38,23 @@ def get_i5_mask(self, insert_Ns: bool = True) -> str:
insert_Ns = False -> Returns the i7 sequence without index and UMI tokens
"""
index_length = (
len(self.i5_index) if self.i5_index is not None and insert_Ns else 0
len(self.i5.index) if self.i5.index is not None and insert_Ns else 0
)
umi_length = (
max(self.i5.len_umi_after_index, self.i5.len_umi_before_index)
if insert_Ns
else 0
)
umi_length = max(self.i5_umi_after, self.i5_umi_before) if insert_Ns else 0

# Test if the index is specified in the adaptor sequence when it shouldn't be
if (
has_match(INDEX_TOKEN, self.i5.sequence_token)
and self.i5_index is None
and self.i5.index is None
and insert_Ns
):
raise UserWarning("Adaptor has i5 but no sequence was specified")

if self.i5_index is not None or not insert_Ns:
if self.i5.index is not None or not insert_Ns:
new_i5 = re.sub(INDEX_TOKEN, "N" * index_length, self.i5.sequence_token)
new_i5 = re.sub(UMI_TOKEN, "N" * umi_length, new_i5)
return new_i5
Expand All @@ -82,19 +68,23 @@ def get_i7_mask(self, insert_Ns: bool = True) -> str:
insert_Ns = False -> Returns the i7 sequence without index and UMI tokens
"""
index_length = (
len(self.i7_index) if self.i7_index is not None and insert_Ns else 0
len(self.i7.index) if self.i7.index is not None and insert_Ns else 0
)
umi_length = (
max(self.i7.len_umi_after_index, self.i7.len_umi_before_index)
if insert_Ns
else 0
)
umi_length = max(self.i7_umi_after, self.i7_umi_before) if insert_Ns else 0

# Test if the index is specified in the adaptor sequence when it shouldn't be
if (
has_match(INDEX_TOKEN, self.i7.sequence_token)
and self.i7_index is None
and self.i7.index is None
and insert_Ns
):
raise UserWarning("Adaptor has i7 but no sequence was specified")

if self.i7_index is not None or not insert_Ns:
if self.i7.index is not None or not insert_Ns:
new_i7 = re.sub(INDEX_TOKEN, "N" * index_length, self.i7.sequence_token)
new_i7 = re.sub(UMI_TOKEN, "N" * umi_length, new_i7)
return new_i7
Expand Down Expand Up @@ -144,28 +134,32 @@ def __init__(self, sequence_token: str, name: str, index: str | None):

# Parse UMI, if any
umi_token_matches = re.findall(UMI_TOKEN, self.sequence_token)
if umi_token_matches > 0:
if len(umi_token_matches) > 0:
assert (
umi_token_matches == 1
), f"Multiple UMIs found in {self.name}, not supported."
self.umi_token = umi_token_matches[0]

import ipdb

ipdb.set_trace()

self.len_umi = int(re.search(UMI_LENGTH_TOKEN, self.umi_token).group(1))

# Check if UMI is before or after index
if INDEX_TOKEN + UMI_TOKEN in self.sequence_token:
# The index region is INDEX+UMI
self.len_umi_after_index = int(
re.search(UMI_LENGTH_TOKEN, self.umi_token).group(1)
self.len_umi_before_index = len(
INDEX_TOKEN.split(self.sequence_token)[0]
)
self.len_before_index = len(INDEX_TOKEN.split(self.sequence_token)[0])
self.len_after_index = len(UMI_TOKEN.split(self.sequence_token)[-1])
self.len_umi_after_index = len(UMI_TOKEN.split(self.sequence_token)[-1])

elif UMI_TOKEN + INDEX_TOKEN in self.sequence_token:
# The index region is UMI+INDEX
self.len_umi_before_index = int(
re.search(UMI_LENGTH_TOKEN, self.umi_token[0]).group(1)
self.len_umi_before_index = len(UMI_TOKEN.split(self.sequence_token)[0])
self.len_umi_after_index = len(
INDEX_TOKEN.split(self.sequence_token)[-1]
)
self.len_before_index = len(UMI_TOKEN.split(self.sequence_token)[0])
self.len_after_index = len(INDEX_TOKEN.split(self.sequence_token)[-1])

else:
raise UserWarning(
Expand All @@ -174,8 +168,8 @@ def __init__(self, sequence_token: str, name: str, index: str | None):

else:
self.umi_token = None
self.len_before_index = len(INDEX_TOKEN.split(self.sequence_token)[0])
self.len_after_index = len(INDEX_TOKEN.split(self.sequence_token)[-1])
self.len_umi_before_index = len(INDEX_TOKEN.split(self.sequence_token)[0])
self.len_umi_after_index = len(INDEX_TOKEN.split(self.sequence_token)[-1])


def has_match(pattern: re.Pattern, query: str) -> bool:
Expand Down
4 changes: 2 additions & 2 deletions anglerfish/demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,8 @@ def cluster_matches(
fi7, d2 = parse_cs(
i7["cs"],
i7_seq,
adaptor.i7_umi_before,
adaptor.i7_umi_after,
adaptor.i7.len_umi_before_index,
adaptor.i7.len_umi_after_index,
)
dists.append(d1 + d2)

Expand Down
2 changes: 1 addition & 1 deletion anglerfish/demux/samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def minimum_bc_distance(self) -> int:
for ont_barcode, adaptors in ont_bc_to_adaptors.items():
testset[ont_barcode] = []
for adaptor in adaptors:
if adaptor.i5.has_index():
if adaptor.i5.has_index:
testset[ont_barcode].append(adaptor.i5.index + adaptor.i7.index)
else:
testset[ont_barcode].append(adaptor.i7.index)
Expand Down
4 changes: 2 additions & 2 deletions anglerfish/explore/explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def run_explore(
for adaptor_end_name, adaptor_end in zip(
["i5", "i7"], [adaptor.i5, adaptor.i7]
):
if adaptor_end.has_index():
if adaptor_end.has_index:
# Alignment thresholds
before_thres = round(
adaptor_end.len_before_index_region() * good_hit_threshold
Expand Down Expand Up @@ -176,7 +176,7 @@ def run_explore(
["i5", "i7"], [adaptor.i5, adaptor.i7]
):
df_good_hits = entries[adaptor.name][adaptor_end_name]
if adaptor_end.has_index():
if adaptor_end.has_index:
median_insert_length = df_good_hits["insert_len"].median()
if median_insert_length > umi_threshold:
# Calculate entropies here
Expand Down

0 comments on commit 91c17e4

Please sign in to comment.