Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore inconsistent soft-clips #90

Merged
merged 1 commit into from
Mar 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/strpkg/call.nim
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ proc call_main*() =

var b: Bounds
var good_cluster: bool
(b, good_cluster) = bounds(c, min_clip, min_clip_total)
let max_clip_dist: uint16 = uint16(0.5 * frag_dist.median(0.5).float)
(b, good_cluster) = bounds(c, min_clip, min_clip_total, max_clip_dist)
if good_cluster == false:
continue

Expand Down
4 changes: 2 additions & 2 deletions src/strpkg/callclusters.nim
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ proc assign_reads_locus*(locus: var Bounds, treads_by_tid_rep: TableRef[tid_rep,
elif r.split == Soft.left:
locus.n_left.inc

proc bounds*(c: Cluster, min_clip: uint16, min_clip_total: uint16): (Bounds, bool) =
proc bounds*(c: Cluster, min_clip: uint16, min_clip_total: uint16, max_clip_dist:uint16): (Bounds, bool) =
if c.reads.len >= uint16.high.int:
stderr.write_line "More than " & &"{uint16.high.int}" & " reads in cluster with first read:" & $c.reads[0] & " skipping"
return
var b = c.bounds
var b = c.bounds(max_clip_dist)
if b.right - b.left > 1000'u32:
stderr.write_line "large bounds:" & $b & " skipping"
return
Expand Down
27 changes: 17 additions & 10 deletions src/strpkg/cluster.nim
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ proc parse_bounds*(f:string, targets: seq[Target]): seq[Bounds] =


# Find the bounds of the STR in the reference genome
proc bounds*(cl:Cluster): Bounds =
# max_clip_dist is set to 0.5 * median insert size elsewhere
# (200 is an okay default for testing)
proc bounds*(cl:Cluster, max_clip_dist:uint16=200): Bounds =

var lefts = initCountTable[uint32](8)
var rights = initCountTable[uint32](8)
Expand All @@ -184,15 +186,21 @@ proc bounds*(cl:Cluster): Bounds =
doAssert cl.reads.len <= uint16.high.int, ("got too many reads for cluster with first read:" & $cl.reads[0])

for r in cl.reads:
result.n_total.inc
if r.split == Soft.right:
# soft-clip of left indicates right end of variant
rights.inc(r.position)
result.n_right.inc
elif r.split == Soft.left:
posns.add(r.position)
result.center_mass = posns[int(posns.len / 2)]

# Check of soft-clip is too far away before adding it
for r in cl.reads:
if r.split == Soft.left and int32(r.position) < int32(result.center_mass) + int32(max_clip_dist):
lefts.inc(r.position)
result.n_left.inc
posns.add(r.position)
result.n_total.inc
elif r.split == Soft.right and int32(r.position) > int32(result.center_mass) - int32(max_clip_dist):
rights.inc(r.position)
result.n_right.inc
result.n_total.inc
else:
result.n_total.inc

if lefts.len > 0:
var ll = lefts.largest
Expand All @@ -203,8 +211,7 @@ proc bounds*(cl:Cluster): Bounds =
if rr.val > 1:
result.right = rr.key

if posns.len > 0:
result.center_mass = posns[int(posns.len / 2)]
if posns.len > 0: #necessary?
if result.left == 0:
result.left = result.center_mass
if result.right == 0:
Expand Down
3 changes: 2 additions & 1 deletion src/strpkg/merge.nim
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ proc merge_main*() =

var b: Bounds
var good_cluster: bool
(b, good_cluster) = bounds(c, min_clip, min_clip_total)
let max_clip_dist: uint16 = uint16(0.5 * frag_dist.median(0.5).float)
(b, good_cluster) = bounds(c, min_clip, min_clip_total, max_clip_dist)
if good_cluster == false:
continue

Expand Down
20 changes: 20 additions & 0 deletions tests/test_cluster.nim
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,26 @@ suite "cluster suite":
check b.left == 3
check b.right == 4

test "test bounds: filter out soft-clips that are inconsistent with the center mass":
var reads = @[
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 100, split: Soft.right),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 123, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 223, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 223, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 223, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 253, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 283, split: Soft.none),
]

let max_clip_dist:uint16 = 50
var cl = Cluster(reads:reads)
var b = cl.bounds(max_clip_dist)
check b.left == 223
check b.right == 224
check b.left_most == 100
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should it drop (or flag) the read(s) that don't fit with the event?

Copy link
Collaborator Author

@hdashnow hdashnow Mar 12, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that would be better. Because then we could update the center mass and the start/end of the event accordingly. Ideas on how to code that?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@brentp what are you thinking about this? Do you think it's worth changing the implementation to completely drop those reads rather than just excluding them from the left/right bounds calculation?

check b.right_most == 283
check b.center_mass == 223

# This test is failing. Need to think about how to fix
# test "test bounds: no left soft-clipped reads so use median":
# var reads = @[
Expand Down