Skip to content

Commit

Permalink
for bounds calc, ignore soft-clips that are inconsistent with the cen…
Browse files Browse the repository at this point in the history
…ter mass
  • Loading branch information
hdashnow committed Mar 24, 2021
1 parent 8e9a7e8 commit bcdaf4a
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 14 deletions.
3 changes: 2 additions & 1 deletion src/strpkg/call.nim
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ proc call_main*() =

var b: Bounds
var good_cluster: bool
(b, good_cluster) = bounds(c, min_clip, min_clip_total)
let max_clip_dist: uint16 = uint16(0.5 * frag_dist.median(0.5).float)
(b, good_cluster) = bounds(c, min_clip, min_clip_total, max_clip_dist)
if good_cluster == false:
continue

Expand Down
4 changes: 2 additions & 2 deletions src/strpkg/callclusters.nim
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ proc assign_reads_locus*(locus: var Bounds, treads_by_tid_rep: TableRef[tid_rep,
elif r.split == Soft.left:
locus.n_left.inc

proc bounds*(c: Cluster, min_clip: uint16, min_clip_total: uint16): (Bounds, bool) =
proc bounds*(c: Cluster, min_clip: uint16, min_clip_total: uint16, max_clip_dist:uint16): (Bounds, bool) =
if c.reads.len >= uint16.high.int:
stderr.write_line "More than " & &"{uint16.high.int}" & " reads in cluster with first read:" & $c.reads[0] & " skipping"
return
var b = c.bounds
var b = c.bounds(max_clip_dist)
if b.right - b.left > 1000'u32:
stderr.write_line "large bounds:" & $b & " skipping"
return
Expand Down
27 changes: 17 additions & 10 deletions src/strpkg/cluster.nim
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ proc parse_bounds*(f:string, targets: seq[Target]): seq[Bounds] =


# Find the bounds of the STR in the reference genome
proc bounds*(cl:Cluster): Bounds =
# max_clip_dist is set to 0.5 * median insert size elsewhere
# (200 is an okay default for testing)
proc bounds*(cl:Cluster, max_clip_dist:uint16=200): Bounds =

var lefts = initCountTable[uint32](8)
var rights = initCountTable[uint32](8)
Expand All @@ -184,15 +186,21 @@ proc bounds*(cl:Cluster): Bounds =
doAssert cl.reads.len <= uint16.high.int, ("got too many reads for cluster with first read:" & $cl.reads[0])

for r in cl.reads:
result.n_total.inc
if r.split == Soft.right:
# soft-clip of left indicates right end of variant
rights.inc(r.position)
result.n_right.inc
elif r.split == Soft.left:
posns.add(r.position)
result.center_mass = posns[int(posns.len / 2)]

# Check of soft-clip is too far away before adding it
for r in cl.reads:
if r.split == Soft.left and int32(r.position) < int32(result.center_mass) + int32(max_clip_dist):
lefts.inc(r.position)
result.n_left.inc
posns.add(r.position)
result.n_total.inc
elif r.split == Soft.right and int32(r.position) > int32(result.center_mass) - int32(max_clip_dist):
rights.inc(r.position)
result.n_right.inc
result.n_total.inc
else:
result.n_total.inc

if lefts.len > 0:
var ll = lefts.largest
Expand All @@ -203,8 +211,7 @@ proc bounds*(cl:Cluster): Bounds =
if rr.val > 1:
result.right = rr.key

if posns.len > 0:
result.center_mass = posns[int(posns.len / 2)]
if posns.len > 0: #necessary?
if result.left == 0:
result.left = result.center_mass
if result.right == 0:
Expand Down
3 changes: 2 additions & 1 deletion src/strpkg/merge.nim
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ proc merge_main*() =

var b: Bounds
var good_cluster: bool
(b, good_cluster) = bounds(c, min_clip, min_clip_total)
let max_clip_dist: uint16 = uint16(0.5 * frag_dist.median(0.5).float)
(b, good_cluster) = bounds(c, min_clip, min_clip_total, max_clip_dist)
if good_cluster == false:
continue

Expand Down
20 changes: 20 additions & 0 deletions tests/test_cluster.nim
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,26 @@ suite "cluster suite":
check b.left == 3
check b.right == 4

test "test bounds: filter out soft-clips that are inconsistent with the center mass":
var reads = @[
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 100, split: Soft.right),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 123, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 223, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 223, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 223, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 253, split: Soft.none),
tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 283, split: Soft.none),
]

let max_clip_dist:uint16 = 50
var cl = Cluster(reads:reads)
var b = cl.bounds(max_clip_dist)
check b.left == 223
check b.right == 224
check b.left_most == 100
check b.right_most == 283
check b.center_mass == 223

# This test is failing. Need to think about how to fix
# test "test bounds: no left soft-clipped reads so use median":
# var reads = @[
Expand Down

0 comments on commit bcdaf4a

Please sign in to comment.