diff --git a/src/strpkg/call.nim b/src/strpkg/call.nim index 4371e32..19080a6 100644 --- a/src/strpkg/call.nim +++ b/src/strpkg/call.nim @@ -229,7 +229,8 @@ proc call_main*() = var b: Bounds var good_cluster: bool - (b, good_cluster) = bounds(c, min_clip, min_clip_total) + let max_clip_dist: uint16 = uint16(0.5 * frag_dist.median(0.5).float) + (b, good_cluster) = bounds(c, min_clip, min_clip_total, max_clip_dist) if good_cluster == false: continue diff --git a/src/strpkg/callclusters.nim b/src/strpkg/callclusters.nim index 9093685..bb834b0 100644 --- a/src/strpkg/callclusters.nim +++ b/src/strpkg/callclusters.nim @@ -49,11 +49,11 @@ proc assign_reads_locus*(locus: var Bounds, treads_by_tid_rep: TableRef[tid_rep, elif r.split == Soft.left: locus.n_left.inc -proc bounds*(c: Cluster, min_clip: uint16, min_clip_total: uint16): (Bounds, bool) = +proc bounds*(c: Cluster, min_clip: uint16, min_clip_total: uint16, max_clip_dist:uint16): (Bounds, bool) = if c.reads.len >= uint16.high.int: stderr.write_line "More than " & &"{uint16.high.int}" & " reads in cluster with first read:" & $c.reads[0] & " skipping" return - var b = c.bounds + var b = c.bounds(max_clip_dist) if b.right - b.left > 1000'u32: stderr.write_line "large bounds:" & $b & " skipping" return diff --git a/src/strpkg/cluster.nim b/src/strpkg/cluster.nim index 1e0b9a2..c8930f7 100644 --- a/src/strpkg/cluster.nim +++ b/src/strpkg/cluster.nim @@ -171,7 +171,9 @@ proc parse_bounds*(f:string, targets: seq[Target]): seq[Bounds] = # Find the bounds of the STR in the reference genome -proc bounds*(cl:Cluster): Bounds = +# max_clip_dist is set to 0.5 * median insert size elsewhere +# (200 is an okay default for testing) +proc bounds*(cl:Cluster, max_clip_dist:uint16=200): Bounds = var lefts = initCountTable[uint32](8) var rights = initCountTable[uint32](8) @@ -184,15 +186,21 @@ proc bounds*(cl:Cluster): Bounds = doAssert cl.reads.len <= uint16.high.int, ("got too many reads for cluster with first read:" & $cl.reads[0]) for r in cl.reads: - result.n_total.inc - if r.split == Soft.right: - # soft-clip of left indicates right end of variant - rights.inc(r.position) - result.n_right.inc - elif r.split == Soft.left: + posns.add(r.position) + result.center_mass = posns[int(posns.len / 2)] + + # Check of soft-clip is too far away before adding it + for r in cl.reads: + if r.split == Soft.left and int32(r.position) < int32(result.center_mass) + int32(max_clip_dist): lefts.inc(r.position) result.n_left.inc - posns.add(r.position) + result.n_total.inc + elif r.split == Soft.right and int32(r.position) > int32(result.center_mass) - int32(max_clip_dist): + rights.inc(r.position) + result.n_right.inc + result.n_total.inc + else: + result.n_total.inc if lefts.len > 0: var ll = lefts.largest @@ -203,8 +211,7 @@ proc bounds*(cl:Cluster): Bounds = if rr.val > 1: result.right = rr.key - if posns.len > 0: - result.center_mass = posns[int(posns.len / 2)] + if posns.len > 0: #necessary? if result.left == 0: result.left = result.center_mass if result.right == 0: diff --git a/src/strpkg/merge.nim b/src/strpkg/merge.nim index d0b4854..9a3189d 100644 --- a/src/strpkg/merge.nim +++ b/src/strpkg/merge.nim @@ -165,7 +165,8 @@ proc merge_main*() = var b: Bounds var good_cluster: bool - (b, good_cluster) = bounds(c, min_clip, min_clip_total) + let max_clip_dist: uint16 = uint16(0.5 * frag_dist.median(0.5).float) + (b, good_cluster) = bounds(c, min_clip, min_clip_total, max_clip_dist) if good_cluster == false: continue diff --git a/tests/test_cluster.nim b/tests/test_cluster.nim index f13176a..9bf1c9f 100644 --- a/tests/test_cluster.nim +++ b/tests/test_cluster.nim @@ -117,6 +117,26 @@ suite "cluster suite": check b.left == 3 check b.right == 4 + test "test bounds: filter out soft-clips that are inconsistent with the center mass": + var reads = @[ + tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 100, split: Soft.right), + tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 123, split: Soft.none), + tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 223, split: Soft.none), + tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 223, split: Soft.none), + tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 223, split: Soft.none), + tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 253, split: Soft.none), + tread(tid:1'i32, repeat: ['A', 'T', 'G', '\x00', '\x00', '\x00'], position: 283, split: Soft.none), + ] + + let max_clip_dist:uint16 = 50 + var cl = Cluster(reads:reads) + var b = cl.bounds(max_clip_dist) + check b.left == 223 + check b.right == 224 + check b.left_most == 100 + check b.right_most == 283 + check b.center_mass == 223 + # This test is failing. Need to think about how to fix # test "test bounds: no left soft-clipped reads so use median": # var reads = @[