Close #486 by fixing a double counting bug that it introduced.

cfe-lab · Dec 2, 2019 · b8ec9e1 · b8ec9e1
1 parent 62af69d
commit b8ec9e1
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 9 deletions.
diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py
@@ -504,11 +504,10 @@ def combine_reports(self):
         for region, report in self.reports.items():
             old_report = old_reports[region]
             for i, report_amino in enumerate(report):
-                if i < len(old_report):
-                    old_report[i].seed_amino.add(report_amino.seed_amino)
-                else:
-                    report_amino.seed_amino.consensus_nuc_index = None
-                    old_report.append(report_amino)
+                while len(old_report) <= i:
+                    old_report.append(ReportAmino(SeedAmino(None),
+                                                  len(old_report)+1))
+                old_report[i].seed_amino.add(report_amino.seed_amino)
         self.reports.clear()
 
     def write_amino_report(self, amino_writer, reports, seed, coverage_summary=None):

diff --git a/micall/g2p/fastq_g2p.py b/micall/g2p/fastq_g2p.py
@@ -53,6 +53,9 @@ def parse_args():
     parser.add_argument('aligned_csv',
                         type=argparse.FileType('w'),
                         help='<output> CSV containing mapped reads aligned to HIV seed')
+    parser.add_argument('merged_contigs_csv',
+                        type=argparse.FileType('w'),
+                        help='<output> CSV containing amplicon contigs based on length')
     return parser.parse_args()
 
 
@@ -546,7 +549,8 @@ def main():
               aligned_csv=args.aligned_csv,
               min_count=DEFAULT_MIN_COUNT,
               min_valid=MIN_VALID,
-              min_valid_percent=MIN_VALID_PERCENT)
+              min_valid_percent=MIN_VALID_PERCENT,
+              merged_contigs_csv=args.merged_contigs_csv)
 
 
 if __name__ == '__main__':

diff --git a/micall/tests/test_aln2counts.py b/micall/tests/test_aln2counts.py
@@ -676,6 +676,104 @@ def testMultiplePrefixNucleotideReport(self):
                                   self.detail_report_file.getvalue())
         self.assertMultiLineEqual(expected_text, self.report_file.getvalue())
 
+    def testMultiplePrefixNucleotideReportOverlappingRegions(self):
+        """ Assemble counts from two contigs when coordinate regions overlap.
+
+        Contig 1-R1 AAATTT -> KF
+        Contig 2-R1 TTTAGG -> FR
+
+        Contig 1 and 2 should combine into R1 with KFR, but
+        """
+        self.report.projects.load(StringIO("""\
+{
+  "projects": {
+    "R1": {
+      "max_variants": 0,
+      "regions": [
+        {
+          "coordinate_region": "R1",
+          "seed_region_names": ["R1-seed"]
+        },
+        {
+          "coordinate_region": "R1-expanded",
+          "seed_region_names": ["R1-seed"]
+        }
+      ]
+    }
+  },
+  "regions": {
+    "R1-seed": {
+      "is_nucleotide": true,
+      "reference": [
+        "AAATTTAGG"
+      ]
+    },
+    "R1": {
+      "is_nucleotide": false,
+      "reference": [
+        "KFR"
+      ]
+    },
+    "R1-expanded": {
+      "is_nucleotide": false,
+      "reference": [
+        "GPKFREH"
+      ]
+    }
+  }
+}
+"""))
+        # refname,qcut,rank,count,offset,seq
+        aligned_reads1 = self.prepareReads("1-R1-seed,15,0,5,0,AAATTT")
+        aligned_reads2 = self.prepareReads("2-R1-seed,15,0,2,0,TTTAGG")
+
+        expected_text = """\
+seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,\
+A,C,G,T,N,del,ins,clip,v3_overlap,coverage
+R1-seed,R1,15,,1,5,0,0,0,0,0,0,0,0,5
+R1-seed,R1,15,,2,5,0,0,0,0,0,0,0,0,5
+R1-seed,R1,15,,3,5,0,0,0,0,0,0,0,0,5
+R1-seed,R1,15,,4,0,0,0,7,0,0,0,0,0,7
+R1-seed,R1,15,,5,0,0,0,7,0,0,0,0,0,7
+R1-seed,R1,15,,6,0,0,0,7,0,0,0,0,0,7
+R1-seed,R1,15,,7,2,0,0,0,0,0,0,0,0,2
+R1-seed,R1,15,,8,0,0,2,0,0,0,0,0,0,2
+R1-seed,R1,15,,9,0,0,2,0,0,0,0,0,0,2
+R1-seed,R1-expanded,15,,1,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,2,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,3,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,4,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,5,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,6,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,7,5,0,0,0,0,0,0,0,0,5
+R1-seed,R1-expanded,15,,8,5,0,0,0,0,0,0,0,0,5
+R1-seed,R1-expanded,15,,9,5,0,0,0,0,0,0,0,0,5
+R1-seed,R1-expanded,15,,10,0,0,0,7,0,0,0,0,0,7
+R1-seed,R1-expanded,15,,11,0,0,0,7,0,0,0,0,0,7
+R1-seed,R1-expanded,15,,12,0,0,0,7,0,0,0,0,0,7
+R1-seed,R1-expanded,15,,13,2,0,0,0,0,0,0,0,0,2
+R1-seed,R1-expanded,15,,14,0,0,2,0,0,0,0,0,0,2
+R1-seed,R1-expanded,15,,15,0,0,2,0,0,0,0,0,0,2
+R1-seed,R1-expanded,15,,16,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,17,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,18,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,19,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,20,0,0,0,0,0,0,0,0,0,0
+R1-seed,R1-expanded,15,,21,0,0,0,0,0,0,0,0,0,0
+"""
+
+        self.report.write_nuc_header(self.report_file)
+        self.report.write_nuc_detail_header(self.detail_report_file)
+        self.report.read(aligned_reads1)
+        self.report.write_nuc_detail_counts()
+        self.report.combine_reports()
+        self.report.read(aligned_reads2)
+        self.report.write_nuc_detail_counts()
+        self.report.combine_reports()
+        self.report.write_nuc_counts()
+
+        self.assertMultiLineEqual(expected_text, self.report_file.getvalue())
+
     def testContigCoverageReport(self):
         """ Assemble counts from three contigs to two references.
 

diff --git a/release_test_compare.py b/release_test_compare.py
@@ -94,7 +94,10 @@ def report_source_versions(runs):
         version = os.path.basename(run.source_path)
         version_runs[version].append(run.source_path)
         yield run
-    max_count = max(len(paths) for paths in version_runs.values())
+    if version_runs:
+        max_count = max(len(paths) for paths in version_runs.values())
+    else:
+        max_count = 0
     for version, paths in sorted(version_runs.items()):
         if len(paths) == max_count:
             print(version, max_count)
@@ -534,7 +537,7 @@ def main():
                         samples,
                         chunksize=50)
     scenario_summaries = defaultdict(list)
-    i = None
+    i = 0
     all_consensus_distances = []
     report_count = 0
     for i, (report, scenarios, consensus_distances) in enumerate(results, 1):
@@ -556,6 +559,13 @@ def main():
             print(*summary, end='.\n')
             print(body, end='')
 
+    report_distances(all_consensus_distances)
+    print('Finished {} samples.'.format(i))
+
+
+def report_distances(all_consensus_distances):
+    if not all_consensus_distances:
+        return
     distance_data = pd.DataFrame(all_consensus_distances)
     non_zero_distances = distance_data[distance_data['distance'] != 0]
     region_names = sorted(non_zero_distances['region'].unique())
@@ -569,7 +579,6 @@ def main():
                        'consensus_diffs_{}.svg'.format(page_num),
                        'Consensus Differences Between Previous and v' + MICALL_VERSION,
                        'pct_diff')
-    print('Finished {} samples.'.format(i))
 
 
 if __name__ == '__main__':