Merge pull request #32 from tcezard/EVA_3522_fix_assembly_check_report

EVA-3522 - Add assembly check error to HTML report.
EBIvariation · May 2, 2024 · ab60a7e · ab60a7e
2 parents 0e73aa8 + 43e5747
commit ab60a7e
Show file tree

Hide file tree

Showing 8 changed files with 140 additions and 95 deletions.
diff --git a/eva_sub_cli/jinja_templates/file_validation.html b/eva_sub_cli/jinja_templates/file_validation.html
@@ -3,71 +3,87 @@
     {% for check_type, check_per_file in validation_results.items() %}
         {% set result = check_per_file.get(file_name, {}) %}
         {% if check_type == "assembly_check" %}
-            {% set nb_match = result.get("match", 0) %}
-            {% set nb_total = result.get("total", 0) %}
-            {% set match_percentage = nb_match / nb_total * 100 if nb_total else 0 %}
-            {% if result.get("nb_mismatch", 0) > 0 %}
-                {% set icon = "&#10060;" %}
-                {% set row_class = "report-section fail collapsible" %}
-            {% else %}
-                {% set icon = "&#10004;" %}
-                {% set row_class = "report-section pass" %}
-            {% endif %}
-            <div class='{{ row_class }}'>{{ icon }} Assembly check: {{ nb_match }}/{{ nb_total }} ({{ match_percentage|round(2) }}%)</div>
-            {% set mismatch_list = result.get("mismatch_list") %}
-            {% if mismatch_list %}
-                <div class="error-list">
-                <div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ result.get('report_path', '') }}</div>
-                <table>
-                    <tr>
-                        <th>Category</th><th>Error</th>
-                    </tr>
-                    {% for error in mismatch_list[:10] %}
-                        <tr>
-                            <td><strong>mismatch error</strong></td><td> {{ error }}</td>
-                        </tr>
-                    {% endfor %}
-                </table>
-                </div>
-            {% endif %}
+            {{ assembly_check(result) }}
         {% elif check_type == "vcf_check" %}
-            {% set critical_count = result.get("critical_count", 0) %}
-            {% set error_count = result.get("error_count", 0) %}
-            {% set warning_count = result.get("warning_count", 0) %}
-            {% if critical_count > 0 %}
-                {% set icon = "&#10060;" %}
-                {% set row_class = "report-section fail collapsible" %}
-            {% elif error_count > 0 %}
-                {% set icon = "&#10060;" %}
-                {% set row_class = "report-section warn collapsible" %}
-            {% else %}
-                {% set icon = "&#10004;" %}
-                {% set row_class = "report-section pass" %}
-            {% endif %}
-            <div class='{{ row_class }}'>{{ icon }} VCF check: {{ critical_count }} critical errors, {{ error_count }} non-critical errors, {{ warning_count }} warnings </div>
-            {% set critical_list = result.get("critical_list") %}
-            {% set error_list = result.get("error_list") %}
-
-            {% if critical_list or error_list%}
-                <div class="error-list">
-                    <div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ result.get('report_path', '') }}</div>
-                    <table>
-                        <tr>
-                            <th>Category</th><th>Error</th>
-                        </tr>
-                        {% for error in critical_list[:10] %}
-                            <tr>
-                                <td><strong>critical error</strong></td><td> {{ error }}</td>
-                            </tr>
-                        {% endfor %}
-                        {% for error in error_list[:10] %}
-                            <tr>
-                                <td><strong>non-critical error</strong></td><td> {{ error }}</td>
-                            </tr>
-                        {% endfor %}
-                    </table>
-                </div>
-            {% endif %}
+            {{ vcf_check(result) }}
         {% endif %}
     {% endfor %}
-{%- endmacro %}
+{%- endmacro %}
+
+{% macro vcf_check(vcf_check_result) %}
+    {% set critical_count = vcf_check_result.get("critical_count", 0) %}
+    {% set error_count = vcf_check_result.get("error_count", 0) %}
+    {% set warning_count = vcf_check_result.get("warning_count", 0) %}
+    {% if critical_count > 0 %}
+        {% set icon = "&#10060;" %}
+        {% set row_class = "report-section fail collapsible" %}
+    {% elif error_count > 0 %}
+        {% set icon = "&#10060;" %}
+        {% set row_class = "report-section warn collapsible" %}
+    {% else %}
+        {% set icon = "&#10004;" %}
+        {% set row_class = "report-section pass" %}
+    {% endif %}
+    <div class='{{ row_class }}'>{{ icon }} VCF check: {{ critical_count }} critical errors, {{ error_count }} non-critical errors, {{ warning_count }} warnings </div>
+    {% set critical_list = vcf_check_result.get("critical_list") %}
+    {% set error_list = vcf_check_result.get("error_list") %}
+
+    {% if critical_list or error_list%}
+        <div class="error-list">
+            <div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ vcf_check_result.get('report_path', '') }}</div>
+            <table>
+                <tr>
+                    <th>Category</th><th>Error</th>
+                </tr>
+                {% for error in critical_list[:10] %}
+                    <tr>
+                        <td><strong>critical error</strong></td><td> {{ error }}</td>
+                    </tr>
+                {% endfor %}
+                {% for error in error_list[:10] %}
+                    <tr>
+                        <td><strong>non-critical error</strong></td><td> {{ error }}</td>
+                    </tr>
+                {% endfor %}
+            </table>
+        </div>
+    {% endif %}
+{%- endmacro %}
+
+{% macro assembly_check(assembly_check_result) %}
+    {% set nb_match = assembly_check_result.get("match", 0) %}
+    {% set nb_total = assembly_check_result.get("total", 0) %}
+    {% set match_percentage = nb_match / nb_total * 100 if nb_total else 0 %}
+    {% if assembly_check_result.get("nb_mismatch", 0) > 0 %}
+        {% set icon = "&#10060;" %}
+        {% set row_class = "report-section fail collapsible" %}
+    {% else %}
+        {% set icon = "&#10004;" %}
+        {% set row_class = "report-section pass" %}
+    {% endif %}
+    <div class='{{ row_class }}'>{{ icon }} Assembly check: {{ nb_match }}/{{ nb_total }} ({{ match_percentage|round(2) }}%)</div>
+    {% set mismatch_list = assembly_check_result.get("mismatch_list") %}
+    {% set error_list = assembly_check_result.get("error_list") %}
+    {% if mismatch_list or error_list %}
+        <div class="error-list">
+        <div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ assembly_check_result.get('report_path', '') }}</div>
+        <table>
+            <tr>
+                <th>Category</th><th>Error</th>
+            </tr>
+            {% for error in error_list[:10] %}
+                <tr>
+                    <td><strong>Parsing Error</strong></td><td> {{ error }}</td>
+                </tr>
+            {% endfor %}
+            {% for error in mismatch_list[:10] %}
+                <tr>
+                    <td><strong>mismatch error</strong></td><td> {{ error }}</td>
+                </tr>
+            {% endfor %}
+        </table>
+        </div>
+    {% endif %}
+{%- endmacro %}
+
+
diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
@@ -130,7 +130,6 @@ process check_vcf_reference {
     tuple path(vcf), path(fasta), path(report)
 
     output:
-    path "assembly_check/*valid_assembly_report*", emit: vcf_assembly_valid
     path "assembly_check/*text_assembly_report*", emit: assembly_check_report
     path "assembly_check/*.assembly_check.log", emit: assembly_check_log
 
@@ -141,7 +140,7 @@ process check_vcf_reference {
     trap 'if [[ \$? == 1 || \$? == 139 ]]; then exit 0; fi' EXIT
 
     mkdir -p assembly_check
-    $params.executable.vcf_assembly_checker -i $vcf -f $fasta $report_opt -r summary,text,valid  -o assembly_check --require-genbank > assembly_check/${vcf}.assembly_check.log 2>&1
+    $params.executable.vcf_assembly_checker -i $vcf -f $fasta $report_opt -r summary,text  -o assembly_check --require-genbank > assembly_check/${vcf}.assembly_check.log 2>&1
     """
 }
 

diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
@@ -136,7 +136,7 @@ def parse_assembly_check_log(self, assembly_check_log):
                 if line.startswith('[error]'):
                     nb_error += 1
                     if nb_error < 11:
-                        error_list.append(line.strip()[len('[error]'):])
+                        error_list.append(line.strip()[len('[error] '):])
                 elif line.startswith('[info] Number of matches:'):
                     match, total = line.strip()[len('[info] Number of matches: '):].split('/')
                     match = int(match)
@@ -270,12 +270,6 @@ def _assembly_check_log(self, vcf_name):
             os.path.join(self.output_dir, 'assembly_check', vcf_name + '.assembly_check.log')
         )
 
-    @lru_cache
-    def _assembly_check_valid_vcf(self, vcf_name):
-        return resolve_single_file_path(
-            os.path.join(self.output_dir, 'assembly_check', vcf_name + '.valid_assembly_report*')
-        )
-
     @lru_cache
     def _assembly_check_text_report(self, vcf_name):
         return resolve_single_file_path(
@@ -289,10 +283,9 @@ def _collect_assembly_check_results(self):
             vcf_name = os.path.basename(vcf_file)
 
             assembly_check_log = self._assembly_check_log(vcf_name)
-            assembly_check_valid_vcf = self._assembly_check_valid_vcf(vcf_name)
             assembly_check_text_report = self._assembly_check_text_report(vcf_name)
 
-            if assembly_check_log and assembly_check_valid_vcf and assembly_check_text_report:
+            if assembly_check_log and assembly_check_text_report:
                 error_list_from_log, nb_error_from_log, match, total = \
                     self.parse_assembly_check_log(assembly_check_log)
                 mismatch_list, nb_mismatch, error_list_from_report, nb_error_from_report = \

diff --git a/tests/resources/assembly_check/invalid.vcf.assembly_check.log b/tests/resources/assembly_check/invalid.vcf.assembly_check.log
@@ -0,0 +1,9 @@
+[info] Reading from input VCF file...
+[info] Reading from input FASTA file...
+[info] Reading from input FASTA index file...
+[info] Creating index from input FASTA file...
+[info] Number of matches: 85755/100394
+[info] Percentage of matches: 85.4185%
+[info] Text report written to : assembly_check/target.Filter.AfterQC.AddChr.vcf.text_assembly_report.1702044383544.txt
+[info] Valid report written to : assembly_check/target.Filter.AfterQC.AddChr.vcf.valid_assembly_report.1702044383546.txt
+[error] The assembly checking could not be completed: Contig 'chr23' not found in assembly report
diff --git a/tests/resources/assembly_check/invalid.vcf.text_assembly_report.txt b/tests/resources/assembly_check/invalid.vcf.text_assembly_report.txt
@@ -0,0 +1,12 @@
+Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'
+Line 44: Chromosome chr1, position 957900, reference allele 'A' does not match the reference sequence, expected 'G'
+Line 45: Chromosome chr1, position 959193, reference allele 'A' does not match the reference sequence, expected 'G'
+Line 47: Chromosome chr1, position 983193, reference allele 'G' does not match the reference sequence, expected 'A'
+Line 49: Chromosome chr1, position 993456, reference allele 'T' does not match the reference sequence, expected 'C'
+Line 54: Chromosome chr1, position 1013312, reference allele 'A' does not match the reference sequence, expected 'G'
+Line 55: Chromosome chr1, position 1014545, reference allele 'T' does not match the reference sequence, expected 'C'
+Line 58: Chromosome chr1, position 1051063, reference allele 'T' does not match the reference sequence, expected 'C'
+Line 59: Chromosome chr1, position 1068249, reference allele 'T' does not match the reference sequence, expected 'C'
+Line 72: Chromosome chr1, position 1181173, reference allele 'G' does not match the reference sequence, expected 'C'
+Line 80: Chromosome chr1, position 1298561, reference allele 'C' does not match the reference sequence, expected 'T'
+Line 88: Chromosome chr1, position 1366828, reference allele 'G' does not match the reference sequence, expected 'A'
diff --git a/tests/resources/validation_reports/expected_report.html b/tests/resources/validation_reports/expected_report.html
diff --git a/tests/test_report.py b/tests/test_report.py
@@ -17,7 +17,7 @@
         },
         "input_fail.vcf": {
             "report_path": "/path/to/assembly_failed/report",
-            "error_list": [],
+            "error_list": ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"],
             "match": 26,
             "mismatch_list": [
                 "Chromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'",
@@ -30,7 +30,7 @@
                 "Chromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'",
                 "Chromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'",
             ],
-            "nb_error": 0,
+            "nb_error": 1,
             "nb_mismatch": 10,
             "total": 36,
         },
@@ -153,5 +153,8 @@ class TestReport(TestCase):
 
     def test_generate_html_report(self):
         report = generate_html_report(validation_results, datetime.datetime(2023, 8, 31, 12, 34, 56), "My cool project")
+        with open('report.html', 'w') as open_file:
+            open_file.write(report)
+
         with open(self.expected_report) as open_html:
             assert report == open_html.read()
diff --git a/tests/test_validator.py b/tests/test_validator.py
@@ -21,7 +21,7 @@ def setUp(self) -> None:
                             [os.path.join(self.vcf_files, 'input_passed.vcf')],
                             [os.path.join(self.fasta_files, 'input_passed.fa')],
                             [os.path.join(self.assembly_reports, 'input_passed.txt')])
-        self.reporter = Validator(self.mapping_file, self.output_dir)
+        self.validator = Validator(self.mapping_file, self.output_dir)
 
     def tearDown(self) -> None:
         for f in ['expected_report.html', self.mapping_file]:
@@ -77,21 +77,21 @@ def test__collect_validation_workflow_results(self):
             }
         }
 
-        self.reporter._collect_validation_workflow_results()
+        self.validator._collect_validation_workflow_results()
         # Drop report paths from comparison (test will fail if missing)
-        del self.reporter.results['metadata_check']['json_report_path']
-        del self.reporter.results['metadata_check']['spreadsheet_report_path']
-        del self.reporter.results['sample_check']['report_path']
-        for file in self.reporter.results['vcf_check'].values():
+        del self.validator.results['metadata_check']['json_report_path']
+        del self.validator.results['metadata_check']['spreadsheet_report_path']
+        del self.validator.results['sample_check']['report_path']
+        for file in self.validator.results['vcf_check'].values():
             del file['report_path']
-        for file in self.reporter.results['assembly_check'].values():
+        for file in self.validator.results['assembly_check'].values():
             del file['report_path']
 
-        assert self.reporter.results == expected_results
+        assert self.validator.results == expected_results
 
     def test_create_report(self):
-        self.reporter._collect_validation_workflow_results()
-        report_path = self.reporter.create_reports()
+        self.validator._collect_validation_workflow_results()
+        report_path = self.validator.create_reports()
         assert os.path.exists(report_path)
 
     def test_vcf_check_errors_is_critical(self):
@@ -102,11 +102,11 @@ def test_vcf_check_errors_is_critical(self):
         ]
         expected_return = [False, True, True]
         for i, error in enumerate(errors):
-            assert self.reporter.vcf_check_errors_is_critical(error) == expected_return[i]
+            assert self.validator.vcf_check_errors_is_critical(error) == expected_return[i]
 
     def test_parse_biovalidator_validation_results(self):
-        self.reporter._parse_biovalidator_validation_results()
-        assert self.reporter.results['metadata_check']['json_errors'] == [
+        self.validator._parse_biovalidator_validation_results()
+        assert self.validator.results['metadata_check']['json_errors'] == [
             {'property': '.files', 'description': "should have required property 'files'"},
             {'property': '/project.title', 'description': "should have required property 'title'"},
             {'property': '/analysis/0.description', 'description': "should have required property 'description'"},
@@ -117,7 +117,7 @@ def test_parse_biovalidator_validation_results(self):
         ]
 
     def test_convert_biovalidator_validation_to_spreadsheet(self):
-        self.reporter.results['metadata_check'] = {
+        self.validator.results['metadata_check'] = {
             'json_errors': [
                 {'property': '.files', 'description': "should have required property 'files'"},
                 {'property': '/project.title', 'description': "should have required property 'title'"},
@@ -132,12 +132,25 @@ def test_convert_biovalidator_validation_to_spreadsheet(self):
                 {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
             ]
         }
-        self.reporter._convert_biovalidator_validation_to_spreadsheet()
+        self.validator._convert_biovalidator_validation_to_spreadsheet()
 
-        assert self.reporter.results['metadata_check']['spreadsheet_errors'] == [
+        assert self.validator.results['metadata_check']['spreadsheet_errors'] == [
             {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
             {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
             {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
             {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
             {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
         ]
+
+    def test_parse_assembly_check_log(self):
+        assembly_check_log = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.assembly_check.log')
+        error_list, nb_error, match, total = self.validator.parse_assembly_check_log(assembly_check_log)
+        assert error_list == ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"]
+
+    def test_parse_assembly_check_report(self):
+        assembly_check_report = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.text_assembly_report.txt')
+        mismatch_list, nb_mismatch, error_list, nb_error = self.validator.parse_assembly_check_report(assembly_check_report)
+        assert mismatch_list[0] == "Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'"
+        assert nb_mismatch == 12
+        assert error_list == []
+        assert nb_error == 0