Skip to content

Commit

Permalink
Merge pull request #32 from tcezard/EVA_3522_fix_assembly_check_report
Browse files Browse the repository at this point in the history
EVA-3522 - Add assembly check error to HTML report.
  • Loading branch information
tcezard authored May 2, 2024
2 parents 0e73aa8 + 43e5747 commit ab60a7e
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 95 deletions.
146 changes: 81 additions & 65 deletions eva_sub_cli/jinja_templates/file_validation.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,71 +3,87 @@
{% for check_type, check_per_file in validation_results.items() %}
{% set result = check_per_file.get(file_name, {}) %}
{% if check_type == "assembly_check" %}
{% set nb_match = result.get("match", 0) %}
{% set nb_total = result.get("total", 0) %}
{% set match_percentage = nb_match / nb_total * 100 if nb_total else 0 %}
{% if result.get("nb_mismatch", 0) > 0 %}
{% set icon = "❌" %}
{% set row_class = "report-section fail collapsible" %}
{% else %}
{% set icon = "✔" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} Assembly check: {{ nb_match }}/{{ nb_total }} ({{ match_percentage|round(2) }}%)</div>
{% set mismatch_list = result.get("mismatch_list") %}
{% if mismatch_list %}
<div class="error-list">
<div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ result.get('report_path', '') }}</div>
<table>
<tr>
<th>Category</th><th>Error</th>
</tr>
{% for error in mismatch_list[:10] %}
<tr>
<td><strong>mismatch error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{{ assembly_check(result) }}
{% elif check_type == "vcf_check" %}
{% set critical_count = result.get("critical_count", 0) %}
{% set error_count = result.get("error_count", 0) %}
{% set warning_count = result.get("warning_count", 0) %}
{% if critical_count > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% elif error_count > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section warn collapsible" %}
{% else %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} VCF check: {{ critical_count }} critical errors, {{ error_count }} non-critical errors, {{ warning_count }} warnings </div>
{% set critical_list = result.get("critical_list") %}
{% set error_list = result.get("error_list") %}

{% if critical_list or error_list%}
<div class="error-list">
<div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ result.get('report_path', '') }}</div>
<table>
<tr>
<th>Category</th><th>Error</th>
</tr>
{% for error in critical_list[:10] %}
<tr>
<td><strong>critical error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
{% for error in error_list[:10] %}
<tr>
<td><strong>non-critical error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{{ vcf_check(result) }}
{% endif %}
{% endfor %}
{%- endmacro %}
{%- endmacro %}

{% macro vcf_check(vcf_check_result) %}
{% set critical_count = vcf_check_result.get("critical_count", 0) %}
{% set error_count = vcf_check_result.get("error_count", 0) %}
{% set warning_count = vcf_check_result.get("warning_count", 0) %}
{% if critical_count > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% elif error_count > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section warn collapsible" %}
{% else %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} VCF check: {{ critical_count }} critical errors, {{ error_count }} non-critical errors, {{ warning_count }} warnings </div>
{% set critical_list = vcf_check_result.get("critical_list") %}
{% set error_list = vcf_check_result.get("error_list") %}

{% if critical_list or error_list%}
<div class="error-list">
<div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ vcf_check_result.get('report_path', '') }}</div>
<table>
<tr>
<th>Category</th><th>Error</th>
</tr>
{% for error in critical_list[:10] %}
<tr>
<td><strong>critical error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
{% for error in error_list[:10] %}
<tr>
<td><strong>non-critical error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{%- endmacro %}

{% macro assembly_check(assembly_check_result) %}
{% set nb_match = assembly_check_result.get("match", 0) %}
{% set nb_total = assembly_check_result.get("total", 0) %}
{% set match_percentage = nb_match / nb_total * 100 if nb_total else 0 %}
{% if assembly_check_result.get("nb_mismatch", 0) > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% else %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} Assembly check: {{ nb_match }}/{{ nb_total }} ({{ match_percentage|round(2) }}%)</div>
{% set mismatch_list = assembly_check_result.get("mismatch_list") %}
{% set error_list = assembly_check_result.get("error_list") %}
{% if mismatch_list or error_list %}
<div class="error-list">
<div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ assembly_check_result.get('report_path', '') }}</div>
<table>
<tr>
<th>Category</th><th>Error</th>
</tr>
{% for error in error_list[:10] %}
<tr>
<td><strong>Parsing Error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
{% for error in mismatch_list[:10] %}
<tr>
<td><strong>mismatch error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{%- endmacro %}


3 changes: 1 addition & 2 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ process check_vcf_reference {
tuple path(vcf), path(fasta), path(report)

output:
path "assembly_check/*valid_assembly_report*", emit: vcf_assembly_valid
path "assembly_check/*text_assembly_report*", emit: assembly_check_report
path "assembly_check/*.assembly_check.log", emit: assembly_check_log

Expand All @@ -141,7 +140,7 @@ process check_vcf_reference {
trap 'if [[ \$? == 1 || \$? == 139 ]]; then exit 0; fi' EXIT
mkdir -p assembly_check
$params.executable.vcf_assembly_checker -i $vcf -f $fasta $report_opt -r summary,text,valid -o assembly_check --require-genbank > assembly_check/${vcf}.assembly_check.log 2>&1
$params.executable.vcf_assembly_checker -i $vcf -f $fasta $report_opt -r summary,text -o assembly_check --require-genbank > assembly_check/${vcf}.assembly_check.log 2>&1
"""
}

Expand Down
11 changes: 2 additions & 9 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def parse_assembly_check_log(self, assembly_check_log):
if line.startswith('[error]'):
nb_error += 1
if nb_error < 11:
error_list.append(line.strip()[len('[error]'):])
error_list.append(line.strip()[len('[error] '):])
elif line.startswith('[info] Number of matches:'):
match, total = line.strip()[len('[info] Number of matches: '):].split('/')
match = int(match)
Expand Down Expand Up @@ -270,12 +270,6 @@ def _assembly_check_log(self, vcf_name):
os.path.join(self.output_dir, 'assembly_check', vcf_name + '.assembly_check.log')
)

@lru_cache
def _assembly_check_valid_vcf(self, vcf_name):
return resolve_single_file_path(
os.path.join(self.output_dir, 'assembly_check', vcf_name + '.valid_assembly_report*')
)

@lru_cache
def _assembly_check_text_report(self, vcf_name):
return resolve_single_file_path(
Expand All @@ -289,10 +283,9 @@ def _collect_assembly_check_results(self):
vcf_name = os.path.basename(vcf_file)

assembly_check_log = self._assembly_check_log(vcf_name)
assembly_check_valid_vcf = self._assembly_check_valid_vcf(vcf_name)
assembly_check_text_report = self._assembly_check_text_report(vcf_name)

if assembly_check_log and assembly_check_valid_vcf and assembly_check_text_report:
if assembly_check_log and assembly_check_text_report:
error_list_from_log, nb_error_from_log, match, total = \
self.parse_assembly_check_log(assembly_check_log)
mismatch_list, nb_mismatch, error_list_from_report, nb_error_from_report = \
Expand Down
9 changes: 9 additions & 0 deletions tests/resources/assembly_check/invalid.vcf.assembly_check.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[info] Reading from input VCF file...
[info] Reading from input FASTA file...
[info] Reading from input FASTA index file...
[info] Creating index from input FASTA file...
[info] Number of matches: 85755/100394
[info] Percentage of matches: 85.4185%
[info] Text report written to : assembly_check/target.Filter.AfterQC.AddChr.vcf.text_assembly_report.1702044383544.txt
[info] Valid report written to : assembly_check/target.Filter.AfterQC.AddChr.vcf.valid_assembly_report.1702044383546.txt
[error] The assembly checking could not be completed: Contig 'chr23' not found in assembly report
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'
Line 44: Chromosome chr1, position 957900, reference allele 'A' does not match the reference sequence, expected 'G'
Line 45: Chromosome chr1, position 959193, reference allele 'A' does not match the reference sequence, expected 'G'
Line 47: Chromosome chr1, position 983193, reference allele 'G' does not match the reference sequence, expected 'A'
Line 49: Chromosome chr1, position 993456, reference allele 'T' does not match the reference sequence, expected 'C'
Line 54: Chromosome chr1, position 1013312, reference allele 'A' does not match the reference sequence, expected 'G'
Line 55: Chromosome chr1, position 1014545, reference allele 'T' does not match the reference sequence, expected 'C'
Line 58: Chromosome chr1, position 1051063, reference allele 'T' does not match the reference sequence, expected 'C'
Line 59: Chromosome chr1, position 1068249, reference allele 'T' does not match the reference sequence, expected 'C'
Line 72: Chromosome chr1, position 1181173, reference allele 'G' does not match the reference sequence, expected 'C'
Line 80: Chromosome chr1, position 1298561, reference allele 'C' does not match the reference sequence, expected 'T'
Line 88: Chromosome chr1, position 1366828, reference allele 'G' does not match the reference sequence, expected 'A'
2 changes: 1 addition & 1 deletion tests/resources/validation_reports/expected_report.html

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions tests/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
},
"input_fail.vcf": {
"report_path": "/path/to/assembly_failed/report",
"error_list": [],
"error_list": ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"],
"match": 26,
"mismatch_list": [
"Chromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'",
Expand All @@ -30,7 +30,7 @@
"Chromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'",
"Chromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'",
],
"nb_error": 0,
"nb_error": 1,
"nb_mismatch": 10,
"total": 36,
},
Expand Down Expand Up @@ -153,5 +153,8 @@ class TestReport(TestCase):

def test_generate_html_report(self):
report = generate_html_report(validation_results, datetime.datetime(2023, 8, 31, 12, 34, 56), "My cool project")
with open('report.html', 'w') as open_file:
open_file.write(report)

with open(self.expected_report) as open_html:
assert report == open_html.read()
45 changes: 29 additions & 16 deletions tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setUp(self) -> None:
[os.path.join(self.vcf_files, 'input_passed.vcf')],
[os.path.join(self.fasta_files, 'input_passed.fa')],
[os.path.join(self.assembly_reports, 'input_passed.txt')])
self.reporter = Validator(self.mapping_file, self.output_dir)
self.validator = Validator(self.mapping_file, self.output_dir)

def tearDown(self) -> None:
for f in ['expected_report.html', self.mapping_file]:
Expand Down Expand Up @@ -77,21 +77,21 @@ def test__collect_validation_workflow_results(self):
}
}

self.reporter._collect_validation_workflow_results()
self.validator._collect_validation_workflow_results()
# Drop report paths from comparison (test will fail if missing)
del self.reporter.results['metadata_check']['json_report_path']
del self.reporter.results['metadata_check']['spreadsheet_report_path']
del self.reporter.results['sample_check']['report_path']
for file in self.reporter.results['vcf_check'].values():
del self.validator.results['metadata_check']['json_report_path']
del self.validator.results['metadata_check']['spreadsheet_report_path']
del self.validator.results['sample_check']['report_path']
for file in self.validator.results['vcf_check'].values():
del file['report_path']
for file in self.reporter.results['assembly_check'].values():
for file in self.validator.results['assembly_check'].values():
del file['report_path']

assert self.reporter.results == expected_results
assert self.validator.results == expected_results

def test_create_report(self):
self.reporter._collect_validation_workflow_results()
report_path = self.reporter.create_reports()
self.validator._collect_validation_workflow_results()
report_path = self.validator.create_reports()
assert os.path.exists(report_path)

def test_vcf_check_errors_is_critical(self):
Expand All @@ -102,11 +102,11 @@ def test_vcf_check_errors_is_critical(self):
]
expected_return = [False, True, True]
for i, error in enumerate(errors):
assert self.reporter.vcf_check_errors_is_critical(error) == expected_return[i]
assert self.validator.vcf_check_errors_is_critical(error) == expected_return[i]

def test_parse_biovalidator_validation_results(self):
self.reporter._parse_biovalidator_validation_results()
assert self.reporter.results['metadata_check']['json_errors'] == [
self.validator._parse_biovalidator_validation_results()
assert self.validator.results['metadata_check']['json_errors'] == [
{'property': '.files', 'description': "should have required property 'files'"},
{'property': '/project.title', 'description': "should have required property 'title'"},
{'property': '/analysis/0.description', 'description': "should have required property 'description'"},
Expand All @@ -117,7 +117,7 @@ def test_parse_biovalidator_validation_results(self):
]

def test_convert_biovalidator_validation_to_spreadsheet(self):
self.reporter.results['metadata_check'] = {
self.validator.results['metadata_check'] = {
'json_errors': [
{'property': '.files', 'description': "should have required property 'files'"},
{'property': '/project.title', 'description': "should have required property 'title'"},
Expand All @@ -132,12 +132,25 @@ def test_convert_biovalidator_validation_to_spreadsheet(self):
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
]
}
self.reporter._convert_biovalidator_validation_to_spreadsheet()
self.validator._convert_biovalidator_validation_to_spreadsheet()

assert self.reporter.results['metadata_check']['spreadsheet_errors'] == [
assert self.validator.results['metadata_check']['spreadsheet_errors'] == [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
]

def test_parse_assembly_check_log(self):
assembly_check_log = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.assembly_check.log')
error_list, nb_error, match, total = self.validator.parse_assembly_check_log(assembly_check_log)
assert error_list == ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"]

def test_parse_assembly_check_report(self):
assembly_check_report = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.text_assembly_report.txt')
mismatch_list, nb_mismatch, error_list, nb_error = self.validator.parse_assembly_check_report(assembly_check_report)
assert mismatch_list[0] == "Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'"
assert nb_mismatch == 12
assert error_list == []
assert nb_error == 0

0 comments on commit ab60a7e

Please sign in to comment.