Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3522 - Add assembly check error to HTML report. #32

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 81 additions & 65 deletions eva_sub_cli/jinja_templates/file_validation.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,71 +3,87 @@
{% for check_type, check_per_file in validation_results.items() %}
{% set result = check_per_file.get(file_name, {}) %}
{% if check_type == "assembly_check" %}
{% set nb_match = result.get("match", 0) %}
{% set nb_total = result.get("total", 0) %}
{% set match_percentage = nb_match / nb_total * 100 if nb_total else 0 %}
{% if result.get("nb_mismatch", 0) > 0 %}
{% set icon = "❌" %}
{% set row_class = "report-section fail collapsible" %}
{% else %}
{% set icon = "✔" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} Assembly check: {{ nb_match }}/{{ nb_total }} ({{ match_percentage|round(2) }}%)</div>
{% set mismatch_list = result.get("mismatch_list") %}
{% if mismatch_list %}
<div class="error-list">
<div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ result.get('report_path', '') }}</div>
<table>
<tr>
<th>Category</th><th>Error</th>
</tr>
{% for error in mismatch_list[:10] %}
<tr>
<td><strong>mismatch error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{{ assembly_check(result) }}
{% elif check_type == "vcf_check" %}
{% set critical_count = result.get("critical_count", 0) %}
{% set error_count = result.get("error_count", 0) %}
{% set warning_count = result.get("warning_count", 0) %}
{% if critical_count > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% elif error_count > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section warn collapsible" %}
{% else %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} VCF check: {{ critical_count }} critical errors, {{ error_count }} non-critical errors, {{ warning_count }} warnings </div>
{% set critical_list = result.get("critical_list") %}
{% set error_list = result.get("error_list") %}

{% if critical_list or error_list%}
<div class="error-list">
<div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ result.get('report_path', '') }}</div>
<table>
<tr>
<th>Category</th><th>Error</th>
</tr>
{% for error in critical_list[:10] %}
<tr>
<td><strong>critical error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
{% for error in error_list[:10] %}
<tr>
<td><strong>non-critical error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{{ vcf_check(result) }}
{% endif %}
{% endfor %}
{%- endmacro %}
{%- endmacro %}

{% macro vcf_check(vcf_check_result) %}
{% set critical_count = vcf_check_result.get("critical_count", 0) %}
{% set error_count = vcf_check_result.get("error_count", 0) %}
{% set warning_count = vcf_check_result.get("warning_count", 0) %}
{% if critical_count > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% elif error_count > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section warn collapsible" %}
{% else %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} VCF check: {{ critical_count }} critical errors, {{ error_count }} non-critical errors, {{ warning_count }} warnings </div>
{% set critical_list = vcf_check_result.get("critical_list") %}
{% set error_list = vcf_check_result.get("error_list") %}

{% if critical_list or error_list%}
<div class="error-list">
<div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ vcf_check_result.get('report_path', '') }}</div>
<table>
<tr>
<th>Category</th><th>Error</th>
</tr>
{% for error in critical_list[:10] %}
<tr>
<td><strong>critical error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
{% for error in error_list[:10] %}
<tr>
<td><strong>non-critical error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{%- endmacro %}

{% macro assembly_check(assembly_check_result) %}
{% set nb_match = assembly_check_result.get("match", 0) %}
{% set nb_total = assembly_check_result.get("total", 0) %}
{% set match_percentage = nb_match / nb_total * 100 if nb_total else 0 %}
{% if assembly_check_result.get("nb_mismatch", 0) > 0 %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% else %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} Assembly check: {{ nb_match }}/{{ nb_total }} ({{ match_percentage|round(2) }}%)</div>
{% set mismatch_list = assembly_check_result.get("mismatch_list") %}
{% set error_list = assembly_check_result.get("error_list") %}
{% if mismatch_list or error_list %}
<div class="error-list">
<div class="error-description">First 10 errors per category are below. <strong>Full report:</strong> {{ assembly_check_result.get('report_path', '') }}</div>
<table>
<tr>
<th>Category</th><th>Error</th>
</tr>
{% for error in error_list[:10] %}
<tr>
<td><strong>Parsing Error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
{% for error in mismatch_list[:10] %}
<tr>
<td><strong>mismatch error</strong></td><td> {{ error }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{%- endmacro %}


3 changes: 1 addition & 2 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ process check_vcf_reference {
tuple path(vcf), path(fasta), path(report)

output:
path "assembly_check/*valid_assembly_report*", emit: vcf_assembly_valid
path "assembly_check/*text_assembly_report*", emit: assembly_check_report
path "assembly_check/*.assembly_check.log", emit: assembly_check_log

Expand All @@ -141,7 +140,7 @@ process check_vcf_reference {
trap 'if [[ \$? == 1 || \$? == 139 ]]; then exit 0; fi' EXIT

mkdir -p assembly_check
$params.executable.vcf_assembly_checker -i $vcf -f $fasta $report_opt -r summary,text,valid -o assembly_check --require-genbank > assembly_check/${vcf}.assembly_check.log 2>&1
$params.executable.vcf_assembly_checker -i $vcf -f $fasta $report_opt -r summary,text -o assembly_check --require-genbank > assembly_check/${vcf}.assembly_check.log 2>&1
"""
}

Expand Down
11 changes: 2 additions & 9 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def parse_assembly_check_log(self, assembly_check_log):
if line.startswith('[error]'):
nb_error += 1
if nb_error < 11:
error_list.append(line.strip()[len('[error]'):])
error_list.append(line.strip()[len('[error] '):])
elif line.startswith('[info] Number of matches:'):
match, total = line.strip()[len('[info] Number of matches: '):].split('/')
match = int(match)
Expand Down Expand Up @@ -270,12 +270,6 @@ def _assembly_check_log(self, vcf_name):
os.path.join(self.output_dir, 'assembly_check', vcf_name + '.assembly_check.log')
)

@lru_cache
def _assembly_check_valid_vcf(self, vcf_name):
return resolve_single_file_path(
os.path.join(self.output_dir, 'assembly_check', vcf_name + '.valid_assembly_report*')
)

@lru_cache
def _assembly_check_text_report(self, vcf_name):
return resolve_single_file_path(
Expand All @@ -289,10 +283,9 @@ def _collect_assembly_check_results(self):
vcf_name = os.path.basename(vcf_file)

assembly_check_log = self._assembly_check_log(vcf_name)
assembly_check_valid_vcf = self._assembly_check_valid_vcf(vcf_name)
assembly_check_text_report = self._assembly_check_text_report(vcf_name)

if assembly_check_log and assembly_check_valid_vcf and assembly_check_text_report:
if assembly_check_log and assembly_check_text_report:
error_list_from_log, nb_error_from_log, match, total = \
self.parse_assembly_check_log(assembly_check_log)
mismatch_list, nb_mismatch, error_list_from_report, nb_error_from_report = \
Expand Down
9 changes: 9 additions & 0 deletions tests/resources/assembly_check/invalid.vcf.assembly_check.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[info] Reading from input VCF file...
[info] Reading from input FASTA file...
[info] Reading from input FASTA index file...
[info] Creating index from input FASTA file...
[info] Number of matches: 85755/100394
[info] Percentage of matches: 85.4185%
[info] Text report written to : assembly_check/target.Filter.AfterQC.AddChr.vcf.text_assembly_report.1702044383544.txt
[info] Valid report written to : assembly_check/target.Filter.AfterQC.AddChr.vcf.valid_assembly_report.1702044383546.txt
[error] The assembly checking could not be completed: Contig 'chr23' not found in assembly report
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'
Line 44: Chromosome chr1, position 957900, reference allele 'A' does not match the reference sequence, expected 'G'
Line 45: Chromosome chr1, position 959193, reference allele 'A' does not match the reference sequence, expected 'G'
Line 47: Chromosome chr1, position 983193, reference allele 'G' does not match the reference sequence, expected 'A'
Line 49: Chromosome chr1, position 993456, reference allele 'T' does not match the reference sequence, expected 'C'
Line 54: Chromosome chr1, position 1013312, reference allele 'A' does not match the reference sequence, expected 'G'
Line 55: Chromosome chr1, position 1014545, reference allele 'T' does not match the reference sequence, expected 'C'
Line 58: Chromosome chr1, position 1051063, reference allele 'T' does not match the reference sequence, expected 'C'
Line 59: Chromosome chr1, position 1068249, reference allele 'T' does not match the reference sequence, expected 'C'
Line 72: Chromosome chr1, position 1181173, reference allele 'G' does not match the reference sequence, expected 'C'
Line 80: Chromosome chr1, position 1298561, reference allele 'C' does not match the reference sequence, expected 'T'
Line 88: Chromosome chr1, position 1366828, reference allele 'G' does not match the reference sequence, expected 'A'
2 changes: 1 addition & 1 deletion tests/resources/validation_reports/expected_report.html

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions tests/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
},
"input_fail.vcf": {
"report_path": "/path/to/assembly_failed/report",
"error_list": [],
"error_list": ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"],
"match": 26,
"mismatch_list": [
"Chromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'",
Expand All @@ -30,7 +30,7 @@
"Chromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'",
"Chromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'",
],
"nb_error": 0,
"nb_error": 1,
"nb_mismatch": 10,
"total": 36,
},
Expand Down Expand Up @@ -153,5 +153,8 @@ class TestReport(TestCase):

def test_generate_html_report(self):
report = generate_html_report(validation_results, datetime.datetime(2023, 8, 31, 12, 34, 56), "My cool project")
with open('report.html', 'w') as open_file:
open_file.write(report)

with open(self.expected_report) as open_html:
assert report == open_html.read()
45 changes: 29 additions & 16 deletions tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setUp(self) -> None:
[os.path.join(self.vcf_files, 'input_passed.vcf')],
[os.path.join(self.fasta_files, 'input_passed.fa')],
[os.path.join(self.assembly_reports, 'input_passed.txt')])
self.reporter = Validator(self.mapping_file, self.output_dir)
self.validator = Validator(self.mapping_file, self.output_dir)

def tearDown(self) -> None:
for f in ['expected_report.html', self.mapping_file]:
Expand Down Expand Up @@ -77,21 +77,21 @@ def test__collect_validation_workflow_results(self):
}
}

self.reporter._collect_validation_workflow_results()
self.validator._collect_validation_workflow_results()
# Drop report paths from comparison (test will fail if missing)
del self.reporter.results['metadata_check']['json_report_path']
del self.reporter.results['metadata_check']['spreadsheet_report_path']
del self.reporter.results['sample_check']['report_path']
for file in self.reporter.results['vcf_check'].values():
del self.validator.results['metadata_check']['json_report_path']
del self.validator.results['metadata_check']['spreadsheet_report_path']
del self.validator.results['sample_check']['report_path']
for file in self.validator.results['vcf_check'].values():
del file['report_path']
for file in self.reporter.results['assembly_check'].values():
for file in self.validator.results['assembly_check'].values():
del file['report_path']

assert self.reporter.results == expected_results
assert self.validator.results == expected_results

def test_create_report(self):
self.reporter._collect_validation_workflow_results()
report_path = self.reporter.create_reports()
self.validator._collect_validation_workflow_results()
report_path = self.validator.create_reports()
assert os.path.exists(report_path)

def test_vcf_check_errors_is_critical(self):
Expand All @@ -102,11 +102,11 @@ def test_vcf_check_errors_is_critical(self):
]
expected_return = [False, True, True]
for i, error in enumerate(errors):
assert self.reporter.vcf_check_errors_is_critical(error) == expected_return[i]
assert self.validator.vcf_check_errors_is_critical(error) == expected_return[i]

def test_parse_biovalidator_validation_results(self):
self.reporter._parse_biovalidator_validation_results()
assert self.reporter.results['metadata_check']['json_errors'] == [
self.validator._parse_biovalidator_validation_results()
assert self.validator.results['metadata_check']['json_errors'] == [
{'property': '.files', 'description': "should have required property 'files'"},
{'property': '/project.title', 'description': "should have required property 'title'"},
{'property': '/analysis/0.description', 'description': "should have required property 'description'"},
Expand All @@ -117,7 +117,7 @@ def test_parse_biovalidator_validation_results(self):
]

def test_convert_biovalidator_validation_to_spreadsheet(self):
self.reporter.results['metadata_check'] = {
self.validator.results['metadata_check'] = {
'json_errors': [
{'property': '.files', 'description': "should have required property 'files'"},
{'property': '/project.title', 'description': "should have required property 'title'"},
Expand All @@ -132,12 +132,25 @@ def test_convert_biovalidator_validation_to_spreadsheet(self):
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
]
}
self.reporter._convert_biovalidator_validation_to_spreadsheet()
self.validator._convert_biovalidator_validation_to_spreadsheet()

assert self.reporter.results['metadata_check']['spreadsheet_errors'] == [
assert self.validator.results['metadata_check']['spreadsheet_errors'] == [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
]

def test_parse_assembly_check_log(self):
assembly_check_log = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.assembly_check.log')
error_list, nb_error, match, total = self.validator.parse_assembly_check_log(assembly_check_log)
assert error_list == ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"]

def test_parse_assembly_check_report(self):
assembly_check_report = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.text_assembly_report.txt')
mismatch_list, nb_mismatch, error_list, nb_error = self.validator.parse_assembly_check_report(assembly_check_report)
assert mismatch_list[0] == "Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'"
assert nb_mismatch == 12
assert error_list == []
assert nb_error == 0
Loading