Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3459 - New script and tests to check if sequences in fasta are from INSDC #19

Merged
merged 6 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions bin/check_fasta_insdc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env python

import argparse
import gzip
import hashlib

from itertools import groupby

import requests
from ebi_eva_common_pyutils.logger import logging_config

import yaml
from requests import HTTPError
from retry import retry

REFGET_SERVER = 'https://www.ebi.ac.uk/ena/cram'

logger = logging_config.get_logger(__name__)


def open_gzip_if_required(input_file):
if input_file.endswith('.gz'):
return gzip.open(input_file, 'rt')
else:
return open(input_file, 'r')


def write_result_yaml(output_yaml, results):
with open(output_yaml, 'w') as open_yaml:
yaml.safe_dump(data=results, stream=open_yaml)


def refget_md5_digest(sequence):
return hashlib.md5(sequence.upper().encode('utf-8')).hexdigest()


def fasta_iter(input_fasta):
"""
Given a fasta file. yield tuples of header, sequence
"""
# first open the file outside
with open(input_fasta, 'r') as open_file:

# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">"))

tcezard marked this conversation as resolved.
Show resolved Hide resolved
for header in faiter:
# drop the ">"
headerStr = header.__next__()[1:].strip()

# join all sequence lines to one.
seq = "".join(s.strip() for s in faiter.__next__())
yield (headerStr, seq)


@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def get_refget_metadata(md5_digest):
response = requests.get(f'{REFGET_SERVER}/sequence/{md5_digest}/metadata')
if 500 <= response.status_code < 600:
raise HTTPError(f"{response.status_code} Server Error: {response.reason} for url: {response.url}", response=response)
if 200 <= response.status_code < 300:
return response.json()
return None


def assess_fasta(input_fasta):
results = {'sequences': []}
all_insdc = True
for header, sequence in fasta_iter(input_fasta):
name = header.split()[0]
md5_digest = refget_md5_digest(sequence)
sequence_metadata = get_refget_metadata(md5_digest)
results['sequences'].append({'sequence_name': name, 'sequence_md5': md5_digest, 'insdc': bool(sequence_metadata)})
all_insdc = all_insdc and bool(sequence_metadata)
results['all_insdc'] = all_insdc
return results


def main():
arg_parser = argparse.ArgumentParser(
description="Calculate each sequence's Refget MD5 digest and compare these against INSDC Refget server.")
arg_parser.add_argument('--input_fasta', required=True, dest='input_fasta',
help='Fasta file that contains the sequence to be checked')
arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
help='Path to the location of the results')
args = arg_parser.parse_args()
logging_config.add_stdout_handler()
results = assess_fasta(args.input_fasta)
write_result_yaml(args.output_yaml, results)


if __name__ == "__main__":
main()
21 changes: 7 additions & 14 deletions eva_sub_cli/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

docker_path = 'docker'
container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev2'
container_tag = 'v0.0.1.dev3'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = '/opt/vcf_validation/vcf_validation_output'
container_etc_dir = '/opt/eva_sub_cli/etc'
Expand All @@ -27,30 +27,22 @@ class DockerValidator(Reporter):

def __init__(self, mapping_file, output_dir, metadata_json=None,
metadata_xlsx=None, container_name=None, docker_path='docker', submission_config=None):
# validator write to the validation output directory
# If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
super().__init__(mapping_file, os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
submission_config=submission_config)

self.docker_path = docker_path
self.mapping_file = mapping_file
self.metadata_json = metadata_json
self.metadata_xlsx = metadata_xlsx
self.container_name = container_name
if self.container_name is None:
self.container_name = container_image.split('/')[1] + '.' + container_tag
self.spreadsheet2json_conf = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
# validator write to the validation output directory
# If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
super().__init__(self._find_vcf_file(), os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
submission_config=submission_config)

def _validate(self):
self.run_docker_validator()

def _find_vcf_file(self):
vcf_files = []
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
vcf_files.append(row['vcf'])
return vcf_files

def get_docker_validation_cmd(self):
if self.metadata_xlsx and not self.metadata_json:
docker_cmd = (
Expand Down Expand Up @@ -97,6 +89,7 @@ def run_docker_validator(self):
self.copy_files_to_container()

docker_cmd = self.get_docker_validation_cmd()
print(docker_cmd)
# start validation
# FIXME: If nextflow fails in the docker exec still exit with error code 0
run_command_with_output("Run Validation using Nextflow", docker_cmd)
Expand Down
30 changes: 30 additions & 0 deletions eva_sub_cli/jinja_templates/fasta_check.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

{% macro fasta_check_report(validation_results) -%}
{% for fasta_file, results_for_fasta in validation_results.get('fasta_check', {}).items() %}
{% if results_for_fasta.get('all_insdc') %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% set text = "all sequences are INSDC accessioned" %}
{% else %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% set text = "some sequences are not INSDC accessioned" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} {{ fasta_file }}: {{ text }} </div>
{% if not results_for_fasta.get('all_insdc') %}
<div class="error-list">
<div class="error-description">First 10 sequences not in INSDC. <strong>Full report:</strong> {{ results_for_fasta.get('report_path', '') }}</div>
<table>
<tr>
<th>Sequence name</th><th>Refget md5</th>
</tr>
{% set sequence_info_list = results_for_fasta.get('sequences', [])|rejectattr("insdc")|list %}
{% for sequence_info in sequence_info_list[:10] %}
<tr>
<td><strong>{{sequence_info.get('sequence_name') }}</strong></td><td> {{ sequence_info.get('sequence_md5') }}</td>
</tr>
{% endfor %}

{% endif %}
{% endfor %}
{%- endmacro %}
10 changes: 10 additions & 0 deletions eva_sub_cli/jinja_templates/html_report.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
<!DOCTYPE html>
{% from 'file_validation.html' import file_validation_report %}
{% from 'sample_name_check.html' import sample_name_check_report %}
{% from 'fasta_check.html' import fasta_check_report %}
{% from 'metadata_validation.html' import metadata_validation_report %}

<html lang="EN">
<head>
<meta charset="UTF-8">
Expand Down Expand Up @@ -74,6 +76,14 @@ <h2>Sample name concordance check</h2>
{{ sample_name_check_report(validation_results)}}
</section>

<section>
<h2>Reference Genome INSDC check</h2>
<div class="description">
Checks that the reference sequences in the fasta file used to call the variants are accessioned in INSDC.
</div>
{{ fasta_check_report(validation_results)}}
</section>

<script>
let collapsibles = document.querySelectorAll('.collapsible');
for (let collapsible of collapsibles) {
Expand Down
29 changes: 28 additions & 1 deletion eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ params.executable = [
"vcf_validator": "vcf_validator",
"vcf_assembly_checker": "vcf_assembly_checker",
"samples_checker": "samples_checker.py",
"fasta_checker": "check_fasta_insdc.py",
"xlsx2json": "xlsx2json.py",
"biovalidator": "biovalidator"
]
// validation tasks
params.validation_tasks = [ "vcf_check", "assembly_check", "samples_check", "metadata_check"]
params.validation_tasks = [ "vcf_check", "assembly_check", "samples_check", "metadata_check", "insdc_check"]
// container validation dir (prefix for vcf files)
params.container_validation_dir = "/opt/vcf_validation"
// help
Expand Down Expand Up @@ -79,6 +80,13 @@ workflow {
if ("samples_check" in params.validation_tasks) {
sample_name_concordance(metadata_json, vcf_files.collect())
}
if ("insdc_check" in params.validation_tasks){
fasta_files = Channel.fromPath(params.vcf_files_mapping)
.splitCsv(header:true)
.map{row -> file(params.container_validation_dir+row.fasta)}
.unique()
insdc_checker(fasta_files)
}
}

/*
Expand Down Expand Up @@ -187,3 +195,22 @@ process sample_name_concordance {
$params.executable.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
"""
}



process insdc_checker {
publishDir "$params.output_dir",
overwrite: true,
mode: "copy"

input:
path(fasta_file)

output:
path "${fasta_file}_check.yml", emit: fasta_checker_yml

script:
"""
$params.executable.fasta_checker --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml
"""
}
25 changes: 24 additions & 1 deletion eva_sub_cli/reporter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
import csv
import datetime
import glob
import os
Expand Down Expand Up @@ -27,9 +28,12 @@ def resolve_single_file_path(file_path):

class Reporter:

def __init__(self, vcf_files, output_dir, submission_config: WritableConfig = None):
def __init__(self, mapping_file, output_dir, submission_config: WritableConfig = None):
self.output_dir = output_dir
self.mapping_file = mapping_file
vcf_files, fasta_files = self._find_vcf_and_fasta_files()
self.vcf_files = vcf_files
self.fasta_files = fasta_files
self.results = {}
self.project_title = None # TODO fill this from metadata?
self.validation_date = datetime.datetime.now()
Expand All @@ -46,6 +50,16 @@ def __exit__(self, exc_type, exc_val, exc_tb):
self.sub_config.backup()
self.sub_config.write()

def _find_vcf_and_fasta_files(self):
vcf_files = []
fasta_files = []
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
vcf_files.append(row['vcf'])
fasta_files.append(row['fasta'])
return vcf_files, fasta_files

def validate(self):
self._validate()
self._collect_validation_workflow_results()
Expand Down Expand Up @@ -151,6 +165,7 @@ def _collect_validation_workflow_results(self, ):
self._collect_vcf_check_results()
self._collect_assembly_check_results()
self._load_sample_check_results()
self._load_fasta_check_results()
self._parse_biovalidator_validation_results()
self._convert_biovalidator_validation_to_spreadsheet()
self._write_spreadsheet_validation_results()
Expand Down Expand Up @@ -247,6 +262,14 @@ def _collect_assembly_check_results(self):
def _sample_check_yaml(self):
return resolve_single_file_path(os.path.join(self.output_dir, 'sample_checker.yml'))

def _load_fasta_check_results(self):
for fasta_file in self.fasta_files:
fasta_file_name = os.path.basename(fasta_file)
fasta_check = resolve_single_file_path(os.path.join(self.output_dir, f'{fasta_file_name}_check.yml'))
self.results['fasta_check'] = {}
with open(fasta_check) as open_yaml:
self.results['fasta_check'][fasta_file_name] = yaml.safe_load(open_yaml)

def _load_sample_check_results(self):
with open(self._sample_check_yaml) as open_yaml:
self.results['sample_check'] = yaml.safe_load(open_yaml)
Expand Down
Loading
Loading