New script and tests to check if sequences in fasta are from INSDC. A…

…lso added report to HTML
EBIvariation · Feb 6, 2024 · 77be2c4 · 77be2c4
1 parent ac2d01f
commit 77be2c4
Show file tree

Hide file tree

Showing 14 changed files with 4,104 additions and 32 deletions.
diff --git a/bin/check_fasta_insdc.py b/bin/check_fasta_insdc.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import argparse
+import gzip
+import hashlib
+
+from itertools import groupby
+
+import requests
+from ebi_eva_common_pyutils.logger import logging_config
+
+import yaml
+from requests import HTTPError
+from retry import retry
+
+REFGET_SERVER = 'https://www.ebi.ac.uk/ena/cram'
+
+logger = logging_config.get_logger(__name__)
+
+
+def open_gzip_if_required(input_file):
+    if input_file.endswith('.gz'):
+        return gzip.open(input_file, 'rt')
+    else:
+        return open(input_file, 'r')
+
+
+def write_result_yaml(output_yaml, results):
+    with open(output_yaml, 'w') as open_yaml:
+        yaml.safe_dump(data=results, stream=open_yaml)
+
+
+def refget_md5_digest(sequence):
+    return hashlib.md5(sequence.upper().encode('utf-8')).hexdigest()
+
+
+def fasta_iter(input_fasta):
+    """
+    Given a fasta file. yield tuples of header, sequence
+    """
+    "first open the file outside "
+    fin = open(input_fasta, 'r')
+
+    # ditch the boolean (x[0]) and just keep the header or sequence since
+    # we know they alternate.
+
+    faiter = (x[1] for x in groupby(fin, lambda line: line[0] == ">"))
+
+    for header in faiter:
+        # drop the ">"
+        headerStr = header.__next__()[1:].strip()
+
+        # join all sequence lines to one.
+        seq = "".join(s.strip() for s in faiter.__next__())
+        yield (headerStr, seq)
+
+
+@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
+def get_refget_metadata(md5_digest):
+    response = requests.get(f'{REFGET_SERVER}/sequence/{md5_digest}/metadata')
+    if 500 <= response.status_code < 600:
+        raise HTTPError(f"{response.status_code} Server Error: {response.reason} for url: {response.url}", response=response)
+    if 200 <= response.status_code < 300:
+        return response.json()
+    return None
+
+
+def assess_fasta(input_fasta):
+    results = {'sequences': []}
+    all_insdc = True
+    for header, sequence in fasta_iter(input_fasta):
+        name = header.split()[0]
+        md5_digest = refget_md5_digest(sequence)
+        sequence_metadata = get_refget_metadata(md5_digest)
+        results['sequences'].append({'sequence_name': name, 'sequence_md5': md5_digest, 'insdc': bool(sequence_metadata)})
+        all_insdc = all_insdc and bool(sequence_metadata)
+    results['all_insdc']: all_insdc
+    return results
+
+
+def main():
+    arg_parser = argparse.ArgumentParser(
+        description='Calculate the each sequence Refget MD5 identify and compare these against INSDC Refget server.')
+    arg_parser.add_argument('--input_fasta', required=True, dest='input_fasta',
+                            help='Fasta file that contains the sequence to be checked')
+    arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
+                            help='Path to the location of the results')
+    args = arg_parser.parse_args()
+    logging_config.add_stdout_handler()
+    results = assess_fasta(args.input_fasta)
+    write_result_yaml(args.output_yaml, results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eva_sub_cli/docker_validator.py b/eva_sub_cli/docker_validator.py
@@ -27,30 +27,22 @@ class DockerValidator(Reporter):
 
     def __init__(self, mapping_file, output_dir, metadata_json=None,
                  metadata_xlsx=None, container_name=None, docker_path='docker', submission_config=None):
+        # validator write to the validation output directory
+        # If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
+        super().__init__(mapping_file, os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
+                         submission_config=submission_config)
+
         self.docker_path = docker_path
-        self.mapping_file = mapping_file
         self.metadata_json = metadata_json
         self.metadata_xlsx = metadata_xlsx
         self.container_name = container_name
         if self.container_name is None:
             self.container_name = container_image.split('/')[1] + '.' + container_tag
         self.spreadsheet2json_conf = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
-        # validator write to the validation output directory
-        # If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
-        super().__init__(self._find_vcf_file(), os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
-                         submission_config=submission_config)
 
     def _validate(self):
         self.run_docker_validator()
 
-    def _find_vcf_file(self):
-        vcf_files = []
-        with open(self.mapping_file) as open_file:
-            reader = csv.DictReader(open_file, delimiter=',')
-            for row in reader:
-                vcf_files.append(row['vcf'])
-        return vcf_files
-
     def get_docker_validation_cmd(self):
         if self.metadata_xlsx and not self.metadata_json:
             docker_cmd = (
@@ -97,6 +89,7 @@ def run_docker_validator(self):
             self.copy_files_to_container()
 
             docker_cmd = self.get_docker_validation_cmd()
+            print(docker_cmd)
             # start validation
             # FIXME: If nextflow fails in the docker exec still exit with error code 0
             run_command_with_output("Run Validation using Nextflow", docker_cmd)

diff --git a/eva_sub_cli/jinja_templates/fasta_check.html b/eva_sub_cli/jinja_templates/fasta_check.html
@@ -0,0 +1,30 @@
+
+{% macro fasta_check_report(validation_results) -%}
+    {% for fasta_file, results_for_fasta in validation_results.get('fasta_check', {}).items() %}
+        {% if results_for_fasta.get('all_insdc') %}
+            {% set icon = "&#10004;" %}
+            {% set row_class = "report-section pass" %}
+            {% set text = "sequences are all INSDC accessioned" %}
+        {% else %}
+            {% set icon = "&#10060;" %}
+            {% set row_class = "report-section fail collapsible" %}
+            {% set text = "sequences are not INSDC accessioned" %}
+        {% endif %}
+        <div class='{{ row_class }}'>{{ icon }} {{ fasta_file }} {{ text }} </div>
+        {% if not results_for_fasta.get('all_insdc') %}
+            <div class="error-list">
+                <div class="error-description">First 10 sequences not in INSDC. <strong>Full report:</strong> {{ results_for_fasta.get('report_path', '') }}</div>
+                <table>
+                    <tr>
+                        <th>Sequence name</th><th>Refget md5</th><th>In INSDC</th>
+                    </tr>
+                    {% set sequence_info_list = results_for_fasta.get('sequences', [])|rejectattr("insdc")|list %}
+                    {% for sequence_info in sequence_info_list[:10] %}
+                        <tr>
+                            <td><strong>{{sequence_info.get('sequence_name') }}</strong></td><td> {{ sequence_info.get('sequence_md5') }}</td><td> &#10060; </td>
+                        </tr>
+                    {% endfor %}
+
+        {% endif %}
+    {% endfor %}
+{%- endmacro %}
diff --git a/eva_sub_cli/jinja_templates/html_report.html b/eva_sub_cli/jinja_templates/html_report.html
@@ -1,7 +1,9 @@
 <!DOCTYPE html>
 {% from 'file_validation.html' import file_validation_report %}
 {% from 'sample_name_check.html' import sample_name_check_report %}
+{% from 'fasta_check.html' import fasta_check_report %}
 {% from 'metadata_validation.html' import metadata_validation_report %}
+
 <html lang="EN">
 <head>
     <meta charset="UTF-8">
@@ -74,6 +76,14 @@ <h2>Sample name concordance check</h2>
     {{ sample_name_check_report(validation_results)}}
 </section>
 
+<section>
+    <h2>Reference Genome INSDC check</h2>
+    <div class="description">
+        Checks that the reference sequences in the fasta file used to call the variant are accessioned in INSDC.
+    </div>
+    {{ fasta_check_report(validation_results)}}
+</section>
+
 <script>
     let collapsibles = document.querySelectorAll('.collapsible');
     for (let collapsible of collapsibles) {

diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
@@ -25,11 +25,12 @@ params.executable = [
     "vcf_validator": "vcf_validator",
     "vcf_assembly_checker": "vcf_assembly_checker",
     "samples_checker": "samples_checker.py",
+    "fasta_checker": "check_fasta_insdc.py",
     "xlsx2json": "xlsx2json.py",
     "biovalidator": "biovalidator"
 ]
 // validation tasks
-params.validation_tasks = [ "vcf_check", "assembly_check", "samples_check", "metadata_check"]
+params.validation_tasks = [ "vcf_check", "assembly_check", "samples_check", "metadata_check", "genome_check"]
 // container validation dir (prefix for vcf files)
 params.container_validation_dir = "/opt/vcf_validation"
 // help
@@ -79,6 +80,13 @@ workflow {
     if ("samples_check" in params.validation_tasks) {
         sample_name_concordance(metadata_json, vcf_files.collect())
     }
+    if ("genome_check" in params.validation_tasks){
+        fasta_files = Channel.fromPath(params.vcf_files_mapping)
+        .splitCsv(header:true)
+        .map{row -> file(params.container_validation_dir+row.fasta)}
+        .unique()
+        genome_checker(fasta_files)
+    }
 }
 
 /*
@@ -187,3 +195,22 @@ process sample_name_concordance {
     $params.executable.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
     """
 }
+
+
+
+process genome_checker {
+    publishDir "$params.output_dir",
+            overwrite: true,
+            mode: "copy"
+
+    input:
+    path(fasta_file)
+
+    output:
+    path "${fasta_file}_check.yml", emit: fasta_checker
+
+    script:
+    """
+    $params.executable.fasta_checker --input_fasta $fasta_file  --output_yaml ${fasta_file}_check.yml
+    """
+}
diff --git a/eva_sub_cli/reporter.py b/eva_sub_cli/reporter.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import csv
 import datetime
 import glob
 import os
@@ -27,9 +28,12 @@ def resolve_single_file_path(file_path):
 
 class Reporter:
 
-    def __init__(self, vcf_files, output_dir, submission_config: WritableConfig = None):
+    def __init__(self, mapping_file, output_dir, submission_config: WritableConfig = None):
         self.output_dir = output_dir
+        self.mapping_file = mapping_file
+        vcf_files, fasta_files = self._find_vcf_and_fasta_files()
         self.vcf_files = vcf_files
+        self.fasta_files = fasta_files
         self.results = {}
         self.project_title = None  # TODO fill this from metadata?
         self.validation_date = datetime.datetime.now()
@@ -46,6 +50,16 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.sub_config.backup()
         self.sub_config.write()
 
+    def _find_vcf_and_fasta_files(self):
+        vcf_files = []
+        fasta_files = []
+        with open(self.mapping_file) as open_file:
+            reader = csv.DictReader(open_file, delimiter=',')
+            for row in reader:
+                vcf_files.append(row['vcf'])
+                fasta_files.append(row['fasta'])
+        return vcf_files, fasta_files
+
     def validate(self):
         self._validate()
         self._collect_validation_workflow_results()
@@ -151,6 +165,7 @@ def _collect_validation_workflow_results(self, ):
         self._collect_vcf_check_results()
         self._collect_assembly_check_results()
         self._load_sample_check_results()
+        self._load_fasta_check_results()
         self._parse_biovalidator_validation_results()
         self._convert_biovalidator_validation_to_spreadsheet()
         self._write_spreadsheet_validation_results()
@@ -247,6 +262,14 @@ def _collect_assembly_check_results(self):
     def _sample_check_yaml(self):
         return resolve_single_file_path(os.path.join(self.output_dir, 'sample_checker.yml'))
 
+    def _load_fasta_check_results(self):
+        for fasta_file in self.fasta_files:
+            fasta_file_name = os.path.basename(fasta_file)
+            fasta_check = resolve_single_file_path(os.path.join(self.output_dir, f'{fasta_file_name}_check.yml'))
+            self.results['fasta_check'] = {}
+            with open(fasta_check) as open_yaml:
+                self.results['fasta_check'][fasta_file_name] = yaml.safe_load(open_yaml)
+
     def _load_sample_check_results(self):
         with open(self._sample_check_yaml) as open_yaml:
             self.results['sample_check'] = yaml.safe_load(open_yaml)