Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3435 - Retrieve INSDC accession based on FASTA file and compare against metadata #29

Merged
merged 6 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 97 additions & 4 deletions bin/check_fasta_insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import argparse
import gzip
import hashlib
import json

from itertools import groupby

Expand All @@ -13,7 +14,11 @@
from requests import HTTPError
from retry import retry

from eva_sub_cli.metadata_utils import get_files_per_analysis, get_analysis_for_vcf_file, \
get_reference_assembly_for_analysis

REFGET_SERVER = 'https://www.ebi.ac.uk/ena/cram'
CONTIG_ALIAS_SERVER = 'https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum'

logger = logging_config.get_logger(__name__)

Expand Down Expand Up @@ -64,29 +69,117 @@ def get_refget_metadata(md5_digest):
return None


def assess_fasta(input_fasta):
@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def _get_containing_assemblies_paged(url):
response = requests.get(url)
if 500 <= response.status_code < 600:
raise HTTPError(f"{response.status_code} Server Error: {response.reason} for url: {response.url}",
response=response)
if 200 <= response.status_code < 300:
results = set()
response_data = response.json()
if '_embedded' in response_data and 'chromosomeEntities' in response_data['_embedded']:
for contigEntity in response_data['_embedded']['chromosomeEntities']:
results.add(contigEntity['assembly']['insdcAccession'])
if '_links' in response_data and 'next' in response_data['_links']:
# Add results from next page if needed
results |= _get_containing_assemblies_paged(response_data['_links']['next'])
return results
return set()


def get_containing_assemblies(md5_digest):
# Wrapper method to handle pagination
url = f'{CONTIG_ALIAS_SERVER}/{md5_digest}'
return _get_containing_assemblies_paged(url)


def assess_fasta(input_fasta, analyses, assembly_in_metadata):
"""
Check whether all sequences in fasta file are INSDC, and if so whether the INSDC accession provided in the metadata
is compatible.
:param input_fasta: path to fasta file
:param analyses: aliases of all analyses associated with this fasta (used only for reporting)
:param assembly_in_metadata: INSDC accession from metadata (if None will only do the first check but will report
compatible assemblies)
:returns: dict of results
"""
results = {'sequences': []}
all_insdc = True
possible_assemblies = set()
for header, sequence in fasta_iter(input_fasta):
# Check sequence is INSDC
name = header.split()[0]
md5_digest = refget_md5_digest(sequence)
sequence_metadata = get_refget_metadata(md5_digest)
results['sequences'].append({'sequence_name': name, 'sequence_md5': md5_digest, 'insdc': bool(sequence_metadata)})
all_insdc = all_insdc and bool(sequence_metadata)
is_insdc = bool(sequence_metadata)
results['sequences'].append({'sequence_name': name, 'sequence_md5': md5_digest, 'insdc': is_insdc})
all_insdc = all_insdc and is_insdc
# Get possible assemblies for sequence
containing_assemblies = get_containing_assemblies(md5_digest)
if len(containing_assemblies) == 0:
if is_insdc:
logger.warning(f'Sequence with this MD5 is INSDC but not found in contig alias: {md5_digest}')
elif len(possible_assemblies) == 0:
possible_assemblies = containing_assemblies
else:
possible_assemblies &= containing_assemblies

# Always report whether everything is INSDC
results['all_insdc'] = all_insdc
# Only report compatible assembly accessions if any were found in contig alias
if possible_assemblies:
results['possible_assemblies'] = possible_assemblies
# Only report on metadata concordance if we found compatible assemblies and found a single assembly accession in
# the metadata to compare against this FASTA file
if possible_assemblies and assembly_in_metadata:
results['metadata_assembly_compatible'] = (assembly_in_metadata in possible_assemblies)
results['associated_analyses'] = analyses
results['assembly_in_metadata'] = assembly_in_metadata
return results


def get_analyses_and_reference_genome_from_metadata(vcf_files_for_fasta, json_file):
"""
Get analysis aliases and associated reference genome from the metadata.
:param vcf_files_for_fasta: list of VCF file paths, assumed to be associated with a single FASTA file and thus a
single assembly accession
:param json_file: JSON file of the metadata
"""
with open(json_file) as open_json:
metadata = json.load(open_json)
files_per_analysis = get_files_per_analysis(metadata)
# Get all analyses associated with all vcf files that are linked with a single fasta file
all_analyses = set()
for vcf_file in vcf_files_for_fasta:
analysis_aliases = get_analysis_for_vcf_file(vcf_file, files_per_analysis)
if len(analysis_aliases) != 1:
logger.error(f'Could not determine analysis associated with VCF file: {vcf_file}')
else:
all_analyses.add(analysis_aliases[0])
# Get (single) assembly associated with these analyses
assemblies = [get_reference_assembly_for_analysis(metadata, analysis) for analysis in all_analyses]
if len(assemblies) != 1:
logger.error(f'Could not determine assembly accession to check against fasta file, out of: {assemblies}')
return all_analyses, None
apriltuesday marked this conversation as resolved.
Show resolved Hide resolved
return all_analyses, assemblies[0]


def main():
arg_parser = argparse.ArgumentParser(
description="Calculate each sequence's Refget MD5 digest and compare these against INSDC Refget server.")
arg_parser.add_argument('--input_fasta', required=True, dest='input_fasta',
help='Fasta file that contains the sequence to be checked')
arg_parser.add_argument('--vcf_files', dest='vcf_files', nargs='+',
help='VCF files associated with this fasta file (used to connect fasta file to '
'assembly accession in metadata via analysis)')
arg_parser.add_argument('--metadata_json', required=True, dest='metadata_json', help='EVA metadata json file')
arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
help='Path to the location of the results')
args = arg_parser.parse_args()
logging_config.add_stdout_handler()
results = assess_fasta(args.input_fasta)
analyses, metadata_insdc = get_analyses_and_reference_genome_from_metadata(args.vcf_files, args.metadata_json)
results = assess_fasta(args.input_fasta, analyses, metadata_insdc)
write_result_yaml(args.output_yaml, results)


Expand Down
21 changes: 4 additions & 17 deletions bin/samples_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import json
import os

from collections import defaultdict
from ebi_eva_common_pyutils.logger import logging_config

import yaml

from eva_sub_cli.metadata_utils import get_samples_per_analysis, get_files_per_analysis, get_analysis_for_vcf_file

logger = logging_config.get_logger(__name__)

Expand Down Expand Up @@ -88,17 +88,7 @@ def compare_all_analysis(samples_per_analysis, files_per_analysis):
def read_metadata_json(json_file):
with open(json_file) as open_json:
metadata = json.load(open_json)
samples_per_analysis = defaultdict(list)
files_per_analysis = defaultdict(list)
for sample_info in metadata['sample']:
samples_per_analysis[sample_info.get('analysisAlias')].append(sample_info.get('sampleInVCF'))
for file_info in metadata['files']:
if file_info.get('fileType') == 'vcf':
files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
return (
{analysis_alias: set(samples) for analysis_alias, samples in samples_per_analysis.items()},
{filepath: set(samples) for filepath, samples in files_per_analysis.items()}
)
return get_samples_per_analysis(metadata), get_files_per_analysis(metadata)


def associate_vcf_path_with_analysis(vcf_files, files_per_analysis):
Expand All @@ -113,10 +103,7 @@ def associate_vcf_path_with_analysis(vcf_files, files_per_analysis):
for analysis in files_per_analysis:
result_files_per_analysis[analysis] = []
for vcf_file in vcf_files:
if not os.path.exists(vcf_file):
raise FileNotFoundError(f'{vcf_file} cannot be resolved')
analysis_aliases = [analysis_alias for analysis_alias in files_per_analysis
if os.path.basename(vcf_file) in files_per_analysis[analysis_alias]]
analysis_aliases = get_analysis_for_vcf_file(vcf_file, files_per_analysis)
if len(analysis_aliases) == 1:
result_files_per_analysis[analysis_aliases[0]].append(vcf_file)
elif len(analysis_aliases) == 0:
Expand All @@ -140,7 +127,7 @@ def write_result_yaml(output_yaml, overall_differences, results_per_analysis_ali

def check_sample_name_concordance(metadata_json, vcf_files, output_yaml):
"""
Take the metadata following EVA standard and formatted in JSON then compare the sample names in it to the ones
Take the metadata following EVA standard and formatted in JSON then compare the sample names in it to the ones
found in the VCF files
"""
samples_per_analysis, files_per_analysis = read_metadata_json(metadata_json)
Expand Down
81 changes: 62 additions & 19 deletions eva_sub_cli/jinja_templates/fasta_check.html
Original file line number Diff line number Diff line change
@@ -1,30 +1,73 @@

{% macro fasta_check_report(validation_results) -%}
{% for fasta_file, results_for_fasta in validation_results.get('fasta_check', {}).items() %}
{% if results_for_fasta.get('all_insdc') %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% set text = "all sequences are INSDC accessioned" %}
{% macro fasta_check_report(validation_results, file_name) -%}
{% set results_for_fasta = validation_results['fasta_check'][file_name] %}

<!-- All INSDC check results -->
{% if results_for_fasta.get('all_insdc') %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% set text = "All sequences are INSDC accessioned" %}
{% else %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% set text = "Some sequences are not INSDC accessioned" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} {{ text }} </div>
{% if not results_for_fasta.get('all_insdc') %}
<div class="error-list">
<div class="error-description">First 10 sequences not in INSDC. <strong>Full report:</strong> {{ results_for_fasta.get('report_path', '') }}</div>
<table>
<tr>
<th>Sequence name</th><th>Refget md5</th>
</tr>
{% set sequence_info_list = results_for_fasta.get('sequences', [])|rejectattr("insdc")|list %}
{% for sequence_info in sequence_info_list[:10] %}
<tr>
<td><strong>{{sequence_info.get('sequence_name') }}</strong></td><td> {{ sequence_info.get('sequence_md5') }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}

<!-- INSDC concordance check results (optional) -->
{% if 'possible_assemblies' in results_for_fasta %}
{% if 'metadata_assembly_compatible' in results_for_fasta %}
<!-- found assembly in metadata, so definitely know the analysis -->
{% set analysis_text = results_for_fasta.get('associated_analyses')|join(", ") %}
{% if results_for_fasta.get('metadata_assembly_compatible') %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% set text = analysis_text + ": Assembly accession in metadata is compatible" %}
{% else %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% set text = analysis_text + ": Assembly accession in metadata is not compatible" %}
{% endif %}
{% else %}
<!-- has possible assemblies but no metadata assembly -->
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% set text = "some sequences are not INSDC accessioned" %}
{% set text = "No assembly accession found in metadata" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} {{ fasta_file }}: {{ text }} </div>
{% if not results_for_fasta.get('all_insdc') %}
<div class='{{ row_class }}'>{{ icon }} {{ text }} </div>
{% if 'metadata_assembly_compatible' not in results_for_fasta or not results_for_fasta['metadata_assembly_compatible'] %}
<div class="error-list">
<div class="error-description">First 10 sequences not in INSDC. <strong>Full report:</strong> {{ results_for_fasta.get('report_path', '') }}</div>
<div class="error-description"><strong>Full report:</strong> {{ results_for_fasta.get('report_path', '') }}</div>
<table>
<tr>
<th>Sequence name</th><th>Refget md5</th>
<th>Category</th><th>Accessions</th>
</tr>
{% set sequence_info_list = results_for_fasta.get('sequences', [])|rejectattr("insdc")|list %}
{% for sequence_info in sequence_info_list[:10] %}
<tr>
<td><strong>{{sequence_info.get('sequence_name') }}</strong></td><td> {{ sequence_info.get('sequence_md5') }}</td>
</tr>
{% endfor %}

<tr>
<td><strong>Assembly accession found in metadata</strong></td>
<td>{{ results_for_fasta.get('assembly_in_metadata', 'Not found') }}</td>
</tr>
<tr>
<td><strong>Assembly accession(s) compatible with FASTA</strong></td>
<td>{{ results_for_fasta.get('possible_assemblies')|join(", ") }}</td>
</tr>
</table>
</div>
{% endif %}
{% endfor %}
{% endif %}
{%- endmacro %}
12 changes: 8 additions & 4 deletions eva_sub_cli/jinja_templates/html_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ <h2>VCF validation results</h2>
Checks whether each file is compliant with the <a href="http://samtools.github.io/hts-specs/VCFv4.3.pdf">VCF specification</a>.
Also checks whether the variants' reference alleles match against the reference assembly.
</div>
{% for file_name in file_names %}
{% for file_name in vcf_files %}
<h3>{{ file_name }}</h3>
{{ file_validation_report(validation_results, file_name) }}
{% endfor %}
Expand All @@ -77,11 +77,15 @@ <h2>Sample name concordance check</h2>
</section>

<section>
<h2>Reference Genome INSDC check</h2>
<h2>Reference genome INSDC check</h2>
<div class="description">
Checks that the reference sequences in the fasta file used to call the variants are accessioned in INSDC.
Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC.
Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.
</div>
{{ fasta_check_report(validation_results)}}
{% for file_name in fasta_files %}
<h3>{{ file_name }}</h3>
{{ fasta_check_report(validation_results, file_name) }}
{% endfor %}
</section>

<script>
Expand Down
46 changes: 46 additions & 0 deletions eva_sub_cli/metadata_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
from collections import defaultdict

from ebi_eva_common_pyutils.logger import logging_config

logger = logging_config.get_logger(__name__)


def get_samples_per_analysis(metadata):
"""Returns mapping of analysis alias to sample names, based on metadata."""
samples_per_analysis = defaultdict(list)
for sample_info in metadata.get('sample', []):
samples_per_analysis[sample_info.get('analysisAlias')].append(sample_info.get('sampleInVCF'))
return {
analysis_alias: set(samples)
for analysis_alias, samples in samples_per_analysis.items()
}


def get_files_per_analysis(metadata):
"""Returns mapping of analysis alias to filenames, based on metadata."""
files_per_analysis = defaultdict(list)
for file_info in metadata.get('files', []):
if file_info.get('fileType') == 'vcf':
files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
return {
analysis_alias: set(filepaths)
for analysis_alias, filepaths in files_per_analysis.items()
}


def get_reference_assembly_for_analysis(metadata, analysis_alias):
"""Returns the reference assembly for this analysis (does not validate format)."""
for analysis in metadata.get('analysis', []):
if analysis.get('analysisAlias') == analysis_alias:
return analysis.get('referenceGenome')
return None


def get_analysis_for_vcf_file(vcf_file, files_per_analysis):
"""Returns list of analysis aliases associated with the vcf file path."""
if not os.path.exists(vcf_file):
raise FileNotFoundError(f'{vcf_file} cannot be resolved')
analysis_aliases = [analysis_alias for analysis_alias in files_per_analysis
if os.path.basename(vcf_file) in files_per_analysis[analysis_alias]]
return analysis_aliases
Loading
Loading