Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3435 - Retrieve INSDC accession based on FASTA file and compare against metadata #29

Merged
merged 6 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 89 additions & 4 deletions bin/check_fasta_insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import argparse
import gzip
import hashlib
import json

from itertools import groupby

Expand All @@ -13,7 +14,11 @@
from requests import HTTPError
from retry import retry

from eva_sub_cli.metadata_utils import get_files_per_analysis, get_analysis_for_vcf_file, \
get_reference_assembly_for_analysis

REFGET_SERVER = 'https://www.ebi.ac.uk/ena/cram'
CONTIG_ALIAS_SERVER = 'https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum'

logger = logging_config.get_logger(__name__)

Expand Down Expand Up @@ -64,29 +69,109 @@ def get_refget_metadata(md5_digest):
return None


def assess_fasta(input_fasta):
@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def _get_containing_assemblies_paged(url):
response = requests.get(url)
if 500 <= response.status_code < 600:
raise HTTPError(f"{response.status_code} Server Error: {response.reason} for url: {response.url}",
response=response)
if 200 <= response.status_code < 300:
results = set()
response_data = response.json()
if '_embedded' in response_data and 'chromosomeEntities' in response_data['_embedded']:
for contigEntity in response_data['_embedded']['chromosomeEntities']:
results.add(contigEntity['assembly']['insdcAccession'])
if '_links' in response_data and 'next' in response_data['_links']:
# Add results from next page if needed
results |= _get_containing_assemblies_paged(response_data['_links']['next'])
return results
return set()


def get_containing_assemblies(md5_digest):
# Wrapper method to handle pagination
url = f'{CONTIG_ALIAS_SERVER}/{md5_digest}'
return _get_containing_assemblies_paged(url)


def assess_fasta(input_fasta, analyses, metadata_insdc):
"""
Check whether all sequences in fasta file are INSDC, and if so whether the INSDC accession provided in the metadata
is compatible.
:param input_fasta: path to fasta file
:param analyses: aliases of all analyses associated with this fasta (used only for reporting)
:param metadata_insdc: INSDC accession from metadata (if None will only do the first check)
:returns: dict of results
"""
results = {'sequences': []}
all_insdc = True
possible_assemblies = set()
for header, sequence in fasta_iter(input_fasta):
name = header.split()[0]
md5_digest = refget_md5_digest(sequence)
sequence_metadata = get_refget_metadata(md5_digest)
results['sequences'].append({'sequence_name': name, 'sequence_md5': md5_digest, 'insdc': bool(sequence_metadata)})
all_insdc = all_insdc and bool(sequence_metadata)
is_insdc = bool(sequence_metadata)
if is_insdc:
containing_assemblies = get_containing_assemblies(md5_digest)
if len(containing_assemblies) == 0:
logger.warning(f'Sequence with this MD5 is INSDC but not found in contig alias: {md5_digest}')
continue
if len(possible_assemblies) == 0:
possible_assemblies = containing_assemblies
else:
possible_assemblies &= containing_assemblies
results['sequences'].append({'sequence_name': name, 'sequence_md5': md5_digest, 'insdc': is_insdc})
all_insdc = all_insdc and is_insdc
results['all_insdc'] = all_insdc

# Only report on metadata concordance if all of the following hold:
# 1) All sequences in FASTA file are INSDC
# 2) At least one compatible assembly accession was found in contig alias
# 3) Found a single assembly accession in the metadata to compare against this FASTA file
# If (3) is missing but (1) and (2) hold, we will still report possible INSDC assemblies.
if all_insdc and possible_assemblies:
results['possible_assemblies'] = possible_assemblies
if all_insdc and possible_assemblies and metadata_insdc:
results['metadata_assembly_compatible'] = (metadata_insdc in possible_assemblies)
results['associated_analyses'] = analyses
results['assembly_in_metadata'] = metadata_insdc
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could debate this but in my view we would want check_fasta_insdc to report as much as possible and filter what needs to be reported in the final report.

  • I would report all possible assemblies if any was found.
  • metadata_assembly_compatible per analysis if bool(metadata_insdc) and bool(possible_assemblies)
    I think I'm mostly worried about the case where the assembly fasta contains one non-insdc sequence and all the check fails.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that makes sense, right now it's kind of assuming that the is_insdc check and contig alias checks are always 100% accurate and consistent with each other, which is unlikely... I'll update this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new implementation would allow a report like this:
image
For a case where e.g. a sequence is not supported by the refget endpoint, but according to contig alias it's consistent with the metadata. It will also show two errors if not everything is INSDC and the metadata is not consistent.

The only case where the second check doesn't show up at all in the final report is if we don't find any possible assemblies - I think I was thinking that we're expecting contig alias might be missing things and so we wouldn't want to output an error, but perhaps we should show it as an error anyway so both checks always show up? What do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that their will be a number of cases where the contig alias will not have the assembly. We should not show the error in that case.
We probably will come back to this decision if we fell like the contig alias has enough coverage of assemblies we're using. But for the moment this is sensible.

return results


def get_analyses_and_reference_genome_from_metadata(vcf_files, json_file):
with open(json_file) as open_json:
metadata = json.load(open_json)
files_per_analysis = get_files_per_analysis(metadata)
# Get all analyses associated with all vcf files
all_analyses = set()
for vcf_file in vcf_files:
analysis_aliases = get_analysis_for_vcf_file(vcf_file, files_per_analysis)
if len(analysis_aliases) != 1:
logger.error(f'Could not determine analysis associated with VCF file: {vcf_file}')
else:
all_analyses.add(analysis_aliases[0])
# Get (single) assembly associated with all analyses
assemblies = [get_reference_assembly_for_analysis(metadata, analysis) for analysis in all_analyses]
if len(assemblies) != 1:
logger.error(f'Could not determine assembly accession to check against fasta file, out of: {assemblies}')
return all_analyses, None
apriltuesday marked this conversation as resolved.
Show resolved Hide resolved
return all_analyses, assemblies[0]


def main():
arg_parser = argparse.ArgumentParser(
description="Calculate each sequence's Refget MD5 digest and compare these against INSDC Refget server.")
arg_parser.add_argument('--input_fasta', required=True, dest='input_fasta',
help='Fasta file that contains the sequence to be checked')
arg_parser.add_argument('--vcf_files', dest='vcf_files', nargs='+',
help='VCF files, used to connect fasta file to assembly accession in metadata via analysis')
arg_parser.add_argument('--metadata_json', required=True, dest='metadata_json', help='EVA metadata json file')
arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
help='Path to the location of the results')
args = arg_parser.parse_args()
logging_config.add_stdout_handler()
results = assess_fasta(args.input_fasta)
analyses, metadata_insdc = get_analyses_and_reference_genome_from_metadata(args.vcf_files, args.metadata_json)
results = assess_fasta(args.input_fasta, analyses, metadata_insdc)
write_result_yaml(args.output_yaml, results)


Expand Down
21 changes: 4 additions & 17 deletions bin/samples_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import json
import os

from collections import defaultdict
from ebi_eva_common_pyutils.logger import logging_config

import yaml

from eva_sub_cli.metadata_utils import get_samples_per_analysis, get_files_per_analysis, get_analysis_for_vcf_file

logger = logging_config.get_logger(__name__)

Expand Down Expand Up @@ -88,17 +88,7 @@ def compare_all_analysis(samples_per_analysis, files_per_analysis):
def read_metadata_json(json_file):
with open(json_file) as open_json:
metadata = json.load(open_json)
samples_per_analysis = defaultdict(list)
files_per_analysis = defaultdict(list)
for sample_info in metadata['sample']:
samples_per_analysis[sample_info.get('analysisAlias')].append(sample_info.get('sampleInVCF'))
for file_info in metadata['files']:
if file_info.get('fileType') == 'vcf':
files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
return (
{analysis_alias: set(samples) for analysis_alias, samples in samples_per_analysis.items()},
{filepath: set(samples) for filepath, samples in files_per_analysis.items()}
)
return get_samples_per_analysis(metadata), get_files_per_analysis(metadata)


def associate_vcf_path_with_analysis(vcf_files, files_per_analysis):
Expand All @@ -113,10 +103,7 @@ def associate_vcf_path_with_analysis(vcf_files, files_per_analysis):
for analysis in files_per_analysis:
result_files_per_analysis[analysis] = []
for vcf_file in vcf_files:
if not os.path.exists(vcf_file):
raise FileNotFoundError(f'{vcf_file} cannot be resolved')
analysis_aliases = [analysis_alias for analysis_alias in files_per_analysis
if os.path.basename(vcf_file) in files_per_analysis[analysis_alias]]
analysis_aliases = get_analysis_for_vcf_file(vcf_file, files_per_analysis)
if len(analysis_aliases) == 1:
result_files_per_analysis[analysis_aliases[0]].append(vcf_file)
elif len(analysis_aliases) == 0:
Expand All @@ -140,7 +127,7 @@ def write_result_yaml(output_yaml, overall_differences, results_per_analysis_ali

def check_sample_name_concordance(metadata_json, vcf_files, output_yaml):
"""
Take the metadata following EVA standard and formatted in JSON then compare the sample names in it to the ones
Take the metadata following EVA standard and formatted in JSON then compare the sample names in it to the ones
found in the VCF files
"""
samples_per_analysis, files_per_analysis = read_metadata_json(metadata_json)
Expand Down
81 changes: 62 additions & 19 deletions eva_sub_cli/jinja_templates/fasta_check.html
Original file line number Diff line number Diff line change
@@ -1,30 +1,73 @@

{% macro fasta_check_report(validation_results) -%}
{% for fasta_file, results_for_fasta in validation_results.get('fasta_check', {}).items() %}
{% if results_for_fasta.get('all_insdc') %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% set text = "all sequences are INSDC accessioned" %}
{% macro fasta_check_report(validation_results, file_name) -%}
{% set results_for_fasta = validation_results['fasta_check'][file_name] %}

<!-- All INSDC check results -->
{% if results_for_fasta.get('all_insdc') %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% set text = "All sequences are INSDC accessioned" %}
{% else %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% set text = "Some sequences are not INSDC accessioned" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} {{ text }} </div>
{% if not results_for_fasta.get('all_insdc') %}
<div class="error-list">
<div class="error-description">First 10 sequences not in INSDC. <strong>Full report:</strong> {{ results_for_fasta.get('report_path', '') }}</div>
<table>
<tr>
<th>Sequence name</th><th>Refget md5</th>
</tr>
{% set sequence_info_list = results_for_fasta.get('sequences', [])|rejectattr("insdc")|list %}
{% for sequence_info in sequence_info_list[:10] %}
<tr>
<td><strong>{{sequence_info.get('sequence_name') }}</strong></td><td> {{ sequence_info.get('sequence_md5') }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}

<!-- INSDC concordance check results (optional) -->
{% if 'possible_assemblies' in results_for_fasta %}
{% if 'metadata_assembly_compatible' in results_for_fasta %}
<!-- found assembly in metadata, so definitely know the analysis -->
{% set analysis_text = results_for_fasta.get('associated_analyses')|join(", ") %}
{% if results_for_fasta.get('metadata_assembly_compatible') %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% set text = analysis_text + ": Assembly accession in metadata is compatible" %}
{% else %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% set text = analysis_text + ": Assembly accession in metadata is not compatible" %}
{% endif %}
{% else %}
<!-- has possible assemblies but no metadata assembly -->
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% set text = "some sequences are not INSDC accessioned" %}
{% set text = "No assembly accession found in metadata" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} {{ fasta_file }}: {{ text }} </div>
{% if not results_for_fasta.get('all_insdc') %}
<div class='{{ row_class }}'>{{ icon }} {{ text }} </div>
{% if 'metadata_assembly_compatible' not in results_for_fasta or not results_for_fasta['metadata_assembly_compatible'] %}
<div class="error-list">
<div class="error-description">First 10 sequences not in INSDC. <strong>Full report:</strong> {{ results_for_fasta.get('report_path', '') }}</div>
<div class="error-description"><strong>Full report:</strong> {{ results_for_fasta.get('report_path', '') }}</div>
<table>
<tr>
<th>Sequence name</th><th>Refget md5</th>
<th>Category</th><th>Accessions</th>
</tr>
{% set sequence_info_list = results_for_fasta.get('sequences', [])|rejectattr("insdc")|list %}
{% for sequence_info in sequence_info_list[:10] %}
<tr>
<td><strong>{{sequence_info.get('sequence_name') }}</strong></td><td> {{ sequence_info.get('sequence_md5') }}</td>
</tr>
{% endfor %}

<tr>
<td><strong>Assembly accession found in metadata</strong></td>
<td>{{ results_for_fasta.get('assembly_in_metadata', 'Not found') }}</td>
</tr>
<tr>
<td><strong>Assembly accession(s) compatible with FASTA</strong></td>
<td>{{ results_for_fasta.get('possible_assemblies')|join(", ") }}</td>
</tr>
</table>
</div>
{% endif %}
{% endfor %}
{% endif %}
{%- endmacro %}
12 changes: 8 additions & 4 deletions eva_sub_cli/jinja_templates/html_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ <h2>VCF validation results</h2>
Checks whether each file is compliant with the <a href="http://samtools.github.io/hts-specs/VCFv4.3.pdf">VCF specification</a>.
Also checks whether the variants' reference alleles match against the reference assembly.
</div>
{% for file_name in file_names %}
{% for file_name in vcf_files %}
<h3>{{ file_name }}</h3>
{{ file_validation_report(validation_results, file_name) }}
{% endfor %}
Expand All @@ -77,11 +77,15 @@ <h2>Sample name concordance check</h2>
</section>

<section>
<h2>Reference Genome INSDC check</h2>
<h2>Reference genome INSDC check</h2>
<div class="description">
Checks that the reference sequences in the fasta file used to call the variants are accessioned in INSDC.
Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC.
Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.
</div>
{{ fasta_check_report(validation_results)}}
{% for file_name in fasta_files %}
<h3>{{ file_name }}</h3>
{{ fasta_check_report(validation_results, file_name) }}
{% endfor %}
</section>

<script>
Expand Down
46 changes: 46 additions & 0 deletions eva_sub_cli/metadata_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
from collections import defaultdict

from ebi_eva_common_pyutils.logger import logging_config

logger = logging_config.get_logger(__name__)


def get_samples_per_analysis(metadata):
"""Returns mapping of analysis alias to sample names, based on metadata."""
samples_per_analysis = defaultdict(list)
for sample_info in metadata.get('sample', []):
samples_per_analysis[sample_info.get('analysisAlias')].append(sample_info.get('sampleInVCF'))
return {
analysis_alias: set(samples)
for analysis_alias, samples in samples_per_analysis.items()
}


def get_files_per_analysis(metadata):
"""Returns mapping of analysis alias to filenames, based on metadata."""
files_per_analysis = defaultdict(list)
for file_info in metadata.get('files', []):
if file_info.get('fileType') == 'vcf':
files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
return {
analysis_alias: set(filepaths)
for analysis_alias, filepaths in files_per_analysis.items()
}


def get_reference_assembly_for_analysis(metadata, analysis_alias):
"""Returns the reference assembly for this analysis (does not validate format)."""
for analysis in metadata.get('analysis', []):
if analysis.get('analysisAlias') == analysis_alias:
return analysis.get('referenceGenome')
return None


def get_analysis_for_vcf_file(vcf_file, files_per_analysis):
"""Returns list of analysis aliases associated with the vcf file path."""
if not os.path.exists(vcf_file):
raise FileNotFoundError(f'{vcf_file} cannot be resolved')
analysis_aliases = [analysis_alias for analysis_alias in files_per_analysis
if os.path.basename(vcf_file) in files_per_analysis[analysis_alias]]
return analysis_aliases
Loading
Loading