Skip to content

Commit

Permalink
Merge pull request #4636 from broadinstitute/validation-record-clean-up
Browse files Browse the repository at this point in the history
Pedigree info validation clean up
  • Loading branch information
hanars authored Feb 11, 2025
2 parents 614b7fb + e242924 commit 345ac42
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 170 deletions.
12 changes: 5 additions & 7 deletions seqr/utils/search/add_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from seqr.views.utils.airtable_utils import AirtableSession, ANVIL_REQUEST_TRACKING_TABLE
from seqr.views.utils.dataset_utils import match_and_update_search_samples, load_mapping_file
from seqr.views.utils.export_utils import write_multiple_files
from seqr.views.utils.pedigree_info_utils import get_no_affected_families
from seqr.views.utils.pedigree_info_utils import validate_affected_families
from settings import SEQR_SLACK_DATA_ALERTS_NOTIFICATION_CHANNEL, BASE_URL, ANVIL_UI_URL, \
SEQR_SLACK_ANVIL_DATA_LOADING_CHANNEL

Expand Down Expand Up @@ -157,12 +157,10 @@ def _upload_data_loading_files(projects: list[Project], user: User, file_path: s
data_by_project[row.pop('project')].append(row)
affected_by_family[row['Family_GUID']].append(row.pop('affected_status'))

no_affected_families =get_no_affected_families(affected_by_family)
if no_affected_families:
families = ', '.join(sorted(no_affected_families))
raise ErrorsWarningsException(errors=[
f'The following families have no affected individuals and can not be loaded to seqr: {families}',
])
errors = []
validate_affected_families(affected_by_family, errors)
if errors:
raise ErrorsWarningsException(errors=errors)

header = list(file_annotations.keys())
files = [(f'{project_guid}_pedigree', header, rows) for project_guid, rows in data_by_project.items()]
Expand Down
92 changes: 51 additions & 41 deletions seqr/views/apis/anvil_workspace_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,16 @@
from django.shortcuts import redirect

from reference_data.models import GENOME_VERSION_LOOKUP
from seqr.models import Project, CAN_EDIT, Sample, Individual, IgvSample
from seqr.models import Project, CAN_EDIT, Sample, IgvSample
from seqr.views.react_app import render_app_html
from seqr.views.utils.airtable_utils import AirtableSession, ANVIL_REQUEST_TRACKING_TABLE
from seqr.utils.search.utils import get_search_samples
from seqr.views.utils.airflow_utils import trigger_airflow_data_loading
from seqr.views.utils.json_to_orm_utils import create_model_from_json
from seqr.views.utils.json_utils import create_json_response
from seqr.views.utils.file_utils import load_uploaded_file
from seqr.views.utils.terra_api_utils import add_service_account, has_service_account_access, TerraAPIException, \
TerraRefreshTokenFailedException
from seqr.views.utils.pedigree_info_utils import parse_basic_pedigree_table, JsonConstants
from seqr.views.utils.pedigree_info_utils import parse_basic_pedigree_table, validate_affected_families, JsonConstants
from seqr.views.utils.individual_utils import add_or_update_individuals_and_families
from seqr.utils.communication_utils import send_html_email
from seqr.utils.file_utils import list_files
Expand Down Expand Up @@ -184,7 +183,7 @@ def create_project_from_workspace(request, namespace, name):
error = 'Field(s) "{}" are required'.format(', '.join(missing_fields))
return create_json_response({'error': error}, status=400, reason=error)

pedigree_records = _parse_uploaded_pedigree(request_json)
pedigree_records = _parse_uploaded_pedigree(request_json)[0]

# Create a new Project in seqr
project_args = {
Expand Down Expand Up @@ -225,61 +224,72 @@ def add_workspace_data(request, project_guid):
error = 'Field(s) "{}" are required'.format(', '.join(missing_fields))
return create_json_response({'error': error}, status=400, reason=error)

pedigree_records = _parse_uploaded_pedigree(request_json, project=project)

previous_samples = get_search_samples([project]).filter(dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS)
sample = previous_samples.first()
if not sample:
return create_json_response({
'error': 'New data cannot be added to this project until the previously requested data is loaded',
}, status=400)
sample_type = sample.sample_type

families = {record[JsonConstants.FAMILY_ID_COLUMN] for record in pedigree_records}
previous_loaded_individuals = previous_samples.filter(
individual__family__family_id__in=families,
).values_list('individual_id', 'individual__individual_id', 'individual__family__family_id')
missing_samples_by_family = defaultdict(list)
for _, individual_id, family_id in previous_loaded_individuals:
if individual_id not in request_json['vcfSamples']:
missing_samples_by_family[family_id].append(individual_id)
if missing_samples_by_family:
missing_family_sample_messages = [
f'Family {family_id}: {", ".join(sorted(individual_ids))}'
for family_id, individual_ids in missing_samples_by_family.items()
]
return create_json_response({
'error': 'In order to load data for families with previously loaded data, new family samples must be joint called in a single VCF with all previously loaded samples.'
' The following samples were previously loaded in this project but are missing from the VCF:\n{}'.format(
'\n'.join(sorted(missing_family_sample_messages)))}, status=400)
pedigree_records, loaded_individual_ids, sample_type = _parse_uploaded_pedigree(request_json, project=project, search_dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS)

pedigree_json = _trigger_add_workspace_data(
project, pedigree_records, request.user, request_json['fullDataPath'], sample_type,
previous_loaded_ids=[i[0] for i in previous_loaded_individuals], get_pedigree_json=True)
previous_loaded_ids=loaded_individual_ids, get_pedigree_json=True)

return create_json_response(pedigree_json)


def _parse_uploaded_pedigree(request_json, project=None):
# Parse families/individuals in the uploaded pedigree file
def _parse_uploaded_pedigree(request_json, project=None, search_dataset_type=None):
loaded_sample_type = None
loaded_individual_ids = []
def validate_expected_samples(record_family_ids, affected_status_by_family, previous_loaded_individuals, sample_type):
errors, loaded_ids = _validate_expected_samples(
request_json['vcfSamples'], search_dataset_type,
record_family_ids, affected_status_by_family, previous_loaded_individuals, sample_type,
)
nonlocal loaded_individual_ids
loaded_individual_ids += loaded_ids
nonlocal loaded_sample_type
loaded_sample_type = sample_type
return errors

json_records = load_uploaded_file(request_json['uploadedFileId'])
pedigree_records, _ = parse_basic_pedigree_table(
pedigree_records = parse_basic_pedigree_table(
project, json_records, 'uploaded pedigree file', update_features=True, required_columns=[
JsonConstants.SEX_COLUMN, JsonConstants.AFFECTED_COLUMN,
])
], search_dataset_type=search_dataset_type, validate_expected_samples=validate_expected_samples)

missing_samples = [record['individualId'] for record in pedigree_records
if record['individualId'] not in request_json['vcfSamples']]
return pedigree_records, loaded_individual_ids, loaded_sample_type


def _validate_expected_samples(vcf_samples, search_dataset_type, record_family_ids, affected_status_by_family, previous_loaded_individuals, sample_type):
errors = []
if search_dataset_type and not sample_type:
errors.append('New data cannot be added to this project until the previously requested data is loaded')

missing_samples = sorted(set(record_family_ids.keys()) - set(vcf_samples))
if missing_samples:
errors.append('The following samples are included in the pedigree file but are missing from the VCF: {}'.format(
', '.join(missing_samples)))

if errors:
raise ErrorsWarningsException(errors, [])
families = set(record_family_ids.values())
missing_samples_by_family = defaultdict(list)
for loaded_individual in previous_loaded_individuals:
individual_id = loaded_individual[JsonConstants.INDIVIDUAL_ID_COLUMN]
family_id = loaded_individual[JsonConstants.FAMILY_ID_COLUMN]
if family_id in families and individual_id not in vcf_samples:
missing_samples_by_family[family_id].append(individual_id)
if missing_samples_by_family:
missing_family_sample_messages = [
f'Family {family_id}: {", ".join(sorted(individual_ids))}'
for family_id, individual_ids in missing_samples_by_family.items()
]
errors.append(
'In order to load data for families with previously loaded data, new family samples must be joint called in a single VCF with all previously loaded samples.'
' The following samples were previously loaded in this project but are missing from the VCF:\n' +
'\n'.join(sorted(missing_family_sample_messages))
)

return pedigree_records
validate_affected_families(affected_status_by_family, errors)

loaded_individual_ids = [
i['individual_id'] for i in previous_loaded_individuals if i[JsonConstants.FAMILY_ID_COLUMN] in families
]
return errors, loaded_individual_ids


def _trigger_add_workspace_data(project, pedigree_records, user, data_path, sample_type, previous_loaded_ids=None, get_pedigree_json=False):
Expand Down
55 changes: 27 additions & 28 deletions seqr/views/apis/anvil_workspace_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@

MISSING_REQUIRED_SAMPLE_DATA = [["21", "HG00736", "", "", "", "", "", "", "", ""]]

LOAD_SAMPLE_DATA_EXTRA_SAMPLE = LOAD_SAMPLE_DATA + [["1", "NA19678", "", "", "", "Male", "Affected", "HP:0011675", "", ""]]

LOAD_SAMPLE_DATA_NO_AFFECTED = LOAD_SAMPLE_DATA + [["22", "HG00736", "", "", "", "Unknown", "Unknown", "", "", ""]]

FILE_DATA = [
Expand Down Expand Up @@ -635,33 +633,28 @@ def test_add_workspace_data(self, mock_compute_indiv_guid):
response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
self.assertEqual(response.status_code, 400)
self.assertEqual(
response.json()['error'],
'New data cannot be added to this project until the previously requested data is loaded',
response.json()['errors'],
['New data cannot be added to this project until the previously requested data is loaded'],
)

url = reverse(add_workspace_data, args=[PROJECT1_GUID])
self._test_errors(url, ['uploadedFileId', 'fullDataPath', 'vcfSamples'], TEST_WORKSPACE_NAME)
self._test_errors(url, ['uploadedFileId', 'fullDataPath', 'vcfSamples'], TEST_WORKSPACE_NAME, has_existing_data=True)

# Test Individual ID exists in an omitted family
# Test Individual ID exists in an omitted family and missing loaded samples
self.mock_load_file.return_value = LOAD_SAMPLE_DATA + INVALID_ADDED_SAMPLE_DATA
response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
self.assertEqual(response.status_code, 400)
response_json = response.json()
self.assertListEqual(response_json['errors'], [
'HG00731 already has loaded data and cannot be moved to a different family',
])

# Test missing loaded samples
self.mock_load_file.return_value = LOAD_SAMPLE_DATA
response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
self.assertEqual(response.status_code, 400)
self.assertEqual(
response.json()['error'],
'The following samples are included in the pedigree file but are missing from the VCF: HG00731',
'In order to load data for families with previously loaded data, new family samples must be joint called in a single VCF with all previously'
' loaded samples. The following samples were previously loaded in this project but are missing from the VCF:'
'\nFamily 1: NA19678')
'\nFamily 1: NA19678',
'HG00731 already has loaded data and cannot be moved to a different family',
])

# Test a valid operation
self.mock_load_file.return_value = LOAD_SAMPLE_DATA
mock_compute_indiv_guid.return_value = 'I0000020_hg00735'
response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY_ADD_DATA))
self.assertEqual(response.status_code, 200)
Expand All @@ -678,7 +671,7 @@ def test_add_workspace_data(self, mock_compute_indiv_guid):
self._test_mv_file_and_triggering_dag_exception(
url, {'guid': PROJECT2_GUID}, PROJECT2_SAMPLE_DATA, 'GRCh37', REQUEST_BODY_ADD_DATA2)

def _test_errors(self, url, fields, workspace_name):
def _test_errors(self, url, fields, workspace_name, has_existing_data=False):
# Test missing required fields in the request body
response = self.client.post(url, content_type='application/json', data=json.dumps({}))
self.assertEqual(response.status_code, 400)
Expand All @@ -699,30 +692,36 @@ def _test_errors(self, url, fields, workspace_name):
response_json = response.json()
self.assertListEqual(response_json['errors'], ['Missing Sex in row #4', 'Missing Affected in row #4'])

# test sample data error
# test sample data error and missing samples
self.mock_load_file.return_value = LOAD_SAMPLE_DATA + BAD_SAMPLE_DATA
response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
self.assertEqual(response.status_code, 400)
response_json = response.json()
missing_vcf_sample_error = (
'In order to load data for families with previously loaded data, new family samples must be joint called in '
'a single VCF with all previously loaded samples. The following samples were previously loaded in this '
'project but are missing from the VCF:\nFamily 1: NA19678'
)
missing_row_error = missing_vcf_sample_error if has_existing_data else \
'NA19678 is the father of NA19674 but is not included. Make sure to create an additional record with NA19678 as the Individual ID'
self.assertListEqual(response_json['errors'], [
'The following samples are included in the pedigree file but are missing from the VCF: NA19674, NA19681',
missing_row_error,
'NA19674 is affected but has no HPO terms',
'NA19681 has invalid HPO terms: HP:0100258',
'NA19678 is the father of NA19674 but is not included. Make sure to create an additional record with NA19678 as the Individual ID',
])

# test missing samples
self.mock_load_file.return_value = LOAD_SAMPLE_DATA_EXTRA_SAMPLE
response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
self.assertEqual(response.status_code, 400)
response_json = response.json()
self.assertEqual(response_json['errors'],
['The following samples are included in the pedigree file but are missing from the VCF: NA19678'])

self.mock_load_file.return_value = LOAD_SAMPLE_DATA_NO_AFFECTED
response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
self.assertEqual(response.status_code, 400)
response_json = response.json()
self.assertEqual(response_json['errors'],['The following families do not have any affected individuals: 22'])
errors = [
'The following samples are included in the pedigree file but are missing from the VCF: HG00736',
'The following families do not have any affected individuals: 22',
]
if has_existing_data:
errors.insert(1, missing_vcf_sample_error)
self.assertEqual(response_json['errors'],errors)

def _assert_valid_operation(self, project, test_add_data=True):
genome_version = 'GRCh37' if test_add_data else 'GRCh38'
Expand Down
2 changes: 1 addition & 1 deletion seqr/views/apis/data_manager_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1627,7 +1627,7 @@ def _test_no_affected_family(self, url, body):
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 400)
self.assertDictEqual(response.json(), {
'errors': ['The following families have no affected individuals and can not be loaded to seqr: F000005_5'],
'errors': ['The following families do not have any affected individuals: F000005_5'],
'warnings': None,
})
Individual.objects.filter(guid='I000009_na20874').update(affected='A')
Expand Down
Loading

0 comments on commit 345ac42

Please sign in to comment.