Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benb/use metadata as source of family table load #936

Merged
merged 2 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 22 additions & 22 deletions v03_pipeline/lib/tasks/write_project_family_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import luigi
import luigi.util

from v03_pipeline.lib.misc.io import import_pedigree
from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
from v03_pipeline.lib.tasks.files import RawFileTask
from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask
from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask
from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import (
WriteRemappedAndSubsettedCallsetTask,
)


@luigi.util.inherits(BaseLoadingRunParams)
Expand All @@ -26,27 +27,26 @@ def complete(self) -> bool:
for write_family_table_task in self.dynamic_write_family_table_tasks
)

def run(self):
# https://luigi.readthedocs.io/en/stable/tasks.html#dynamic-dependencies
# Fetch family guids from project table
update_project_table_task: luigi.Target = yield self.clone(
UpdateProjectTableTask,
)
project_ht = hl.read_table(update_project_table_task.path)
family_guids_in_project_table = set(hl.eval(project_ht.globals.family_guids))
def requires(self) -> list[luigi.Task]:
return [
self.clone(
WriteRemappedAndSubsettedCallsetTask,
),
self.clone(
UpdateProjectTableTask,
),
]

# Fetch family guids from pedigree
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The specific failure was a family existing in the pedigree, failing a check and thus being excluded from the rest of the pipeline, but being present on the project table from a different callset.

pedigree_ht_task: luigi.Target = yield RawFileTask(self.project_pedigree_path)
pedigree_ht = import_pedigree(pedigree_ht_task.path)
families_guids_in_pedigree = {
f.family_guid for f in parse_pedigree_ht_to_families(pedigree_ht)
}

# Intersect them
family_guids_to_load = (
family_guids_in_project_table & families_guids_in_pedigree
def run(self):
ht = hl.read_matrix_table(
remapped_and_subsetted_callset_path(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just sanity checking that this works for AnVIL loading where we don't have remaps (or usually subsets)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah the same file gets generated, we just run a "subset" that has all families in it.

self.reference_genome,
self.dataset_type,
self.callset_path,
self.project_guid,
),
)
for family_guid in family_guids_to_load:
for family_guid in set(hl.eval(ht.globals.family_samples).keys()):
self.dynamic_write_family_table_tasks.add(
self.clone(WriteFamilyTableTask, family_guid=family_guid),
)
Expand Down
68 changes: 62 additions & 6 deletions v03_pipeline/lib/tasks/write_project_family_tables_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import luigi.worker

from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
from v03_pipeline.lib.paths import project_table_path
from v03_pipeline.lib.paths import (
project_table_path,
remapped_and_subsetted_callset_path,
)
from v03_pipeline.lib.tasks.write_project_family_tables import (
WriteProjectFamilyTablesTask,
)
Expand Down Expand Up @@ -38,6 +41,33 @@ def test_snv_write_project_family_tables_task(self) -> None:
hl.read_table(write_family_table_task.output().path)
for write_family_table_task in write_project_family_tables.dynamic_write_family_table_tasks
]
# Validate remapped and subsetted callset families
remapped_and_subsetted_callset = hl.read_matrix_table(
remapped_and_subsetted_callset_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
TEST_SNV_INDEL_VCF,
'R0113_test_project',
),
)
self.assertCountEqual(
hl.eval(remapped_and_subsetted_callset.globals.family_samples.keys()),
{
'123_1',
'234_1',
'345_1',
'456_1',
'567_1',
'678_1',
'789_1',
'890_1',
'901_1',
'bcd_1',
'cde_1',
'def_1',
'efg_1',
},
)
self.assertCountEqual(
[ht.globals.sample_ids.collect() for ht in hts],
[
Expand Down Expand Up @@ -73,13 +103,39 @@ def test_snv_write_project_family_tables_task(self) -> None:
worker.run()
self.assertTrue(write_project_family_tables_subset.complete())
hts = [
hl.read_table(write_family_table_task.output().path)
write_family_table_task.output().path
for write_family_table_task in write_project_family_tables_subset.dynamic_write_family_table_tasks
]
# Only one family table written
self.assertEqual(
len(hts),
1,
self.assertTrue(len(hts))
self.assertTrue(
'123_1' in hts[0],
)
# Validate remapped and subsetted callset families
# (and that it was re-written)
remapped_and_subsetted_callset = hl.read_matrix_table(
remapped_and_subsetted_callset_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
TEST_SNV_INDEL_VCF,
'R0113_test_project',
),
)
self.assertCountEqual(
hl.eval(remapped_and_subsetted_callset.globals.family_samples.keys()),
{'123_1'},
)
self.assertCountEqual(
hl.eval(remapped_and_subsetted_callset.globals.failed_family_samples),
hl.Struct(
missing_samples={
'234_1': {
'reasons': ["Missing samples: {'NA19678_999'}"],
'samples': ['NA19678_1', 'NA19678_999'],
},
},
relatedness_check={},
sex_check={},
),
)
# Project table still contains all family guids
self.assertCountEqual(
Expand Down
2 changes: 2 additions & 0 deletions v03_pipeline/var/test/pedigrees/test_pedigree_4_subset.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
Project_GUID Family_GUID Family_ID Individual_ID Paternal_ID Maternal_ID Sex
R0114_project4 123_1 123 NA19675_1 F
R0114_project4 234_1 234 NA19678_1 M
R0114_project4 234_1 234 NA19678_999 F
Loading