Skip to content

Commit

Permalink
VCF import - only keep standard contigs
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Dec 9, 2024
1 parent 5128e26 commit 9ec34d6
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 11 deletions.
8 changes: 5 additions & 3 deletions library/genomics/vcf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def get_contigs_header_lines(genome_build, standard_only=True, use_accession=Tru


def write_cleaned_vcf_header(genome_build, source_vcf_filename: str, output_filename: str,
new_info_lines: list[str] = None, standard_contigs_only=False):
new_info_lines: list[str] = None, standard_contigs_only=True):
contig_regex = re.compile(r"^##contig=<ID=(.+),length=(\d+)")

header_lines = []
Expand All @@ -204,7 +204,7 @@ def write_cleaned_vcf_header(genome_build, source_vcf_filename: str, output_file
if new_info_lines:
for new_info_line in new_info_lines:
f.write(new_info_line + "\n")
for contig_line in get_contigs_header_lines(genome_build):
for contig_line in get_contigs_header_lines(genome_build, standard_only=standard_contigs_only):
f.write(contig_line + "\n")

elif m := contig_regex.match(line):
Expand All @@ -213,7 +213,7 @@ def write_cleaned_vcf_header(genome_build, source_vcf_filename: str, output_file
if contig := genome_build.chrom_contig_mappings.get(contig_name):
if standard_contigs_only:
if contig.role != SequenceRole.ASSEMBLED_MOLECULE:
continue
continue # No validation

if fasta_chrom := contig_to_fasta_names.get(contig.pk):
provided_contig_length = int(provided_contig_length)
Expand All @@ -222,6 +222,8 @@ def write_cleaned_vcf_header(genome_build, source_vcf_filename: str, output_file
msg = f"VCF header contig '{contig_name}' (length={provided_contig_length}) has " + \
f"different length than ref contig {fasta_chrom} (length={ref_contig_length})"
raise ValueError(msg)
# Don't write any old contigs
continue

f.write(line + "\n")

Expand Down
21 changes: 16 additions & 5 deletions snpdb/models/models_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,10 +173,9 @@ def get_build_annotation_descriptions():
build_annotation_descriptions = ", ".join([f"{k}: {v}" for k, v in build_consortia.items()])
return build_annotation_descriptions

@cached_property
def chrom_contig_mappings(self) -> dict[str, 'Contig']:
def _get_chrom_contig_mappings(self, contigs: Iterable['Contig']) -> dict[str, 'Contig']:
chrom_contig_mappings = {}
for contig in self.contigs:
for contig in contigs:
chrom_contig_mappings[contig.name] = contig
chrom_contig_mappings[contig.ucsc_name] = contig
chrom_contig_mappings[contig.genbank_accession] = contig
Expand All @@ -187,8 +186,20 @@ def chrom_contig_mappings(self) -> dict[str, 'Contig']:
chrom_contig_mappings["mt"] = mt
return chrom_contig_mappings

def get_chrom_contig_id_mappings(self) -> dict[str, int]:
return {k: v.pk for k, v in self.chrom_contig_mappings.items()}
@cached_property
def chrom_contig_mappings(self) -> dict[str, 'Contig']:
return self._get_chrom_contig_mappings(self.contigs)

@cached_property
def chrom_standard_contig_mappings(self) -> dict[str, 'Contig']:
return self._get_chrom_contig_mappings(self.standard_contigs)

def get_chrom_contig_id_mappings(self, standard_contigs_only=False) -> dict[str, int]:
if standard_contigs_only:
chrom_contig_mappings = self.chrom_standard_contig_mappings
else:
chrom_contig_mappings = self.chrom_contig_mappings
return {k: v.pk for k, v in chrom_contig_mappings.items()}

def convert_chrom_to_contig_accession(self, chrom: str) -> str:
""" chrom = ucsc_name/genbank_accession/refseq accession """
Expand Down
6 changes: 3 additions & 3 deletions upload/management/commands/vcf_clean_and_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def handle(self, *args, **options):

genome_build = GenomeBuild.get_name_or_alias(build_name)
genome_fasta = GenomeFasta.get_for_genome_build(genome_build)
chrom_to_contig_id = genome_build.get_chrom_contig_id_mappings()
contig_lengths = dict(genome_build.contigs.values_list("pk", "length"))
chrom_to_contig_id = genome_build.get_chrom_contig_id_mappings(standard_contigs_only=True)
contig_lengths = {contig_id: int(length) for contig_id, length in genome_build.contigs.values_list("pk", "length")}
contig_to_fasta_names = genome_fasta.get_contig_id_to_name_mappings()

if vcf_filename == '-':
Expand All @@ -57,7 +57,7 @@ def handle(self, *args, **options):
skipped_filters = Counter()

ref_standard_bases_pattern = re.compile(r"[GATCN]") # Reference can be N (and FreeBayes often writes these)
alt_standard_bases_pattern = re.compile(r"[GATC,\.]") # Can be multi-alts, or "." for reference
alt_standard_bases_pattern = re.compile(r"[GATC,.]") # Can be multi-alts, or "." for reference

skip_patterns = {}
if skip_regex := getattr(settings, "VCF_IMPORT_SKIP_RECORD_REGEX", {}):
Expand Down
1 change: 1 addition & 0 deletions upload/vcf/bulk_genotype_vcf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class BulkGenotypeVCFProcessor(AbstractBulkVCFProcessor):
# v20. Split into common/uncommon genotype collections
# v21. Support CNV, store unknown format/info fields
# v22. Normalize SVLEN based on settings, change gnomAD preprocessing INFO to avoid collisions
# v23. Fixes to clean_and_filter - Standard chromosomes only
VCF_IMPORTER_VERSION = 22 # Change this if you make a major change to the code.
# Need to distinguish between no entry and 0, can't use None w/postgres command line inserts
DEFAULT_AD_FIELD = 'AD' # What CyVCF2 uses
Expand Down

0 comments on commit 9ec34d6

Please sign in to comment.