Skip to content

Commit

Permalink
feat(ingest): Add option to group segments with grouping_override in …
Browse files Browse the repository at this point in the history
…JSON file (#3509)

* Allow option to group segments using a list of overrides in JSON file (`grouping_override` in config) before grouping the remaining segments using the now used heuristic grouping. The JSON file must contain a map from the group name to a list of insdcAccessionFull of the segments that should be grouped together.
  • Loading branch information
anna-parker authored Jan 30, 2025
1 parent 626182c commit a58edd1
Show file tree
Hide file tree
Showing 6 changed files with 401 additions and 19 deletions.
73 changes: 61 additions & 12 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min") # time in minutes
CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False)
ALIGN = True
GROUPS_OVERRIDE_JSON = None

dataset_server_map = {}
dataset_name_map = {}

if SEGMENTED:
GROUPS_OVERRIDE_JSON = config.get("grouping_override", None) # JSON map from group to segments' insdcAccessionFull
if config.get("minimizer_index") and config.get("minimizer_parser"):
ALIGN = False
if ALIGN:
Expand Down Expand Up @@ -91,7 +93,7 @@ rule fetch_ncbi_dataset_package:
datasets download virus genome taxon {params.taxon_id} \
--no-progressbar \
--filename {output.dataset_package} \
--api-key {params.api_key} \
--api-key {params.api_key}\
"""


Expand Down Expand Up @@ -269,10 +271,9 @@ if ALIGN:


if not ALIGN:

rule download:
rule download_minimizer:
output:
results="results/minimzer.json",
results="results/minimizer.json",
params:
minimizer=config.get("minimizer_index"),
shell:
Expand All @@ -282,15 +283,15 @@ if not ALIGN:

rule nextclade_sort:
input:
sequences="results/sequences.fasta",
minimizer="results/minimzer.json",
sequences= "results/sequences.fasta",
minimizer="results/minimizer.json",
output:
results="results/sort_results.tsv",
shell:
"""
nextclade sort -m {input.minimizer} \
-r {output.results} {input.sequences} \
--max-score-gap 0.3 --min-score 0.1 --min-hits 2 --all-matches
--max-score-gap 0.3 --min-score 0.05 --min-hits 2 --all-matches
"""

rule parse_sort:
Expand Down Expand Up @@ -339,18 +340,62 @@ rule prepare_metadata:
--log-level {params.log_level} \
"""

if GROUPS_OVERRIDE_JSON:
rule download_groups:
output:
results="results/groups.json",
params:
grouping=config.get("grouping_override")
shell:
"""
curl -L -o {output.results} {params.grouping}
"""

rule override_group_segments:
"""Group segments based on JSON map"""
input:
script="scripts/override_group_segments.py",
metadata="results/metadata_post_prepare.ndjson",
sequences="results/sequences.ndjson",
config="results/config.yaml",
groups="results/groups.json",
output:
metadata="results/metadata_grouped.ndjson",
sequences="results/sequences_grouped.ndjson",
ungrouped_metadata="results/metadata_ungrouped.ndjson",
ungrouped_sequences="results/sequences_ungrouped.ndjson",
params:
log_level=LOG_LEVEL,
shell:
"""
python {input.script} \
--config-file {input.config} \
--groups {input.groups} \
--input-metadata {input.metadata} \
--input-seq {input.sequences} \
--output-metadata {output.metadata} \
--output-ungrouped-metadata {output.ungrouped_metadata} \
--output-ungrouped-seq {output.ungrouped_sequences} \
--output-seq {output.sequences} \
--log-level {params.log_level} \
"""


rule group_segments:
rule heuristic_group_segments:
"""Group segments based on heuristic, join with previous groups if available"""
input:
script="scripts/group_segments.py",
metadata="results/metadata_post_prepare.ndjson",
sequences="results/sequences.ndjson",
script="scripts/heuristic_group_segments.py",
metadata="results/metadata_ungrouped.ndjson" if GROUPS_OVERRIDE_JSON else "results/metadata_post_prepare.ndjson",
sequences="results/sequences_ungrouped.ndjson" if GROUPS_OVERRIDE_JSON else "results/sequences.ndjson",
metadata_grouped="results/metadata_grouped.ndjson" if GROUPS_OVERRIDE_JSON else "results/metadata_post_prepare.ndjson",
sequences_grouped="results/sequences_grouped.ndjson" if GROUPS_OVERRIDE_JSON else "results/sequences.ndjson",
config="results/config.yaml",
output:
metadata="results/metadata_post_group.ndjson",
sequences="results/sequences_post_group.ndjson",
params:
log_level=LOG_LEVEL,
GROUPS_OVERRIDE_JSON="true" if GROUPS_OVERRIDE_JSON else "false"
shell:
"""
python {input.script} \
Expand All @@ -359,7 +404,11 @@ rule group_segments:
--input-seq {input.sequences} \
--output-metadata {output.metadata} \
--output-seq {output.sequences} \
--log-level {params.log_level} \
--log-level {params.log_level}
if [ "{params.GROUPS_OVERRIDE_JSON}" = "true" ]; then
cat {input.metadata_grouped} >> {output.metadata}
cat {input.sequences_grouped} >> {output.sequences}
fi
"""


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import pathlib
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Final

import click
Expand Down Expand Up @@ -57,7 +56,7 @@ def values_with_sorted_authors(values: dict[str, str]) -> dict[str, str]:
class Config:
compound_country_field: str
fasta_id_field: str
insdc_segment_specific_fields: list[str] # What does this field mean?
insdc_segment_specific_fields: list[str] # Fields that can vary between segments in a group
nucleotide_sequences: list[str]
segmented: bool

Expand Down
Loading

0 comments on commit a58edd1

Please sign in to comment.