Skip to content

Commit

Permalink
Feature download sorting (#43)
Browse files Browse the repository at this point in the history
* sequence check function and test cases

* re-organise files and formatting
  • Loading branch information
vestalisvirginis authored Jan 3, 2024
1 parent 7617542 commit 723373b
Show file tree
Hide file tree
Showing 14 changed files with 507 additions and 110 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ clean: # Remove workspace files
@rm -rf .scannerwork
@rm -rf ./dist
@rm -rf test/fixtures/assets_testing_folder/ncbi_download/fetch
@rm -rf test/fixtures/assets_testing_folder/sequence_quality/genbank
@rm -rf test/fixtures/assets_testing_folder/blasting/gene_identity
@rm -rf test/fixtures/assets_testing_folder/blasting_with_history/gene_identity
@rm -rf test/fixtures/assets_testing_folder/synteny/synteny
Expand Down
5 changes: 5 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def mock_env_ncbi_fetch(monkeypatch):
monkeypatch.setenv("PHAGY_DIRECTORY", "test/fixtures/ncbi_download/fetch")


@pytest.fixture
def mock_env_sequence_check(monkeypatch):
monkeypatch.setenv("PHAGY_DIRECTORY", "test/fixtures/assets_testing_folder/sequence_quality")


@pytest.fixture
def mock_env_phagy_dir_blasting(monkeypatch):
monkeypatch.setenv("PHAGY_DIRECTORY", "test/fixtures/assets_testing_folder/blasting")
Expand Down
60 changes: 0 additions & 60 deletions synphage/assets/blaster/blaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
)

import os
import re
import shutil
import pickle

from Bio import SeqIO
Expand Down Expand Up @@ -44,64 +42,6 @@
}


# @asset(
# config_schema={**sqc_folder_config},
# description="Select for phages with complete genome sequence",
# compute_kind="Biopython",
# metadata={"owner": "Virginie Grosboillot"},
# )
# def sequence_sorting(context, fetch_genome) -> List[str]:
# context.log.info(f"Number of genomes in download folder: {len(fetch_genome)}")

# _complete_sequences = []
# for _file in fetch_genome:
# for _p in SeqIO.parse(_file, "gb"):
# if re.search("complete genome", _p.description):
# _complete_sequences.append(_file)

# context.log.info(f"Number of complete sequences: {len(_complete_sequences)}")

# _bacillus_sub_sequences = []
# for _file in _complete_sequences:
# for _p in SeqIO.parse(_file, "gb"):
# for _feature in _p.features:
# if _feature.type == "source":
# for _v in _feature.qualifiers.values():
# if re.search("Bacillus subtilis", _v[0]):
# _bacillus_sub_sequences.append(_file)

# context.log.info(
# f"Number of Bacillus subtilis sequences: {len(_bacillus_sub_sequences)}"
# )

# _genes_in_sequences = []
# for _file in _bacillus_sub_sequences:
# for _p in SeqIO.parse(_file, "gb"):
# if set(["gene"]).issubset(set([type_f.type for type_f in _p.features])):
# _genes_in_sequences.append(_file)

# context.log.info(
# f"Number of sequences with gene features: {len(_genes_in_sequences)}"
# )

# _gb_path = "/".join(
# [os.getenv(EnvVar("PHAGY_DIRECTORY")), context.op_config["genebank_dir"]]
# )

# for _file in _genes_in_sequences:
# shutil.copy2(
# _file,
# f"{_gb_path}/{Path(_file).stem}.gb",
# )

# return list(
# map(
# lambda x: Path(x).stem,
# os.listdir(_gb_path),
# )
# )


def _assess_file_content(genome) -> bool:
"""Assess wether the genbank file contains gene or only CDS"""

Expand Down
14 changes: 6 additions & 8 deletions synphage/assets/ncbi_connect/accession.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ def _get_ncbi_count_result(result, dbname) -> NucleotideRecord:

ncbi_query_config = {
"database": Field(str, description="Database identifier", default_value="nuccore"),
#"keyword": Field(
#str,
#description="Search criteria for the ncbi query",
#default_value=os.getenv(EnvVar("KEYWORD"), "Listeria ivanovii"),
#),
# "keyword": Field(
# str,
# description="Search criteria for the ncbi query",
# default_value=os.getenv(EnvVar("KEYWORD"), "Listeria ivanovii"),
# ),
}


Expand All @@ -38,9 +38,7 @@ def _get_ncbi_count_result(result, dbname) -> NucleotideRecord:
def accession_count(context) -> int:
keyword = os.getenv(EnvVar("KEYWORD"), "Listeria ivanovii")
context.log.info(keyword)
_query = context.resources.ncbi_connection.conn.egquery(
term=keyword
)
_query = context.resources.ncbi_connection.conn.egquery(term=keyword)
_result = context.resources.ncbi_connection.conn.read(_query)
_query.close()
_nucleotide = _get_ncbi_count_result(_result, context.op_config["database"])
Expand Down
73 changes: 73 additions & 0 deletions synphage/assets/ncbi_connect/sequence_quality_assessment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from dagster import (
asset,
EnvVar,
Field,
)

import os
import shutil

from pathlib import Path
from datetime import datetime
from typing import List


sqc_folder_config = {
"sqc_download_dir": Field(
str,
description="Path to folder containing the downloaded genbank sequences",
default_value="download",
),
"genbank_dir": Field(
str,
description="Path to folder containing the genbank files",
default_value="genbank",
),
"fasta_dir": Field(
str,
description="Path to folder containing the fasta sequence files",
default_value="gene_identity/fasta",
),
}


@asset(
config_schema={**sqc_folder_config},
description="Checks for sequence quality and accuracy",
compute_kind="Biopython",
metadata={"owner": "Virginie Grosboillot"},
)
def sequence_check(context, fetch_genome) -> List[str]:
context.log.info(f"Number of genomes in download folder: {len(fetch_genome)}")

_gb_path = "/".join(
[os.getenv(EnvVar("PHAGY_DIRECTORY")), context.op_config["genbank_dir"]]
)
os.makedirs(_gb_path, exist_ok=True)

# add check to assess the quality of the query

for _file in fetch_genome:
shutil.copy2(
_file,
f"{_gb_path}/{Path(_file).stem.replace('.', '_')}.gb",
)

_downloaded_files = list(
map(
lambda x: Path(x).stem,
os.listdir(_gb_path),
)
)

_time = datetime.now()
context.add_output_metadata(
metadata={
"text_metadata": f"List of downloaded sequences{_time.isoformat()} (UTC).",
"path": _gb_path,
"num_files": len(_downloaded_files),
"preview": _downloaded_files,
}
)

return _downloaded_files
2 changes: 1 addition & 1 deletion synphage/assets/viewer/viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ def create_graph(context, create_genome, config: Diagram):
ypos = int(math.trunc(height * 0.9))
context.log.info(f"Coord of SVG: {str(xpos)} : {str(ypos)}")

legend_path = 'synphage/assets/viewer/legend.svg'
legend_path = "synphage/assets/viewer/legend.svg"
# (f"{_output_folder}/legend.svg")
C.Figure(
f"{width}px",
Expand Down
Loading

0 comments on commit 723373b

Please sign in to comment.