Skip to content

Commit

Permalink
Merge pull request #78 from kbessonov1984/master
Browse files Browse the repository at this point in the history
Version 0.9.1 addressing minor issues on species identification and fasta files handling
  • Loading branch information
kbessonov1984 authored Dec 7, 2019
2 parents bcc0e6a + 5fe0483 commit a7e67b6
Show file tree
Hide file tree
Showing 12 changed files with 90,109 additions and 82 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,10 @@ Did not found any noticeable negative effect on specificity and accuracy based o
* wrote additional unit tests


**v0.9.1**
* Implemented better species identification error handling for cases when accession number
is not found in the assembly stats. This is especially important for custom
MASH sketches that might have accession numbers not found in `assembly_summary.txt` stats file
* Corrected issues #76 and 77. Improved behaviour in case of no `--verify` switch is
specified and MASH distance to RefSeq genomes fails
* Added additional test cases for non-E.coli genomes
3 changes: 2 additions & 1 deletion ectyper/ectyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def run_program():
temp_dir)



LOG.info("Standardizing the E.coli genome headers based on file names") #e.g. lcl|Escherichia_O26H11|17
#final_fasta_files \
predictions_dict={}
Expand All @@ -108,7 +109,7 @@ def run_program():

# Add empty rows for genomes without a blast result or non-E.coli samples that did not undergo typing
final_predictions = predictionFunctions.add_non_predicted(
raw_genome_files, predictions_dict, other_genomes_dict)
raw_genome_files, predictions_dict, other_genomes_dict, ecoli_genomes_dict)

# Store most recent result in working directory
LOG.info("Reporting results:\n")
Expand Down
11 changes: 5 additions & 6 deletions ectyper/genomeFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,26 +98,25 @@ def get_genome_names_from_files(files_dict, temp_dir, args):
:param files: All the fasta files for analyses
:param temp_dir: The ectyper temp directory
:param args: Commandline arguments
:return: List of files with fasta headers modified for filename
:return: Dictionary of files with the fasta headers modified for each filename {sampleid: {species:"","filepath":"","modheaderfile":"","error":""}}
"""
files=[]
for sample in files_dict.keys(): #{'Escherichia_O26H11': {'species': 'Escherichia coli', 'filepath': '/Data/Escherichia_O26H11.fasta'}}
for sample in files_dict.keys(): # files_dict = {'Escherichia_O26H11': {'species': 'Escherichia coli', 'filepath': '/Data/Escherichia_O26H11.fasta'}}
files.append(files_dict[sample]["filepath"])
#modified_genomes = []

partial_ghw = partial(genome_header_wrapper, temp_dir=temp_dir)


with Pool(processes=args.cores) as pool:
(results)= pool.map(partial_ghw, files)
#print(results)

for r in results:
sample=r["samplename"]
files_dict[sample]["modheaderfile"] = r["newfile"]
#modified_genomes.append(r)

modified_genomes = [files_dict[samples]["modheaderfile"] for samples in files_dict.keys()]
LOG.debug(("Modified genomes: {}".format(modified_genomes)))
#return modified_genomes

return files_dict

def genome_header_wrapper(file, temp_dir):
Expand Down
7 changes: 4 additions & 3 deletions ectyper/predictionFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def report_result(final_dict, output_dir, output_file):
ofh.write(line)


def add_non_predicted(all_genomes_list, predictions_dict, other_dict):
def add_non_predicted(all_genomes_list, predictions_dict, other_dict, ecoli_dict):
"""
Add genomes that do not show up in the blast results to the final predictions
Expand All @@ -378,6 +378,7 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict):
for g in all_genomes_list:
gname = os.path.splitext(os.path.split(g)[1])[0]


if gname not in predictions_dict:
if gname in other_dict:
predictions_dict[gname] = {
Expand All @@ -386,8 +387,8 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict):
}
else:
predictions_dict[gname] = {
'error': "No serotyping-specific genes found",
'species': other_dict[gname]["species"]
'error': "No serotyping-specific E.coli genes found",
'species': ecoli_dict[gname]["species"]
}

return predictions_dict
140 changes: 93 additions & 47 deletions ectyper/speciesIdentification.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def get_refseq_mash():
if response.status_code == 200:
with portalocker.Lock(filename=targetpath, mode="wb", flags=portalocker.LOCK_EX) as fp:
fp.write(response.content); fp.flush()
download_RefSeq_assembly_summary()
download_assembly_summary()
except Exception as e:
LOG.error("Failed to download refseq.genomes.k21s1000.msh from {}.\nError msg {}".format(url,e))
pass
Expand All @@ -70,32 +70,37 @@ def get_refseq_mash():
else:
assemblysummarypath = os.path.join(os.path.dirname(__file__), "Data/assembly_summary_refseq.txt")
if os.path.exists(assemblysummarypath) == False:
download_RefSeq_assembly_summary()
LOG.info("RefSeq sketch is in good health and does not need to be downloaded")
download_assembly_summary()
LOG.info("RefSeq sketch (refseq.genomes.k21s1000.msh) and assembly meta data (assembly_summary_refseq.txt) is in good health and does not need to be downloaded")
return True

def download_RefSeq_assembly_summary():
url = "http://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt"
targetpath = os.path.join(os.path.dirname(__file__), "Data/assembly_summary_refseq.txt")
def download_assembly_summary():
sourceurls = {"assembly_summary_refseq.txt":"http://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt"}

try:
response = requests.get(url, timeout=10, verify=False)
response.raise_for_status()
for targetfile, url in sourceurls.items():
targetpath = os.path.join(os.path.dirname(__file__), "Data/"+targetfile)
response = requests.get(url, timeout=10, verify=False)
response.raise_for_status()

if response.status_code == 200:
with portalocker.Lock(filename=targetpath, mode="w", flags=portalocker.LOCK_EX) as fp:
fp.write(response.text); fp.flush()
LOG.info("Successfully downloaded {} with response code {}".format(targetfile, response.status_code))
else:
LOG.critical("Server response error {}. Failed to download {}".format(response.status_code,targetfile))


if response.status_code == 200:
with portalocker.Lock(filename=targetpath, mode="w", flags=portalocker.LOCK_EX) as fp:
fp.write(response.text); fp.flush()
LOG.info("Successfully downloaded assembly_summary_refseq.txt".format(response.status_code))
else:
LOG.critical("Server response error {}. Failed to download assembly_summary_refseq.txt".format(response.status_code))
if not os.path.exists(targetpath):
exit(1)
LOG.critical("The {} file does not exist or is corrupted at {} path".format(targetfile, targetpath))

except Exception as e:
LOG.critical("Failed to download or write to a disk assembly_summary_refseq.txt from {}.\nError msg {}".format(url, str(e)))
print(e)
LOG.critical("Failed to download or write to a disk assembly_summary_genbank.txt or assembly_summary_refseq.txt.\nError msg {}".format(str(e)))
pass

if (os.path.exists(targetpath)) == False or os.path.getsize(targetpath) < 54000000:
exit(1)
LOG.critical("The assembly_summary_refseq.txt file does not exist or is corrupted at {} path".format(targetpath))



def is_ecoli(genome_file, temp_dir):
"""
Expand All @@ -105,6 +110,7 @@ def is_ecoli(genome_file, temp_dir):
:param temp_dir: ectyper run temp_dir
:return: True or False
"""
LOG.info("Verifying if sample is a valid E.coli genome based on the 10 markers")
num_hit = get_num_hits(genome_file, temp_dir)

if num_hit < 3:
Expand All @@ -117,6 +123,7 @@ def is_ecoli(genome_file, temp_dir):
"present".format(os.path.basename(genome_file)))
return False
else:
LOG.info("Samples is a valid E.coli genome based on the {} markers".format(num_hit))
return True

def is_escherichia_genus(speciesname):
Expand Down Expand Up @@ -204,34 +211,63 @@ def get_species(file, args):

head_cmd = [
'head',
'-n', '1'
'-n', '5'
]

head_output = subprocess_util.run_subprocess(head_cmd,
input_data=sort_output.stdout)
top_hit = head_output.stdout.decode("utf-8").split()
top_match = top_hit[0]; top_match_dist = top_hit[2]; top_match_hashratio = top_hit[4]
top_hit_lines = head_output.stdout.decode("utf-8").split('\n')

LOG.info("MASH species RefSeq top hit {} with distance {} and shared hashes ratio {}".format(top_match,top_match_dist,top_match_hashratio))
LOG.info("Following top hits returned by MASH {}".format([top_hit_line.split("\t")[0] for top_hit_line in top_hit_lines]))

m = re.match(r"(GCF_\d+)", top_match)
#refseq_key = None
if m:
refseq_key = m.group(1)
else:
LOG.critical("Unknown key from MASH species search")
return("Undetermined species. Could not map genome accession #{} to species".format(top_match))
#exit("MASH error")

LOG.info(refseq_key)
grep_cmd = [
'grep',
refseq_key,
definitions.REFSEQ_SUMMARY
]
grep_output = subprocess_util.run_subprocess(grep_cmd)
for top_hit_line in top_hit_lines:
top_hit_line_elements = top_hit_line.split()


top_match = top_hit_line_elements[0]; top_match_dist = top_hit_line_elements[2]; top_match_hashratio = top_hit_line_elements[4]

species = grep_output.stdout.decode("utf-8").split('\t')[7]
m = re.match(r"(GCF_\d+)", top_match)

top_match_hashratio_tuple = re.findall('(\d+)\/(\d+)', top_match_hashratio)[0]
top_match_sharedhashes = int(top_match_hashratio_tuple[0])

if m:
refseq_key = m.group(1)
else:
LOG.warning("Could not detemine species based on MASH Distance"
"Could not extract GCF_# accession number from the MASH dist results.".format(top_match))
continue #try other top match

if m is None or top_match_sharedhashes < 100:
LOG.warning("\nCould not detemine species based on MASH Distance.\n"
"Either:\n"
"1. MASH sketch meta data accessions do not start with GCF_ prefix or\n"
"2. Number of shared hashes to reference is below 100.\n"
"3. Meta data can not match none of the top 5 RefSeq accession numbers\n"
"Found {} shared hashes to {}".format(top_match_hashratio,top_match))
species = "-" # if after 10 IDs still no accession number match, give up
return species

LOG.info(refseq_key)
grep_cmd = [
'grep',
refseq_key,
definitions.REFSEQ_SUMMARY
]
grep_output = subprocess_util.run_subprocess(grep_cmd, ignorereturncode=True)
grep_output_decoded = grep_output.stdout.decode("utf-8").split('\t')

if grep_output_decoded and len(grep_output_decoded) >= 8:
species = grep_output.stdout.decode("utf-8").split('\t')[7]
else:
species = "-"

if not species == "-":
break #no need to continue looping if top hit species is found

LOG.info(
"MASH species RefSeq top hit {} with distance {} and shared hashes ratio {}".format(top_match, top_match_dist,
top_match_hashratio))
LOG.info("MASH dist predicted species name: {}".format(species))

return species
Expand All @@ -257,7 +293,7 @@ def verify_ecoli(fasta_fastq_files_dict, ofiles, args, temp_dir):
ecoli_files_dict = {}
other_files_dict = {}

fasta_files= fasta_fastq_files_dict.keys()
fasta_files = fasta_fastq_files_dict.keys()
for fasta in fasta_files:
sampleName = getSampleName(fasta)

Expand All @@ -277,20 +313,30 @@ def verify_ecoli(fasta_fastq_files_dict, ofiles, args, temp_dir):

if args.verify:
if is_ecoli(fasta, temp_dir):
#ecoli_files.append(f)
ecoli_files_dict[sampleName] = {"species":"Escherichia coli","filepath":fasta,"error":"-"}
#elif is_shigella(f, speciesname, args):
# other_files_dict[sampleName] = {"species": speciesname, "filepath": f}
elif is_escherichia_genus(speciesname):
ecoli_files_dict[sampleName] = {"species":speciesname,"filepath":fasta,"error":"This sample belongs to Escherichia genus so serotyping results might not be accurate"}
else:
if args.refseq:
other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta,"error":"-"}
else:
other_files_dict[sampleName] = {"species":"Non-Ecoli", "error":"Failed E. coli species confirmation based on 10 E.coli specific markers"}
#if args.refseq:
# other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta,"error":"-"}
#else:
other_files_dict[sampleName] = {"species":speciesname, "error":"Failed E.coli species confirmation based on 10 E.coli specific markers"}
else:
#ecoli_files.append(f)
ecoli_files_dict[fasta] = {"species":speciesname,"filepath":fasta}
#Withouth --verify assume all input genomes are E.coli and try to type them regardless of MASH result
if re.match("Escherichia", speciesname):
ecoli_files_dict[sampleName] = {"species":speciesname,"filepath":fasta, "error": "-"}
elif speciesname == "-":
other_files_dict[sampleName] = {"species": speciesname,
"error": "MASH dist species determination was not successful"}

ecoli_files_dict[sampleName]["error"] = "MASH dist species determination was not successful.\n" \
"Try running ECTyper with the --verify switch instead"
else:
other_files_dict[sampleName] = {"species": speciesname, "filepath": fasta, "error": "Non-Ecoli"}


for bf in ofiles:
sampleName = getSampleName(bf)
Expand Down
7 changes: 4 additions & 3 deletions ectyper/subprocess_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
LOG = logging.getLogger(__name__)


def run_subprocess(cmd, input_data=None, un=False):
def run_subprocess(cmd, input_data=None, un=False, ignorereturncode=False):
"""
Run cmd command on subprocess
Expand All @@ -29,12 +29,13 @@ def run_subprocess(cmd, input_data=None, un=False):
stderr=subprocess.PIPE
)

if comp_proc.returncode == 0:

if comp_proc.returncode == 0 or ignorereturncode == True:
elapsed_time = timeit.default_timer() - start_time
LOG.debug("Subprocess {} finished successfully in {:0.3f} sec.".format(cmd, elapsed_time))
return comp_proc
else:
LOG.error("Error in subprocess. The following command failed: {}".format(cmd))
LOG.error("Subprocess failed with error: {}".format(comp_proc.stderr))
LOG.error("Subprocess failed with error: \"{}\"".format(comp_proc.stderr.decode("utf-8")))
LOG.critical("ectyper has stopped")
exit("subprocess failure")
Loading

0 comments on commit a7e67b6

Please sign in to comment.