diff --git a/ectyper/definitions.py b/ectyper/definitions.py index 747dd1e..234c1d5 100644 --- a/ectyper/definitions.py +++ b/ectyper/definitions.py @@ -46,4 +46,6 @@ 'GeneLengths','DatabaseVer','Warnings','Pathotype', 'PathotypeCounts', 'PathotypeGenes', 'PathotypeGeneNames', 'PathotypeAccessions', 'PathotypeAlleleIDs', 'PathotypeIdentities(%)','PathotypeCoverages(%)','PathotypeGeneLengthRatios','PathotypeRuleIDs', 'PathotypeGeneCounts', 'PathoDBVer', 'StxSubtypes','StxAccessions','StxAlleleIDs', 'StxIdentities(%)','StxCoverages(%)','StxLengths', - 'StxContigNames', 'StxContigNum','StxCoordinates'] \ No newline at end of file + 'StxContigNames', 'StxContigNum','StxCoordinates'] +OUTPUT_FILES_LIST = ['blastn_output_alleles.txt', 'blastn_pathotype_alleles_overall.txt', 'mash_output.txt', + 'stx1_allhits_annotated_df.txt', 'stx2_allhits_annotated_df.txt'] \ No newline at end of file diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py index b4b3b66..b4bee16 100644 --- a/ectyper/ectyper.py +++ b/ectyper/ectyper.py @@ -62,7 +62,7 @@ def run_program(): Creates all required files and controls function execution. :return: success or failure """ - + LOG.setLevel(logging.INFO) args = commandLineOptions.parse_command_line() @@ -76,7 +76,7 @@ def run_program(): LOG.setLevel(logging.DEBUG) else: fh.setLevel(logging.INFO) - LOG.setLevel(logging.INFO) + LOG.addHandler(fh) #try to load database @@ -235,7 +235,7 @@ def run_program(): if args.debug == False: shutil.rmtree(temp_dir, ignore_errors=True) - LOG.info("ECTyper has finished successfully.") + LOG.info(f"ECTyper has finished successfully. Results available at {os.path.abspath(args.output)}") def getOantigenHighSimilarGroup(final_predictions, sample): pred_Otypes = final_predictions[sample]['O']["serogroup"].split("/") #if call is a mixed call @@ -263,8 +263,8 @@ def create_output_directory(output_dir): :param output_dir: The user-specified output directory, if any :return: The output directory """ - # If no output directory is specified for the run, create a one based on - # time + # If no output directory is specified for the run, create a one based on time + if output_dir is None: @@ -283,6 +283,13 @@ def create_output_directory(output_dir): if not os.path.exists(out_dir): os.makedirs(out_dir) + + # clean previous ECTyper output files if the directory was used in previous runs + for file in definitions.OUTPUT_FILES_LIST: + path2file = os.path.join(output_dir,file) + if os.path.exists(path2file): + LOG.info(f"Cleaning ECTyper previous files. Removing previously generated {path2file} ...") + os.remove(path2file) return out_dir @@ -398,9 +405,14 @@ def genome_group_prediction(g_group, alleles_fasta, args, temp_dir, ectyperdb_di blast_output_file, ectyperdb_dict, args); - - blast_output_file_path = os.path.join(args.output,"blast_output_alleles.txt") - blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , sep="\t", index=False) - LOG.info("BLAST output file against reference alleles is written at {}".format(blast_output_file_path)) + + blast_output_file_path = os.path.join(args.output,f"blastn_output_alleles.txt") + if os.path.exists(blast_output_file_path) == False: + blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , sep="\t", index=False) + LOG.info("BLAST output file against reference alleles is written at {}".format(blast_output_file_path)) + else: + blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , mode="a", header=False, sep="\t", index=False) + LOG.info("Appending BLAST output file against reference alleles at {}".format(blast_output_file_path)) + return db_prediction_dict diff --git a/ectyper/predictionFunctions.py b/ectyper/predictionFunctions.py index 272bc5b..216a4eb 100644 --- a/ectyper/predictionFunctions.py +++ b/ectyper/predictionFunctions.py @@ -134,7 +134,11 @@ def shiga_toxing_subtyping(pathotype_genes_tmp_df, output_dir, debug): if debug: stx_df_out_filename = f'{gene}_allhits_annotated_df.txt' LOG.debug(f"Wrote {gene} annotated potential hits dataframe to {output_dir}/{stx_df_out_filename}") - stx_toxin_df.to_csv(os.path.join(output_dir,stx_df_out_filename), sep="\t", index=False) + path2stx_df = os.path.join(output_dir,stx_df_out_filename) + if os.path.exists(path2stx_df) == False: + stx_toxin_df.to_csv(path2stx_df, sep="\t", index=False) + else: + stx_toxin_df.to_csv(path2stx_df , mode="a", header=False, sep="\t", index=False) # get top hit for each common gene range. Provide mixed call if >1 hits share the same 'bitscore' stx_subtypes_dict={} for range_id in stx_toxin_df['rangeid'].unique(): @@ -312,7 +316,9 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen #write pathotype blastn results if debug == True: LOG.debug(f"Writting overall pathotype BLASTn results to {output_dir}/blastn_pathotype_alleles_overall.txt") - pathotype_genes_overall_df.to_csv(f'{output_dir}/blastn_pathotype_alleles_overall.txt',sep="\t", index=False) + path2pathotype_df = f'{output_dir}/blastn_pathotype_alleles_overall.txt' + pathotype_genes_overall_df.to_csv(path2pathotype_df,sep="\t", index=False) + return predictions_pathotype_dict @@ -361,7 +367,7 @@ def predict_serotype(blast_output_file, ectyper_dict, args): # Make prediction for each genome based on blast output for genome_name, per_genome_df in output_df.groupby('genome_name'): predictions_dict[genome_name] = get_prediction(per_genome_df) - LOG.info("Serotype prediction successfully completed") + LOG.info(f"Serotype prediction successfully completed for {genome_name}") LOG.debug("Predictions dict:\n{}".format(predictions_dict)) return predictions_dict, output_df diff --git a/ectyper/speciesIdentification.py b/ectyper/speciesIdentification.py index 0b74294..4b528e2 100644 --- a/ectyper/speciesIdentification.py +++ b/ectyper/speciesIdentification.py @@ -185,7 +185,7 @@ def get_species(file, args, cores=1): if args.debug: LOG.debug("Wrote MASH against reference sketch results to {}".format(args.output)) - with open(file=args.output+"/mash_output.txt", mode="w") as fp: + with open(file=args.output+"/mash_output.txt", mode="a") as fp: fp.write(sort_output.stdout.decode("utf-8")) fp.close()