Skip to content

Commit

Permalink
Blackify
Browse files Browse the repository at this point in the history
  • Loading branch information
jfy133 committed Oct 6, 2022
1 parent 446769f commit 8fbded0
Show file tree
Hide file tree
Showing 9 changed files with 486 additions and 227 deletions.
46 changes: 31 additions & 15 deletions bin/combine_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,22 @@
import os.path
import pandas as pd


def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('-d', "--depths_summary", required=True, metavar='FILE', help="Bin depths summary file.")
parser.add_argument('-b', "--busco_summary", metavar='FILE', help="BUSCO summary file.")
parser.add_argument('-q', "--quast_summary", metavar='FILE', help="QUAST BINS summary file.")
parser.add_argument('-g', "--gtdbtk_summary", metavar='FILE', help="GTDB-Tk summary file.")
parser.add_argument("-d", "--depths_summary", required=True, metavar="FILE", help="Bin depths summary file.")
parser.add_argument("-b", "--busco_summary", metavar="FILE", help="BUSCO summary file.")
parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.")
parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.")

parser.add_argument('-o', "--out", required=True, metavar='FILE', type=argparse.FileType('w'), help="Output file containing final summary.")
parser.add_argument(
"-o",
"--out",
required=True,
metavar="FILE",
type=argparse.FileType("w"),
help="Output file containing final summary.",
)
return parser.parse_args(args)


Expand All @@ -28,28 +36,36 @@ def main(args=None):

# handle bin depths
results = pd.read_csv(args.depths_summary, sep="\t")
results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns ]
bins = results['bin'].sort_values().reset_index(drop=True)
results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns]
bins = results["bin"].sort_values().reset_index(drop=True)

if args.busco_summary:
busco_results = pd.read_csv(args.busco_summary, sep="\t")
if not bins.equals(busco_results['GenomeBin'].sort_values().reset_index(drop=True)):
if not bins.equals(busco_results["GenomeBin"].sort_values().reset_index(drop=True)):
sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!")
results = pd.merge(results, busco_results, left_on="bin", right_on="GenomeBin", how='outer') # assuming depths for all bins are given
results = pd.merge(
results, busco_results, left_on="bin", right_on="GenomeBin", how="outer"
) # assuming depths for all bins are given

if args.quast_summary:
quast_results = pd.read_csv(args.quast_summary, sep="\t")
if not bins.equals(quast_results['Assembly'].sort_values().reset_index(drop=True)):
if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)):
sys.exit("Bins in QUAST summary do not match bins in bin depths summary!")
results = pd.merge(results, quast_results, left_on="bin", right_on="Assembly", how='outer') # assuming depths for all bins are given
results = pd.merge(
results, quast_results, left_on="bin", right_on="Assembly", how="outer"
) # assuming depths for all bins are given

if args.gtdbtk_summary:
gtdbtk_results = pd.read_csv(args.gtdbtk_summary, sep="\t")
if not bins.equals(gtdbtk_results['user_genome'].sort_values().reset_index(drop=True)):
sys.exit("Bins in GTDB-Tk summary do not match bins in BUSCO summary!") # GTDB-Tk can currently anyway only run in combination with BUSCO
results = pd.merge(results, gtdbtk_results, left_on="GenomeBin", right_on="user_genome", how='outer') # assuming BUSCO summary must be given
if not bins.equals(gtdbtk_results["user_genome"].sort_values().reset_index(drop=True)):
sys.exit(
"Bins in GTDB-Tk summary do not match bins in BUSCO summary!"
) # GTDB-Tk can currently anyway only run in combination with BUSCO
results = pd.merge(
results, gtdbtk_results, left_on="GenomeBin", right_on="user_genome", how="outer"
) # assuming BUSCO summary must be given

results.to_csv(args.out, sep='\t')
results.to_csv(args.out, sep="\t")


if __name__ == "__main__":
Expand Down
23 changes: 4 additions & 19 deletions bin/filter_ssu.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,10 @@ def filter(args):


def main():
parser = argparse.ArgumentParser(
prog="filter_ssu.py",
usage="filter ssu hits from refinem"
)
parser.add_argument(
"--evalue",
help="evalue threshold"
)
parser.add_argument(
"ssu",
metavar="ssu.tsv",
help="ssu tsv file generated by refinem"
)
parser.add_argument(
"output",
metavar="output.tsv",
default="output.tsv",
help="output file name"
)
parser = argparse.ArgumentParser(prog="filter_ssu.py", usage="filter ssu hits from refinem")
parser.add_argument("--evalue", help="evalue threshold")
parser.add_argument("ssu", metavar="ssu.tsv", help="ssu tsv file generated by refinem")
parser.add_argument("output", metavar="output.tsv", default="output.tsv", help="output file name")
parser.set_defaults(func=filter)
args = parser.parse_args()

Expand Down
54 changes: 35 additions & 19 deletions bin/get_mag_depths.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,61 +13,77 @@

def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('-b', '--bins' , required=True, nargs="+", metavar='FILE' , help="Bins: FASTA containing all contigs.")
parser.add_argument('-d', '--depths' , required=True , metavar='FILE' , help="(Compressed) TSV file containing contig depths for each sample: contigName, contigLen, totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...].")
parser.add_argument('-a', '--assembler' , required=True , type=str , help="Assembler name.")
parser.add_argument('-i', '--id' , required=True , type=str , help="Sample or group id.")
parser.add_argument('-m', '--binner' , required=True , type=str , help="Binning method.")
parser.add_argument(
"-b", "--bins", required=True, nargs="+", metavar="FILE", help="Bins: FASTA containing all contigs."
)
parser.add_argument(
"-d",
"--depths",
required=True,
metavar="FILE",
help="(Compressed) TSV file containing contig depths for each sample: contigName, contigLen, totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...].",
)
parser.add_argument("-a", "--assembler", required=True, type=str, help="Assembler name.")
parser.add_argument("-i", "--id", required=True, type=str, help="Sample or group id.")
parser.add_argument("-m", "--binner", required=True, type=str, help="Binning method.")
return parser.parse_args(args)


# Processing contig depths for each binner again, i.e. not the most efficient way, but ok


def main(args=None):
args = parse_args(args)

# load contig depths for all samples into dict (could use pandas as well)
sample_names = []
dict_contig_depths = {}
with gzip.open(args.depths, "rt") as infile:
reader = csv.reader(infile, delimiter = "\t")
reader = csv.reader(infile, delimiter="\t")
# process header
header = next(reader)
for sample in range(int((len(header)-3)/2)):
col_name = header[3+2*sample]
for sample in range(int((len(header) - 3) / 2)):
col_name = header[3 + 2 * sample]
# retrieve sample name: "<assembler>-<id>-<other sample_name>.bam"
sample_name = col_name[len(args.assembler)+1+len(args.id)+1:-4]
sample_name = col_name[len(args.assembler) + 1 + len(args.id) + 1 : -4]
sample_names.append(sample_name)
# process contig depths
for row in reader:
contig_depths = []
for sample in range(int((len(row)-3)/2)):
contig_depths.append(float(row[3+2*sample]))
for sample in range(int((len(row) - 3) / 2)):
contig_depths.append(float(row[3 + 2 * sample]))
dict_contig_depths[str(row[0])] = contig_depths

# Initialize output files
n_samples = len(sample_names)
with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", 'w') as outfile:
print("bin", '\t'.join(sample_names), sep='\t', file=outfile)
with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "w") as outfile:
print("bin", "\t".join(sample_names), sep="\t", file=outfile)

# for each bin, access contig depths and compute mean bin depth (for all samples)
for file in args.bins:
all_depths = [[] for i in range(n_samples)]

if file.endswith('.gz'):
with gzip.open(file, 'rt') as infile:
for rec in SeqIO.parse(infile,'fasta'):
if file.endswith(".gz"):
with gzip.open(file, "rt") as infile:
for rec in SeqIO.parse(infile, "fasta"):
contig_depths = dict_contig_depths[rec.id]
for sample in range(n_samples):
all_depths[sample].append(contig_depths[sample])
else:
with open(file, "rt") as infile:
for rec in SeqIO.parse(infile,'fasta'):
for rec in SeqIO.parse(infile, "fasta"):
contig_depths = dict_contig_depths[rec.id]
for sample in range(n_samples):
all_depths[sample].append(contig_depths[sample])

binname = os.path.basename(file)
with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", 'a') as outfile:
print(binname, '\t'.join(str(statistics.median(sample_depths)) for sample_depths in all_depths), sep='\t', file=outfile)
with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "a") as outfile:
print(
binname,
"\t".join(str(statistics.median(sample_depths)) for sample_depths in all_depths),
sep="\t",
file=outfile,
)


if __name__ == "__main__":
Expand Down
24 changes: 20 additions & 4 deletions bin/get_mag_depths_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,35 @@

def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--depths' , required=True, nargs="+", metavar='FILE' , help="TSV file for each assembly and binning method containing bin depths for samples: bin, sample1, ....")
parser.add_argument('-o', "--out" , required=True , metavar='FILE', type=argparse.FileType('w'), help="Output file containing depths for all assemblies, binning methods and all samples.")
parser.add_argument(
"-d",
"--depths",
required=True,
nargs="+",
metavar="FILE",
help="TSV file for each assembly and binning method containing bin depths for samples: bin, sample1, ....",
)
parser.add_argument(
"-o",
"--out",
required=True,
metavar="FILE",
type=argparse.FileType("w"),
help="Output file containing depths for all assemblies, binning methods and all samples.",
)
return parser.parse_args(args)


def main(args=None):
args = parse_args(args)

results = pd.DataFrame()
for assembly_depths_file in args.depths:
assembly_results = pd.read_csv(assembly_depths_file, index_col="bin", sep="\t")
results = results.append(assembly_results, sort=True, verify_integrity=True)
results = results.append(assembly_results, sort=True, verify_integrity=True)

results.to_csv(args.out, sep="\t")

results.to_csv(args.out, sep='\t')

if __name__ == "__main__":
sys.exit(main())
Loading

0 comments on commit 8fbded0

Please sign in to comment.