Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CAT summary into the global bin_summary #562

Merged
merged 12 commits into from
Feb 1, 2024
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### `Added`

- [#562](https://github.com/nf-core/mag/pull/562) - Add CAT summary into the global bin_summary (by @maxibor)

### `Changed`

### `Fixed`
Expand Down
45 changes: 44 additions & 1 deletion bin/combine_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def parse_args(args=None):
parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.")
parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.")
parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.")

parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.")
parser.add_argument(
"-o",
"--out",
Expand All @@ -25,6 +25,43 @@ def parse_args(args=None):
return parser.parse_args(args)


def parse_cat_table(cat_table):
"""Parse CAT table.

CAT table is trickier to parse than the other tables, because it has a variable number of columns,
depending on the number of ranks that are reported for the taxonomic assignation of each contig.
Therefore, we first parse the header to get the column names, and then parse the table, to get the
maximum number of columns. Then, we merge the columns containing the ranks into a single column.

Args:
cat_table (str): Path to CAT table

Returns:
pd.DataFrame: parse CAT table
"""
with open(cat_table, "r") as f:
next(f) # skip header
maxcol = 0
for line in f:
maxcol = max(maxcol, len(line.split("\t")))

header = ["bin", "classification", "reason", "lineage", "lineage scores", "full lineage names"]

df = pd.read_table(
cat_table,
names=header + [f"rank_{i}" for i in range(maxcol - len(header))],
on_bad_lines="warn",
header=None,
skiprows=1,
)
# merge all rank columns into a single column
df["CAT_rank"] = df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip()
# remove rank_* columns
df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True)

return df


def main(args=None):
args = parse_args(args)

Expand Down Expand Up @@ -93,6 +130,12 @@ def main(args=None):
results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer"
) # assuming depths for all bins are given

if args.cat_summary:
cat_results = parse_cat_table(args.cat_summary)
if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0:
sys.exit("Bins in CAT summary do not match bins in bin depths summary!")
results = pd.merge(results, cat_results[["bin", "CAT_rank"]], left_on="bin", right_on="bin", how="outer")

results.to_csv(args.out, sep="\t")


Expand Down
2 changes: 1 addition & 1 deletion modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
},
"gtdbtk/classifywf": {
"branch": "master",
"git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5",
"git_sha": "9bbc6a88ce3004ae4bc9f84cef762484dc2c95e5",
"installed_by": ["modules"]
},
"gunc/downloaddb": {
Expand Down
3 changes: 3 additions & 0 deletions modules/local/bin_summary.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ process BIN_SUMMARY {
path(checkm_sum)
path(quast_sum)
path(gtdbtk_sum)
path(cat_sum)

output:
path("bin_summary.tsv"), emit: summary
Expand All @@ -21,12 +22,14 @@ process BIN_SUMMARY {
def checkm_summary = checkm_sum.sort().size() > 0 ? "--checkm_summary ${checkm_sum}" : ""
def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : ""
def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : ""
def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : ""
"""
combine_tables.py --depths_summary ${bin_depths} \
$busco_summary \
$checkm_summary \
$quast_summary \
$gtdbtk_summary \
$cat_summary \
--out bin_summary.tsv

cat <<-END_VERSIONS > versions.yml
Expand Down
7 changes: 7 additions & 0 deletions modules/nf-core/gtdbtk/classifywf/environment.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions modules/nf-core/gtdbtk/classifywf/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions modules/nf-core/gtdbtk/classifywf/meta.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion workflows/mag.nf
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,13 @@ workflow MAG {
ch_versions = ch_versions.mix(CAT.out.versions.first())
ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions)

// If CAT is not run, then the CAT global summary should be an empty channel
if ( params.cat_db_generate || params.cat_db) {
ch_cat_global_summary = CAT_SUMMARY.out.summary
} else {
ch_cat_global_summary = Channel.empty()
}

/*
* GTDB-tk: taxonomic classifications using GTDB reference
*/
Expand Down Expand Up @@ -992,7 +999,8 @@ workflow MAG {
ch_busco_summary.ifEmpty([]),
ch_checkm_summary.ifEmpty([]),
ch_quast_bins_summary.ifEmpty([]),
ch_gtdbtk_summary.ifEmpty([])
ch_gtdbtk_summary.ifEmpty([]),
ch_cat_global_summary.ifEmpty([])
)
}

Expand Down
Loading