Merge pull request #562 from maxibor/add_cat_2_summary

Add CAT summary into the global `bin_summary`
nf-core · Feb 1, 2024 · 15650d8 · 15650d8
2 parents eb97cbd + 212d079
commit 15650d8
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#562](https://github.com/nf-core/mag/pull/562) - Add CAT summary into the global bin_summary (by @maxibor)
+
 ### `Changed`
 
 ### `Fixed`

diff --git a/bin/combine_tables.py b/bin/combine_tables.py
@@ -13,7 +13,7 @@ def parse_args(args=None):
     parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.")
     parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.")
     parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.")
-
+    parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.")
     parser.add_argument(
         "-o",
         "--out",
@@ -25,6 +25,43 @@ def parse_args(args=None):
     return parser.parse_args(args)
 
 
+def parse_cat_table(cat_table):
+    """Parse CAT table.
+
+    CAT table is trickier to parse than the other tables, because it has a variable number of columns,
+    depending on the number of ranks that are reported for the taxonomic assignation of each contig.
+    Therefore, we first parse the header to get the column names, and then parse the table, to get the
+    maximum number of columns. Then, we merge the columns containing the ranks into a single column.
+
+    Args:
+        cat_table (str): Path to CAT table
+
+    Returns:
+        pd.DataFrame: parse CAT table
+    """
+    with open(cat_table, "r") as f:
+        next(f)  # skip header
+        maxcol = 0
+        for line in f:
+            maxcol = max(maxcol, len(line.split("\t")))
+
+    header = ["bin", "classification", "reason", "lineage", "lineage scores", "full lineage names"]
+
+    df = pd.read_table(
+        cat_table,
+        names=header + [f"rank_{i}" for i in range(maxcol - len(header))],
+        on_bad_lines="warn",
+        header=None,
+        skiprows=1,
+    )
+    # merge all rank columns into a single column
+    df["CAT_rank"] = df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip()
+    # remove rank_* columns
+    df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True)
+
+    return df
+
+
 def main(args=None):
     args = parse_args(args)
 
@@ -93,6 +130,12 @@ def main(args=None):
             results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer"
         )  # assuming depths for all bins are given
 
+    if args.cat_summary:
+        cat_results = parse_cat_table(args.cat_summary)
+        if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0:
+            sys.exit("Bins in CAT summary do not match bins in bin depths summary!")
+        results = pd.merge(results, cat_results[["bin", "CAT_rank"]], left_on="bin", right_on="bin", how="outer")
+
     results.to_csv(args.out, sep="\t")
 
 

diff --git a/modules.json b/modules.json
@@ -118,7 +118,7 @@
                     },
                     "gtdbtk/classifywf": {
                         "branch": "master",
-                        "git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5",
+                        "git_sha": "9bbc6a88ce3004ae4bc9f84cef762484dc2c95e5",
                         "installed_by": ["modules"]
                     },
                     "gunc/downloaddb": {

diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf
@@ -11,6 +11,7 @@ process BIN_SUMMARY {
     path(checkm_sum)
     path(quast_sum)
     path(gtdbtk_sum)
+    path(cat_sum)
 
     output:
     path("bin_summary.tsv"), emit: summary
@@ -21,12 +22,14 @@ process BIN_SUMMARY {
     def checkm_summary = checkm_sum.sort().size() > 0 ?  "--checkm_summary ${checkm_sum}" : ""
     def quast_summary  = quast_sum.sort().size() > 0 ?  "--quast_summary ${quast_sum}" : ""
     def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : ""
+    def cat_summary    = cat_sum.sort().size() > 0 ?    "--cat_summary ${cat_sum}" : ""
     """
     combine_tables.py --depths_summary ${bin_depths} \
                     $busco_summary \
                     $checkm_summary \
                     $quast_summary \
                     $gtdbtk_summary \
+                    $cat_summary \
                     --out bin_summary.tsv
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/modules/nf-core/gtdbtk/classifywf/environment.yml b/modules/nf-core/gtdbtk/classifywf/environment.yml
diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf
diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml
diff --git a/workflows/mag.nf b/workflows/mag.nf
@@ -958,6 +958,13 @@ workflow MAG {
         ch_versions = ch_versions.mix(CAT.out.versions.first())
         ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions)
 
+        // If CAT is not run, then the CAT global summary should be an empty channel
+        if ( params.cat_db_generate || params.cat_db) {
+            ch_cat_global_summary = CAT_SUMMARY.out.summary
+        } else {
+            ch_cat_global_summary = Channel.empty()
+        }
+
         /*
          * GTDB-tk: taxonomic classifications using GTDB reference
          */
@@ -992,7 +999,8 @@ workflow MAG {
                 ch_busco_summary.ifEmpty([]),
                 ch_checkm_summary.ifEmpty([]),
                 ch_quast_bins_summary.ifEmpty([]),
-                ch_gtdbtk_summary.ifEmpty([])
+                ch_gtdbtk_summary.ifEmpty([]),
+                ch_cat_global_summary.ifEmpty([])
             )
         }