nf-core · tillenglert · Sep 30, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 13, 2024
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -14,6 +14,17 @@ report_section_order:
   "nf-core-metapep-summary":
     order: -1002
 
+custom_data:
+  summary_stats:
+    file_format: "tsv"
+    section_name: "Overall Summary"
+    description: "Summary of the number of proteins, peptides, unique peptides, binders per condition, as well as binders per allele per condition. The last row corresponds to a total across all conditions"
+    plot_type: "table"
+
+sp:
+  summary_stats:
+    fn: "stats.tsv"
+
 export_plots: true
 
 disable_version_detection: true
diff --git a/bin/collect_stats.py b/bin/collect_stats.py
@@ -32,10 +32,34 @@ def parse_args(args=None):
         type=str,
         required=True,
     )
-    parser.add_argument("-c", "--conditions", help="Path to the conditions input file", type=str, required=True)
+    parser.add_argument(
+        "-c",
+        "--conditions",
+        help="Path to the conditions input file",
+        type=str,
+        required=True)
+    parser.add_argument(
+        "-a",
+        "--alleles",
+        help="Path to the allele input file",
+        type=str,
+        required=True)
+    parser.add_argument(
+        "-pr",
+        "--predictions",
+        help="Path to the predictions input file",
+        type=str,
+        required=True)
+    parser.add_argument(
+        "-bt",
+        "--binder_threshold",
+        help="Score threshold to call binder",
+        type=float,
+        default=0.5,
+        required=True)
 
     # OUTPUT FILES
-    parser.add_argument("-o", "--outfile", help="Path to the output file", type=argparse.FileType("w"), required=True)
+    parser.add_argument("-o", "--outfile", help="Path to the output file", type=str, required=True)
 
     return parser.parse_args()
 
@@ -63,26 +87,35 @@ def main(args=None):
     microbiomes_entities_occs["entity_id"] = pd.to_numeric(microbiomes_entities_occs["entity_id"], downcast="unsigned")
 
     conditions = pd.read_csv(args.conditions, usecols=["condition_name", "microbiome_id"], sep="\t")
-    # TODO "condition_name" as " "category"?
-    # (first try: memory went up, check how to use properly)
     conditions["microbiome_id"] = pd.to_numeric(conditions["microbiome_id"], downcast="unsigned")
 
+    predictions = pd.read_csv(args.predictions, usecols=["peptide_id", "prediction_score", "allele_id"], sep="\t")
+    predictions["peptide_id"] = pd.to_numeric(predictions["peptide_id"], downcast="unsigned")
+    predictions["allele_id"] = pd.to_numeric(predictions["allele_id"], downcast="unsigned")
+    predictions["prediction_score"] = pd.to_numeric(predictions["prediction_score"], downcast="float")
+
+    alleles = pd.read_csv(args.alleles, usecols=["allele_id", "allele_name"], sep="\t").set_index("allele_id")
+    allele_names = alleles["allele_name"].to_dict()
+
+    predictions = pd.concat([df.set_index("peptide_id").rename(columns={"prediction_score":f"prediction_score_allele_{allele_id}"}).drop("allele_id", axis=1) for allele_id, df in predictions.groupby("allele_id")], join="outer", axis=1)
+    best_scored_peptides = predictions.agg("max", axis=1).rename("best_prediction_score")
+
+    stat_table = []
+
     # Process data condition-wise to reduce memory usage
     for condition_name in conditions["condition_name"]:
-        print("Process condition:", condition_name, flush=True)
-        print("Condition name:", condition_name, file=args.outfile, sep="\t", flush=True)
 
         conditions_proteins = (
             conditions[conditions.condition_name == condition_name]
             .merge(microbiomes_entities_occs)
             .drop(columns="microbiome_id")
             .merge(entities_proteins_occs)
             .drop(columns="entity_id")
+            .drop_duplicates()
         )
 
         # condition_name, unique_proteins
-        unique_protein_count = len(conditions_proteins[["condition_name", "protein_id"]].drop_duplicates())
-        print("Unique proteins:", unique_protein_count, file=args.outfile, sep="\t", flush=True)
+        unique_protein_count = len(conditions_proteins[["condition_name", "protein_id"]])
 
         # condition_name, peptide_id, condition_peptide_count
         conditions_peptides = (
@@ -95,26 +128,37 @@ def main(args=None):
 
         # condition_name, total_peptide_count
         total_peptide_count = sum(conditions_peptides["condition_peptide_count"])
-        print("Total peptides:", total_peptide_count, file=args.outfile, sep="\t", flush=True)
 
         # condition_name, unique_peptide_count
         unique_peptide_count = len(conditions_peptides)
-        print("Unique peptides:", unique_peptide_count, file=args.outfile, sep="\t", flush=True)
-        print(file=args.outfile)
-
-    # unique peptides across all conditions
-    all_conditions_unqiue_peptide_counts = len(protein_peptide_occs["peptide_id"].drop_duplicates())
-    print(file=args.outfile)
-    print(
-        "Unique peptides across all conditions:",
-        all_conditions_unqiue_peptide_counts,
-        file=args.outfile,
-        sep="\t",
-        flush=True,
-    )
 
-    print("Done!", flush=True)
+        # peptide_id, condition_name, condition_peptide_count, highest_prediction_score, prediction_score_allele_0, prediction_score_allele_1,...prediction_score_allele_n
+        conditions_peptides = conditions_peptides.set_index("peptide_id").join(best_scored_peptides).join(predictions)
+
+        # number of best prediction allele binder
+        num_binder = len(conditions_peptides[conditions_peptides["best_prediction_score"]>=args.binder_threshold])
+
+        # collect all info into table row per condition
+        row_temp = {"Condition name":condition_name, "Unique proteins":unique_protein_count, "Total peptides":total_peptide_count, "Unique peptides":unique_peptide_count, "# Binder (any allele)": num_binder}
+        for col in predictions.columns:
+            allele = allele_names[int(col.split('_')[-1])]
+            row_temp[f"# Binders for allele {allele}"] = sum(conditions_peptides[col]>=args.binder_threshold)
+
+        stat_table.append(row_temp)
+
+    # collect info across all conditions
+    all_conditions_unique_protein_count = len(protein_peptide_occs["protein_id"].drop_duplicates())
+    all_conditions_total_peptide_counts = sum(protein_peptide_occs.groupby("peptide_id")["count"].sum())
+    all_conditions_unique_peptide_counts = len(protein_peptide_occs["peptide_id"].drop_duplicates())
+    all_conditions_unique_binder_counts = sum(best_scored_peptides>=args.binder_threshold)
+
+    row_temp = {"Condition name":"total", "Unique proteins":all_conditions_unique_protein_count, "Total peptides":all_conditions_total_peptide_counts, "Unique peptides":all_conditions_unique_peptide_counts, "# Binder (any allele)": all_conditions_unique_binder_counts}
+    for col in predictions.columns:
+        allele = allele_names[int(col.split('_')[-1])]
+        row_temp[f"# Binders for allele {allele}"] = sum(predictions[col]>=args.binder_threshold)
 
+    stat_table.append(row_temp)
+    pd.DataFrame(stat_table).to_csv(args.outfile, sep="\t", index=False)
 
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/modules/local/collect_stats.nf b/modules/local/collect_stats.nf
@@ -12,24 +12,30 @@ process COLLECT_STATS {
     path(entities_proteins   )
     path(microbiomes_entities)
     path(conditions          )
+    path(alleles             )
+    path(predictions         )
 
     output:
-    path "stats.txt",       emit:   ch_stats
+    path "stats.tsv",       emit:   ch_stats
     path "versions.yml",    emit:   versions
 
+
+    def score_threshold = params.pred_method != "SYFPEITHI" ? params.mhcflurry_mhcnuggets_score_threshold : params.syfpeithi_score_threshold
     script:
     """
-    collect_stats.py --protein-peptide-occ "$proteins_peptides" \\
-                    --entities-proteins-occ "$entities_proteins" \\
-                    --microbiomes-entities-occ "$microbiomes_entities" \\
-                    --conditions "$conditions" \\
-                    --outfile stats.txt
+    collect_stats.py --protein-peptide-occ "$proteins_peptides"         \\
+                    --entities-proteins-occ "$entities_proteins"        \\
+                    --microbiomes-entities-occ "$microbiomes_entities"  \\
+                    --conditions "$conditions"                          \\
+                    --alleles "$alleles"                                \\
+                    --predictions "$predictions"                        \\
+                    --binder_threshold "$score_threshold"               \\
+                    --outfile "stats.tsv"
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python --version | sed 's/Python //g')
         pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)")
     END_VERSIONS
     """
-
 }
diff --git a/workflows/metapep.nf b/workflows/metapep.nf
@@ -136,19 +136,6 @@ workflow METAPEP {
         )
         ch_versions = ch_versions.mix(GENERATE_PEPTIDES.out.versions)
 
-        //
-        // MODULE: Collect stats
-        //
-
-        // Collects proteins, peptides, unique peptides per conditon
-        COLLECT_STATS (
-            GENERATE_PEPTIDES.out.ch_proteins_peptides,
-            GENERATE_PROTEIN_AND_ENTITY_IDS.out.ch_entities_proteins,
-            FINALIZE_MICROBIOME_ENTITIES.out.ch_microbiomes_entities,
-            PROCESS_INPUT.out.ch_conditions
-        )
-        ch_versions = ch_versions.mix(COLLECT_STATS.out.versions)
-
         //
         // MODULE: Split prediction tasks into chunks
         //
@@ -221,6 +208,21 @@ workflow METAPEP {
         )
         ch_versions = ch_versions.mix(MERGE_PREDICTIONS.out.versions)
 
+        //
+        // MODULE: Collect stats
+        //
+
+        // Collects proteins, peptides, unique peptides per conditon
+        COLLECT_STATS (
+            GENERATE_PEPTIDES.out.ch_proteins_peptides,
+            GENERATE_PROTEIN_AND_ENTITY_IDS.out.ch_entities_proteins,
+            FINALIZE_MICROBIOME_ENTITIES.out.ch_microbiomes_entities,
+            PROCESS_INPUT.out.ch_conditions,
+            PROCESS_INPUT.out.ch_alleles,
+            MERGE_PREDICTIONS.out.ch_predictions
+        )
+        ch_versions = ch_versions.mix(COLLECT_STATS.out.versions)
+
         //
         // MODULE: Plot score distributions
         //
@@ -262,6 +264,7 @@ workflow METAPEP {
         )
         ch_versions = ch_versions.mix(PLOT_ENTITY_BINDING_RATIOS.out.versions)
     }
+
     //
     // Collate and save software versions
     //
@@ -306,6 +309,7 @@ workflow METAPEP {
             sort: true
         )
     )
+    ch_multiqc_files = ch_multiqc_files.mix(COLLECT_STATS.out.ch_stats)
 
     MULTIQC (
         ch_multiqc_files.collect(),