Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MultiQC-Additional overview stat table #134

Merged
merged 14 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ report_section_order:
"nf-core-metapep-summary":
order: -1002

custom_data:
summary_stats:
file_format: "tsv"
section_name: "Overall Summary"
description: "Summary of the number of proteins, peptides, unique peptides, binders per condition, as well as binders per allele per condition. The last row corresponds to a total across all conditions"
plot_type: "table"

sp:
summary_stats:
fn: "stats.tsv"

export_plots: true

disable_version_detection: true
90 changes: 67 additions & 23 deletions bin/collect_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,34 @@ def parse_args(args=None):
type=str,
required=True,
)
parser.add_argument("-c", "--conditions", help="Path to the conditions input file", type=str, required=True)
parser.add_argument(
"-c",
"--conditions",
help="Path to the conditions input file",
type=str,
required=True)
parser.add_argument(
"-a",
"--alleles",
help="Path to the allele input file",
type=str,
required=True)
parser.add_argument(
"-pr",
"--predictions",
help="Path to the predictions input file",
type=str,
required=True)
parser.add_argument(
"-bt",
"--binder_threshold",
help="Score threshold to call binder",
type=float,
default=0.5,
required=True)

# OUTPUT FILES
parser.add_argument("-o", "--outfile", help="Path to the output file", type=argparse.FileType("w"), required=True)
parser.add_argument("-o", "--outfile", help="Path to the output file", type=str, required=True)

return parser.parse_args()

Expand Down Expand Up @@ -63,26 +87,35 @@ def main(args=None):
microbiomes_entities_occs["entity_id"] = pd.to_numeric(microbiomes_entities_occs["entity_id"], downcast="unsigned")

conditions = pd.read_csv(args.conditions, usecols=["condition_name", "microbiome_id"], sep="\t")
# TODO "condition_name" as " "category"?
# (first try: memory went up, check how to use properly)
conditions["microbiome_id"] = pd.to_numeric(conditions["microbiome_id"], downcast="unsigned")

predictions = pd.read_csv(args.predictions, usecols=["peptide_id", "prediction_score", "allele_id"], sep="\t")
predictions["peptide_id"] = pd.to_numeric(predictions["peptide_id"], downcast="unsigned")
predictions["allele_id"] = pd.to_numeric(predictions["allele_id"], downcast="unsigned")
predictions["prediction_score"] = pd.to_numeric(predictions["prediction_score"], downcast="float")

alleles = pd.read_csv(args.alleles, usecols=["allele_id", "allele_name"], sep="\t").set_index("allele_id")
allele_names = alleles["allele_name"].to_dict()

predictions = pd.concat([df.set_index("peptide_id").rename(columns={"prediction_score":f"prediction_score_allele_{allele_id}"}).drop("allele_id", axis=1) for allele_id, df in predictions.groupby("allele_id")], join="outer", axis=1)
best_scored_peptides = predictions.agg("max", axis=1).rename("best_prediction_score")

stat_table = []

# Process data condition-wise to reduce memory usage
for condition_name in conditions["condition_name"]:
print("Process condition:", condition_name, flush=True)
print("Condition name:", condition_name, file=args.outfile, sep="\t", flush=True)

conditions_proteins = (
conditions[conditions.condition_name == condition_name]
.merge(microbiomes_entities_occs)
.drop(columns="microbiome_id")
.merge(entities_proteins_occs)
.drop(columns="entity_id")
.drop_duplicates()
)

# condition_name, unique_proteins
unique_protein_count = len(conditions_proteins[["condition_name", "protein_id"]].drop_duplicates())
print("Unique proteins:", unique_protein_count, file=args.outfile, sep="\t", flush=True)
unique_protein_count = len(conditions_proteins[["condition_name", "protein_id"]])

# condition_name, peptide_id, condition_peptide_count
conditions_peptides = (
Expand All @@ -95,26 +128,37 @@ def main(args=None):

# condition_name, total_peptide_count
total_peptide_count = sum(conditions_peptides["condition_peptide_count"])
print("Total peptides:", total_peptide_count, file=args.outfile, sep="\t", flush=True)

# condition_name, unique_peptide_count
unique_peptide_count = len(conditions_peptides)
print("Unique peptides:", unique_peptide_count, file=args.outfile, sep="\t", flush=True)
print(file=args.outfile)

# unique peptides across all conditions
all_conditions_unqiue_peptide_counts = len(protein_peptide_occs["peptide_id"].drop_duplicates())
print(file=args.outfile)
print(
"Unique peptides across all conditions:",
all_conditions_unqiue_peptide_counts,
file=args.outfile,
sep="\t",
flush=True,
)

print("Done!", flush=True)
# peptide_id, condition_name, condition_peptide_count, highest_prediction_score, prediction_score_allele_0, prediction_score_allele_1,...prediction_score_allele_n
conditions_peptides = conditions_peptides.set_index("peptide_id").join(best_scored_peptides).join(predictions)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure I understood all details of the new code, but since it caused quite some memory usage, I have a few questions/suggestions since I am wondering if you really need that many joins:

  • conditions_peptides: if the resulting df become too large, you could drop the condition_name (anyway the same within this context) and count before joining to the predictions
  • filter predictions directly after reading in for entries that are above the threshold, and drop the prediction_score to reduce the size
  • merge/join for each allele to get only peptides that are "binders" and count

Brings me to the next question: you just provide the count of unique binders, right? Maybe that should be stated somewhere more clearly.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I managed to reduce the used Memory to around 50GB which is only 10GB more compared to the previous version not including peptide predictions!


# number of best prediction allele binder
num_binder = len(conditions_peptides[conditions_peptides["best_prediction_score"]>=args.binder_threshold])

# collect all info into table row per condition
row_temp = {"Condition name":condition_name, "Unique proteins":unique_protein_count, "Total peptides":total_peptide_count, "Unique peptides":unique_peptide_count, "# Binder (any allele)": num_binder}
for col in predictions.columns:
allele = allele_names[int(col.split('_')[-1])]
row_temp[f"# Binders for allele {allele}"] = sum(conditions_peptides[col]>=args.binder_threshold)

stat_table.append(row_temp)

# collect info across all conditions
all_conditions_unique_protein_count = len(protein_peptide_occs["protein_id"].drop_duplicates())
all_conditions_total_peptide_counts = sum(protein_peptide_occs.groupby("peptide_id")["count"].sum())
all_conditions_unique_peptide_counts = len(protein_peptide_occs["peptide_id"].drop_duplicates())
all_conditions_unique_binder_counts = sum(best_scored_peptides>=args.binder_threshold)

row_temp = {"Condition name":"total", "Unique proteins":all_conditions_unique_protein_count, "Total peptides":all_conditions_total_peptide_counts, "Unique peptides":all_conditions_unique_peptide_counts, "# Binder (any allele)": all_conditions_unique_binder_counts}
for col in predictions.columns:
allele = allele_names[int(col.split('_')[-1])]
row_temp[f"# Binders for allele {allele}"] = sum(predictions[col]>=args.binder_threshold)

stat_table.append(row_temp)
pd.DataFrame(stat_table).to_csv(args.outfile, sep="\t", index=False)

if __name__ == "__main__":
sys.exit(main())
20 changes: 13 additions & 7 deletions modules/local/collect_stats.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,30 @@ process COLLECT_STATS {
path(entities_proteins )
path(microbiomes_entities)
path(conditions )
path(alleles )
path(predictions )

output:
path "stats.txt", emit: ch_stats
path "stats.tsv", emit: ch_stats
path "versions.yml", emit: versions


def score_threshold = params.pred_method != "SYFPEITHI" ? params.mhcflurry_mhcnuggets_score_threshold : params.syfpeithi_score_threshold
script:
"""
collect_stats.py --protein-peptide-occ "$proteins_peptides" \\
--entities-proteins-occ "$entities_proteins" \\
--microbiomes-entities-occ "$microbiomes_entities" \\
--conditions "$conditions" \\
--outfile stats.txt
collect_stats.py --protein-peptide-occ "$proteins_peptides" \\
--entities-proteins-occ "$entities_proteins" \\
--microbiomes-entities-occ "$microbiomes_entities" \\
--conditions "$conditions" \\
--alleles "$alleles" \\
--predictions "$predictions" \\
--binder_threshold "$score_threshold" \\
--outfile "stats.tsv"

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)")
END_VERSIONS
"""

}
30 changes: 17 additions & 13 deletions workflows/metapep.nf
Original file line number Diff line number Diff line change
Expand Up @@ -136,19 +136,6 @@ workflow METAPEP {
)
ch_versions = ch_versions.mix(GENERATE_PEPTIDES.out.versions)

//
// MODULE: Collect stats
//

// Collects proteins, peptides, unique peptides per conditon
COLLECT_STATS (
GENERATE_PEPTIDES.out.ch_proteins_peptides,
GENERATE_PROTEIN_AND_ENTITY_IDS.out.ch_entities_proteins,
FINALIZE_MICROBIOME_ENTITIES.out.ch_microbiomes_entities,
PROCESS_INPUT.out.ch_conditions
)
ch_versions = ch_versions.mix(COLLECT_STATS.out.versions)

//
// MODULE: Split prediction tasks into chunks
//
Expand Down Expand Up @@ -221,6 +208,21 @@ workflow METAPEP {
)
ch_versions = ch_versions.mix(MERGE_PREDICTIONS.out.versions)

//
// MODULE: Collect stats
//

// Collects proteins, peptides, unique peptides per conditon
COLLECT_STATS (
GENERATE_PEPTIDES.out.ch_proteins_peptides,
GENERATE_PROTEIN_AND_ENTITY_IDS.out.ch_entities_proteins,
FINALIZE_MICROBIOME_ENTITIES.out.ch_microbiomes_entities,
PROCESS_INPUT.out.ch_conditions,
PROCESS_INPUT.out.ch_alleles,
MERGE_PREDICTIONS.out.ch_predictions
)
ch_versions = ch_versions.mix(COLLECT_STATS.out.versions)

//
// MODULE: Plot score distributions
//
Expand Down Expand Up @@ -262,6 +264,7 @@ workflow METAPEP {
)
ch_versions = ch_versions.mix(PLOT_ENTITY_BINDING_RATIOS.out.versions)
}

//
// Collate and save software versions
//
Expand Down Expand Up @@ -306,6 +309,7 @@ workflow METAPEP {
sort: true
)
)
ch_multiqc_files = ch_multiqc_files.mix(COLLECT_STATS.out.ch_stats)

MULTIQC (
ch_multiqc_files.collect(),
Expand Down
Loading