From 63accc5adf97f530ee8f08eb98b2ef12170e8435 Mon Sep 17 00:00:00 2001 From: anmol thapar Date: Wed, 26 Jun 2024 11:29:00 +0100 Subject: [PATCH 1/2] bug: update cluster assign to write qcreport on qc failiures --- PopPUNK/assign.py | 9 +++++---- PopPUNK/qc.py | 43 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index da8adc87..7cc66b2c 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -372,7 +372,7 @@ def assign_query_hdf5(dbFuncs, from .plot import writeClusterCsv from .qc import qcDistMat, qcQueryAssignments, prune_distance_matrix, \ - prune_query_distance_matrix + prune_query_distance_matrix, write_qc_failure_report from .sketchlib import addRandom @@ -489,12 +489,13 @@ def assign_query_hdf5(dbFuncs, # QC distance matrix if qc_dict['run_qc']: sys.stderr.write("Running QC on distance matrix\n") - seq_names_passing = \ - frozenset(qcDistMat(qrDistMat, rNames, qNames, ref_db, qc_dict)[0]) - failed_samples = frozenset(qNames) - seq_names_passing + seq_names_passing, failed_samples_dict = qcDistMat(qrDistMat, rNames, qNames, ref_db, qc_dict) + failed_samples = frozenset(qNames) - frozenset(seq_names_passing) if len(failed_samples) > 0: sys.stderr.write(f"{len(failed_samples)} samples failed:\n" f"{','.join(failed_samples)}\n") + write_qc_failure_report(failed_samples, [failed_samples_dict], output) + if len(failed_samples) == len(qNames): sys.exit(1) else: diff --git a/PopPUNK/qc.py b/PopPUNK/qc.py index 9ae8359f..e45d5d45 100755 --- a/PopPUNK/qc.py +++ b/PopPUNK/qc.py @@ -472,14 +472,43 @@ def remove_qc_fail(qc_dict, names, passed, fail_dicts, ref_db, distMat, prefix, overwrite=True, threads=threads) # write failing & reasons - with open(f"{prefix}/{os.path.basename(prefix)}_qcreport.txt", 'w') as qc_file: - for sample in failed: - reasons = [] - for fail_test in fail_dicts: - if sample in fail_test: - reasons += (fail_test[sample]) - qc_file.write(f"{sample}\t{','.join(reasons)}\n") + write_qc_failure_report(failed, fail_dicts, prefix) + +def write_qc_failure_report(failed_samples, fail_dicts, output_prefix): + """ + Writes a report of failed samples and their reasons to a file. + Parameters: + - failed_samples: A list of samples that have failed. + - fail_dicts: A list of dictionaries, each mapping samples to their failure reasons. + - output_prefix: The prefix for the output file path. + """ + # Accumulate output lines for each failed sample + failed_output_lines = [ + f"{sample}\t{','.join(get_failure_reasons(sample, fail_dicts))}\n" + for sample in failed_samples + ] + with open(f"{output_prefix}/{os.path.basename(output_prefix)}_qcreport.txt", 'w') as qc_file: + qc_file.writelines(failed_output_lines) + +def get_failure_reasons(sample, fail_dicts): + """ + Retrieves all failure reasons for a given sample across multiple dictionaries. + + Parameters: + - sample: The sample to retrieve failure reasons for. + - fail_dicts: A list of dictionaries, each mapping samples to their failure reasons. + + Returns: + A list of failure reasons for the given sample. + """ + return [ + reason + for fail_dict in fail_dicts + if sample in fail_dict + for reason in fail_dict[sample] + ] + def pickTypeIsolate(prefix, refList): """Selects a type isolate as that with a minimal proportion of missing data. From db1bce4af630acd38cce413d0bd865d1335a5df0 Mon Sep 17 00:00:00 2001 From: anmol thapar Date: Wed, 3 Jul 2024 08:22:00 +0100 Subject: [PATCH 2/2] chore: update version --- PopPUNK/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index a3f1eb11..9a9c3eeb 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,7 +3,7 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '2.6.5' +__version__ = '2.6.6' # Minimum sketchlib version SKETCHLIB_MAJOR = 2