From c79d03c11d7427fef3b2eff730f0b168671fb6fd Mon Sep 17 00:00:00 2001 From: EddieLF Date: Mon, 14 Oct 2024 09:57:51 +1100 Subject: [PATCH 01/10] Allow local auditing runs temporarily --- metamist/audit/audit_upload_bucket.py | 37 ++++++++++++++++--- metamist/audit/audithelper.py | 9 ++++- .../audit/delete_assay_files_from_audit.py | 31 ++++++++++++---- 3 files changed, 64 insertions(+), 13 deletions(-) diff --git a/metamist/audit/audit_upload_bucket.py b/metamist/audit/audit_upload_bucket.py index c58e006ee..8d236bae7 100644 --- a/metamist/audit/audit_upload_bucket.py +++ b/metamist/audit/audit_upload_bucket.py @@ -5,11 +5,14 @@ """ import asyncio +import csv import logging import os import sys from datetime import datetime from functools import cache +from typing import Any +from cloudpathlib import AnyPath import click @@ -123,7 +126,13 @@ async def write_upload_bucket_audit_reports( """ today = datetime.today().strftime('%Y-%m-%d') - report_path = f'gs://{bucket_name}/audit_results/{today}/' + # report_path = f'gs://{bucket_name}/audit_results/{today}/' + report_path = f'/Users/edwfor/Code/metamist/audit_results/{today}/' + + # Create the report file and directory if it doesn't exist + report_path = AnyPath(report_path) + report_path.parent.mkdir(parents=True, exist_ok=True) + logging.info(f'Writing reports to {report_path}') if set(sequencing_types) == set(get_sequencing_types()): sequencing_types_str = 'all' @@ -143,9 +152,14 @@ async def write_upload_bucket_audit_reports( logging.info('No assay read files to delete found. Skipping report...') else: assays_to_delete_file = f'{report_prefix}_assay_files_to_delete_{today}.csv' - self.write_csv_report_to_cloud( + file_to_write = AnyPath(os.path.join(report_path, assays_to_delete_file)) + file_to_write.parent.mkdir(parents=True, exist_ok=True) + file_to_write.touch(exist_ok=True) + # self.write_csv_report_to_cloud( + write_csv_report_to_local( data_to_write=assay_files_to_delete, - report_path=os.path.join(report_path, assays_to_delete_file), + # report_path=os.path.join(report_path, assays_to_delete_file), + report_path=file_to_write, header_row=[ 'SG_ID', 'Assay_ID', @@ -160,7 +174,8 @@ async def write_upload_bucket_audit_reports( logging.info('No assay reads to ingest found. Skipping report...') else: assays_to_ingest_file = f'{report_prefix}_assay_files_to_ingest_{today}.csv' - self.write_csv_report_to_cloud( + # self.write_csv_report_to_cloud( + write_csv_report_to_local( data_to_write=assay_files_to_ingest, report_path=os.path.join(report_path, assays_to_ingest_file), header_row=[ @@ -178,12 +193,24 @@ async def write_upload_bucket_audit_reports( logging.info('No sequencing groups without crams found. Skipping report...') else: unaligned_sgs_file = f'{report_prefix}_unaligned_sgs_{today}.csv' - self.write_csv_report_to_cloud( + # self.write_csv_report_to_cloud( + write_csv_report_to_local( data_to_write=unaligned_sgs, report_path=os.path.join(report_path, unaligned_sgs_file), header_row=['SG_ID', 'Sample_ID', 'Sample_External_ID'], ) +def write_csv_report_to_local( + data_to_write: list[Any], report_path: AnyPath, header_row: list[str] | None +): + """Write a csv report to the local filesystem.""" + with open(report_path, 'w', newline='') as report_file: + writer = csv.writer(report_file) + if header_row: + writer.writerow(header_row) + for row in data_to_write: + writer.writerow(row) + logging.info(f'Wrote report to {report_path}') async def audit_upload_bucket_async( dataset: str, diff --git a/metamist/audit/audithelper.py b/metamist/audit/audithelper.py index cf568698c..75042fbae 100644 --- a/metamist/audit/audithelper.py +++ b/metamist/audit/audithelper.py @@ -145,7 +145,14 @@ def write_csv_report_to_cloud( Writes a csv report to the cloud bucket containing the data to write at the report path, with an optional header row """ - with AnyPath(report_path).open('w+') as f: # pylint: disable=E1101 + # Create the report file and directory if it doesn't exist + report_path = AnyPath(report_path) + report_path.parent.mkdir(parents=True, exist_ok=True) + report_path.touch(exist_ok=True) + logging.info(f'Writing report to {report_path}') + + # with report_path.open('w+') as f: # pylint: disable=E1101 + with open(report_path, 'w+') as f: writer = csv.writer(f) if header_row: writer.writerow(header_row) diff --git a/metamist/audit/delete_assay_files_from_audit.py b/metamist/audit/delete_assay_files_from_audit.py index 5aedd2e55..d7454300b 100644 --- a/metamist/audit/delete_assay_files_from_audit.py +++ b/metamist/audit/delete_assay_files_from_audit.py @@ -17,6 +17,7 @@ import os import sys from datetime import datetime +from typing import Any import click from cloudpathlib import AnyPath, CloudPath @@ -24,7 +25,7 @@ from cpg_utils.config import get_config -from metamist.audit.audithelper import AuditHelper +# from metamist.audit.audithelper import AuditHelper CLIENT = storage.Client() @@ -35,6 +36,8 @@ def clean_up_cloud_storage(locations: list[CloudPath]): """Given a list of locations of files to be deleted""" deleted_files = [] for location in locations: + # logging.info(f'Deleting {location}...') + # continue try: location.unlink() logging.info(f'{location.name} was deleted from cloud storage.') @@ -42,7 +45,7 @@ def clean_up_cloud_storage(locations: list[CloudPath]): # Many possible http exceptions could occur so use a broad exception except Exception: # pylint: disable=W0718 logging.warning(f'{location.name} threw an exception - file not deleted.') - + # exit() return deleted_files @@ -53,8 +56,9 @@ def clean_up_cloud_storage(locations: list[CloudPath]): help='The field in the input csv corresponding to the files to delete', required=True, ) +@click.option('--dataset', '-ds', help='The dataset to delete files from') @click.argument('delete-file-path') -def main(delete_field_name, delete_file_path): +def main(delete_field_name, dataset, delete_file_path): """ Inputs: - The name of the field containing the paths to delete in the input csv @@ -78,15 +82,28 @@ def main(delete_field_name, delete_file_path): logging.info(f'{len(deleted_files)} sequence files deleted.') # Write a log of the deleted files to the same location - log_path = f'{delete_file_path.removesuffix(os.path.basename(delete_file_path))}deleted_{TODAY}.csv' - AuditHelper.write_csv_report_to_cloud( + log_path = f'{delete_file_path.removesuffix(os.path.basename(delete_file_path))}{dataset}_deleted_{TODAY}.csv' + # AuditHelper.write_csv_report_to_cloud( + write_csv_report_to_local( deleted_files, log_path, header_row=['Deleted_file_path'] ) - + + +def write_csv_report_to_local( + data_to_write: list[Any], report_path: AnyPath, header_row: list[str] | None +): + """Write a csv report to the local filesystem.""" + with open(report_path, 'w', newline='') as report_file: + writer = csv.writer(report_file) + if header_row: + writer.writerow(header_row) + for row in data_to_write: + writer.writerow(row) + logging.info(f'Wrote report to {report_path}') if __name__ == '__main__': logging.basicConfig( - level=logging.WARNING, + level=logging.INFO, format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s', datefmt='%Y-%M-%d %H:%M:%S', stream=sys.stderr, From ceea4df8b15333545c0a0d42c117055347f3f85b Mon Sep 17 00:00:00 2001 From: EddieLF Date: Mon, 21 Oct 2024 09:07:34 +1100 Subject: [PATCH 02/10] Improve GQL queries for performance on large datasets --- metamist/audit/generic_auditor.py | 52 ++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/metamist/audit/generic_auditor.py b/metamist/audit/generic_auditor.py index c154db431..aa36305ce 100644 --- a/metamist/audit/generic_auditor.py +++ b/metamist/audit/generic_auditor.py @@ -20,9 +20,9 @@ """ ) -QUERY_PARTICIPANTS_SAMPLES_SGS_ASSAYS = gql( +QUERY_PARTICIPANTS_SAMPLES_SGS = gql( """ - query DatasetData($datasetName: String!) { + query DatasetData($datasetName: String!, $seqTypes: [String!]) { project(name: $datasetName) { participants { id @@ -30,13 +30,9 @@ samples { id externalId - sequencingGroups { + sequencingGroups(type: {in_: $seqTypes}) { id type - assays { - id - meta - } } } } @@ -45,6 +41,23 @@ """ ) +QUERY_SG_ASSAYS = gql( + """ + query DatasetData($datasetName: String!, $seqTypes: [String!]) { + project(name: $datasetName) { + sequencingGroups(type: {in_: $seqTypes}) { + id + type + assays { + id + meta + } + } + } + } + """ +) + QUERY_SG_ANALYSES = gql( """ query sgAnalyses($dataset: String!, $sgIds: [String!], $analysisTypes: [String!]) { @@ -118,10 +131,18 @@ async def get_participant_data_for_dataset(self) -> list[dict]: logging.getLogger().setLevel(logging.WARN) participant_query_results = await query_async( - QUERY_PARTICIPANTS_SAMPLES_SGS_ASSAYS, {'datasetName': self.dataset} + QUERY_PARTICIPANTS_SAMPLES_SGS, {'datasetName': self.dataset, 'seqTypes': self.sequencing_types} + ) + sg_assays_query_results = await query_async( + QUERY_SG_ASSAYS, {'datasetName': self.dataset, 'seqTypes': self.sequencing_types} ) logging.getLogger().setLevel(logging.INFO) + sg_assays = {} + for sg in sg_assays_query_results['project']['sequencingGroups']: + sg_id = sg['id'] + sg_assays[sg_id] = sg['assays'] + participant_data = participant_query_results['project']['participants'] filtered_participants = [] @@ -131,6 +152,20 @@ async def get_participant_data_for_dataset(self) -> list[dict]: f'{self.dataset} :: Filtering participant {participant["id"]} ({participant["externalId"]}) as it has no samples.' ) continue + for sample in participant['samples']: + if not sample['sequencingGroups']: + logging.info( + f'{self.dataset} :: Filtering sample {sample["id"]} ({sample["externalId"]}) as it has no sequencing groups.' + ) + continue + for sg in sample['sequencingGroups']: + if sg['id'] not in sg_assays: + logging.info( + f'{self.dataset} :: Filtering SG {sg["id"]} as it has no assays.' + ) + continue + sg['assays'] = sg_assays[sg['id']] + filtered_participants.append(participant) return filtered_participants @@ -252,6 +287,7 @@ async def get_analysis_cram_paths_for_dataset_sgs( Returns a dict mapping {sg_id : (analysis_id, cram_path) } """ sg_ids = list(assay_sg_id_map.values()) + logging.info(f'{self.dataset} :: Fetching CRAM analyses for {len(sg_ids)} SGs') logging.getLogger().setLevel(logging.WARN) analyses_query_result = await query_async( From e8bb55655edfad8674ed164655e75f87b7272675 Mon Sep 17 00:00:00 2001 From: EddieLF Date: Thu, 7 Nov 2024 13:05:52 +1100 Subject: [PATCH 03/10] Fix auditor --- metamist/audit/generic_auditor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metamist/audit/generic_auditor.py b/metamist/audit/generic_auditor.py index aa36305ce..3d790742e 100644 --- a/metamist/audit/generic_auditor.py +++ b/metamist/audit/generic_auditor.py @@ -30,7 +30,7 @@ samples { id externalId - sequencingGroups(type: {in_: $seqTypes}) { + sequencingGroups(type: {in_: $seqTypes}, technology: {eq: "short-read"}) { id type } @@ -45,7 +45,7 @@ """ query DatasetData($datasetName: String!, $seqTypes: [String!]) { project(name: $datasetName) { - sequencingGroups(type: {in_: $seqTypes}) { + sequencingGroups(type: {in_: $seqTypes}, technology: {eq: "short-read"}) { id type assays { @@ -61,7 +61,7 @@ QUERY_SG_ANALYSES = gql( """ query sgAnalyses($dataset: String!, $sgIds: [String!], $analysisTypes: [String!]) { - sequencingGroups(id: {in_: $sgIds}, project: {eq: $dataset}) { + sequencingGroups(id: {in_: $sgIds}, project: {eq: $dataset}, technology: {eq: "short-read"}) { id analyses(status: {eq: COMPLETED}, type: {in_: $analysisTypes}, project: {eq: $dataset}) { id @@ -287,7 +287,7 @@ async def get_analysis_cram_paths_for_dataset_sgs( Returns a dict mapping {sg_id : (analysis_id, cram_path) } """ sg_ids = list(assay_sg_id_map.values()) - logging.info(f'{self.dataset} :: Fetching CRAM analyses for {len(sg_ids)} SGs') + logging.info(f'{self.dataset} :: Fetching CRAM analyses for {len(set(sg_ids))} SGs') logging.getLogger().setLevel(logging.WARN) analyses_query_result = await query_async( From 3885b31bff4521162519d78ddafaac762133dab2 Mon Sep 17 00:00:00 2001 From: EddieLF Date: Tue, 17 Dec 2024 15:29:45 +1100 Subject: [PATCH 04/10] Reinstate cloud publishing --- metamist/audit/audit_upload_bucket.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metamist/audit/audit_upload_bucket.py b/metamist/audit/audit_upload_bucket.py index 8d236bae7..87661a64a 100644 --- a/metamist/audit/audit_upload_bucket.py +++ b/metamist/audit/audit_upload_bucket.py @@ -126,8 +126,7 @@ async def write_upload_bucket_audit_reports( """ today = datetime.today().strftime('%Y-%m-%d') - # report_path = f'gs://{bucket_name}/audit_results/{today}/' - report_path = f'/Users/edwfor/Code/metamist/audit_results/{today}/' + report_path = f'gs://{bucket_name}/audit_results/{today}/' # Create the report file and directory if it doesn't exist report_path = AnyPath(report_path) From ad02d318383f91118108e3deea2d1c8b8ed10415 Mon Sep 17 00:00:00 2001 From: EddieLF Date: Thu, 19 Dec 2024 10:23:55 +1100 Subject: [PATCH 05/10] Big refactor, should have been many separate commits --- metamist/audit/audit_upload_bucket.py | 245 ++----- metamist/audit/audithelper.py | 299 ++++++--- metamist/audit/generic_auditor.py | 903 +++++++++++++++----------- 3 files changed, 790 insertions(+), 657 deletions(-) diff --git a/metamist/audit/audit_upload_bucket.py b/metamist/audit/audit_upload_bucket.py index 87661a64a..7c1035881 100644 --- a/metamist/audit/audit_upload_bucket.py +++ b/metamist/audit/audit_upload_bucket.py @@ -5,66 +5,16 @@ """ import asyncio -import csv import logging -import os import sys from datetime import datetime -from functools import cache -from typing import Any -from cloudpathlib import AnyPath import click -from cpg_utils.config import get_config +from cpg_utils.config import config_retrieve, dataset_path -from metamist.audit.generic_auditor import GenericAuditor -from metamist.graphql import gql, query - -FASTQ_EXTENSIONS = ('.fq.gz', '.fastq.gz', '.fq', '.fastq') -BAM_EXTENSIONS = ('.bam',) -CRAM_EXTENSIONS = ('.cram',) -READ_EXTENSIONS = FASTQ_EXTENSIONS + BAM_EXTENSIONS + CRAM_EXTENSIONS -GVCF_EXTENSIONS = ('.g.vcf.gz',) -VCF_EXTENSIONS = ('.vcf', '.vcf.gz') -ALL_EXTENSIONS = ( - FASTQ_EXTENSIONS - + BAM_EXTENSIONS - + CRAM_EXTENSIONS - + GVCF_EXTENSIONS - + VCF_EXTENSIONS -) - -FILE_TYPES_MAP = { - 'fastq': FASTQ_EXTENSIONS, - 'bam': BAM_EXTENSIONS, - 'cram': CRAM_EXTENSIONS, - 'all_reads': READ_EXTENSIONS, - 'gvcf': GVCF_EXTENSIONS, - 'vcf': VCF_EXTENSIONS, - 'all': ALL_EXTENSIONS, -} - -SEQUENCING_TYPES_QUERY = gql( - """ - query seqTypes { - enum { - sequencingType - } - } - """ -) - - -@cache -def get_sequencing_types(): - """Return the list of sequencing types from the enum table.""" - logging.getLogger().setLevel(logging.WARN) - sequencing_types = query(SEQUENCING_TYPES_QUERY) - logging.getLogger().setLevel(logging.INFO) - return sequencing_types['enum'][ # pylint: disable=unsubscriptable-object - 'sequencingType' - ] +from metamist.audit.audithelper import FILE_TYPES_MAP, get_sequencing_types +from metamist.audit.generic_auditor import GenericAuditor, SequencingGroupData def audit_upload_bucket( @@ -111,11 +61,11 @@ def __init__( async def write_upload_bucket_audit_reports( self, bucket_name: str, - sequencing_types: list[str], - file_types: list[str], - assay_files_to_delete: list[tuple[str, int, str, list[int]]], - assay_files_to_ingest: list[tuple[str, str, str, int, str]], - unaligned_sgs: list[tuple[str, str]], + # audit_report_assay_files_to_delete, + # audit_report_assay_files_to_ingest, + # audit_report_unaligned_sgs, + audit_reports: dict[str, list[dict[str, str]]], + report_extension: str = 'tsv', ): """ Writes the 'assay files to delete/ingest' csv reports and upload them to the bucket. @@ -125,91 +75,16 @@ async def write_upload_bucket_audit_reports( The report names include the file types, sequencing types, and date of the audit. """ today = datetime.today().strftime('%Y-%m-%d') - - report_path = f'gs://{bucket_name}/audit_results/{today}/' - - # Create the report file and directory if it doesn't exist - report_path = AnyPath(report_path) - report_path.parent.mkdir(parents=True, exist_ok=True) - logging.info(f'Writing reports to {report_path}') - - if set(sequencing_types) == set(get_sequencing_types()): - sequencing_types_str = 'all' - else: - sequencing_types_str = ('_').join(sequencing_types) - - if set(file_types) == set(ALL_EXTENSIONS): - file_types_str = 'all' - elif set(file_types) == set(READ_EXTENSIONS): - file_types_str = 'all_reads' - else: - file_types_str = ('_').join(file_types) - - report_prefix = f'{self.dataset}_{file_types_str}_{sequencing_types_str}' - - if not assay_files_to_delete: - logging.info('No assay read files to delete found. Skipping report...') - else: - assays_to_delete_file = f'{report_prefix}_assay_files_to_delete_{today}.csv' - file_to_write = AnyPath(os.path.join(report_path, assays_to_delete_file)) - file_to_write.parent.mkdir(parents=True, exist_ok=True) - file_to_write.touch(exist_ok=True) - # self.write_csv_report_to_cloud( - write_csv_report_to_local( - data_to_write=assay_files_to_delete, - # report_path=os.path.join(report_path, assays_to_delete_file), - report_path=file_to_write, - header_row=[ - 'SG_ID', - 'Assay_ID', - 'Assay_Read_File_Path', - 'CRAM_Analysis_ID', - 'Filesize', - ], - ) - - # 'Sequences to ingest' report contains paths to the (possibly) uningested files - and any samples/SGs that might be related - if not assay_files_to_ingest: - logging.info('No assay reads to ingest found. Skipping report...') - else: - assays_to_ingest_file = f'{report_prefix}_assay_files_to_ingest_{today}.csv' - # self.write_csv_report_to_cloud( - write_csv_report_to_local( - data_to_write=assay_files_to_ingest, - report_path=os.path.join(report_path, assays_to_ingest_file), - header_row=[ - 'Assay_File_Path', - 'SG_ID', - 'Sample_ID', - 'Sample_External_ID', - 'CRAM_Analysis_ID', - 'CRAM_Path', - ], + report_prefix = self.get_audit_report_prefix( + seq_types=self.sequencing_types, file_types=self.file_types + ) + for audit_report_type, audit_report in audit_reports.items(): + self.write_report_to_cloud( + data_to_write=audit_report, + bucket_name=bucket_name, + blob_path=f'audit_results/{today}/{report_prefix}_{audit_report_type}.{report_extension}', ) - # Write the sequencing groups without any completed cram to a csv - if not unaligned_sgs: - logging.info('No sequencing groups without crams found. Skipping report...') - else: - unaligned_sgs_file = f'{report_prefix}_unaligned_sgs_{today}.csv' - # self.write_csv_report_to_cloud( - write_csv_report_to_local( - data_to_write=unaligned_sgs, - report_path=os.path.join(report_path, unaligned_sgs_file), - header_row=['SG_ID', 'Sample_ID', 'Sample_External_ID'], - ) - -def write_csv_report_to_local( - data_to_write: list[Any], report_path: AnyPath, header_row: list[str] | None -): - """Write a csv report to the local filesystem.""" - with open(report_path, 'w', newline='') as report_file: - writer = csv.writer(report_file) - if header_row: - writer.writerow(header_row) - for row in data_to_write: - writer.writerow(row) - logging.info(f'Wrote report to {report_path}') async def audit_upload_bucket_async( dataset: str, @@ -230,6 +105,15 @@ async def audit_upload_bucket_async( default_analysis_type: The default analysis type to audit default_analysis_status: The default analysis status to audit """ + + # Initialise the auditor + auditor = UploadBucketAuditor( + dataset=dataset, + sequencing_types=sequencing_types, + file_types=file_types, + default_analysis_type=default_analysis_type, + default_analysis_status=default_analysis_status, + ) # Validate user inputs allowed_sequencing_types = get_sequencing_types() @@ -250,59 +134,60 @@ async def audit_upload_bucket_async( else: file_types = FILE_TYPES_MAP[file_types[0]] - config = get_config() if not dataset: - dataset = config['workflow']['dataset'] - bucket = config['storage'][dataset]['upload'] + dataset = config_retrieve(['workflow', 'dataset']) + bucket_name = dataset_path(dataset=dataset, category='upload') - # Initialise the auditor - auditor = UploadBucketAuditor( - dataset=dataset, - sequencing_types=sequencing_types, - file_types=file_types, - default_analysis_type=default_analysis_type, - default_analysis_status=default_analysis_status, - ) - participant_data = await auditor.get_participant_data_for_dataset() - sample_internal_external_id_map = auditor.map_internal_to_external_sample_ids( - participant_data - ) - ( - sg_sample_id_map, - assay_sg_id_map, - assay_filepaths_filesizes, - ) = auditor.get_assay_map_from_participants(participant_data) + + # participant_data = await auditor.get_participant_data_for_dataset() + # sample_internal_external_id_map = auditor.map_internal_to_external_sample_ids( + # participant_data + # ) + # ( + # sg_sample_id_map, + # assay_sg_id_map, + # assay_filepaths_filesizes, + # ) = auditor.get_assay_map_from_participants(participant_data) + + sequencing_groups: list[SequencingGroupData] = await auditor.get_sg_assays_for_dataset() # Get all completed cram output paths for the samples in the dataset and validate them sg_cram_paths = await auditor.get_analysis_cram_paths_for_dataset_sgs( - assay_sg_id_map + sequencing_groups ) # Identify sgs with and without completed crams sg_completion = await auditor.get_complete_and_incomplete_sgs( - assay_sg_id_map, sg_cram_paths + sequencing_groups, sg_cram_paths ) - - unaligned_sgs = [ - ( - sg_id, - sg_sample_id_map[sg_id], - sample_internal_external_id_map.get(sg_sample_id_map[sg_id]), - ) - for sg_id in sg_completion.get('incomplete') - ] - + + # Get the unaligned sequencing groups + unaligned_sgs = [] + for sg in sequencing_groups: + if sg.id in sg_completion.get('incomplete'): + unaligned_sgs.append( + { + 'sg_id': sg.id, + 'sample_id': sg.sample.id, + 'sample_external_id': sg.sample.external_id, + 'participant_id': sg.sample.participant.id, + 'participant_external_id': sg.sample.participant.external_id, + } + ) + + # Extract the assay file paths and sizes for the dataset's SGs + assay_id_to_paths_sizes = {assay.id: assay.read_files_paths_sizes for sg in sequencing_groups for assay in sg.assays} + + # Get the assay files to delete and ingest ( reads_to_delete, reads_to_ingest, ) = await auditor.get_reads_to_delete_or_ingest( - bucket, - sg_completion.get('complete'), - assay_filepaths_filesizes, - sg_sample_id_map, - assay_sg_id_map, - sample_internal_external_id_map, + bucket_name=bucket_name, + sequencing_groups=sequencing_groups, + completed_sgs=sg_completion.get('complete'), + assay_id_to_paths_sizes=assay_id_to_paths_sizes, ) possible_assay_ingests = auditor.find_crams_for_reads_to_ingest( @@ -311,7 +196,7 @@ async def audit_upload_bucket_async( # Write the reads to delete, reads to ingest, and unaligned SGs reports await auditor.write_upload_bucket_audit_reports( - bucket, + bucket_name=bucket_name, sequencing_types=sequencing_types, file_types=file_types, assay_files_to_delete=reads_to_delete, diff --git a/metamist/audit/audithelper.py b/metamist/audit/audithelper.py index 75042fbae..3f0f0f1f8 100644 --- a/metamist/audit/audithelper.py +++ b/metamist/audit/audithelper.py @@ -1,31 +1,105 @@ # pylint: disable=no-member import csv import logging -import os from collections import defaultdict +from io import StringIO from typing import Any -from cloudpathlib import AnyPath - from cpg_utils.cloud import get_path_components_from_gcp_path +from cpg_utils.config import config_retrieve, get_gcp_project +from metamist.graphql import gql, query from metamist.parser.cloudhelper import CloudHelper +from google.cloud import storage + +FASTQ_EXTENSIONS = ('.fq.gz', '.fastq.gz', '.fq', '.fastq') +BAM_EXTENSIONS = ('.bam',) +CRAM_EXTENSIONS = ('.cram',) +READ_EXTENSIONS = FASTQ_EXTENSIONS + BAM_EXTENSIONS + CRAM_EXTENSIONS +GVCF_EXTENSIONS = ('.g.vcf.gz',) +VCF_EXTENSIONS = ('.vcf', '.vcf.gz') +ALL_EXTENSIONS = ( + FASTQ_EXTENSIONS + + BAM_EXTENSIONS + + CRAM_EXTENSIONS + + GVCF_EXTENSIONS + + VCF_EXTENSIONS +) + +FILE_TYPES_MAP = { + 'fastq': FASTQ_EXTENSIONS, + 'bam': BAM_EXTENSIONS, + 'cram': CRAM_EXTENSIONS, + 'all_reads': READ_EXTENSIONS, + 'gvcf': GVCF_EXTENSIONS, + 'vcf': VCF_EXTENSIONS, + 'all': ALL_EXTENSIONS, +} + +HAIL_EXTENSIONS = ['.ht', '.mt', '.vds'] + +ANALYSIS_TYPES_QUERY = gql( + """ + query analysisTypes { + enum { + analysisType + } + } + """ +) + +SEQUENCING_TYPES_QUERY = gql( + """ + query seqTypes { + enum { + sequencingType + } + } + """ +) + +def get_analysis_types(): + """Return the list of analysis types from the enum table.""" + analysis_types_query_result = query(ANALYSIS_TYPES_QUERY) + return analysis_types_query_result['enum']['analysisType'] + +def get_sequencing_types(): + """Return the list of sequencing types from the enum table.""" + sequencing_types_query_result: dict[str, dict[str, list[str]]] = query(SEQUENCING_TYPES_QUERY) + return sequencing_types_query_result['enum']['sequencingType'] + class AuditHelper(CloudHelper): """General helper class for bucket auditing""" + def __init__( + self, + gcp_project: str, + all_analysis_types: list[str] = None, + all_sequencing_types: list[str] = None, + excluded_sequencing_groups: list[str] = None, + ): + # Initialize GCP project + self.gcp_project = gcp_project or get_gcp_project() or config_retrieve(['workflow', 'gcp_project']) + if not self.gcp_project: + raise ValueError('GCP project is required') + + self.all_analysis_types = all_analysis_types or get_analysis_types() + self.all_sequencing_types = all_sequencing_types or get_sequencing_types() + + self.excluded_sequencing_groups = excluded_sequencing_groups or config_retrieve(['workflow', 'audit', 'excluded_sequencing_groups']) - EXCLUDED_SGS: set[str] = set( - sg for sg in os.getenv('SM_AUDIT_EXCLUDED_SGS', '').split(',') if sg - ) + super().__init__( + gcp_project=self.gcp_project, + ) @staticmethod - def get_gcs_bucket_subdirs_to_search(paths: list[str]) -> defaultdict[str, list]: + def get_gcs_buckets_and_prefixes_from_paths(paths: list[str]) -> defaultdict[str, list]: """ - Takes a list of paths and extracts the bucket name and subdirectory, returning all unique pairs - of buckets/subdirectories + Takes a list of paths and extracts the bucket names and prefix, returning all unique pairs + of buckets and prefixes. Does not make any calls to GCS. """ - buckets_subdirs_to_search: defaultdict[str, list] = defaultdict(list) + buckets_prefixes: defaultdict[str, list] = defaultdict(list) for path in paths: try: pc = get_path_components_from_gcp_path(path) @@ -33,133 +107,146 @@ def get_gcs_bucket_subdirs_to_search(paths: list[str]) -> defaultdict[str, list] logging.warning(f'{path} invalid') continue bucket = pc['bucket'] - subdir = pc['suffix'] - if subdir and subdir not in buckets_subdirs_to_search[bucket]: - buckets_subdirs_to_search[bucket].append(subdir) + prefix = pc['suffix'] # This is the prefix (i.e. the "subdirectory" in the bucket) + if prefix and prefix not in buckets_prefixes[bucket]: + buckets_prefixes[bucket].append(prefix) - return buckets_subdirs_to_search + return buckets_prefixes - def get_gcs_paths_for_subdir( - self, bucket_name: str, subdirectory: str, file_extension: tuple[str] + def get_all_files_in_gcs_bucket_with_prefix_and_extensions( + self, bucket_name: str, prefix: str, file_extension: tuple[str] ): - """Iterate through a gcp bucket/subdir and get all the blobs with the specified file extension(s)""" - files_in_bucket_subdir = [] + """Iterate through a gcp bucket/prefix and get all the blobs with the specified file extension(s)""" + bucket = self.gcs_client.bucket(bucket_name, user_project=self.user_project) + + files_in_bucket_prefix = [] for blob in self.gcs_client.list_blobs( - bucket_name, prefix=subdirectory, delimiter='/' + bucket, prefix=prefix, delimiter='/' ): # Check if file ends with specified analysis type if not blob.name.endswith(file_extension): continue - files_in_bucket_subdir.append(f'gs://{bucket_name}/{blob.name}') + files_in_bucket_prefix.append(f'gs://{bucket_name}/{blob.name}') - return files_in_bucket_subdir + return files_in_bucket_prefix - def find_files_in_gcs_buckets_subdirs( - self, buckets_subdirs: defaultdict[str, list], file_types: tuple[str] + def find_files_in_gcs_buckets_prefixes( + self, buckets_prefixes: defaultdict[str, list[str]], file_types: tuple[str] ): """ - Takes a list of (bucket,subdirectory) tuples and finds all the files contained in that directory - with filetypes defined with an input list + Takes a dict of {bucket: [prefix1, prefix2, ...]} tuples and finds all the files contained in that bucket/prefix + that end with the specified file type extensions. Skips hailtable, matrixtable, and vds paths. """ files_in_bucket = [] - for bucket, subdirs in buckets_subdirs.items(): - for subdir in subdirs: - # matrixtable / hailtable subdirectories should not appear in main-upload buckets, + for bucket_name, prefixes in buckets_prefixes.items(): + for prefix in prefixes: + # matrixtable / hailtable / vds prefix should not appear in main-upload buckets, # but handle them just in case. These directories are too large to search. - if '.mt' in subdir or '.ht' in subdir: + if any(hl_extension in prefix for hl_extension in HAIL_EXTENSIONS): continue files_in_bucket.extend( - self.get_gcs_paths_for_subdir(bucket, subdir, file_types) + self.get_all_files_in_gcs_bucket_with_prefix_and_extensions(bucket_name, prefix, file_types) ) return files_in_bucket def find_assay_files_in_gcs_bucket( self, bucket_name: str, file_extensions: tuple[str] - ) -> list[str]: - """Gets all the gs paths to fastq files in the datasets upload bucket""" - if bucket_name.startswith('gs://'): - bucket_name = bucket_name.removeprefix('gs://') - assay_paths = [] + ) -> dict[str, int]: + """ + Gets all the paths and sizes to assay files in the dataset's upload bucket. + Calls list_blobs on the bucket with the specified file extensions, returning a dict of paths and sizes. + """ + bucket_name = bucket_name.removeprefix('gs://').removesuffix('/') if 'upload' not in bucket_name: # No prefix means it will get all blobs in the bucket (regardless of path) # This can be a dangerous call outside of the upload buckets raise NameError( 'Call to list_blobs without prefix only valid for upload buckets' ) - - for blob in self.gcs_client.list_blobs(bucket_name, prefix=''): - if blob.name.endswith(file_extensions): - assay_paths.append(f'gs://{bucket_name}/{blob.name}') - continue - - return assay_paths - - @staticmethod - def get_sequencing_group_ids_from_analysis(analysis) -> list[str]: - """Tries a number of different field names to retrieve the sg ids from an analysis""" - while True: - try: - sg_ids = analysis['meta']['sample'] - break - except KeyError: - pass - - try: - sg_ids = analysis['meta']['samples'] - break - except KeyError: - pass - - try: - sg_ids = analysis['meta']['sample_ids'] - break - except KeyError: - pass - - try: - sg_ids = analysis['meta']['sequencing_group'] - break - except KeyError: - pass - - try: - sg_ids = analysis['meta']['sequencing_groups'] - break - except KeyError as exc: - raise ValueError( - f'Analysis {analysis["id"]} missing sample or sequencing group field.' - ) from exc - - if isinstance(sg_ids, str): - return [ - sg_ids, - ] - return sg_ids - - @staticmethod - def write_csv_report_to_cloud( - data_to_write: list[Any], report_path: AnyPath, header_row: list[str] | None + bucket = self.gcs_client.bucket(bucket_name, user_project=self.user_project) + + assay_paths_sizes = {} + for blob in self.gcs_client.list_blobs(bucket, prefix=''): + if not blob.name.endswith(file_extensions): + continue + blob.reload() + assay_paths_sizes[blob.name] = blob.size + + return assay_paths_sizes + + def get_audit_report_prefix( + self, + seq_types: str, + file_types: str, ): + """Get the prefix for the report file based on the sequencing and file types audited""" + if set(seq_types) == set(self.all_sequencing_types): + sequencing_types_str = 'all_seq_types' + else: + sequencing_types_str = ('_').join(self.sequencing_types) + '_seq_types' + + if set(file_types) == set(ALL_EXTENSIONS): + file_types_str = 'all_file_types' + elif set(file_types) == set(READ_EXTENSIONS): + file_types_str = 'all_reads_file_types' + else: + file_types_str = ('_').join(self.file_types) + '_file_types' + + return f'{file_types_str}_{sequencing_types_str}' + + + def write_report_to_cloud( + self, + data_to_write: list[dict[str, Any]] | None, + bucket_name: str, + blob_path: str, + ) -> None: """ - Writes a csv report to the cloud bucket containing the data to write - at the report path, with an optional header row + Writes a CSV/TSV report directly to Google Cloud Storage. + + Args: + data_to_write: List of data rows to write to the CSV/TSV + bucket_name: Name of the GCS bucket + blob_path: Path where the blob should be stored in the bucket (with either .tsv or .csv extension) + + Raises: + ValueError: If the blob path doesn't end with .csv or .tsv + google.cloud.exceptions.NotFound: If the bucket doesn't exist + google.cloud.exceptions.Forbidden: If permissions are insufficient """ - # Create the report file and directory if it doesn't exist - report_path = AnyPath(report_path) - report_path.parent.mkdir(parents=True, exist_ok=True) - report_path.touch(exist_ok=True) - logging.info(f'Writing report to {report_path}') + if not data_to_write: + logging.info('No data to write to report') + return - # with report_path.open('w+') as f: # pylint: disable=E1101 - with open(report_path, 'w+') as f: - writer = csv.writer(f) - if header_row: - writer.writerow(header_row) - for row in data_to_write: - if isinstance(row, str): - writer.writerow([row]) - continue - writer.writerow(row) + logging.info(f'Writing report to gs://{bucket_name}/{blob_path}') + + # Create a string buffer to hold the data + if blob_path.endswith('.csv'): + delimiter = ',' + content_type = 'text/csv' + elif blob_path.endswith('.tsv'): + delimiter = '\t' + content_type = 'text/tab-separated-values' + else: + raise ValueError('Blob path must end with either .csv or .tsv') + + buffer = StringIO() + writer = csv.DictWriter(buffer, fieldnames=data_to_write[0].keys(), delimiter=delimiter) - logging.info(f'Wrote {len(data_to_write)} lines to report: {report_path}') + writer.writeheader() + writer.writerows(data_to_write) + + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name, user_project=self.user_project) + blob = bucket.blob(blob_path) + + # Upload the TSV content + blob.upload_from_string( + buffer.getvalue(), + content_type=content_type, + ) + + buffer.close() + logging.info(f'Wrote {len(data_to_write)} lines to gs://{bucket_name}/{blob_path}') + return diff --git a/metamist/audit/generic_auditor.py b/metamist/audit/generic_auditor.py index 3d790742e..cd2e848bc 100644 --- a/metamist/audit/generic_auditor.py +++ b/metamist/audit/generic_auditor.py @@ -1,28 +1,31 @@ import logging import os -from collections import defaultdict, namedtuple +from collections import defaultdict from datetime import datetime -from functools import cache from typing import Any +from cpg_utils.config import config_retrieve, dataset_path from gql.transport.requests import log as requests_logger -from metamist.audit.audithelper import AuditHelper +from metamist.audit.audithelper import AuditHelper, FILE_TYPES_MAP from metamist.graphql import gql, query_async -ANALYSIS_TYPES_QUERY = gql( - """ - query analysisTypes { - enum { - analysisType - } - } - """ +handler = logging.StreamHandler() +formatter = logging.Formatter( + fmt='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', ) +handler.setFormatter(formatter) +logger = logging.getLogger(__name__) +logger.addHandler(handler) +logger.setLevel(logging.INFO) +logger.propagate = False + + QUERY_PARTICIPANTS_SAMPLES_SGS = gql( """ - query DatasetData($datasetName: String!, $seqTypes: [String!]) { + query DatasetData($datasetName: String!, $seqTypes: [String!], $seqTechs: [String!]) { project(name: $datasetName) { participants { id @@ -30,7 +33,7 @@ samples { id externalId - sequencingGroups(type: {in_: $seqTypes}, technology: {eq: "short-read"}) { + sequencingGroups(type: {in_: $seqTypes}, technology: {in_: $seqTechs}) { id type } @@ -41,16 +44,33 @@ """ ) -QUERY_SG_ASSAYS = gql( +QUERY_DATASET_SGS = gql( """ - query DatasetData($datasetName: String!, $seqTypes: [String!]) { + query DatasetData($datasetName: String!, $seqTypes: [String!], $seqTechs: [String!]) { project(name: $datasetName) { - sequencingGroups(type: {in_: $seqTypes}, technology: {eq: "short-read"}) { + sequencingGroups(type: {in_: $seqTypes}, technology: {in_: $seqTechs}) { id type + technology + sample { + id + externalId + participant { + id + externalId + } + } assays { id meta + sample { + id + externalId + participant { + id + externalId + } + } } } } @@ -60,13 +80,13 @@ QUERY_SG_ANALYSES = gql( """ - query sgAnalyses($dataset: String!, $sgIds: [String!], $analysisTypes: [String!]) { - sequencingGroups(id: {in_: $sgIds}, project: {eq: $dataset}, technology: {eq: "short-read"}) { + query sgAnalyses($dataset: String!, $sgIds: [String!], $analysisTypes: [String!], $seqTechs: [String!]) { + sequencingGroups(id: {in_: $sgIds}, project: {eq: $dataset}) { id analyses(status: {eq: COMPLETED}, type: {in_: $analysisTypes}, project: {eq: $dataset}) { id meta - output + outputs type timestampCompleted } @@ -76,26 +96,97 @@ ) # Variable type definitions +AnalysisId = int AssayId = int +ParticipantId = int +ParticipantExternalId = str SampleId = str SampleExternalId = str -ParticipantExternalId = str +SequencingGroupId = str -AssayReportEntry = namedtuple( - 'AssayReportEntry', - 'sg_id assay_id assay_file_path analysis_id filesize', -) +class AuditReportEntry: + """Class to hold the data for an audit report entry""" -@cache -async def get_analysis_types(): - """Return the list of analysis types from the enum table.""" - logging.getLogger().setLevel(logging.WARN) - analysis_types = await query_async(ANALYSIS_TYPES_QUERY) - logging.getLogger().setLevel(logging.INFO) - return analysis_types['enum'][ # pylint: disable=unsubscriptable-object - 'analysisType' - ] + def __init__( + self, + file_path: str, + filesize: int, + sg_id: str | None = None, + assay_id: int | None = None, + cram_analysis_id: int | None = None, + cram_file_path: str | None = None, + sample_id: str | None = None, + sample_external_id: str | None = None, + participant_id: int | None = None, + participant_external_id: str | None = None, + ): + self.file_path = file_path + self.filesize = filesize + self.sg_id = sg_id + self.assay_id = assay_id + self.cram_analysis_id = cram_analysis_id + self.cram_file_path = cram_file_path + self.sample_id = sample_id + self.sample_external_id = sample_external_id + self.participant_id = participant_id + self.participant_external_id = participant_external_id + + +class ParticipantData: + """Class to hold the data for a participant""" + + def __init__( + self, + id: ParticipantId, + external_id: ParticipantExternalId, + ): + self.id = id + self.external_id = external_id + +class SampleData: + """Class to hold the data for a sample""" + + def __init__( + self, + id: SampleId, + external_id: SampleExternalId, + participant: ParticipantData, + ): + self.id = id + self.external_id = external_id + self.participant = participant + + +class AssayData: + """Class to hold the data for an assay""" + + def __init__( + self, + id: AssayId, + read_files_paths_sizes: list[tuple[str, int]], + sample: SampleData, + ): + self.id = id + self.read_files_paths_sizes = read_files_paths_sizes + self.sample = sample + +class SequencingGroupData: + """Class to hold the data for a sequencing group""" + + def __init__( + self, + id: str, + sequencing_type: str, + sequencing_technology: str, + sample: SampleData, + assays: list[AssayData], + ): + self.id = id + self.sequencing_type = sequencing_type + self.sequencing_technology = sequencing_technology + self.sample = sample + self.assays = assays class GenericAuditor(AuditHelper): @@ -106,87 +197,139 @@ def __init__( self, dataset: str, sequencing_types: list[str], + sequencing_technologies: list[str], file_types: tuple[str], default_analysis_type='cram', default_analysis_status='completed', ): - if not dataset: + # Initialize dataset + self.dataset = dataset or config_retrieve(['workflow', 'dataset']) + if not self.dataset: raise ValueError('Metamist dataset is required') - super().__init__(search_paths=None) + # Validate sequencing types + if sequencing_types == ('all',): + self.sequencing_types = self.all_sequencing_types + else: + invalid_types = [st for st in sequencing_types if st not in self.all_sequencing_types] + if invalid_types: + raise ValueError( + f'Input sequencing types "{invalid_types}" must be in the allowed types: {self.all_sequencing_types}' + ) + self.sequencing_types = sequencing_types + + # Validate file types + if file_types in (('all',), ('all_reads',)): + self.file_types = FILE_TYPES_MAP[file_types[0]] + else: + invalid_files = [ft for ft in file_types if ft not in FILE_TYPES_MAP] + if invalid_files: + raise ValueError( + f'Input file types "{invalid_files}" must be in the allowed types: {", ".join(FILE_TYPES_MAP.keys())}' + ) + self.file_types = file_types - self.dataset = dataset - self.sequencing_types = sequencing_types - self.file_types = file_types + # Set remaining attributes + self.sequencing_technologies = sequencing_technologies self.default_analysis_type: str = default_analysis_type self.default_analysis_status: str = default_analysis_status - requests_logger.setLevel(logging.WARNING) + # Calculate bucket name + self.bucket_name = dataset_path(dataset=self.dataset, category='upload') - async def get_participant_data_for_dataset(self) -> list[dict]: + super().__init__(search_paths=None) + requests_logger.setLevel(logging.WARNING) + + async def get_sgs_for_dataset(self) -> list[SequencingGroupData]: """ - Uses a graphQL query to return all participants in a Metamist dataset. - Returned list includes all samples and assays associated with the participants. + Fetches all sequencing groups for the given dataset, including the assays for each sequencing group. + + Returns a list of SequencingGroupData objects. """ - - logging.getLogger().setLevel(logging.WARN) - participant_query_results = await query_async( - QUERY_PARTICIPANTS_SAMPLES_SGS, {'datasetName': self.dataset, 'seqTypes': self.sequencing_types} - ) - sg_assays_query_results = await query_async( - QUERY_SG_ASSAYS, {'datasetName': self.dataset, 'seqTypes': self.sequencing_types} + logger.info(f'{self.dataset} :: Fetching SG assays for {self.sequencing_types} sequencing types') + dataset_sgs_query_result = await query_async( + QUERY_DATASET_SGS, + {'datasetName': self.dataset, 'seqTypes': self.sequencing_types, 'seqTechs': self.sequencing_technologies}, ) - logging.getLogger().setLevel(logging.INFO) - - sg_assays = {} - for sg in sg_assays_query_results['project']['sequencingGroups']: - sg_id = sg['id'] - sg_assays[sg_id] = sg['assays'] + dataset_sgs = dataset_sgs_query_result['project']['sequencingGroups'] + + return [self.get_sg_data(sg) for sg in dataset_sgs] - participant_data = participant_query_results['project']['participants'] - - filtered_participants = [] - for participant in participant_data: - if not participant['samples']: - logging.info( - f'{self.dataset} :: Filtering participant {participant["id"]} ({participant["externalId"]}) as it has no samples.' + + def get_sg_data(self, sg: dict[str, Any]) -> SequencingGroupData: + """Parse a sequencing group dictionary into a SequencingGroupData object""" + return SequencingGroupData( + id=sg['id'], + sequencing_type=sg['type'], + sequencing_technology=sg['technology'], + sample=SampleData( + id=sg['sample']['id'], + external_id=sg['sample']['externalId'], + participant=ParticipantData( + id=sg['sample']['participant']['id'], + external_id=sg['sample']['participant']['externalId'], + ), + ), + assays=[ + self.parse_assay_data(assay) for assay in sg['assays'] + ], + ) + + + def parse_assay_data(self, assay: dict[str, Any]) -> AssayData: + """Parse an assay dictionary into an AssayData object""" + reads = assay['meta']['reads'] + if isinstance(assay['meta']['reads'], dict): + reads = [reads] + + reads_files_paths_sizes = [] + for read in reads: + reads_files_paths_sizes.append( + ( + read['location'], + read['size'], ) - continue - for sample in participant['samples']: - if not sample['sequencingGroups']: - logging.info( - f'{self.dataset} :: Filtering sample {sample["id"]} ({sample["externalId"]}) as it has no sequencing groups.' - ) - continue - for sg in sample['sequencingGroups']: - if sg['id'] not in sg_assays: - logging.info( - f'{self.dataset} :: Filtering SG {sg["id"]} as it has no assays.' + ) + if 'secondaryFiles' in read: + for secondary_file in read['secondaryFiles']: + reads_files_paths_sizes.append( + ( + secondary_file['location'], + secondary_file['size'], ) - continue - sg['assays'] = sg_assays[sg['id']] - - filtered_participants.append(participant) - - return filtered_participants - - @staticmethod - def get_most_recent_analyses_by_sg( - analyses_list: list[dict[str, Any]], - ) -> dict[str, dict[str, Any]]: + ) + + return AssayData( + id=assay['id'], + read_files_paths_sizes=reads_files_paths_sizes, + sample=SampleData( + id=assay['sample']['id'], + external_id=assay['sample']['externalId'], + participant=ParticipantData( + id=assay['sample']['participant']['id'], + external_id=assay['sample']['participant']['externalId'], + ), + ), + ) + + + def get_latest_analyses_by_sg( + self, + all_sg_analyses: list[dict[str, Any]], + ) -> dict[SequencingGroupId, dict[str, Any]]: """ Takes a list of completed analyses for a number of sequencing groups and returns the latest completed analysis for each sequencing group, creating a 1:1 mapping of SG to analysis. """ - most_recent_analysis_by_sg = {} + latest_analysis_by_sg = {} - for sg_analyses in analyses_list: + for sg_analyses in all_sg_analyses: sg_id = sg_analyses['id'] analyses = sg_analyses['analyses'] if not analyses: continue if len(analyses) == 1: - most_recent_analysis_by_sg[sg_id] = analyses[0] + latest_analysis_by_sg[sg_id] = analyses[0] continue sorted_analyses = sorted( @@ -195,176 +338,93 @@ def get_most_recent_analyses_by_sg( x['timestampCompleted'], '%Y-%m-%dT%H:%M:%S' ), ) - most_recent_analysis_by_sg[sg_id] = sorted_analyses[-1] + latest_analysis_by_sg[sg_id] = sorted_analyses[-1] - return most_recent_analysis_by_sg + # Check the analysis meta data for the sequencing type + self.check_analyses_seq_type(list(latest_analysis_by_sg.values())) + + return latest_analysis_by_sg - @staticmethod - def map_internal_to_external_sample_ids( - participants: list[dict], - ) -> dict[SampleId, SampleExternalId]: - """ - Returns the {internal sample id : external sample id} mapping for all participants in a Metamist dataset - """ - return { - sample['id']: sample['externalId'] - for participant in participants - for sample in participant['samples'] - } - - def get_assay_map_from_participants( - self, participants: list[dict] - ) -> tuple[dict[str, str], dict[int, str], dict[Any, list[tuple[Any, Any]]]]: - """ - Input the list of Metamist participant dictionaries from the 'QUERY_PARTICIPANTS_SAMPLES_SGS_ASSAYS' query - - Returns the mappings: - 1. { sg_id : sample_id } - 2. { assay_id : sg_id } - 3. { assay_id : (read_filepath, read_filesize,) } - """ - - sg_sample_id_map = {} - assay_sg_id_map = {} - assay_filepaths_filesizes = defaultdict(list) - sample_sgs = { - sample['id']: sample['sequencingGroups'] - for participant in participants - for sample in participant['samples'] - } - for sample_id, sgs in sample_sgs.items(): - for sg in sgs: - if sg['type'].lower() not in self.sequencing_types: - continue - - sg_sample_id_map[sg['id']] = sample_id - for assay in sg['assays']: - reads = assay['meta'].get('reads') - if not reads: - logging.warning( - f'{self.dataset} :: SG {sg["id"]} assay {assay["id"]} has no reads field' - ) - continue - - if assay_sg_id_map.get(assay['id']): - raise ValueError( - f'{self.dataset} :: Assay {assay["id"]} has multiple SGs: {assay_sg_id_map[assay["id"]]} and {sg["id"]}' - ) - assay_sg_id_map[assay['id']] = sg['id'] - - if isinstance(reads, dict): - assay_filepaths_filesizes[assay['id']].append( - ( - reads.get('location'), - reads.get('size'), - ) - ) - continue - - for read in reads: - if not isinstance(read, dict): - logging.error( - f'{self.dataset} :: Got {type(read)} read for SG {sg["id"]}, expected dict: {read}' - ) - continue - - assay_filepaths_filesizes[assay['id']].append( - ( - read.get('location'), - read.get('size'), - ) - ) + def check_analyses_seq_type( + self, + analyses: list[dict[str, Any]], + ): + """Check the analysis meta data for the sequencing type""" + analyses_with_missing_seq_type = [ + (analysis['id'], analysis['type']) + for analysis in analyses + if 'sequencing_type' not in analysis['meta'] + ] + if analyses_with_missing_seq_type: + raise ValueError( + f'{self.dataset} :: Analyses are missing sequencing_type field: {analyses_with_missing_seq_type}' + ) - return sg_sample_id_map, assay_sg_id_map, assay_filepaths_filesizes async def get_analysis_cram_paths_for_dataset_sgs( self, - assay_sg_id_map: dict[int, str], - ) -> dict[str, dict[int, str]]: + sequencing_groups: list[SequencingGroupData], + ) -> dict[SequencingGroupId, dict[AnalysisId, str]]: """ Fetches all CRAMs for the list of sgs in the given dataset. - Returns a dict mapping {sg_id : (analysis_id, cram_path) } + Returns a dict mapping {sg_id : (cram_analysis_id, cram_path) } """ - sg_ids = list(assay_sg_id_map.values()) + sg_ids = [sg.id for sg in sequencing_groups] logging.info(f'{self.dataset} :: Fetching CRAM analyses for {len(set(sg_ids))} SGs') - logging.getLogger().setLevel(logging.WARN) - analyses_query_result = await query_async( + sg_analyses_query_result = await query_async( QUERY_SG_ANALYSES, {'dataset': self.dataset, 'sgId': sg_ids, 'analysisTypes': ['CRAM']}, ) - logging.getLogger().setLevel(logging.INFO) - analyses = analyses_query_result['sequencingGroups'] - analyses = self.get_most_recent_analyses_by_sg(analyses_list=analyses) - - # Report any crams missing the sequencing type - crams_with_missing_seq_type = [ - analysis['id'] - for analysis in analyses.values() - if 'sequencing_type' not in analysis['meta'] - ] - if crams_with_missing_seq_type: - raise ValueError( - f'{self.dataset} :: CRAM analyses are missing sequencing_type field: {crams_with_missing_seq_type}' - ) + sg_analyses = sg_analyses_query_result['sequencingGroups'] + latest_analysis_by_sg = self.get_latest_analyses_by_sg(all_sg_analyses=sg_analyses) # For each sg id, collect the analysis id and cram paths sg_cram_paths: dict[str, dict[int, str]] = defaultdict(dict) - for sg_id, analysis in analyses.items(): - cram_path = analysis['output'] + for sg_id, analysis in latest_analysis_by_sg.items(): + cram_path = analysis['outputs']['path'] if not cram_path.startswith('gs://') or not cram_path.endswith('.cram'): logging.warning( f'Analysis {analysis["id"]} invalid output path: {analysis["output"]}' ) continue - sg_cram_paths[sg_id][analysis['id']] = analysis['output'] + sg_cram_paths[sg_id] = {analysis['id']: cram_path} return sg_cram_paths - async def analyses_for_sgs_without_crams(self, sgs_without_crams: list[str]): - """Checks if other completed analyses exist for samples without completed crams""" - - all_sg_analyses: dict[str, list[dict[str, int | str]]] = defaultdict(list) - logging.getLogger().setLevel(logging.WARN) + async def check_for_non_cram_analyses(self, sgs_without_crams: list[str]) -> None: + """Checks if other completed analyses exist for sequencing groups without a completed cram analysis""" sg_analyse_query_result = await query_async( QUERY_SG_ANALYSES, { 'dataset': self.dataset, 'sgIds': sgs_without_crams, - 'analysisTypes': [t for t in await get_analysis_types() if t != 'cram'], + 'analysisTypes': [t for t in self.all_analysis_types if t != 'cram'], }, ) - logging.getLogger().setLevel(logging.INFO) - sg_analyses = sg_analyse_query_result['sequencingGroups'] for sg_analysis in sg_analyses: sg_id = sg_analysis['id'] - + if not sg_analysis['analyses']: + continue + logging.warning( + f'{self.dataset} :: SG {sg_id} missing CRAM but has analyses:' + ) for analysis in sg_analysis['analyses']: - analysis_entry = { - 'analysis_id': analysis['id'], - 'analysis_type': analysis['type'], - 'analysis_output': analysis['output'], - 'timestamp_completed': analysis['timestampCompleted'], - } - all_sg_analyses[sg_id].append(analysis_entry) + logging.warning( + f'{analysis["id"]} - {analysis["type"]} - {analysis["outputs"].get("path")}' + ) - if all_sg_analyses: - for sg_without_cram, completed_analyses in all_sg_analyses.items(): - for completed_analysis in completed_analyses: - logging.warning( - f'{self.dataset} :: SG {sg_without_cram} missing CRAM but has analysis {completed_analysis}' - ) async def get_complete_and_incomplete_sgs( self, - assay_sg_id_map: dict[int, str], - sg_cram_paths: dict[str, dict[int, str]], + sequencing_groups: list[SequencingGroupData], + sg_cram_paths: dict[SequencingGroupId, dict[AnalysisId, str]], ) -> dict[str, Any]: """ Returns a dictionary containing two categories of sequencing groups: @@ -378,256 +438,357 @@ async def get_complete_and_incomplete_sgs( cram_paths.update(list(analyses.values())) # Check the analysis CRAM paths actually exist in the bucket - buckets_subdirs = self.get_gcs_bucket_subdirs_to_search(list(cram_paths)) - crams_in_bucket = self.find_files_in_gcs_buckets_subdirs( - buckets_subdirs, + buckets_prefixes = self.get_gcs_buckets_and_prefixes_from_paths(list(cram_paths)) + crams_in_bucket = self.find_files_in_gcs_buckets_prefixes( + buckets_prefixes, ('cram',), ) # Incomplete SGs initialised as the SGs without a completed CRAM - incomplete_sgs = set(assay_sg_id_map.values()).difference( + incomplete_sgs = set([sg.id for sg in sequencing_groups]).difference( set(sg_cram_paths.keys()) ) - # Completed SGs have a CRAM file in the bucket that matches the path in Metamist + # Completed SGs have a CRAM file in the bucket that matches the path in Metamist analysis record + # Incomplete SGs have a CRAM analysis record in Metamist but are not found at that path in the bucket completed_sgs = {} for sg_id, analysis in sg_cram_paths.items(): - for analysis_id, cram_path in analysis.items(): + for cram_analysis_id, cram_path in analysis.items(): if cram_path in crams_in_bucket: - completed_sgs[sg_id] = analysis_id - continue - incomplete_sgs.update(sg_id) + completed_sgs[sg_id] = cram_analysis_id + else: + logging.warning( + f'{self.dataset} :: SG {sg_id} has CRAM analysis: {cram_analysis_id} - but file not found at path: {cram_path}' + ) + incomplete_sgs.update(sg_id) if incomplete_sgs: logging.warning( f'{self.dataset} :: {len(incomplete_sgs)} SGs without CRAMs found: {list(incomplete_sgs)}' ) - await self.analyses_for_sgs_without_crams(list(incomplete_sgs)) + logging.warning('Checking if any other analyses exist for these SGs, which would be unexpected...') + await self.check_for_non_cram_analyses(list(incomplete_sgs)) return {'complete': completed_sgs, 'incomplete': list(incomplete_sgs)} + async def check_for_uningested_or_moved_assays( # pylint: disable=R0914 self, bucket_name: str, - assay_filepaths_filesizes: dict[int, list[tuple[str, int]]], - completed_sgs: dict[str, list[int]], - sg_sample_id_map: dict[str, str], - assay_sg_id_map: dict[int, str], - sample_internal_external_id_map: dict[str, str], - ): + sequencing_groups: list[SequencingGroupData], + completed_sgs: dict[SequencingGroupId, list[AnalysisId]], + assay_id_to_paths_and_sizes: dict[AssayId, list[tuple[str, int]]], + ) -> tuple[list[AuditReportEntry], list[AuditReportEntry], set[str]]: """ Compares the assays read files in a Metamist dataset to the read files found in the - main-upload bucket. - - Input: The upload bucket name, the {assay_id : (read_path, read_size)} mapping, - the completed SGs, the { SG_ID : sample_ID } mapping, the { assay_id : SG_ID } mapping, - and the { sample_id : sample_external_id } mapping. - - Returns: 1. Paths to assays that have not yet been ingested - checks if any completed - sample external IDs are in the filename and includes this in output. - 2. A dict mapping assay IDs to GS filepaths that have been ingested, - but where the path in the bucket is different to the path for this - assay in Metamist. If the filenames and file sizes match, - these are identified as assay files that have been moved from - their original location to a new location. - 3. The assay read files that have been deleted/moved. + upload bucket. + + Input: + - bucket_name: The name of the GCS bucket to check + - sequencing_groups: A list of SequencingGroupData objects + - completed_sgs: A dict mapping sg_ids to analysis_ids for completed CRAM analyses + - assay_id_to_paths_and_sizes: A dict mapping assay IDs to lists of tuples of read file paths and sizes + + Returns: 1. A list of audit report records for reads that have not been ingested, + but where a known sample ID exists in the read file path. + 2. A list of audit report records for reads that have been ingested, + but have been moved to a different location in the bucket. + 3. A set of string paths to the assay read files that have been + deleted/moved from their original location in Metamist. """ - # Get all the paths to assay data anywhere in the main-upload bucket - assay_paths_in_bucket = self.find_assay_files_in_gcs_bucket( - bucket_name, self.file_types + # Get a list of all the paths and sizes of assay files recorded in Metamist + metamist_assay_paths_sizes: list[tuple[str, int]] = [ + path_size for assay in assay_id_to_paths_and_sizes.values() for path_size in assay + ] + metamist_assay_paths = set( + [path for path, _ in metamist_assay_paths_sizes] ) - # Flatten all the Metamist assay file paths and sizes into a single list - assay_paths_sizes_in_metamist: list[tuple[str, int]] = [] - for assay in assay_filepaths_filesizes.values(): - assay_paths_sizes_in_metamist.extend(assay) - - # Find the paths that exist in the bucket and not in metamist - assay_paths_in_metamist = [ - path_size[0] for path_size in assay_paths_sizes_in_metamist - ] - uningested_assay_paths = set(assay_paths_in_bucket).difference( - set(assay_paths_in_metamist) + # Get a list of all the paths and sizes of assay files anywhere in the upload bucket + bucket_assay_paths_sizes = self.find_assay_files_in_gcs_bucket( + bucket_name, self.file_types + ) + bucket_assay_paths = set(bucket_assay_paths_sizes.keys()) + + # Find the paths that exist in the bucket and not in Metamist + uningested_assay_paths = set(bucket_assay_paths).difference( + set(metamist_assay_paths) ) - # Find the paths that exist in metamist and not in the bucket - metamist_paths_to_nowhere = set(assay_paths_in_metamist).difference( - set(assay_paths_in_bucket) + # Find the paths that exist in Metamist and not in the bucket + metamist_paths_to_nowhere = set(metamist_assay_paths).difference( + set(bucket_assay_paths) ) # Strip the metamist paths into just filenames # Map each file name to its file size and path + # This is used to identify if any files have been moved metamist_assay_file_size_map = { - os.path.basename(path_size[0]): path_size[1] - for path_size in assay_paths_sizes_in_metamist - } - metamist_assay_file_path_map = { - os.path.basename(path_size[0]): path_size[0] - for path_size in assay_paths_sizes_in_metamist + os.path.basename(path): {'size': size, 'path': path} + for path, size in metamist_assay_paths_sizes } - # Identify if any paths are to files that have actually just been moved - # by checking if they are in the bucket but not metamist + # Check if any of the uningested paths are actually files that have been moved + ingested_reads_that_were_moved = self.check_if_assay_files_were_moved( + uningested_assay_paths, + metamist_assay_file_size_map, + bucket_assay_paths_sizes, + ) + + # Check if any of the uningested paths contain sample IDs for completed SGs + uningested_reads = self.check_uningested_assays_for_sample_ids( + sequencing_groups, + uningested_assay_paths, + bucket_assay_paths_sizes, + completed_sgs, + ) + + return uningested_reads, ingested_reads_that_were_moved, metamist_paths_to_nowhere + + + def check_if_assay_files_were_moved( + self, + sequencing_groups: list[SequencingGroupData], + completed_sgs: dict[str, list[int]], + uningested_assay_paths: set[str], + assay_id_to_paths_and_sizes: dict[int, list[tuple[str, int]]], + metamist_assay_paths_sizes: dict[str, dict[str, Any]], + bucket_assay_paths_sizes: dict[str, int], + ) -> list[AuditReportEntry]: + """ + Identify if any paths are to files that have actually just been moved + by checking if they are in the bucket but not Metamist. If they are, + check if the file size is the same as the file in Metamist. If so, + assume the file has been moved and add it to the list of ingested and moved + files. + + Returns a tuple of two lists, the first containing the paths of ingested and moved files, + the second containing the assay report data for these files + """ ingested_and_moved_filepaths = [] new_assay_path_sizes = {} - for path in uningested_assay_paths: - filename = os.path.basename(path) + for bucket_path in uningested_assay_paths: + filename = os.path.basename(bucket_path) # If the file in the bucket has the exact same name and size as one in metamist, assume its the same - if filename in metamist_assay_file_size_map.keys(): - filesize = await self.file_size(path) - if filesize == metamist_assay_file_size_map.get(filename): + if filename in metamist_assay_paths_sizes: + metamist_file_path = metamist_assay_paths_sizes[filename]['path'] + metamist_file_size = metamist_assay_paths_sizes[filename]['size'] + bucket_file_size = bucket_assay_paths_sizes[bucket_path] + if bucket_file_size == metamist_file_size: ingested_and_moved_filepaths.append( - (path, metamist_assay_file_path_map.get(filename)) + { + 'bucket_path': bucket_path, + 'metamist_path': metamist_file_path, + 'size': bucket_file_size, + } + ) + new_assay_path_sizes[bucket_path] = bucket_file_size + else: + logging.warning( + f'Uningested file at {bucket_path} ({bucket_file_size}) is similar to file in Metamist: {metamist_file_path} ({metamist_file_size}) but has different size' ) - new_assay_path_sizes[path] = filesize logging.info( f'Found {len(ingested_and_moved_filepaths)} ingested files that have been moved' ) - + # If the file has just been moved, we consider it ingested - uningested_assay_paths -= { - bucket_path for bucket_path, _ in ingested_and_moved_filepaths - } - - # Check the list of uningested paths to see if any of them contain sample ids for ingested samples - # This could happen when we ingest a fastq read pair for a sample, and additional read files were provided - # but not ingested, such as bams and vcfs. - uningested_reads: dict[str, list[tuple[str, str, str]]] = defaultdict( - list, {k: [] for k in uningested_assay_paths} + uningested_assay_paths.remove( + {bucket_path for bucket_path, _ in ingested_and_moved_filepaths} ) + + # flip the assay id : reads mapping to identify assays by their reads + read_file_path_to_assay_id = {} + for assay_id, reads_sizes in assay_id_to_paths_and_sizes.items(): + for read, _ in reads_sizes: + read_file_path_to_assay_id[read] = assay_id + + assay_sg_id_map = {assay.id: sg.id for sg in sequencing_groups for assay in sg.assays} + + assays_moved_paths = [] + for ingested_and_moved_path in ingested_and_moved_filepaths: + + assay_id = read_file_path_to_assay_id.get(ingested_and_moved_path['metamist_path']) + + sg_id = assay_sg_id_map.get(assay_id) + cram_analysis_id = completed_sgs.get(sg_id)[0] if sg_id in completed_sgs else None + + if sg_id in self.excluded_sequencing_groups or not cram_analysis_id: + continue + + sg = self.get_sequencing_group_data_by_id(sg_id, sequencing_groups) + if not sg: + continue + + assays_moved_paths.append( + AuditReportEntry( + file_path=ingested_and_moved_path['bucket_path'], + filesize=ingested_and_moved_path['filesize'], + sg_id=sg_id, + assay_id=assay_id, + cram_analysis_id=cram_analysis_id, + sample_id=sg.sample.id, + sample_external_id=sg.sample.external_id, + participant_id=sg.sample.participant.id, + participant_external_id=sg.sample.participant.external_id, + ) + ) + + return assays_moved_paths + + def get_sequencing_group_data_by_id( + self, + sg_id: str, + sequencing_groups: list[SequencingGroupData], + ): + """Get the sequencing group data for a given sg_id""" + for sg in sequencing_groups: + if sg.id == sg_id: + return sg + return None + + + def check_uningested_assays_for_sample_ids( + self, + sequencing_groups: list[SequencingGroupData], + uningested_assay_paths: set[str], + bucket_assay_paths_sizes: dict[str, int], + completed_sgs: dict[SequencingGroupId, list[AnalysisId]], + ) -> list[AuditReportEntry]: + """ + Combs through the list of uningested assay paths to see if any of them contain sample ids for completed SGs. + Can happen when we ingest a fastq read pair for a sample, and additional read files were provided (e.g. bams, vcfs). + If there are extra files for a completed SG, we should either ingest them or delete them. + """ + sg_sample_map = {sg.id: sg.sample for sg in sequencing_groups} + uningested_reads = [] for sg_id, analysis_ids in completed_sgs.items(): try: - sample_id = sg_sample_id_map[sg_id] - sample_ext_id = sample_internal_external_id_map[sample_id] - for uningested_assay in uningested_assay_paths: - if sample_ext_id not in uningested_assay: + sample = sg_sample_map[sg_id] + for uningested_read_file in uningested_assay_paths: + if sample.external_id not in uningested_read_file or sample.participant.external_id not in uningested_read_file: continue - uningested_reads[uningested_assay].append( - (sg_id, sample_id, sample_ext_id) + uningested_reads.append( + AuditReportEntry( + file_path=uningested_read_file, + filesize=bucket_assay_paths_sizes[uningested_read_file], + sg_id=sg_id, + cram_analysis_id=analysis_ids[0], + sample_id=sample.id, + sample_external_id=sample.external_id, + participant_id=sample.participant.id, + participant_external_id=sample.participant.external_id, + ) ) except KeyError: logging.warning( f'{sg_id} from analyses: {analysis_ids} not found in SG-sample map.' ) - - # flip the assay id : reads mapping to identify assays by their reads - reads_assays = {} - for assay_id, reads_sizes in assay_filepaths_filesizes.items(): - for read_size in reads_sizes: - reads_assays[read_size[0]] = assay_id - - # Collect the assays for files that have been ingested and moved to a different bucket location - assays_moved_paths = [] - for bucket_path, metamist_path in ingested_and_moved_filepaths: - assay_id = reads_assays.get(metamist_path) - sg_id = assay_sg_id_map.get(assay_id) - analysis_id = completed_sgs.get(sg_id) - if sg_id not in AuditHelper.EXCLUDED_SGS and analysis_id: - filesize = new_assay_path_sizes[bucket_path] - assays_moved_paths.append( - AssayReportEntry( - sg_id=sg_id, - assay_id=assay_id, - assay_file_path=bucket_path, - analysis_id=analysis_id, - filesize=filesize, - ) - ) - - return uningested_reads, assays_moved_paths, metamist_paths_to_nowhere - + + return uningested_reads + + async def get_reads_to_delete_or_ingest( self, bucket_name: str, - completed_sgs: dict[str, list[int]], - assay_filepaths_filesizes: dict[int, list[tuple[str, int]]], - sg_sample_id_map: dict[str, str], - assay_sg_id_map: dict[int, str], - sample_internal_external_id_map: dict[str, str], - ) -> tuple[list, list]: + sequencing_groups: list[SequencingGroupData], + completed_sgs: dict[SequencingGroupId, list[AnalysisId]], + assay_id_to_paths_and_sizes: dict[AssayId, list[tuple[str, int]]], + ) -> tuple[list[AuditReportEntry], list[AuditReportEntry]]: """ - Inputs: 1. List of samples which have completed CRAMs - 2. Dictionary mapping of assay IDs to assay file paths and file sizes - 3. Dictionary mapping sample IDs to assay IDs - 4. Dictionary mapping assay IDs to sample IDs - Returns a tuple of two lists, each containing assays IDs and assay file paths. + Inputs: + - bucket_name: The name of the GCS bucket to check + - sequencing_groups: A list of SequencingGroupData objects + - completed_sgs: A dict mapping sg_ids to analysis_ids for completed CRAM analyses + - assay_id_to_paths_and_sizes: A dict mapping assay IDs to lists of tuples of read file paths and sizes + + Returns two lists, each containing AuditReportEntry objects. The first containins reads which can be deleted, the second containing reads to ingest. The sample id, assay id, and analysis id (of completed cram) are included in the delete list. """ # Check for uningested assay data that may be hiding or assay data that has been moved ( reads_to_ingest, - moved_assays_to_delete, + moved_assay_report_entries, metamist_paths_to_nowhere, ) = await self.check_for_uningested_or_moved_assays( bucket_name, - assay_filepaths_filesizes, + sequencing_groups, completed_sgs, - sg_sample_id_map, - assay_sg_id_map, - sample_internal_external_id_map, + assay_id_to_paths_and_sizes, ) - # Create a mapping of sg id: assay id - use defaultdict in case a sg has several assays - sg_assays_id_map: dict[str, list[int]] = defaultdict(list) - for assay_id, sg_id in assay_sg_id_map.items(): - sg_assays_id_map[sg_id].append(assay_id) - - assay_reads_to_delete = [] - for sg_id, analysis_id in completed_sgs.items(): - if sg_id in AuditHelper.EXCLUDED_SGS: + # Create a mapping of sg id: assay ids + sg_assays_id_map = {sg.id: [assay.id for assay in sg.assays] for sg in sequencing_groups} + + # Create a list of assay report entries for the moved assays + assay_reads_to_delete: list[AuditReportEntry] = [] + for sg_id, cram_analysis_id in completed_sgs.items(): + if sg_id in self.excluded_sequencing_groups: continue + sg = self.get_sequencing_group_data_by_id(sg_id, sequencing_groups) assay_ids = sg_assays_id_map[sg_id] for assay_id in assay_ids: - assay_read_paths = assay_filepaths_filesizes[assay_id] + assay_read_paths = assay_id_to_paths_and_sizes[assay_id] for path, size in assay_read_paths: - if path in metamist_paths_to_nowhere: + if path in metamist_paths_to_nowhere: # Already deleted continue - filesize = size + assay_reads_to_delete.append( - AssayReportEntry( + AuditReportEntry( + file_path=path, + filesize=size, sg_id=sg_id, assay_id=assay_id, - assay_file_path=path, - analysis_id=analysis_id, - filesize=filesize, + cram_analysis_id=cram_analysis_id, + sample_id=sg.sample.id, + sample_external_id=sg.sample.external_id, + participant_id=sg.sample.participant.id, + participant_external_id=sg.sample.participant.external_id, ) ) - reads_to_delete = assay_reads_to_delete + moved_assays_to_delete + reads_to_delete = assay_reads_to_delete + moved_assay_report_entries return reads_to_delete, reads_to_ingest + @staticmethod def find_crams_for_reads_to_ingest( - reads_to_ingest: dict[str, list], + reads_to_ingest: list[AuditReportEntry], sg_cram_paths: dict[str, dict[int, str]], - ) -> list[tuple[str, str, str, str, int, str]]: + ) -> list[AuditReportEntry]: """ - Compares the external sample IDs for samples with completed CRAMs against the + Compares the external sample IDs for SGs with completed CRAMs against the uningested read files. This may turn up results for cases where multiple read types have been provided for a sample, but only one type was ingested and used for alignment. """ - possible_assay_ingests = [] - for assay_path, sample_tuples in reads_to_ingest.items(): - if not sample_tuples: - # If no samples detected in filename, add the path in an empty tuple - possible_assay_ingests.append((assay_path, '', '', '', 0, '')) - continue - for sample_tuple in sample_tuples: - # Else get the completed CRAM analysis id and path for the sample - sg_id, sample_id, sample_external_id = sample_tuple - sg_cram = sg_cram_paths[sg_id] - analysis_id = int(list(sg_cram.keys())[0]) - cram_path = sg_cram[analysis_id] + possible_assay_ingests: list[AuditReportEntry] = [] + for read_to_ingest in reads_to_ingest: + if not read_to_ingest.sample_id: + # If no sample id was detected in the filename, add the path with no further checks possible_assay_ingests.append( - ( - assay_path, - sg_id, - sample_id, - sample_external_id, - analysis_id, - cram_path, + AuditReportEntry( + file_path=read_to_ingest.file_path, + filesize=read_to_ingest.filesize, ) ) + continue + + # Else get the completed CRAM analysis id + sg_cram = sg_cram_paths[read_to_ingest.sg_id] + cram_path = sg_cram[read_to_ingest.cram_analysis_id] + possible_assay_ingests.append( + AuditReportEntry( + file_path=read_to_ingest.file_path, + filesize=read_to_ingest.filesize, + sg_id=read_to_ingest.sg_id, + assay_id=read_to_ingest.assay_id, + cram_analysis_id=read_to_ingest.cram_analysis_id, + cram_file_path=cram_path, + sample_id=read_to_ingest.sample_id, + sample_external_id=read_to_ingest.sample_external_id, + participant_id=read_to_ingest.participant_id, + participant_external_id=read_to_ingest.participant_external_id, + ) + ) return possible_assay_ingests From 9ee102903bca9faa5ea78c50a5b190bbf8584614 Mon Sep 17 00:00:00 2001 From: EddieLF Date: Tue, 7 Jan 2025 17:09:22 +1100 Subject: [PATCH 06/10] Improve enum GQL queries, fix AuditHelper super init --- metamist/audit/audithelper.py | 95 ++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 45 deletions(-) diff --git a/metamist/audit/audithelper.py b/metamist/audit/audithelper.py index 3f0f0f1f8..e5e1566d0 100644 --- a/metamist/audit/audithelper.py +++ b/metamist/audit/audithelper.py @@ -19,12 +19,14 @@ READ_EXTENSIONS = FASTQ_EXTENSIONS + BAM_EXTENSIONS + CRAM_EXTENSIONS GVCF_EXTENSIONS = ('.g.vcf.gz',) VCF_EXTENSIONS = ('.vcf', '.vcf.gz') +ARCHIVE_EXTENSIONS = ('.tar', '.tar.gz', '.zip') ALL_EXTENSIONS = ( FASTQ_EXTENSIONS + BAM_EXTENSIONS + CRAM_EXTENSIONS + GVCF_EXTENSIONS + VCF_EXTENSIONS + + ARCHIVE_EXTENSIONS ) FILE_TYPES_MAP = { @@ -34,44 +36,36 @@ 'all_reads': READ_EXTENSIONS, 'gvcf': GVCF_EXTENSIONS, 'vcf': VCF_EXTENSIONS, + 'archive': ARCHIVE_EXTENSIONS, 'all': ALL_EXTENSIONS, } HAIL_EXTENSIONS = ['.ht', '.mt', '.vds'] -ANALYSIS_TYPES_QUERY = gql( +ENUMS_QUERY = gql( """ query analysisTypes { enum { analysisType - } - } - """ -) - -SEQUENCING_TYPES_QUERY = gql( - """ - query seqTypes { - enum { sequencingType } } """ ) -def get_analysis_types(): - """Return the list of analysis types from the enum table.""" - analysis_types_query_result = query(ANALYSIS_TYPES_QUERY) - return analysis_types_query_result['enum']['analysisType'] -def get_sequencing_types(): - """Return the list of sequencing types from the enum table.""" - sequencing_types_query_result: dict[str, dict[str, list[str]]] = query(SEQUENCING_TYPES_QUERY) - return sequencing_types_query_result['enum']['sequencingType'] +def get_enums(enum_name) -> list[str]: + """ + Return the list of allowed values of a particular enum type from the enum table. + Used to get the list of allowed analysis types and sequencing types. + """ + enums_query_result = query(ENUMS_QUERY) + return enums_query_result['enum'][enum_name] class AuditHelper(CloudHelper): """General helper class for bucket auditing""" + def __init__( self, gcp_project: str, @@ -79,22 +73,28 @@ def __init__( all_sequencing_types: list[str] = None, excluded_sequencing_groups: list[str] = None, ): - # Initialize GCP project - self.gcp_project = gcp_project or get_gcp_project() or config_retrieve(['workflow', 'gcp_project']) + super().__init__(search_paths=None) + + # GCP project is used for GCP calls, which the auditor does a lot of + self.gcp_project = ( + gcp_project + or get_gcp_project() + or config_retrieve(['workflow', 'gcp_project']) + ) if not self.gcp_project: raise ValueError('GCP project is required') - - self.all_analysis_types = all_analysis_types or get_analysis_types() - self.all_sequencing_types = all_sequencing_types or get_sequencing_types() - - self.excluded_sequencing_groups = excluded_sequencing_groups or config_retrieve(['workflow', 'audit', 'excluded_sequencing_groups']) - - super().__init__( - gcp_project=self.gcp_project, + + self.all_analysis_types = all_analysis_types or get_enums('analysisType') + self.all_sequencing_types = all_sequencing_types or get_enums('sequencingType') + + self.excluded_sequencing_groups = excluded_sequencing_groups or config_retrieve( + ['workflow', 'audit', 'excluded_sequencing_groups'] ) @staticmethod - def get_gcs_buckets_and_prefixes_from_paths(paths: list[str]) -> defaultdict[str, list]: + def get_gcs_buckets_and_prefixes_from_paths( + paths: list[str], + ) -> defaultdict[str, list]: """ Takes a list of paths and extracts the bucket names and prefix, returning all unique pairs of buckets and prefixes. Does not make any calls to GCS. @@ -107,7 +107,9 @@ def get_gcs_buckets_and_prefixes_from_paths(paths: list[str]) -> defaultdict[str logging.warning(f'{path} invalid') continue bucket = pc['bucket'] - prefix = pc['suffix'] # This is the prefix (i.e. the "subdirectory" in the bucket) + prefix = pc[ + 'suffix' + ] # This is the prefix (i.e. the "subdirectory" in the bucket) if prefix and prefix not in buckets_prefixes[bucket]: buckets_prefixes[bucket].append(prefix) @@ -118,11 +120,9 @@ def get_all_files_in_gcs_bucket_with_prefix_and_extensions( ): """Iterate through a gcp bucket/prefix and get all the blobs with the specified file extension(s)""" bucket = self.gcs_client.bucket(bucket_name, user_project=self.user_project) - + files_in_bucket_prefix = [] - for blob in self.gcs_client.list_blobs( - bucket, prefix=prefix, delimiter='/' - ): + for blob in self.gcs_client.list_blobs(bucket, prefix=prefix, delimiter='/'): # Check if file ends with specified analysis type if not blob.name.endswith(file_extension): continue @@ -145,7 +145,9 @@ def find_files_in_gcs_buckets_prefixes( if any(hl_extension in prefix for hl_extension in HAIL_EXTENSIONS): continue files_in_bucket.extend( - self.get_all_files_in_gcs_bucket_with_prefix_and_extensions(bucket_name, prefix, file_types) + self.get_all_files_in_gcs_bucket_with_prefix_and_extensions( + bucket_name, prefix, file_types + ) ) return files_in_bucket @@ -165,7 +167,7 @@ def find_assay_files_in_gcs_bucket( 'Call to list_blobs without prefix only valid for upload buckets' ) bucket = self.gcs_client.bucket(bucket_name, user_project=self.user_project) - + assay_paths_sizes = {} for blob in self.gcs_client.list_blobs(bucket, prefix=''): if not blob.name.endswith(file_extensions): @@ -174,7 +176,7 @@ def find_assay_files_in_gcs_bucket( assay_paths_sizes[blob.name] = blob.size return assay_paths_sizes - + def get_audit_report_prefix( self, seq_types: str, @@ -185,16 +187,15 @@ def get_audit_report_prefix( sequencing_types_str = 'all_seq_types' else: sequencing_types_str = ('_').join(self.sequencing_types) + '_seq_types' - + if set(file_types) == set(ALL_EXTENSIONS): file_types_str = 'all_file_types' elif set(file_types) == set(READ_EXTENSIONS): file_types_str = 'all_reads_file_types' else: file_types_str = ('_').join(self.file_types) + '_file_types' - - return f'{file_types_str}_{sequencing_types_str}' + return f'{file_types_str}_{sequencing_types_str}' def write_report_to_cloud( self, @@ -218,7 +219,7 @@ def write_report_to_cloud( if not data_to_write: logging.info('No data to write to report') return - + logging.info(f'Writing report to gs://{bucket_name}/{blob_path}') # Create a string buffer to hold the data @@ -230,13 +231,15 @@ def write_report_to_cloud( content_type = 'text/tab-separated-values' else: raise ValueError('Blob path must end with either .csv or .tsv') - + buffer = StringIO() - writer = csv.DictWriter(buffer, fieldnames=data_to_write[0].keys(), delimiter=delimiter) + writer = csv.DictWriter( + buffer, fieldnames=data_to_write[0].keys(), delimiter=delimiter + ) writer.writeheader() writer.writerows(data_to_write) - + storage_client = storage.Client() bucket = storage_client.bucket(bucket_name, user_project=self.user_project) blob = bucket.blob(blob_path) @@ -248,5 +251,7 @@ def write_report_to_cloud( ) buffer.close() - logging.info(f'Wrote {len(data_to_write)} lines to gs://{bucket_name}/{blob_path}') + logging.info( + f'Wrote {len(data_to_write)} lines to gs://{bucket_name}/{blob_path}' + ) return From 9d076f573308b77aa74eea625ac3ce251ef0648a Mon Sep 17 00:00:00 2001 From: EddieLF Date: Wed, 15 Jan 2025 14:04:56 +1100 Subject: [PATCH 07/10] Refactor generic auditor and audit helper --- metamist/audit/audit_upload_bucket.py | 160 ++--- metamist/audit/audithelper.py | 186 ++++- metamist/audit/generic_auditor.py | 935 +++++++++++++------------- metamist/parser/cloudhelper.py | 23 + 4 files changed, 716 insertions(+), 588 deletions(-) diff --git a/metamist/audit/audit_upload_bucket.py b/metamist/audit/audit_upload_bucket.py index 7c1035881..4ff7cae25 100644 --- a/metamist/audit/audit_upload_bucket.py +++ b/metamist/audit/audit_upload_bucket.py @@ -11,18 +11,18 @@ import click -from cpg_utils.config import config_retrieve, dataset_path - -from metamist.audit.audithelper import FILE_TYPES_MAP, get_sequencing_types -from metamist.audit.generic_auditor import GenericAuditor, SequencingGroupData +from metamist.audit.generic_auditor import ( + GenericAuditor, + SequencingGroupData, + AuditReportEntry, +) def audit_upload_bucket( dataset: str, sequencing_types: list[str], + sequencing_technologies: list[str], file_types: list[str], - default_analysis_type: str, - default_analysis_status: str, ): """Entrypoint for running upload bucket auditor asynchronously.""" asyncio.get_event_loop().run_until_complete( @@ -30,9 +30,8 @@ def audit_upload_bucket( audit_upload_bucket_async( dataset, sequencing_types, + sequencing_technologies, file_types, - default_analysis_type, - default_analysis_status, ), timeout=60, ) @@ -46,6 +45,7 @@ def __init__( self, dataset: str, sequencing_types: list[str], + sequencing_technologies: list[str], file_types: tuple[str], default_analysis_type='cram', default_analysis_status='completed', @@ -53,24 +53,36 @@ def __init__( super().__init__( dataset=dataset, sequencing_types=sequencing_types, + sequencing_technologies=sequencing_technologies, file_types=file_types, default_analysis_type=default_analysis_type, default_analysis_status=default_analysis_status, ) + async def write_unaligned_sgs_report( + self, + unaligned_sgs: list[SequencingGroupData], + report_extension: str = 'tsv', + ): + """Writes a report of the unaligned sequencing groups to the bucket""" + today = datetime.today().strftime('%Y-%m-%d') + report_prefix = self.get_audit_report_prefix( + seq_types=self.sequencing_types, file_types=self.file_types + ) + self.write_report_to_cloud( + data_to_write=self.get_audit_report_records_from_sgs(unaligned_sgs), + bucket_name=self.bucket_name, + blob_path=f'audit_results/{today}/{report_prefix}_unaligned_sgs.{report_extension}', + ) + async def write_upload_bucket_audit_reports( self, bucket_name: str, - # audit_report_assay_files_to_delete, - # audit_report_assay_files_to_ingest, - # audit_report_unaligned_sgs, - audit_reports: dict[str, list[dict[str, str]]], + audit_reports: dict[str, list[AuditReportEntry]], report_extension: str = 'tsv', ): """ - Writes the 'assay files to delete/ingest' csv reports and upload them to the bucket. - Also writes a report for any assay files found that match existing samples and may - require ingestion. + Writes the 'assay files to delete/ingest' reports and upload them to the bucket. The report names include the file types, sequencing types, and date of the audit. """ @@ -89,9 +101,8 @@ async def write_upload_bucket_audit_reports( async def audit_upload_bucket_async( dataset: str, sequencing_types: list[str], + sequencing_technologies: list[str], file_types: list[str], - default_analysis_type: str, - default_analysis_status: str, ): """ Finds sequence files for samples with completed CRAMs and adds these to a csv for deletion. @@ -102,106 +113,44 @@ async def audit_upload_bucket_async( dataset: The name of the dataset to audit sequencing_types: The list of sequencing types to audit file_types: The list of file types to audit - default_analysis_type: The default analysis type to audit - default_analysis_status: The default analysis status to audit """ - # Initialise the auditor auditor = UploadBucketAuditor( dataset=dataset, sequencing_types=sequencing_types, + sequencing_technologies=sequencing_technologies, file_types=file_types, - default_analysis_type=default_analysis_type, - default_analysis_status=default_analysis_status, ) + sequencing_groups: list[SequencingGroupData] = await auditor.get_sgs_for_dataset() - # Validate user inputs - allowed_sequencing_types = get_sequencing_types() - if sequencing_types != ('all',) and any( - st not in allowed_sequencing_types for st in sequencing_types - ): - raise ValueError( - f'Input sequencing types "{sequencing_types}" must be in the allowed types: {allowed_sequencing_types}' - ) - if sequencing_types == ('all',): - sequencing_types = allowed_sequencing_types - - if file_types not in (('all',), ('all_reads',)): - if any(ft not in FILE_TYPES_MAP for ft in file_types): - raise ValueError( - f'Input file types "{file_types}" must be in the allowed types {(", ").join(list(FILE_TYPES_MAP.keys()))}' - ) - else: - file_types = FILE_TYPES_MAP[file_types[0]] - - if not dataset: - dataset = config_retrieve(['workflow', 'dataset']) - bucket_name = dataset_path(dataset=dataset, category='upload') + # Update the sequencing groups with their completed analysis crams + await auditor.update_sequencing_groups_with_crams(sequencing_groups) - - - # participant_data = await auditor.get_participant_data_for_dataset() - # sample_internal_external_id_map = auditor.map_internal_to_external_sample_ids( - # participant_data - # ) - # ( - # sg_sample_id_map, - # assay_sg_id_map, - # assay_filepaths_filesizes, - # ) = auditor.get_assay_map_from_participants(participant_data) - - sequencing_groups: list[SequencingGroupData] = await auditor.get_sg_assays_for_dataset() - - # Get all completed cram output paths for the samples in the dataset and validate them - sg_cram_paths = await auditor.get_analysis_cram_paths_for_dataset_sgs( + # Identify sgs with and without completed crams + sg_completion: dict[str, list[SequencingGroupData]] = await auditor.check_sg_crams( sequencing_groups ) - # Identify sgs with and without completed crams - sg_completion = await auditor.get_complete_and_incomplete_sgs( - sequencing_groups, sg_cram_paths + # Write a report of the unaligned sequencing groups if any + await auditor.write_unaligned_sgs_report( + unaligned_sgs=sg_completion.get('incomplete'), ) - - # Get the unaligned sequencing groups - unaligned_sgs = [] - for sg in sequencing_groups: - if sg.id in sg_completion.get('incomplete'): - unaligned_sgs.append( - { - 'sg_id': sg.id, - 'sample_id': sg.sample.id, - 'sample_external_id': sg.sample.external_id, - 'participant_id': sg.sample.participant.id, - 'participant_external_id': sg.sample.participant.external_id, - } - ) - - # Extract the assay file paths and sizes for the dataset's SGs - assay_id_to_paths_sizes = {assay.id: assay.read_files_paths_sizes for sg in sequencing_groups for assay in sg.assays} - - # Get the assay files to delete and ingest + + # Get the reads to delete and ingest ( reads_to_delete, reads_to_ingest, - ) = await auditor.get_reads_to_delete_or_ingest( - bucket_name=bucket_name, - sequencing_groups=sequencing_groups, - completed_sgs=sg_completion.get('complete'), - assay_id_to_paths_sizes=assay_id_to_paths_sizes, + ) = await auditor.get_audit_report_records_for_reads_to_delete_and_reads_to_ingest( + sequencing_groups=sequencing_groups ) - possible_assay_ingests = auditor.find_crams_for_reads_to_ingest( - reads_to_ingest, sg_cram_paths - ) - - # Write the reads to delete, reads to ingest, and unaligned SGs reports + # Write the reads to delete and ingest reports await auditor.write_upload_bucket_audit_reports( - bucket_name=bucket_name, - sequencing_types=sequencing_types, - file_types=file_types, - assay_files_to_delete=reads_to_delete, - assay_files_to_ingest=possible_assay_ingests, - unaligned_sgs=unaligned_sgs, + bucket_name=auditor.bucket_name, + audit_reports={ + 'reads_to_delete': reads_to_delete, + 'reads_to_ingest': reads_to_ingest, + }, ) @@ -216,7 +165,14 @@ async def audit_upload_bucket_async( '-s', multiple=True, required=True, - help=f'"all", or any of {", ".join(get_sequencing_types())}', + help='"all", or any of the enum sequencing types', +) +@click.option( + '--sequencing-technologies', + '-t', + multiple=True, + required=True, + help='"all", or any of the enum sequencing technologies', ) @click.option( '--file-types', @@ -228,17 +184,15 @@ async def audit_upload_bucket_async( def main( dataset: str, sequencing_types: tuple[str], + sequencing_technologies: tuple[str], file_types: tuple[str], - default_analysis_type='cram', - default_analysis_status='completed', ): """Runs the auditor on the dataset""" audit_upload_bucket( dataset, sequencing_types, + sequencing_technologies, file_types, - default_analysis_type, - default_analysis_status, ) diff --git a/metamist/audit/audithelper.py b/metamist/audit/audithelper.py index e5e1566d0..b10921817 100644 --- a/metamist/audit/audithelper.py +++ b/metamist/audit/audithelper.py @@ -5,7 +5,6 @@ from io import StringIO from typing import Any -from cpg_utils.cloud import get_path_components_from_gcp_path from cpg_utils.config import config_retrieve, get_gcp_project from metamist.graphql import gql, query @@ -44,7 +43,7 @@ ENUMS_QUERY = gql( """ - query analysisTypes { + query enumsQuery { enum { analysisType sequencingType @@ -63,12 +62,126 @@ def get_enums(enum_name) -> list[str]: return enums_query_result['enum'][enum_name] +# Variable type definitions +AnalysisId = int +AssayId = int +ParticipantId = int +ParticipantExternalId = str +SampleId = str +SampleExternalId = str +SequencingGroupId = str + + +class AuditReportEntry: # pylint: disable=too-many-instance-attributes + """Class to hold the data for an audit report entry""" + + def __init__( # pylint: disable=too-many-arguments + self, + filepath: str | None = None, + filesize: int | None = None, + sg_id: str | None = None, + assay_id: int | None = None, + cram_analysis_id: int | None = None, + cram_file_path: str | None = None, + sample_id: str | None = None, + sample_external_id: str | None = None, + participant_id: int | None = None, + participant_external_id: str | None = None, + ): + self.filepath = filepath + self.filesize = filesize + self.sg_id = sg_id + self.assay_id = assay_id + self.cram_analysis_id = cram_analysis_id + self.cram_file_path = cram_file_path + self.sample_id = sample_id + self.sample_external_id = sample_external_id + self.participant_id = participant_id + self.participant_external_id = participant_external_id + + +class ParticipantData: + """Class to hold the data for a participant""" + + def __init__( + self, + id_: ParticipantId, + external_id: ParticipantExternalId, + ): + self.id = id_ + self.external_id = external_id + + +class SampleData: + """Class to hold the data for a sample""" + + def __init__( + self, + id_: SampleId, + external_id: SampleExternalId, + participant: ParticipantData, + ): + self.id = id_ + self.external_id = external_id + self.participant = participant + + +class ReadFileData: + """Class to hold the data for a read file""" + + def __init__( + self, + filepath: str, + filesize: int, + checksum: str | None = None, + ): + self.filepath = filepath + self.filesize = filesize + self.checksum = checksum + + +class AssayData: + """Class to hold the data for an assay""" + + def __init__( + self, + id_: AssayId, + read_files: list[ReadFileData], + sample: SampleData, + ): + self.id = id_ + self.read_files = read_files + self.sample = sample + + +class SequencingGroupData: + """Class to hold the data for a sequencing group""" + + def __init__( + self, + id_: str, + sequencing_type: str, + sequencing_technology: str, + sample: SampleData, + assays: list[AssayData], + cram_analysis_id: int | None = None, + cram_file_path: str | None = None, + ): + self.id = id_ + self.sequencing_type = sequencing_type + self.sequencing_technology = sequencing_technology + self.sample = sample + self.assays = assays + self.cram_analysis_id = cram_analysis_id + self.cram_file_path = cram_file_path + + class AuditHelper(CloudHelper): """General helper class for bucket auditing""" def __init__( self, - gcp_project: str, + gcp_project: str = None, all_analysis_types: list[str] = None, all_sequencing_types: list[str] = None, excluded_sequencing_groups: list[str] = None, @@ -88,12 +201,12 @@ def __init__( self.all_sequencing_types = all_sequencing_types or get_enums('sequencingType') self.excluded_sequencing_groups = excluded_sequencing_groups or config_retrieve( - ['workflow', 'audit', 'excluded_sequencing_groups'] + ['metamist', 'audit', 'excluded_sequencing_groups'] ) - @staticmethod def get_gcs_buckets_and_prefixes_from_paths( - paths: list[str], + self, + paths: list[str] | set[str], ) -> defaultdict[str, list]: """ Takes a list of paths and extracts the bucket names and prefix, returning all unique pairs @@ -102,7 +215,7 @@ def get_gcs_buckets_and_prefixes_from_paths( buckets_prefixes: defaultdict[str, list] = defaultdict(list) for path in paths: try: - pc = get_path_components_from_gcp_path(path) + pc = self.get_path_components_from_gcp_path(path) except ValueError: logging.warning(f'{path} invalid') continue @@ -116,22 +229,24 @@ def get_gcs_buckets_and_prefixes_from_paths( return buckets_prefixes def get_all_files_in_gcs_bucket_with_prefix_and_extensions( - self, bucket_name: str, prefix: str, file_extension: tuple[str] + self, bucket_name: str, prefix: str, file_extensions: tuple[str] ): """Iterate through a gcp bucket/prefix and get all the blobs with the specified file extension(s)""" bucket = self.gcs_client.bucket(bucket_name, user_project=self.user_project) files_in_bucket_prefix = [] for blob in self.gcs_client.list_blobs(bucket, prefix=prefix, delimiter='/'): - # Check if file ends with specified analysis type - if not blob.name.endswith(file_extension): + # If specified, check if the file ends with a valid extension + if file_extensions and not blob.name.endswith(file_extensions): continue files_in_bucket_prefix.append(f'gs://{bucket_name}/{blob.name}') return files_in_bucket_prefix def find_files_in_gcs_buckets_prefixes( - self, buckets_prefixes: defaultdict[str, list[str]], file_types: tuple[str] + self, + buckets_prefixes: defaultdict[str, list[str]], + file_types: tuple[str] | None, ): """ Takes a dict of {bucket: [prefix1, prefix2, ...]} tuples and finds all the files contained in that bucket/prefix @@ -152,9 +267,9 @@ def find_files_in_gcs_buckets_prefixes( return files_in_bucket - def find_assay_files_in_gcs_bucket( + def get_read_file_blobs_in_gcs_bucket( self, bucket_name: str, file_extensions: tuple[str] - ) -> dict[str, int]: + ) -> list[ReadFileData]: """ Gets all the paths and sizes to assay files in the dataset's upload bucket. Calls list_blobs on the bucket with the specified file extensions, returning a dict of paths and sizes. @@ -166,16 +281,22 @@ def find_assay_files_in_gcs_bucket( raise NameError( 'Call to list_blobs without prefix only valid for upload buckets' ) - bucket = self.gcs_client.bucket(bucket_name, user_project=self.user_project) + bucket = self.gcs_client.bucket(bucket_name, user_project=self.gcp_project) - assay_paths_sizes = {} + read_files = [] for blob in self.gcs_client.list_blobs(bucket, prefix=''): if not blob.name.endswith(file_extensions): continue blob.reload() - assay_paths_sizes[blob.name] = blob.size + read_files.append( + ReadFileData( + filepath=blob.name, + filesize=blob.size, + checksum=blob.crc32c, + ) + ) - return assay_paths_sizes + return read_files def get_audit_report_prefix( self, @@ -186,17 +307,42 @@ def get_audit_report_prefix( if set(seq_types) == set(self.all_sequencing_types): sequencing_types_str = 'all_seq_types' else: - sequencing_types_str = ('_').join(self.sequencing_types) + '_seq_types' + sequencing_types_str = ('_').join(seq_types) + '_seq_types' if set(file_types) == set(ALL_EXTENSIONS): file_types_str = 'all_file_types' elif set(file_types) == set(READ_EXTENSIONS): file_types_str = 'all_reads_file_types' else: - file_types_str = ('_').join(self.file_types) + '_file_types' + file_types_str = ('_').join(file_types) + '_file_types' return f'{file_types_str}_{sequencing_types_str}' + def get_audit_report_records_from_sgs( + self, + sequencing_groups: list[SequencingGroupData], + ) -> list[AuditReportEntry]: + """ + Get the audit report records from the sequencing group data. + """ + audit_report_records = [] + for sg in sequencing_groups: + for assay in sg.assays: + audit_report_records.append( + AuditReportEntry( + sg_id=sg.id, + assay_id=assay.id, + cram_analysis_id=sg.cram_analysis_id, + cram_file_path=sg.cram_file_path, + sample_id=sg.sample.id, + sample_external_id=sg.sample.external_id, + participant_id=sg.sample.participant.id, + participant_external_id=sg.sample.participant.external_id, + ) + ) + + return audit_report_records + def write_report_to_cloud( self, data_to_write: list[dict[str, Any]] | None, @@ -241,7 +387,7 @@ def write_report_to_cloud( writer.writerows(data_to_write) storage_client = storage.Client() - bucket = storage_client.bucket(bucket_name, user_project=self.user_project) + bucket = storage_client.bucket(bucket_name, user_project=self.gcp_project) blob = bucket.blob(blob_path) # Upload the TSV content diff --git a/metamist/audit/generic_auditor.py b/metamist/audit/generic_auditor.py index cd2e848bc..7896fbe69 100644 --- a/metamist/audit/generic_auditor.py +++ b/metamist/audit/generic_auditor.py @@ -1,13 +1,22 @@ import logging import os -from collections import defaultdict from datetime import datetime from typing import Any from cpg_utils.config import config_retrieve, dataset_path from gql.transport.requests import log as requests_logger -from metamist.audit.audithelper import AuditHelper, FILE_TYPES_MAP +from metamist.audit.audithelper import ( + AuditHelper, + FILE_TYPES_MAP, + ReadFileData, + AssayData, + ParticipantData, + SampleData, + SequencingGroupData, + AuditReportEntry, + SequencingGroupId, +) from metamist.graphql import gql, query_async handler = logging.StreamHandler() @@ -22,7 +31,6 @@ logger.propagate = False - QUERY_PARTICIPANTS_SAMPLES_SGS = gql( """ query DatasetData($datasetName: String!, $seqTypes: [String!], $seqTechs: [String!]) { @@ -95,99 +103,6 @@ """ ) -# Variable type definitions -AnalysisId = int -AssayId = int -ParticipantId = int -ParticipantExternalId = str -SampleId = str -SampleExternalId = str -SequencingGroupId = str - - -class AuditReportEntry: - """Class to hold the data for an audit report entry""" - - def __init__( - self, - file_path: str, - filesize: int, - sg_id: str | None = None, - assay_id: int | None = None, - cram_analysis_id: int | None = None, - cram_file_path: str | None = None, - sample_id: str | None = None, - sample_external_id: str | None = None, - participant_id: int | None = None, - participant_external_id: str | None = None, - ): - self.file_path = file_path - self.filesize = filesize - self.sg_id = sg_id - self.assay_id = assay_id - self.cram_analysis_id = cram_analysis_id - self.cram_file_path = cram_file_path - self.sample_id = sample_id - self.sample_external_id = sample_external_id - self.participant_id = participant_id - self.participant_external_id = participant_external_id - - -class ParticipantData: - """Class to hold the data for a participant""" - - def __init__( - self, - id: ParticipantId, - external_id: ParticipantExternalId, - ): - self.id = id - self.external_id = external_id - -class SampleData: - """Class to hold the data for a sample""" - - def __init__( - self, - id: SampleId, - external_id: SampleExternalId, - participant: ParticipantData, - ): - self.id = id - self.external_id = external_id - self.participant = participant - - -class AssayData: - """Class to hold the data for an assay""" - - def __init__( - self, - id: AssayId, - read_files_paths_sizes: list[tuple[str, int]], - sample: SampleData, - ): - self.id = id - self.read_files_paths_sizes = read_files_paths_sizes - self.sample = sample - -class SequencingGroupData: - """Class to hold the data for a sequencing group""" - - def __init__( - self, - id: str, - sequencing_type: str, - sequencing_technology: str, - sample: SampleData, - assays: list[AssayData], - ): - self.id = id - self.sequencing_type = sequencing_type - self.sequencing_technology = sequencing_technology - self.sample = sample - self.assays = assays - class GenericAuditor(AuditHelper): """Auditor for cloud storage buckets""" @@ -202,32 +117,10 @@ def __init__( default_analysis_type='cram', default_analysis_status='completed', ): - # Initialize dataset - self.dataset = dataset or config_retrieve(['workflow', 'dataset']) - if not self.dataset: - raise ValueError('Metamist dataset is required') - - # Validate sequencing types - if sequencing_types == ('all',): - self.sequencing_types = self.all_sequencing_types - else: - invalid_types = [st for st in sequencing_types if st not in self.all_sequencing_types] - if invalid_types: - raise ValueError( - f'Input sequencing types "{invalid_types}" must be in the allowed types: {self.all_sequencing_types}' - ) - self.sequencing_types = sequencing_types - - # Validate file types - if file_types in (('all',), ('all_reads',)): - self.file_types = FILE_TYPES_MAP[file_types[0]] - else: - invalid_files = [ft for ft in file_types if ft not in FILE_TYPES_MAP] - if invalid_files: - raise ValueError( - f'Input file types "{invalid_files}" must be in the allowed types: {", ".join(FILE_TYPES_MAP.keys())}' - ) - self.file_types = file_types + # Initialize the auditor + self.dataset = self.validate_dataset(dataset) + self.sequencing_types = self.validate_sequencing_types(sequencing_types) + self.file_types = self.validate_file_types(file_types) # Set remaining attributes self.sequencing_technologies = sequencing_technologies @@ -235,84 +128,131 @@ def __init__( self.default_analysis_status: str = default_analysis_status # Calculate bucket name - self.bucket_name = dataset_path(dataset=self.dataset, category='upload') + self.bucket_name = self.get_bucket_name(self.dataset, 'upload') - super().__init__(search_paths=None) + super().__init__() requests_logger.setLevel(logging.WARNING) - + + def validate_dataset(self, dataset: str) -> str: + """Validate the input dataset""" + if not dataset: + dataset = config_retrieve(['workflow', 'dataset']) + if not dataset: + raise ValueError('Metamist dataset is required') + return dataset + + def validate_sequencing_types(self, sequencing_types: list[str]) -> list[str]: + """Validate the input sequencing types""" + if sequencing_types == ('all',): + return self.all_sequencing_types + invalid_types = [ + st for st in sequencing_types if st not in self.all_sequencing_types + ] + if invalid_types: + raise ValueError( + f'Input sequencing types "{invalid_types}" must be in the allowed types: {self.all_sequencing_types}' + ) + return sequencing_types + + def validate_file_types(self, file_types: tuple[str]) -> tuple[str]: + """Validate the input file types""" + if file_types in (('all',), ('all_reads',)): + return FILE_TYPES_MAP[file_types[0]] + invalid_file_types = [ft for ft in file_types if ft not in FILE_TYPES_MAP] + if invalid_file_types: + raise ValueError( + f'Input file types "{invalid_file_types}" must be in the allowed types: {", ".join(FILE_TYPES_MAP.keys())}' + ) + return file_types + + def get_bucket_name(self, dataset: str, category: str) -> str: + """Get the bucket name for the given dataset and category""" + test = config_retrieve(['workflow', 'access_level']) == 'test' + bucket: str = dataset_path( + suffix='', dataset=dataset, category=category, test=test + ) + if not bucket: + raise ValueError( + f'No bucket found for dataset {dataset} and category {category}' + ) + return bucket.removeprefix('gs://').removesuffix('/') + async def get_sgs_for_dataset(self) -> list[SequencingGroupData]: """ Fetches all sequencing groups for the given dataset, including the assays for each sequencing group. - + Returns a list of SequencingGroupData objects. """ - logger.info(f'{self.dataset} :: Fetching SG assays for {self.sequencing_types} sequencing types') + logger.info( + f'{self.dataset} :: Fetching SG assays for {self.sequencing_types} sequencing types' + ) dataset_sgs_query_result = await query_async( - QUERY_DATASET_SGS, - {'datasetName': self.dataset, 'seqTypes': self.sequencing_types, 'seqTechs': self.sequencing_technologies}, + QUERY_DATASET_SGS, + { + 'datasetName': self.dataset, + 'seqTypes': self.sequencing_types, + 'seqTechs': self.sequencing_technologies, + }, ) dataset_sgs = dataset_sgs_query_result['project']['sequencingGroups'] - + return [self.get_sg_data(sg) for sg in dataset_sgs] - - + def get_sg_data(self, sg: dict[str, Any]) -> SequencingGroupData: """Parse a sequencing group dictionary into a SequencingGroupData object""" return SequencingGroupData( - id=sg['id'], + id_=sg['id'], sequencing_type=sg['type'], sequencing_technology=sg['technology'], sample=SampleData( - id=sg['sample']['id'], + id_=sg['sample']['id'], external_id=sg['sample']['externalId'], participant=ParticipantData( - id=sg['sample']['participant']['id'], + id_=sg['sample']['participant']['id'], external_id=sg['sample']['participant']['externalId'], ), ), - assays=[ - self.parse_assay_data(assay) for assay in sg['assays'] - ], + assays=[self.parse_assay_data(assay) for assay in sg['assays']], ) - - + def parse_assay_data(self, assay: dict[str, Any]) -> AssayData: """Parse an assay dictionary into an AssayData object""" - reads = assay['meta']['reads'] + reads: list[dict] = assay['meta']['reads'] if isinstance(assay['meta']['reads'], dict): reads = [reads] - - reads_files_paths_sizes = [] + + read_files = [] for read in reads: - reads_files_paths_sizes.append( - ( - read['location'], - read['size'], + read_files.append( + ReadFileData( + filepath=read['path'], + filesize=read['size'], + checksum=read['checksum'], ) ) - if 'secondaryFiles' in read: + if read.get('secondaryFiles'): for secondary_file in read['secondaryFiles']: - reads_files_paths_sizes.append( - ( - secondary_file['location'], - secondary_file['size'], + read_files.append( + ReadFileData( + filepath=secondary_file['path'], + filesize=secondary_file['size'], + checksum=secondary_file['checksum'], ) ) - + return AssayData( - id=assay['id'], - read_files_paths_sizes=reads_files_paths_sizes, + id_=assay['id'], + read_files=read_files, sample=SampleData( - id=assay['sample']['id'], + id_=assay['sample']['id'], external_id=assay['sample']['externalId'], participant=ParticipantData( - id=assay['sample']['participant']['id'], + id_=assay['sample']['participant']['id'], external_id=assay['sample']['participant']['externalId'], ), ), ) - - + def get_latest_analyses_by_sg( self, all_sg_analyses: list[dict[str, Any]], @@ -342,9 +282,8 @@ def get_latest_analyses_by_sg( # Check the analysis meta data for the sequencing type self.check_analyses_seq_type(list(latest_analysis_by_sg.values())) - - return latest_analysis_by_sg + return latest_analysis_by_sg def check_analyses_seq_type( self, @@ -361,40 +300,33 @@ def check_analyses_seq_type( f'{self.dataset} :: Analyses are missing sequencing_type field: {analyses_with_missing_seq_type}' ) - - async def get_analysis_cram_paths_for_dataset_sgs( + async def update_sequencing_groups_with_crams( self, sequencing_groups: list[SequencingGroupData], - ) -> dict[SequencingGroupId, dict[AnalysisId, str]]: + ): """ - Fetches all CRAMs for the list of sgs in the given dataset. - Returns a dict mapping {sg_id : (cram_analysis_id, cram_path) } + Updates the sequencing group data in-place with the CRAM analysis id and path for each SG. """ sg_ids = [sg.id for sg in sequencing_groups] - logging.info(f'{self.dataset} :: Fetching CRAM analyses for {len(set(sg_ids))} SGs') + logging.info( + f'{self.dataset} :: Fetching CRAM analyses for {len(set(sg_ids))} SGs' + ) sg_analyses_query_result = await query_async( QUERY_SG_ANALYSES, {'dataset': self.dataset, 'sgId': sg_ids, 'analysisTypes': ['CRAM']}, ) + crams_by_sg = self.get_latest_analyses_by_sg( + all_sg_analyses=sg_analyses_query_result['sequencingGroups'] + ) - sg_analyses = sg_analyses_query_result['sequencingGroups'] - latest_analysis_by_sg = self.get_latest_analyses_by_sg(all_sg_analyses=sg_analyses) - - # For each sg id, collect the analysis id and cram paths - sg_cram_paths: dict[str, dict[int, str]] = defaultdict(dict) - for sg_id, analysis in latest_analysis_by_sg.items(): - cram_path = analysis['outputs']['path'] - if not cram_path.startswith('gs://') or not cram_path.endswith('.cram'): - logging.warning( - f'Analysis {analysis["id"]} invalid output path: {analysis["output"]}' - ) + # Update the sequencing group data with the CRAM analysis id and path + for seq_group in sequencing_groups: + sg_id = seq_group.id + if sg_id not in crams_by_sg: continue - - sg_cram_paths[sg_id] = {analysis['id']: cram_path} - - return sg_cram_paths - + seq_group.cram_analysis_id = crams_by_sg[sg_id]['id'] + seq_group.cram_file_path = crams_by_sg[sg_id]['outputs']['path'] async def check_for_non_cram_analyses(self, sgs_without_crams: list[str]) -> None: """Checks if other completed analyses exist for sequencing groups without a completed cram analysis""" @@ -420,12 +352,10 @@ async def check_for_non_cram_analyses(self, sgs_without_crams: list[str]) -> Non f'{analysis["id"]} - {analysis["type"]} - {analysis["outputs"].get("path")}' ) - - async def get_complete_and_incomplete_sgs( + async def check_sg_crams( self, sequencing_groups: list[SequencingGroupData], - sg_cram_paths: dict[SequencingGroupId, dict[AnalysisId, str]], - ) -> dict[str, Any]: + ) -> dict[str, SequencingGroupData]: """ Returns a dictionary containing two categories of sequencing groups: - the completed sgs which have finished aligning and have a cram, as a dict mapping @@ -433,196 +363,250 @@ async def get_complete_and_incomplete_sgs( - the incomplete sgs where the alignment hasn't completed and no cram exists, as a list """ # Get all the unique cram paths to check - cram_paths = set() - for analyses in sg_cram_paths.values(): - cram_paths.update(list(analyses.values())) + cram_paths = set( + sg.cram_file_path for sg in sequencing_groups if sg.cram_file_path + ) # Check the analysis CRAM paths actually exist in the bucket - buckets_prefixes = self.get_gcs_buckets_and_prefixes_from_paths(list(cram_paths)) + buckets_and_prefixes_to_search = self.get_gcs_buckets_and_prefixes_from_paths( + cram_paths + ) crams_in_bucket = self.find_files_in_gcs_buckets_prefixes( - buckets_prefixes, - ('cram',), + buckets_prefixes=buckets_and_prefixes_to_search, + file_types=('.cram',), ) # Incomplete SGs initialised as the SGs without a completed CRAM - incomplete_sgs = set([sg.id for sg in sequencing_groups]).difference( - set(sg_cram_paths.keys()) - ) + incomplete_sgs = [sg for sg in sequencing_groups if not sg.cram_file_path] # Completed SGs have a CRAM file in the bucket that matches the path in Metamist analysis record # Incomplete SGs have a CRAM analysis record in Metamist but are not found at that path in the bucket - completed_sgs = {} - for sg_id, analysis in sg_cram_paths.items(): - for cram_analysis_id, cram_path in analysis.items(): - if cram_path in crams_in_bucket: - completed_sgs[sg_id] = cram_analysis_id - else: - logging.warning( - f'{self.dataset} :: SG {sg_id} has CRAM analysis: {cram_analysis_id} - but file not found at path: {cram_path}' - ) - incomplete_sgs.update(sg_id) + completed_sgs = [] + for sg in sequencing_groups: + if sg.cram_file_path in crams_in_bucket: + completed_sgs.append(sg) + else: + logging.warning( + f'{self.dataset} :: {sg.id} has CRAM analysis: {sg.cram_analysis_id} - but file not found at path: {sg.cram_file_path}' + ) + incomplete_sgs.append(sg) if incomplete_sgs: logging.warning( - f'{self.dataset} :: {len(incomplete_sgs)} SGs without CRAMs found: {list(incomplete_sgs)}' + f'{self.dataset} :: {len(incomplete_sgs)} SGs without CRAMs found: {sorted([sg.id for sg in incomplete_sgs])}' ) - logging.warning('Checking if any other analyses exist for these SGs, which would be unexpected...') - await self.check_for_non_cram_analyses(list(incomplete_sgs)) - - return {'complete': completed_sgs, 'incomplete': list(incomplete_sgs)} + logging.warning( + 'Checking if any other analyses exist for these SGs, which would be unexpected...' + ) + await self.check_for_non_cram_analyses([sg.id for sg in incomplete_sgs]) + return {'complete': completed_sgs, 'incomplete': incomplete_sgs} - async def check_for_uningested_or_moved_assays( # pylint: disable=R0914 + async def get_audit_report_records_for_reads_to_delete_and_reads_to_ingest( # pylint: disable=R0914 self, - bucket_name: str, sequencing_groups: list[SequencingGroupData], - completed_sgs: dict[SequencingGroupId, list[AnalysisId]], - assay_id_to_paths_and_sizes: dict[AssayId, list[tuple[str, int]]], - ) -> tuple[list[AuditReportEntry], list[AuditReportEntry], set[str]]: + ) -> tuple[list[AuditReportEntry], list[AuditReportEntry]]: """ - Compares the assays read files in a Metamist dataset to the read files found in the - upload bucket. + Compares the read files in a Metamist dataset to the read files found in the + upload bucket, and decides which files should be deleted and which should be ingested. - Input: - - bucket_name: The name of the GCS bucket to check + Input: - sequencing_groups: A list of SequencingGroupData objects - - completed_sgs: A dict mapping sg_ids to analysis_ids for completed CRAM analyses - - assay_id_to_paths_and_sizes: A dict mapping assay IDs to lists of tuples of read file paths and sizes - - Returns: 1. A list of audit report records for reads that have not been ingested, - but where a known sample ID exists in the read file path. - 2. A list of audit report records for reads that have been ingested, - but have been moved to a different location in the bucket. - 3. A set of string paths to the assay read files that have been - deleted/moved from their original location in Metamist. + + Returns: 1. A list of audit report records for read files that can be deleted + 2. A list of audit report records for read files that should be ingested """ - # Get a list of all the paths and sizes of assay files recorded in Metamist - metamist_assay_paths_sizes: list[tuple[str, int]] = [ - path_size for assay in assay_id_to_paths_and_sizes.values() for path_size in assay + # Get a list of all the paths and sizes of read files recorded in Metamist assay records + read_files_in_metamist = [ + read_file + for sg in sequencing_groups + for assay in sg.assays + for read_file in assay.read_files ] - metamist_assay_paths = set( - [path for path, _ in metamist_assay_paths_sizes] + # Find all the read files in the bucket + read_files_in_bucket = self.get_read_file_blobs_in_gcs_bucket( + self.bucket_name, self.file_types ) - # Get a list of all the paths and sizes of assay files anywhere in the upload bucket - bucket_assay_paths_sizes = self.find_assay_files_in_gcs_bucket( - bucket_name, self.file_types - ) - bucket_assay_paths = set(bucket_assay_paths_sizes.keys()) - - # Find the paths that exist in the bucket and not in Metamist - uningested_assay_paths = set(bucket_assay_paths).difference( - set(metamist_assay_paths) + # The files in Metamist which are not in the bucket are assumed to have been deleted + already_deleted_read_files = { + read.filepath for read in read_files_in_metamist + }.difference({read.filepath for read in read_files_in_bucket}) + + ingested_reads_that_were_moved = self.find_moved_reads( + read_files_in_bucket, read_files_in_metamist ) - # Find the paths that exist in Metamist and not in the bucket - metamist_paths_to_nowhere = set(metamist_assay_paths).difference( - set(bucket_assay_paths) + uningested_reads = self.get_uningested_reads( + read_files_in_metamist, read_files_in_bucket, ingested_reads_that_were_moved ) - # Strip the metamist paths into just filenames - # Map each file name to its file size and path - # This is used to identify if any files have been moved - metamist_assay_file_size_map = { - os.path.basename(path): {'size': size, 'path': path} - for path, size in metamist_assay_paths_sizes - } - - # Check if any of the uningested paths are actually files that have been moved - ingested_reads_that_were_moved = self.check_if_assay_files_were_moved( - uningested_assay_paths, - metamist_assay_file_size_map, - bucket_assay_paths_sizes, + # Report the reads that can be deleted + uningested_reads_to_delete_report_entries, reads_to_ingest_report_entries = ( + self.report_uningested_reads( + sequencing_groups, + uningested_reads, + ) + ) + moved_reads_to_delete_report_entries = ( + self.report_ingested_files_that_have_been_moved( + sequencing_groups, + ingested_reads_that_were_moved, + ) ) - - # Check if any of the uningested paths contain sample IDs for completed SGs - uningested_reads = self.check_uningested_assays_for_sample_ids( + ingested_reads_to_delete_report_entries = self.report_ingested_reads_to_delete( sequencing_groups, - uningested_assay_paths, - bucket_assay_paths_sizes, - completed_sgs, + already_deleted_read_files, ) - return uningested_reads, ingested_reads_that_were_moved, metamist_paths_to_nowhere - + # Concatenate the lists of report entries for the reads to delete + reads_to_delete_report_entries = ( + moved_reads_to_delete_report_entries + + uningested_reads_to_delete_report_entries + + ingested_reads_to_delete_report_entries + ) + + return reads_to_delete_report_entries, reads_to_ingest_report_entries - def check_if_assay_files_were_moved( + def report_ingested_reads_to_delete( self, sequencing_groups: list[SequencingGroupData], - completed_sgs: dict[str, list[int]], - uningested_assay_paths: set[str], - assay_id_to_paths_and_sizes: dict[int, list[tuple[str, int]]], - metamist_assay_paths_sizes: dict[str, dict[str, Any]], - bucket_assay_paths_sizes: dict[str, int], - ) -> list[AuditReportEntry]: + already_deleted_read_paths: set[str], + ) -> list[AuditReportEntry]: """ - Identify if any paths are to files that have actually just been moved - by checking if they are in the bucket but not Metamist. If they are, - check if the file size is the same as the file in Metamist. If so, - assume the file has been moved and add it to the list of ingested and moved - files. - - Returns a tuple of two lists, the first containing the paths of ingested and moved files, - the second containing the assay report data for these files + Generates a list of audit report entries for the read files that have been ingested + and can safely be deleted. """ - ingested_and_moved_filepaths = [] - new_assay_path_sizes = {} - for bucket_path in uningested_assay_paths: - filename = os.path.basename(bucket_path) - # If the file in the bucket has the exact same name and size as one in metamist, assume its the same - if filename in metamist_assay_paths_sizes: - metamist_file_path = metamist_assay_paths_sizes[filename]['path'] - metamist_file_size = metamist_assay_paths_sizes[filename]['size'] - bucket_file_size = bucket_assay_paths_sizes[bucket_path] - if bucket_file_size == metamist_file_size: - ingested_and_moved_filepaths.append( - { - 'bucket_path': bucket_path, - 'metamist_path': metamist_file_path, - 'size': bucket_file_size, - } + ingested_reads_to_delete_report_entries = [] + for sg in sequencing_groups: + if sg.id in self.excluded_sequencing_groups or not ( + sg.cram_analysis_id and sg.cram_file_path + ): + continue + for assay in sg.assays: + for read in assay.read_files: + if read.filepath in already_deleted_read_paths: + continue + + ingested_reads_to_delete_report_entries.append( + AuditReportEntry( + filepath=read.filepath, + filesize=read.filesize, + sg_id=sg.id, + assay_id=assay.id, + cram_analysis_id=sg.cram_analysis_id, + sample_id=sg.sample.id, + sample_external_id=sg.sample.external_id, + participant_id=sg.sample.participant.id, + participant_external_id=sg.sample.participant.external_id, + ) ) - new_assay_path_sizes[bucket_path] = bucket_file_size - else: + + return ingested_reads_to_delete_report_entries + + def find_moved_reads( + self, + read_files_in_bucket: list[ReadFileData], + read_files_in_metamist: list[ReadFileData], + ) -> dict[str, ReadFileData]: + """ + Check the files in the bucket and the files in Metamist to validate if any have been moved. + Uses the checksums or the file size and file name to identify if a file has been moved. + Moved files are those that have the same checksum or size and name, but different paths. + + Returns a dictionary of the moved files, with the Metamist path as the key and the ReadFileData object as the value. + """ + metamist_read_files_by_filename = { + os.path.basename(read.filepath): read for read in read_files_in_metamist + } + metamist_filenames_by_checksum: dict[str, str] = { + read.checksum: os.path.basename(read.filepath) + for read in read_files_in_metamist + } + + moved_files = {} + for read_file in read_files_in_bucket: + if read_file.checksum in metamist_filenames_by_checksum: + metamist_filename = metamist_filenames_by_checksum[read_file.checksum] + metamist_filepath = metamist_read_files_by_filename[ + metamist_filename + ].filepath + + if read_file.filepath != metamist_filepath: logging.warning( - f'Uningested file at {bucket_path} ({bucket_file_size}) is similar to file in Metamist: {metamist_file_path} ({metamist_file_size}) but has different size' + f'File {read_file.filepath} has the same checksum as {metamist_filepath} but different path' + ) + moved_files[metamist_filepath] = read_file + continue + + if os.path.basename(read_file.filepath) in metamist_read_files_by_filename: + metamist_filepath = metamist_read_files_by_filename[ + os.path.basename(read_file.filepath) + ].filepath + metamist_filesize = metamist_read_files_by_filename[ + os.path.basename(read_file.filepath) + ].filesize + if ( + read_file.filepath != metamist_filepath + and read_file.filesize == metamist_filesize + ): + logging.warning( + f'File {read_file.filepath} has the same name as {metamist_filepath} but different path' + ) + moved_files[metamist_filepath] = read_file + elif ( + read_file.filepath != metamist_filepath + and read_file.filesize != metamist_filesize + ): + logging.warning( + f'File {read_file.filepath} has the same name as {metamist_filepath} but different path and size' ) - logging.info( - f'Found {len(ingested_and_moved_filepaths)} ingested files that have been moved' - ) - - # If the file has just been moved, we consider it ingested - uningested_assay_paths.remove( - {bucket_path for bucket_path, _ in ingested_and_moved_filepaths} - ) - - # flip the assay id : reads mapping to identify assays by their reads - read_file_path_to_assay_id = {} - for assay_id, reads_sizes in assay_id_to_paths_and_sizes.items(): - for read, _ in reads_sizes: - read_file_path_to_assay_id[read] = assay_id - - assay_sg_id_map = {assay.id: sg.id for sg in sequencing_groups for assay in sg.assays} - - assays_moved_paths = [] - for ingested_and_moved_path in ingested_and_moved_filepaths: - - assay_id = read_file_path_to_assay_id.get(ingested_and_moved_path['metamist_path']) - - sg_id = assay_sg_id_map.get(assay_id) - cram_analysis_id = completed_sgs.get(sg_id)[0] if sg_id in completed_sgs else None - - if sg_id in self.excluded_sequencing_groups or not cram_analysis_id: continue - - sg = self.get_sequencing_group_data_by_id(sg_id, sequencing_groups) - if not sg: + + logging.warning(f'File {read_file.filepath} not found in Metamist') + + logging.info(f'Found {len(moved_files)} ingested files that have been moved') + + return moved_files + + def report_ingested_files_that_have_been_moved( + self, + sequencing_groups: list[SequencingGroupData], + ingested_reads_that_were_moved: dict[str, ReadFileData], + ) -> list[AuditReportEntry]: + """ + Identify if any paths are to files that have actually just been moved + by checking if they are in the bucket but not Metamist. + + If they are, validate it's the same file by comparing the checksums or sizes. + If they match, assume the file has been moved and add it to the list of + ingested and moved files. + + Returns a list of AuditReportEntry objects for the moved files. + """ + sgs_by_assay_id = { + assay.id: sg for sg in sequencing_groups for assay in sg.assays + } + + assays = [assay for sg in sequencing_groups for assay in sg.assays] + read_file_path_to_assay_id = { + read.filepath: assay.id for assay in assays for read in assay.read_files + } + + moved_reads = [] + for metamist_path, read_file in ingested_reads_that_were_moved.items(): + assay_id = read_file_path_to_assay_id.get(metamist_path) + + sg = sgs_by_assay_id.get(assay_id) + cram_analysis_id = sg.cram_analysis_id if sg else None + + if sg in self.excluded_sequencing_groups or not cram_analysis_id: continue - assays_moved_paths.append( + moved_reads.append( AuditReportEntry( - file_path=ingested_and_moved_path['bucket_path'], - filesize=ingested_and_moved_path['filesize'], - sg_id=sg_id, + filepath=read_file.filepath, + filesize=read_file.filesize, + sg_id=sg.id, assay_id=assay_id, cram_analysis_id=cram_analysis_id, sample_id=sg.sample.id, @@ -632,8 +616,8 @@ def check_if_assay_files_were_moved( ) ) - return assays_moved_paths - + return moved_reads + def get_sequencing_group_data_by_id( self, sg_id: str, @@ -645,150 +629,171 @@ def get_sequencing_group_data_by_id( return sg return None + def get_uningested_reads( + self, + read_files_in_metamist: list[ReadFileData], + read_files_in_bucket: list[ReadFileData], + read_files_to_exclude: list[ReadFileData] = None, + ) -> list[ReadFileData]: + """ + Get a list of read files in the bucket that are not in Metamist + Optionally exclude a list of read files from the comparison. + """ + read_paths_in_metamist = {read.filepath for read in read_files_in_metamist} + if read_files_to_exclude: + read_paths_to_exclude = {read.filepath for read in read_files_to_exclude} + read_paths_in_metamist = read_paths_in_metamist.difference( + read_paths_to_exclude + ) - def check_uningested_assays_for_sample_ids( + return [ + read + for read in read_files_in_bucket + if read.filepath not in read_paths_in_metamist + ] + + def report_uningested_reads( self, sequencing_groups: list[SequencingGroupData], - uningested_assay_paths: set[str], - bucket_assay_paths_sizes: dict[str, int], - completed_sgs: dict[SequencingGroupId, list[AnalysisId]], - ) -> list[AuditReportEntry]: + uningested_reads: list[ReadFileData], + ) -> tuple[list[AuditReportEntry], list[AuditReportEntry]]: """ - Combs through the list of uningested assay paths to see if any of them contain sample ids for completed SGs. - Can happen when we ingest a fastq read pair for a sample, and additional read files were provided (e.g. bams, vcfs). - If there are extra files for a completed SG, we should either ingest them or delete them. + Generates two lists of audit report entries for the uningested reads + 1. Reads which can be deleted. + 2. Reads which should be ingested. + + If the read file path contains a sample or participant external ID associated with a completed SG, + it should be deleted. + + Otherwise, it should be ingested. """ - sg_sample_map = {sg.id: sg.sample for sg in sequencing_groups} - uningested_reads = [] - for sg_id, analysis_ids in completed_sgs.items(): - try: - sample = sg_sample_map[sg_id] - for uningested_read_file in uningested_assay_paths: - if sample.external_id not in uningested_read_file or sample.participant.external_id not in uningested_read_file: - continue - uningested_reads.append( - AuditReportEntry( - file_path=uningested_read_file, - filesize=bucket_assay_paths_sizes[uningested_read_file], - sg_id=sg_id, - cram_analysis_id=analysis_ids[0], - sample_id=sample.id, - sample_external_id=sample.external_id, - participant_id=sample.participant.id, - participant_external_id=sample.participant.external_id, - ) + if not uningested_reads: + return ([], []) + + completed_sgs = [ + sg for sg in sequencing_groups if sg.cram_analysis_id and sg.cram_file_path + ] + completed_sgs_by_sample_external_id = { + sg.sample.external_id: sg for sg in completed_sgs + } + completed_sgs_by_participant_external_id = { + sg.sample.participant.external_id: sg for sg in completed_sgs + } + + reads_to_delete_report_entries = [] + reads_to_ingest_report_entries = [] + for read in uningested_reads: + known_sample_id = None + known_participant_id = None + for sample_ext_id in completed_sgs_by_sample_external_id.keys(): + if sample_ext_id in read.filepath: + known_sample_id = sample_ext_id + break + if not known_sample_id: + for ( + participant_ext_id + ) in completed_sgs_by_participant_external_id.keys(): + if participant_ext_id in read.filepath: + known_sample_id = completed_sgs_by_participant_external_id[ + participant_ext_id + ].sample.external_id + known_participant_id = participant_ext_id + break + else: + known_participant_id = completed_sgs_by_sample_external_id[ + known_sample_id + ].sample.participant.external_id + + if known_sample_id and known_participant_id: + sg = completed_sgs_by_sample_external_id[known_sample_id] + reads_to_delete_report_entries.append( + AuditReportEntry( + filepath=read.filepath, + filesize=read.filesize, + sg_id=sg.id, + assay_id=None, # Not relevant for uningested reads being deleted + cram_analysis_id=sg.cram_analysis_id, + cram_file_path=sg.cram_file_path, + sample_id=sg.sample.id, + sample_external_id=sg.sample.external_id, + participant_id=sg.sample.participant.id, + participant_external_id=sg.sample.participant.external_id, + ) + ) + + else: + # If no known ID was detected in the filename, add the path with no further checks + reads_to_ingest_report_entries.append( + AuditReportEntry( + filepath=read.filepath, + filesize=read.filesize, ) - except KeyError: - logging.warning( - f'{sg_id} from analyses: {analysis_ids} not found in SG-sample map.' ) - - return uningested_reads - - + + return reads_to_delete_report_entries, reads_to_ingest_report_entries + async def get_reads_to_delete_or_ingest( self, - bucket_name: str, sequencing_groups: list[SequencingGroupData], - completed_sgs: dict[SequencingGroupId, list[AnalysisId]], - assay_id_to_paths_and_sizes: dict[AssayId, list[tuple[str, int]]], ) -> tuple[list[AuditReportEntry], list[AuditReportEntry]]: """ - Inputs: - - bucket_name: The name of the GCS bucket to check + Inputs: - sequencing_groups: A list of SequencingGroupData objects - - completed_sgs: A dict mapping sg_ids to analysis_ids for completed CRAM analyses - - assay_id_to_paths_and_sizes: A dict mapping assay IDs to lists of tuples of read file paths and sizes - - Returns two lists, each containing AuditReportEntry objects. - The first containins reads which can be deleted, the second containing reads to ingest. - The sample id, assay id, and analysis id (of completed cram) are included in the delete list. + + Returns two lists, each containing AuditReportEntry objects: + 1. Reads which can be deleted. + 2. Reads which should be ingested. + + The reads to delete are those that are associated with SGs that have completed CRAMs. + The reads to ingest are those that are not associated with SGs that have completed CRAMs. """ - # Check for uningested assay data that may be hiding or assay data that has been moved ( - reads_to_ingest, - moved_assay_report_entries, - metamist_paths_to_nowhere, - ) = await self.check_for_uningested_or_moved_assays( - bucket_name, + reads_to_delete_report_entries, + reads_to_ingest_report_entries, + ) = await self.get_audit_report_records_for_reads_to_delete_and_reads_to_ingest( sequencing_groups, - completed_sgs, - assay_id_to_paths_and_sizes, ) - # Create a mapping of sg id: assay ids - sg_assays_id_map = {sg.id: [assay.id for assay in sg.assays] for sg in sequencing_groups} - - # Create a list of assay report entries for the moved assays - assay_reads_to_delete: list[AuditReportEntry] = [] - for sg_id, cram_analysis_id in completed_sgs.items(): - if sg_id in self.excluded_sequencing_groups: - continue - sg = self.get_sequencing_group_data_by_id(sg_id, sequencing_groups) - assay_ids = sg_assays_id_map[sg_id] - for assay_id in assay_ids: - assay_read_paths = assay_id_to_paths_and_sizes[assay_id] - for path, size in assay_read_paths: - if path in metamist_paths_to_nowhere: # Already deleted - continue - - assay_reads_to_delete.append( - AuditReportEntry( - file_path=path, - filesize=size, - sg_id=sg_id, - assay_id=assay_id, - cram_analysis_id=cram_analysis_id, - sample_id=sg.sample.id, - sample_external_id=sg.sample.external_id, - participant_id=sg.sample.participant.id, - participant_external_id=sg.sample.participant.external_id, - ) - ) - - reads_to_delete = assay_reads_to_delete + moved_assay_report_entries - - return reads_to_delete, reads_to_ingest - - - @staticmethod - def find_crams_for_reads_to_ingest( - reads_to_ingest: list[AuditReportEntry], - sg_cram_paths: dict[str, dict[int, str]], - ) -> list[AuditReportEntry]: - """ - Compares the external sample IDs for SGs with completed CRAMs against the - uningested read files. This may turn up results for cases where multiple read types - have been provided for a sample, but only one type was ingested and used for alignment. - """ - possible_assay_ingests: list[AuditReportEntry] = [] - for read_to_ingest in reads_to_ingest: - if not read_to_ingest.sample_id: - # If no sample id was detected in the filename, add the path with no further checks - possible_assay_ingests.append( - AuditReportEntry( - file_path=read_to_ingest.file_path, - filesize=read_to_ingest.filesize, - ) - ) - continue - - # Else get the completed CRAM analysis id - sg_cram = sg_cram_paths[read_to_ingest.sg_id] - cram_path = sg_cram[read_to_ingest.cram_analysis_id] - possible_assay_ingests.append( - AuditReportEntry( - file_path=read_to_ingest.file_path, - filesize=read_to_ingest.filesize, - sg_id=read_to_ingest.sg_id, - assay_id=read_to_ingest.assay_id, - cram_analysis_id=read_to_ingest.cram_analysis_id, - cram_file_path=cram_path, - sample_id=read_to_ingest.sample_id, - sample_external_id=read_to_ingest.sample_external_id, - participant_id=read_to_ingest.participant_id, - participant_external_id=read_to_ingest.participant_external_id, - ) - ) - - return possible_assay_ingests + return reads_to_delete_report_entries, reads_to_ingest_report_entries + + # @staticmethod + # def find_crams_for_reads_to_ingest( + # reads_to_ingest: list[AuditReportEntry], + # sg_cram_paths: dict[str, dict[int, str]], + # ) -> list[AuditReportEntry]: + # """ + # Compares the external sample IDs for SGs with completed CRAMs against the + # uningested read files. This may turn up results for cases where multiple read types + # have been provided for a sample, but only one type was ingested and used for alignment. + # """ + # possible_assay_ingests: list[AuditReportEntry] = [] + # for read_to_ingest in reads_to_ingest: + # if not read_to_ingest.sample_id: + # # If no sample id was detected in the filename, add the path with no further checks + # possible_assay_ingests.append( + # AuditReportEntry( + # filepath=read_to_ingest.filepath, + # filesize=read_to_ingest.filesize, + # ) + # ) + # continue + + # # Else get the completed CRAM analysis id + # sg_cram = sg_cram_paths[read_to_ingest.sg_id] + # cram_path = sg_cram[read_to_ingest.cram_analysis_id] + # possible_assay_ingests.append( + # AuditReportEntry( + # filepath=read_to_ingest.filepath, + # filesize=read_to_ingest.filesize, + # sg_id=read_to_ingest.sg_id, + # assay_id=read_to_ingest.assay_id, + # cram_analysis_id=read_to_ingest.cram_analysis_id, + # cram_file_path=cram_path, + # sample_id=read_to_ingest.sample_id, + # sample_external_id=read_to_ingest.sample_external_id, + # participant_id=read_to_ingest.participant_id, + # participant_external_id=read_to_ingest.participant_external_id, + # ) + # ) + + # return possible_assay_ingests diff --git a/metamist/parser/cloudhelper.py b/metamist/parser/cloudhelper.py index a03c89b20..ee6cf47a4 100644 --- a/metamist/parser/cloudhelper.py +++ b/metamist/parser/cloudhelper.py @@ -1,6 +1,7 @@ # pylint: disable=no-member import logging import os +import re from collections import defaultdict from datetime import datetime from typing import Callable, Iterable, TypeVar @@ -205,3 +206,25 @@ def _list_gcs_directory(self, gcs_path) -> list[str]: path.bucket, prefix=remaining_path, delimiter='/' ) return [f'gs://{path.bucket}/{blob.name}' for blob in blobs] + + def get_path_components_from_gcp_path(self, path: str) -> dict[str, str]: + """ + Return the {bucket_name}, {dataset}, {bucket_type}, {prefix}, and {file} for GS only paths + Uses regex to match the full bucket name, dataset name, bucket type (e.g. 'test', 'main-upload', 'release'), + subdirectory, and the file name. + """ + + bucket_types = ['archive', 'hail', 'main', 'test', 'release'] + + # compile pattern matching all CPG bucket formats + gspath_pattern = re.compile( + r'gs://(?Pcpg-(?P[\w-]+)-(?P[' + + '|'.join(s for s in bucket_types) + + r']+[-\w]*))/(?P.+/)?(?P.*)$', + ) + + # if a match succeeds, return the key: value dictionary + if path_match := gspath_pattern.match(path): + return path_match.groupdict() + + raise ValueError('The input String did not match a valid GCP path') From 7ff4e5fcaf13a6c590a13ec29499673bc9bcc74c Mon Sep 17 00:00:00 2001 From: EddieLF Date: Wed, 15 Jan 2025 14:07:24 +1100 Subject: [PATCH 08/10] Correct audithelper prefix getting --- metamist/audit/audithelper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metamist/audit/audithelper.py b/metamist/audit/audithelper.py index b10921817..e6da04986 100644 --- a/metamist/audit/audithelper.py +++ b/metamist/audit/audithelper.py @@ -220,9 +220,7 @@ def get_gcs_buckets_and_prefixes_from_paths( logging.warning(f'{path} invalid') continue bucket = pc['bucket'] - prefix = pc[ - 'suffix' - ] # This is the prefix (i.e. the "subdirectory" in the bucket) + prefix = pc['prefix'] if prefix and prefix not in buckets_prefixes[bucket]: buckets_prefixes[bucket].append(prefix) From 76fa3285f0e727c4444071ef645431e8d5c3a1be Mon Sep 17 00:00:00 2001 From: EddieLF Date: Thu, 16 Jan 2025 10:17:54 +1100 Subject: [PATCH 09/10] Move some methods to AuditHelper, add comments --- metamist/audit/audit_upload_bucket.py | 7 +- metamist/audit/audithelper.py | 76 +++++++++-- metamist/audit/generic_auditor.py | 177 ++++++++++---------------- 3 files changed, 136 insertions(+), 124 deletions(-) diff --git a/metamist/audit/audit_upload_bucket.py b/metamist/audit/audit_upload_bucket.py index 4ff7cae25..43d1503d0 100644 --- a/metamist/audit/audit_upload_bucket.py +++ b/metamist/audit/audit_upload_bucket.py @@ -11,11 +11,8 @@ import click -from metamist.audit.generic_auditor import ( - GenericAuditor, - SequencingGroupData, - AuditReportEntry, -) +from metamist.audit.audithelper import SequencingGroupData, AuditReportEntry +from metamist.audit.generic_auditor import GenericAuditor def audit_upload_bucket( diff --git a/metamist/audit/audithelper.py b/metamist/audit/audithelper.py index e6da04986..f0fc58694 100644 --- a/metamist/audit/audithelper.py +++ b/metamist/audit/audithelper.py @@ -3,9 +3,8 @@ import logging from collections import defaultdict from io import StringIO -from typing import Any -from cpg_utils.config import config_retrieve, get_gcp_project +from cpg_utils.config import config_retrieve, dataset_path, get_gcp_project from metamist.graphql import gql, query from metamist.parser.cloudhelper import CloudHelper @@ -99,6 +98,9 @@ def __init__( # pylint: disable=too-many-arguments self.participant_id = participant_id self.participant_external_id = participant_external_id + def __repr__(self) -> str: + return f'AuditReportEntry({self.__dict__})' + class ParticipantData: """Class to hold the data for a participant""" @@ -201,9 +203,64 @@ def __init__( self.all_sequencing_types = all_sequencing_types or get_enums('sequencingType') self.excluded_sequencing_groups = excluded_sequencing_groups or config_retrieve( - ['metamist', 'audit', 'excluded_sequencing_groups'] + ['metamist', 'audit', 'excluded_sequencing_groups'], default=[] ) + def validate_dataset(self, dataset: str) -> str: + """Validate the input dataset""" + if not dataset: + dataset = config_retrieve(['workflow', 'dataset']) + if not dataset: + raise ValueError('Metamist dataset is required') + return dataset + + def validate_sequencing_types(self, sequencing_types: list[str]) -> list[str]: + """Validate the input sequencing types""" + if sequencing_types == ('all',): + return self.all_sequencing_types + invalid_types = [ + st for st in sequencing_types if st not in self.all_sequencing_types + ] + if invalid_types: + raise ValueError( + f'Input sequencing types "{invalid_types}" must be in the allowed types: {self.all_sequencing_types}' + ) + return sequencing_types + + def validate_file_types(self, file_types: tuple[str]) -> tuple[str]: + """Validate the input file types""" + if file_types in (('all',), ('all_reads',)): + return FILE_TYPES_MAP[file_types[0]] + invalid_file_types = [ft for ft in file_types if ft not in FILE_TYPES_MAP] + if invalid_file_types: + raise ValueError( + f'Input file types "{invalid_file_types}" must be in the allowed types: {", ".join(FILE_TYPES_MAP.keys())}' + ) + return file_types + + def get_bucket_name(self, dataset: str, category: str) -> str: + """Get the bucket name for the given dataset and category""" + test = config_retrieve(['workflow', 'access_level']) == 'test' + bucket: str = dataset_path( + suffix='', dataset=dataset, category=category, test=test + ) + if not bucket: + raise ValueError( + f'No bucket found for dataset {dataset} and category {category}' + ) + return bucket.removeprefix('gs://').removesuffix('/') + + def get_sequencing_group_data_by_id( + self, + sg_id: str, + sequencing_groups: list[SequencingGroupData], + ): + """Get the sequencing group data for a given sg_id""" + for sg in sequencing_groups: + if sg.id == sg_id: + return sg + return None + def get_gcs_buckets_and_prefixes_from_paths( self, paths: list[str] | set[str], @@ -230,7 +287,7 @@ def get_all_files_in_gcs_bucket_with_prefix_and_extensions( self, bucket_name: str, prefix: str, file_extensions: tuple[str] ): """Iterate through a gcp bucket/prefix and get all the blobs with the specified file extension(s)""" - bucket = self.gcs_client.bucket(bucket_name, user_project=self.user_project) + bucket = self.gcs_client.bucket(bucket_name, user_project=self.gcp_project) files_in_bucket_prefix = [] for blob in self.gcs_client.list_blobs(bucket, prefix=prefix, delimiter='/'): @@ -288,7 +345,7 @@ def get_read_file_blobs_in_gcs_bucket( blob.reload() read_files.append( ReadFileData( - filepath=blob.name, + filepath=f'gs://{bucket_name}/{blob.name}', filesize=blob.size, checksum=blob.crc32c, ) @@ -343,7 +400,7 @@ def get_audit_report_records_from_sgs( def write_report_to_cloud( self, - data_to_write: list[dict[str, Any]] | None, + data_to_write: list[AuditReportEntry] | None, bucket_name: str, blob_path: str, ) -> None: @@ -378,11 +435,12 @@ def write_report_to_cloud( buffer = StringIO() writer = csv.DictWriter( - buffer, fieldnames=data_to_write[0].keys(), delimiter=delimiter + buffer, fieldnames=data_to_write[0].__dict__.keys(), delimiter=delimiter ) + rows_to_write = [entry.__dict__ for entry in data_to_write] writer.writeheader() - writer.writerows(data_to_write) + writer.writerows(rows_to_write) storage_client = storage.Client() bucket = storage_client.bucket(bucket_name, user_project=self.gcp_project) @@ -396,6 +454,6 @@ def write_report_to_cloud( buffer.close() logging.info( - f'Wrote {len(data_to_write)} lines to gs://{bucket_name}/{blob_path}' + f'Wrote {len(rows_to_write)} lines to gs://{bucket_name}/{blob_path}' ) return diff --git a/metamist/audit/generic_auditor.py b/metamist/audit/generic_auditor.py index 7896fbe69..27e0f93bf 100644 --- a/metamist/audit/generic_auditor.py +++ b/metamist/audit/generic_auditor.py @@ -3,12 +3,10 @@ from datetime import datetime from typing import Any -from cpg_utils.config import config_retrieve, dataset_path from gql.transport.requests import log as requests_logger from metamist.audit.audithelper import ( AuditHelper, - FILE_TYPES_MAP, ReadFileData, AssayData, ParticipantData, @@ -88,7 +86,7 @@ QUERY_SG_ANALYSES = gql( """ - query sgAnalyses($dataset: String!, $sgIds: [String!], $analysisTypes: [String!], $seqTechs: [String!]) { + query sgAnalyses($dataset: String!, $sgIds: [String!], $analysisTypes: [String!]) { sequencingGroups(id: {in_: $sgIds}, project: {eq: $dataset}) { id analyses(status: {eq: COMPLETED}, type: {in_: $analysisTypes}, project: {eq: $dataset}) { @@ -117,6 +115,7 @@ def __init__( default_analysis_type='cram', default_analysis_status='completed', ): + super().__init__() # Initialize the auditor self.dataset = self.validate_dataset(dataset) self.sequencing_types = self.validate_sequencing_types(sequencing_types) @@ -129,54 +128,8 @@ def __init__( # Calculate bucket name self.bucket_name = self.get_bucket_name(self.dataset, 'upload') - - super().__init__() requests_logger.setLevel(logging.WARNING) - def validate_dataset(self, dataset: str) -> str: - """Validate the input dataset""" - if not dataset: - dataset = config_retrieve(['workflow', 'dataset']) - if not dataset: - raise ValueError('Metamist dataset is required') - return dataset - - def validate_sequencing_types(self, sequencing_types: list[str]) -> list[str]: - """Validate the input sequencing types""" - if sequencing_types == ('all',): - return self.all_sequencing_types - invalid_types = [ - st for st in sequencing_types if st not in self.all_sequencing_types - ] - if invalid_types: - raise ValueError( - f'Input sequencing types "{invalid_types}" must be in the allowed types: {self.all_sequencing_types}' - ) - return sequencing_types - - def validate_file_types(self, file_types: tuple[str]) -> tuple[str]: - """Validate the input file types""" - if file_types in (('all',), ('all_reads',)): - return FILE_TYPES_MAP[file_types[0]] - invalid_file_types = [ft for ft in file_types if ft not in FILE_TYPES_MAP] - if invalid_file_types: - raise ValueError( - f'Input file types "{invalid_file_types}" must be in the allowed types: {", ".join(FILE_TYPES_MAP.keys())}' - ) - return file_types - - def get_bucket_name(self, dataset: str, category: str) -> str: - """Get the bucket name for the given dataset and category""" - test = config_retrieve(['workflow', 'access_level']) == 'test' - bucket: str = dataset_path( - suffix='', dataset=dataset, category=category, test=test - ) - if not bucket: - raise ValueError( - f'No bucket found for dataset {dataset} and category {category}' - ) - return bucket.removeprefix('gs://').removesuffix('/') - async def get_sgs_for_dataset(self) -> list[SequencingGroupData]: """ Fetches all sequencing groups for the given dataset, including the assays for each sequencing group. @@ -184,7 +137,7 @@ async def get_sgs_for_dataset(self) -> list[SequencingGroupData]: Returns a list of SequencingGroupData objects. """ logger.info( - f'{self.dataset} :: Fetching SG assays for {self.sequencing_types} sequencing types' + f'{self.dataset} :: Fetching sequencing groups for \n Sequencing Types:\n {", ".join(sorted(self.sequencing_types))}\n Sequencing Technologies:\n {", ".join(sorted(self.sequencing_technologies))}' ) dataset_sgs_query_result = await query_async( QUERY_DATASET_SGS, @@ -223,22 +176,22 @@ def parse_assay_data(self, assay: dict[str, Any]) -> AssayData: read_files = [] for read in reads: - read_files.append( - ReadFileData( - filepath=read['path'], - filesize=read['size'], - checksum=read['checksum'], + try: + read_files.append(self.parse_read_file(read)) + except KeyError: + logger.warning( + f'Failed to parse read file: {read} for assay: {assay["id"]}' ) - ) + continue if read.get('secondaryFiles'): for secondary_file in read['secondaryFiles']: - read_files.append( - ReadFileData( - filepath=secondary_file['path'], - filesize=secondary_file['size'], - checksum=secondary_file['checksum'], + try: + read_files.append(self.parse_read_file(secondary_file)) + except KeyError: + logger.warning( + f'Failed to parse secondary read file: {secondary_file} for assay: {assay["id"]}' ) - ) + continue return AssayData( id_=assay['id'], @@ -253,6 +206,14 @@ def parse_assay_data(self, assay: dict[str, Any]) -> AssayData: ), ) + def parse_read_file(self, read: dict) -> ReadFileData: + """Parse a list of read files from an assay dictionary""" + return ReadFileData( + filepath=read['location'], + filesize=read['size'], + checksum=read['checksum'], + ) + def get_latest_analyses_by_sg( self, all_sg_analyses: list[dict[str, Any]], @@ -268,10 +229,6 @@ def get_latest_analyses_by_sg( analyses = sg_analyses['analyses'] if not analyses: continue - if len(analyses) == 1: - latest_analysis_by_sg[sg_id] = analyses[0] - continue - sorted_analyses = sorted( analyses, key=lambda x: datetime.strptime( @@ -314,7 +271,7 @@ async def update_sequencing_groups_with_crams( sg_analyses_query_result = await query_async( QUERY_SG_ANALYSES, - {'dataset': self.dataset, 'sgId': sg_ids, 'analysisTypes': ['CRAM']}, + {'dataset': self.dataset, 'sgIds': sg_ids, 'analysisTypes': ['CRAM']}, ) crams_by_sg = self.get_latest_analyses_by_sg( all_sg_analyses=sg_analyses_query_result['sequencingGroups'] @@ -326,7 +283,10 @@ async def update_sequencing_groups_with_crams( if sg_id not in crams_by_sg: continue seq_group.cram_analysis_id = crams_by_sg[sg_id]['id'] - seq_group.cram_file_path = crams_by_sg[sg_id]['outputs']['path'] + if isinstance(crams_by_sg[sg_id]['outputs'], str): + seq_group.cram_file_path = crams_by_sg[sg_id]['outputs'] + else: + seq_group.cram_file_path = crams_by_sg[sg_id]['outputs']['path'] async def check_for_non_cram_analyses(self, sgs_without_crams: list[str]) -> None: """Checks if other completed analyses exist for sequencing groups without a completed cram analysis""" @@ -376,20 +336,19 @@ async def check_sg_crams( file_types=('.cram',), ) - # Incomplete SGs initialised as the SGs without a completed CRAM - incomplete_sgs = [sg for sg in sequencing_groups if not sg.cram_file_path] - # Completed SGs have a CRAM file in the bucket that matches the path in Metamist analysis record # Incomplete SGs have a CRAM analysis record in Metamist but are not found at that path in the bucket completed_sgs = [] + incomplete_sgs = [] for sg in sequencing_groups: - if sg.cram_file_path in crams_in_bucket: + if sg.cram_file_path and sg.cram_file_path in crams_in_bucket: completed_sgs.append(sg) - else: + continue + if sg.cram_file_path and sg.cram_file_path not in crams_in_bucket: logging.warning( f'{self.dataset} :: {sg.id} has CRAM analysis: {sg.cram_analysis_id} - but file not found at path: {sg.cram_file_path}' ) - incomplete_sgs.append(sg) + incomplete_sgs.append(sg) if incomplete_sgs: logging.warning( @@ -437,7 +396,9 @@ async def get_audit_report_records_for_reads_to_delete_and_reads_to_ingest( # p read_files_in_bucket, read_files_in_metamist ) uningested_reads = self.get_uningested_reads( - read_files_in_metamist, read_files_in_bucket, ingested_reads_that_were_moved + read_files_in_metamist, + read_files_in_bucket, + ingested_reads_that_were_moved.values(), ) # Report the reads that can be deleted @@ -494,6 +455,7 @@ def report_ingested_reads_to_delete( sg_id=sg.id, assay_id=assay.id, cram_analysis_id=sg.cram_analysis_id, + cram_file_path=sg.cram_file_path, sample_id=sg.sample.id, sample_external_id=sg.sample.external_id, participant_id=sg.sample.participant.id, @@ -562,8 +524,6 @@ def find_moved_reads( ) continue - logging.warning(f'File {read_file.filepath} not found in Metamist') - logging.info(f'Found {len(moved_files)} ingested files that have been moved') return moved_files @@ -609,6 +569,7 @@ def report_ingested_files_that_have_been_moved( sg_id=sg.id, assay_id=assay_id, cram_analysis_id=cram_analysis_id, + cram_file_path=sg.cram_file_path, sample_id=sg.sample.id, sample_external_id=sg.sample.external_id, participant_id=sg.sample.participant.id, @@ -618,22 +579,11 @@ def report_ingested_files_that_have_been_moved( return moved_reads - def get_sequencing_group_data_by_id( - self, - sg_id: str, - sequencing_groups: list[SequencingGroupData], - ): - """Get the sequencing group data for a given sg_id""" - for sg in sequencing_groups: - if sg.id == sg_id: - return sg - return None - def get_uningested_reads( self, read_files_in_metamist: list[ReadFileData], read_files_in_bucket: list[ReadFileData], - read_files_to_exclude: list[ReadFileData] = None, + read_files_to_exclude: list[ReadFileData] | None = None, ) -> list[ReadFileData]: """ Get a list of read files in the bucket that are not in Metamist @@ -683,6 +633,10 @@ def report_uningested_reads( reads_to_delete_report_entries = [] reads_to_ingest_report_entries = [] for read in uningested_reads: + # Check if the read file path contains a sample or participant external ID associated with a completed SG + # This is a naive check which assumes the external ID is in the file path + # TODO: Improve this check to use a more robust method that uses known file naming conventions, e.g. + # - extract the VCGS or Garvan format sample ID from the file path and match this specifically known_sample_id = None known_participant_id = None for sample_ext_id in completed_sgs_by_sample_external_id.keys(): @@ -706,6 +660,9 @@ def report_uningested_reads( if known_sample_id and known_participant_id: sg = completed_sgs_by_sample_external_id[known_sample_id] + logger.info( + f'{self.dataset} :: uningested file: {read.filepath} may match to completed SG: {sg.id} with sample: {sg.sample.external_id} and participant: {sg.sample.participant.external_id}' + ) reads_to_delete_report_entries.append( AuditReportEntry( filepath=read.filepath, @@ -732,29 +689,29 @@ def report_uningested_reads( return reads_to_delete_report_entries, reads_to_ingest_report_entries - async def get_reads_to_delete_or_ingest( - self, - sequencing_groups: list[SequencingGroupData], - ) -> tuple[list[AuditReportEntry], list[AuditReportEntry]]: - """ - Inputs: - - sequencing_groups: A list of SequencingGroupData objects - - Returns two lists, each containing AuditReportEntry objects: - 1. Reads which can be deleted. - 2. Reads which should be ingested. + # async def get_reads_to_delete_or_ingest( + # self, + # sequencing_groups: list[SequencingGroupData], + # ) -> tuple[list[AuditReportEntry], list[AuditReportEntry]]: + # """ + # Inputs: + # - sequencing_groups: A list of SequencingGroupData objects - The reads to delete are those that are associated with SGs that have completed CRAMs. - The reads to ingest are those that are not associated with SGs that have completed CRAMs. - """ - ( - reads_to_delete_report_entries, - reads_to_ingest_report_entries, - ) = await self.get_audit_report_records_for_reads_to_delete_and_reads_to_ingest( - sequencing_groups, - ) + # Returns two lists, each containing AuditReportEntry objects: + # 1. Reads which can be deleted. + # 2. Reads which should be ingested. - return reads_to_delete_report_entries, reads_to_ingest_report_entries + # The reads to delete are those that are associated with SGs that have completed CRAMs. + # The reads to ingest are those that are not associated with SGs that have completed CRAMs. + # """ + # ( + # reads_to_delete_report_entries, + # reads_to_ingest_report_entries, + # ) = await self.get_audit_report_records_for_reads_to_delete_and_reads_to_ingest( + # sequencing_groups, + # ) + + # return reads_to_delete_report_entries, reads_to_ingest_report_entries # @staticmethod # def find_crams_for_reads_to_ingest( From b1ae5f37e1b904c3072d5e5a705512a72fd1f8b6 Mon Sep 17 00:00:00 2001 From: EddieLF Date: Thu, 16 Jan 2025 10:18:38 +1100 Subject: [PATCH 10/10] Remove unnecessary methods --- metamist/audit/generic_auditor.py | 66 ------------------------------- 1 file changed, 66 deletions(-) diff --git a/metamist/audit/generic_auditor.py b/metamist/audit/generic_auditor.py index 27e0f93bf..eea5a4951 100644 --- a/metamist/audit/generic_auditor.py +++ b/metamist/audit/generic_auditor.py @@ -688,69 +688,3 @@ def report_uningested_reads( ) return reads_to_delete_report_entries, reads_to_ingest_report_entries - - # async def get_reads_to_delete_or_ingest( - # self, - # sequencing_groups: list[SequencingGroupData], - # ) -> tuple[list[AuditReportEntry], list[AuditReportEntry]]: - # """ - # Inputs: - # - sequencing_groups: A list of SequencingGroupData objects - - # Returns two lists, each containing AuditReportEntry objects: - # 1. Reads which can be deleted. - # 2. Reads which should be ingested. - - # The reads to delete are those that are associated with SGs that have completed CRAMs. - # The reads to ingest are those that are not associated with SGs that have completed CRAMs. - # """ - # ( - # reads_to_delete_report_entries, - # reads_to_ingest_report_entries, - # ) = await self.get_audit_report_records_for_reads_to_delete_and_reads_to_ingest( - # sequencing_groups, - # ) - - # return reads_to_delete_report_entries, reads_to_ingest_report_entries - - # @staticmethod - # def find_crams_for_reads_to_ingest( - # reads_to_ingest: list[AuditReportEntry], - # sg_cram_paths: dict[str, dict[int, str]], - # ) -> list[AuditReportEntry]: - # """ - # Compares the external sample IDs for SGs with completed CRAMs against the - # uningested read files. This may turn up results for cases where multiple read types - # have been provided for a sample, but only one type was ingested and used for alignment. - # """ - # possible_assay_ingests: list[AuditReportEntry] = [] - # for read_to_ingest in reads_to_ingest: - # if not read_to_ingest.sample_id: - # # If no sample id was detected in the filename, add the path with no further checks - # possible_assay_ingests.append( - # AuditReportEntry( - # filepath=read_to_ingest.filepath, - # filesize=read_to_ingest.filesize, - # ) - # ) - # continue - - # # Else get the completed CRAM analysis id - # sg_cram = sg_cram_paths[read_to_ingest.sg_id] - # cram_path = sg_cram[read_to_ingest.cram_analysis_id] - # possible_assay_ingests.append( - # AuditReportEntry( - # filepath=read_to_ingest.filepath, - # filesize=read_to_ingest.filesize, - # sg_id=read_to_ingest.sg_id, - # assay_id=read_to_ingest.assay_id, - # cram_analysis_id=read_to_ingest.cram_analysis_id, - # cram_file_path=cram_path, - # sample_id=read_to_ingest.sample_id, - # sample_external_id=read_to_ingest.sample_external_id, - # participant_id=read_to_ingest.participant_id, - # participant_external_id=read_to_ingest.participant_external_id, - # ) - # ) - - # return possible_assay_ingests