From fa8b798f1057ce17c058ccba6edf0907e3686ea4 Mon Sep 17 00:00:00 2001 From: j-uranic <117292295+j-uranic@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:45:22 -0400 Subject: [PATCH 1/4] Create geomx-ngs-v2.1.yaml Add lab_processed\/primary_analysis\/.* and lab_processed\/primary_analysis\/Q3\s{1}Normalized\.xlsx --- .../directory-schemas/geomx-ngs-v2.1.yaml | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 src/ingest_validation_tools/directory-schemas/geomx-ngs-v2.1.yaml diff --git a/src/ingest_validation_tools/directory-schemas/geomx-ngs-v2.1.yaml b/src/ingest_validation_tools/directory-schemas/geomx-ngs-v2.1.yaml new file mode 100644 index 00000000..9178a5e6 --- /dev/null +++ b/src/ingest_validation_tools/directory-schemas/geomx-ngs-v2.1.yaml @@ -0,0 +1,115 @@ +files: + - + pattern: extras\/.* + required: True + description: Folder for general lab-specific files related to the dataset. + - + pattern: extras\/microscope_hardware\.json + required: True + description: A file generated by the micro-meta app that contains a description of the hardware components of the microscope. Email HuBMAP Consortium Help Desk if help is required in generating this document. + is_qa_qc: True + - + pattern: extras\/microscope_settings\.json + required: False + description: A file generated by the micro-meta app that contains a description of the settings that were used to acquire the image data. Email HuBMAP Consortium Help Desk if help is required in generating this document. + is_qa_qc: True + - + pattern: raw\/.* + required: True + description: All raw data files for the experiment. + - + pattern: raw\/[^\/]+_LabWorksheet.txt + required: True + description: An Excel spreadsheet to refer to in setting up the library. This file documents all of the samples from a single collection plate. Generated by DSP run, prior to sequencing. + - + pattern: raw\/[^\/]+_config\.ini + required: True + description: Needed to generate the DCC file from the fastq file. Contains pipeline processing parameters. Generated by DSP run, prior to sequencing. + - + pattern: raw\/[^\/]+_SeqCodeIndices\.csv + required: True + description: A file with sample information needed by the Illumina software. Use the contents of the SeqCodeIndices.csv file to create a SampleSheet.csv for input to the Illumina sequencer. (NextSeq 1000/2000 users download a SampleSheet.csv and whitelist.txt instead of SeqCodeIndices.csv.) Generated by DSP run. + - + pattern: raw\/markers\.csv + required: False + description: A csv file describing any morphology markers used to guide ROI and/or AOI selection [this should be similar in structure to the antibodies file] + - + pattern: raw\/[^\/]*targets\.pkc + required: True + description: The file listing probe barcode sequence and corresponding gene symbol or proteins targeted by that probe. This should be consistent for the same probe panel. + - + pattern: raw\/additional_panels_used\.csv + required: False + description: If multiple commercial probe panels were used, then the primary probe panel should be selected in the "oligo_probe_panel" metadata field. The additional panels must be included in this file. Each panel record should include:manufacturer, model/name, product code. + - + pattern: raw\/custom_probe_set\.csv + required: True + description: This file should contain any custom probes used and must be included if the metadata field "is_custom_probes_used" is "Yes". The file should minimally include:target gene id, probe seq, probe id. The contents of this file are modeled after the 10x Genomics probe set file (see ). + - + pattern: raw\/fastq\/.* + required: True + description: Raw sequencing files for the experiment + - + pattern: raw\/fastq\/oligo\/.* + required: True + description: Directory containing fastq files pertaining to oligo sequencing. + - + pattern: raw\/fastq\/oligo\/[^\/]+\.fastq\.gz + required: True + description: This is a gzip version of the fastq file. This file contains the cell barcode and unique molecular identifier (technical). + is_qa_qc: False + - + pattern: raw\/images\/.* + required: False + description: Directory containing raw image files. This directory should include at least one raw file. + - + pattern: raw\/images\/overlay\.(?:jpeg|tiff) + required: False + description: State whether an overlay image was used to guide ROI selection. If an overlay is used, then the overlay details will be provided in the protocols.io protocol. If used, this needs to be uploaded. It is not included in the OME TIFF. This can be a JPEG or TIFF file + - + pattern: lab_processed\/.* + required: True + description: Experiment files that were processed by the lab generating the data. + - + pattern: lab_processed\/Initial\s{1}Dataset\.xlsx + required: True + description: An excel spreadsheet that is downloaded from the GeoMx DSP Data Analysis Suite containing QA/QC metrics based on raw, unprocessed target counts. This file contains one row per AOI/segment and no analyses span AOI. The AOIs included in this file can come from different GeoMx runs and hence span Globus uploads. So care must be taken to make sure the appropriate AOIs are included in the file. + is_qa_qc: True + - + pattern: lab_processed\/annotations\.xlsx + required: False + description: AOI specific annotations. This might include cell type and anatomical information. + - + pattern: lab_processed\/dcc\/.* + required: True + description: DCC files generated from fastq by the Nanostring GeoMx NGS Pipeline. + - + pattern: lab_processed\/dcc\/[^\/]+\.dcc + required: True + description: DCC files containing target probe counts, generated from fastq by the Nanostring GeoMx NGS Pipeline. + - + pattern: lab_processed\/images\/.* + required: True + description: Processed image files + - + pattern: lab_processed\/images\/[^\/]+\.ome\.tiff + required: True + description: OME-TIFF files (multichannel, multi-layered) produced by the microscopy experiment. If compressed, must use loss-less compression algorithm. For Visium this stitched file should only include the single capture area relevant to the current dataset. For GeoMx there will be one OME TIFF file per slide, with each slide including multiple AOIs. See the following link for the set of fields that are required in the OME TIFF file XML header. + is_qa_qc: False + example: HBM892.MDXS.293.ome.tiff + - + pattern: lab_processed\/images\/[^\/]*ome-tiff\.channels\.csv + required: True + description: This file provides essential documentation pertaining to each channel of the accommpanying OME TIFF. The file should contain one row per OME TIFF channel. The required fields are detailed + is_qa_qc: False + - + pattern: lab_processed\/primary_analysis\/.* + required: True + description: Primary analysis results + - + pattern: lab_processed\/primary_analysis\/Q3\s{1}Normalized\.xlsx + required: True + description: Results from initial procesing by GeoMx DSP Data Analysis Suite. The collection of datasets were normalized using Q3 normalization after target genes below the limit of quantitation (LOQ) are removed. + is_qa_qc: True + + From 597dc3e47e8767935f119ed9c251cf4430ca26cd Mon Sep 17 00:00:00 2001 From: j-uranic <117292295+j-uranic@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:46:37 -0400 Subject: [PATCH 2/4] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea7d39bc..616e4201 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ - Update Visium with probes directory schema - Update Visium no probes directory schema - Change to EntityTypeInfo constraint format to support constraints endpoint +- Update GeoMx NGS directory schema ## v0.0.23 - Add token to validation_utils.get_assaytype_data, replace URL string concatenation with urllib From 9b2bd957e429304f96766f06df86a54f4e2ba41d Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Tue, 10 Sep 2024 16:13:14 -0400 Subject: [PATCH 3/4] Documentation: Update geomx-ngs --- docs/geomx-ngs/current/index.md | 33 ++++++++++++++++++- .../directory-schemas/geomx-ngs-v2.1.yaml | 2 +- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/docs/geomx-ngs/current/index.md b/docs/geomx-ngs/current/index.md index 390834c2..df1028d0 100644 --- a/docs/geomx-ngs/current/index.md +++ b/docs/geomx-ngs/current/index.md @@ -28,7 +28,38 @@ Related files:
## Directory schemas -Version 2.0 (use this one) +Version 2.1 (use this one) + +| pattern | required? | description | +| --- | --- | --- | +| extras\/.* | ✓ | Folder for general lab-specific files related to the dataset. | +| extras\/microscope_hardware\.json | ✓ | **[QA/QC]** A file generated by the micro-meta app that contains a description of the hardware components of the microscope. Email HuBMAP Consortium Help Desk if help is required in generating this document. | +| extras\/microscope_settings\.json | | **[QA/QC]** A file generated by the micro-meta app that contains a description of the settings that were used to acquire the image data. Email HuBMAP Consortium Help Desk if help is required in generating this document. | +| raw\/.* | ✓ | All raw data files for the experiment. | +| raw\/[^\/]+_LabWorksheet.txt | ✓ | An Excel spreadsheet to refer to in setting up the library. This file documents all of the samples from a single collection plate. Generated by DSP run, prior to sequencing. | +| raw\/[^\/]+_config\.ini | ✓ | Needed to generate the DCC file from the fastq file. Contains pipeline processing parameters. Generated by DSP run, prior to sequencing. | +| raw\/[^\/]+_SeqCodeIndices\.csv | ✓ | A file with sample information needed by the Illumina software. Use the contents of the SeqCodeIndices.csv file to create a SampleSheet.csv for input to the Illumina sequencer. (NextSeq 1000/2000 users download a SampleSheet.csv and whitelist.txt instead of SeqCodeIndices.csv.) Generated by DSP run. | +| raw\/markers\.csv | | A csv file describing any morphology markers used to guide ROI and/or AOI selection [this should be similar in structure to the antibodies file] | +| raw\/[^\/]*targets\.pkc | ✓ | The file listing probe barcode sequence and corresponding gene symbol or proteins targeted by that probe. This should be consistent for the same probe panel. | +| raw\/additional_panels_used\.csv | | If multiple commercial probe panels were used, then the primary probe panel should be selected in the "oligo_probe_panel" metadata field. The additional panels must be included in this file. Each panel record should include:manufacturer, model/name, product code. | +| raw\/custom_probe_set\.csv | ✓ | This file should contain any custom probes used and must be included if the metadata field "is_custom_probes_used" is "Yes". The file should minimally include:target gene id, probe seq, probe id. The contents of this file are modeled after the 10x Genomics probe set file (see ). | +| raw\/fastq\/.* | ✓ | Raw sequencing files for the experiment | +| raw\/fastq\/oligo\/.* | ✓ | Directory containing fastq files pertaining to oligo sequencing. | +| raw\/fastq\/oligo\/[^\/]+\.fastq\.gz | ✓ | This is a gzip version of the fastq file. This file contains the cell barcode and unique molecular identifier (technical). | +| raw\/images\/.* | | Directory containing raw image files. This directory should include at least one raw file. | +| raw\/images\/overlay\.(?:jpeg|tiff) | | State whether an overlay image was used to guide ROI selection. If an overlay is used, then the overlay details will be provided in the protocols.io protocol. If used, this needs to be uploaded. It is not included in the OME TIFF. This can be a JPEG or TIFF file | +| lab_processed\/.* | ✓ | Experiment files that were processed by the lab generating the data. | +| lab_processed\/Initial\s{1}Dataset\.xlsx | ✓ | **[QA/QC]** An excel spreadsheet that is downloaded from the GeoMx DSP Data Analysis Suite containing QA/QC metrics based on raw, unprocessed target counts. This file contains one row per AOI/segment and no analyses span AOI. The AOIs included in this file can come from different GeoMx runs and hence span Globus uploads. So care must be taken to make sure the appropriate AOIs are included in the file. | +| lab_processed\/annotations\.xlsx | | AOI specific annotations. This might include cell type and anatomical information. | +| lab_processed\/dcc\/.* | ✓ | DCC files generated from fastq by the Nanostring GeoMx NGS Pipeline. | +| lab_processed\/dcc\/[^\/]+\.dcc | ✓ | DCC files containing target probe counts, generated from fastq by the Nanostring GeoMx NGS Pipeline. | +| lab_processed\/images\/.* | ✓ | Processed image files | +| lab_processed\/images\/[^\/]+\.ome\.tiff (example: lab_processed/images/HBM892.MDXS.293.ome.tiff) | ✓ | OME-TIFF files (multichannel, multi-layered) produced by the microscopy experiment. If compressed, must use loss-less compression algorithm. For Visium this stitched file should only include the single capture area relevant to the current dataset. For GeoMx there will be one OME TIFF file per slide, with each slide including multiple AOIs. See the following link for the set of fields that are required in the OME TIFF file XML header. | +| lab_processed\/images\/[^\/]*ome-tiff\.channels\.csv | ✓ | This file provides essential documentation pertaining to each channel of the accommpanying OME TIFF. The file should contain one row per OME TIFF channel. The required fields are detailed | +| lab_processed\/primary_analysis\/.* | ✓ | Primary analysis results | +| lab_processed\/primary_analysis\/Q3\s{1}Normalized\.xlsx | ✓ | **[QA/QC]** Results from initial procesing by GeoMx DSP Data Analysis Suite. The collection of datasets were normalized using Q3 normalization after target genes below the limit of quantitation (LOQ) are removed. | + +Version 2.0 | pattern | required? | description | | --- | --- | --- | diff --git a/src/ingest_validation_tools/directory-schemas/geomx-ngs-v2.1.yaml b/src/ingest_validation_tools/directory-schemas/geomx-ngs-v2.1.yaml index 9178a5e6..48a3bf5e 100644 --- a/src/ingest_validation_tools/directory-schemas/geomx-ngs-v2.1.yaml +++ b/src/ingest_validation_tools/directory-schemas/geomx-ngs-v2.1.yaml @@ -96,7 +96,7 @@ files: required: True description: OME-TIFF files (multichannel, multi-layered) produced by the microscopy experiment. If compressed, must use loss-less compression algorithm. For Visium this stitched file should only include the single capture area relevant to the current dataset. For GeoMx there will be one OME TIFF file per slide, with each slide including multiple AOIs. See the following link for the set of fields that are required in the OME TIFF file XML header. is_qa_qc: False - example: HBM892.MDXS.293.ome.tiff + example: lab_processed/images/HBM892.MDXS.293.ome.tiff - pattern: lab_processed\/images\/[^\/]*ome-tiff\.channels\.csv required: True From a22f48e28831089deb436d1662faf1c174be2dfa Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Tue, 10 Sep 2024 16:14:20 -0400 Subject: [PATCH 4/4] General: Update CHANGELOG --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 616e4201..841d1f89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## v0.0.25 (in progress) +- Update GeoMx NGS directory schema + ## v0.0.24 - Release MERFISH - Add MERFISH directory schema @@ -18,7 +21,6 @@ - Update Visium with probes directory schema - Update Visium no probes directory schema - Change to EntityTypeInfo constraint format to support constraints endpoint -- Update GeoMx NGS directory schema ## v0.0.23 - Add token to validation_utils.get_assaytype_data, replace URL string concatenation with urllib