20240612 refdata, docs update and msigs2tsv

sigven · Jun 13, 2024 · e96d3e9 · e96d3e9
1 parent 19f2e7d
commit e96d3e9
Show file tree

Hide file tree

Showing 7 changed files with 126 additions and 41 deletions.
diff --git a/pcgr/pcgr_vars.py b/pcgr/pcgr_vars.py
@@ -3,7 +3,7 @@
 from pcgr._version import __version__
 
 PCGR_VERSION = __version__
-DB_VERSION = '20240610'
+DB_VERSION = '20240612'
 
 ## MISCELLANEOUS
 NCBI_BUILD_MAF = 'GRCh38'

diff --git a/pcgrr/R/main.R b/pcgrr/R/main.R
@@ -788,23 +788,55 @@ generate_tier_tsv <- function(variant_set,
 #' Function that writes contents of PCGR object to a TSV file
 #'
 #' @param report List object with all report data, settings etc.
-#' @param variant_type character indicating variant type for output TSV,
-#' i.e. 'snv_indel' or 'cna_gene'
+#' @param output_type character indicating output type for TSV,
+#' i.e. 'snv_indel' or 'cna_gene', 'msigs'
 #' @export
 #'
-write_report_tsv <- function(report = NULL, variant_type = 'snv_indel'){
+write_report_tsv <- function(report = NULL, output_type = 'snv_indel'){
 
   fname <- paste0(
-    report$settings$output_prefix, ".", variant_type, "_ann.tsv.gz")
+    report$settings$output_prefix, ".", output_type, "_ann.tsv.gz")
+
+  if(output_type == "msigs"){
+    fname <- paste0(
+      report$settings$output_prefix, ".", output_type, ".tsv.gz")
+  }
 
   output_data <- data.frame()
-  pcgrr::log4r_info("------")
-  pcgrr::log4r_info(paste0(
-    "Writing tab-separated output file with PCGR annotations - '",
-    variant_type, "'"))
+  eval_output <- FALSE
+
+  if(output_type == "msigs" &
+     report$content$mutational_signatures$eval == TRUE){
+    eval_output <- TRUE
+  }
+  if(output_type == "cna" &
+     report$content$cna$eval == TRUE){
+    eval_output <- TRUE
+  }
+  if(output_type == "snv_indel" &
+     report$content$snv_indel$eval == TRUE){
+    eval_output <- TRUE
+  }
+
+
+  ## Mutational signatures
+  if(output_type == 'msigs' &
+     !is.null(report$content$mutational_signatures)){
+
+    if(report$content$mutational_signatures$eval == TRUE &
+       report$content$mutational_signatures$missing_data == FALSE){
+
+      if(!is.null(report$content$mutational_signatures$result$tsv)){
+        if(is.data.frame(report$content$mutational_signatures$result$tsv)){
+          output_data <- as.data.frame(
+            report$content$mutational_signatures$result$tsv)
+        }
+      }
+    }
+  }
 
   ## Copy number alterations
-  if(variant_type == 'cna_gene' &
+  if(output_type == 'cna_gene' &
      !is.null(report$content$cna) &
      report$content$cna$eval == TRUE){
 
@@ -819,7 +851,7 @@ write_report_tsv <- function(report = NULL, variant_type = 'snv_indel'){
   }
 
   ## SNVs/InDels
-  if(variant_type == 'snv_indel' &
+  if(output_type == 'snv_indel' &
      !is.null(report$content$snv_indel) &
      report$content$snv_indel$eval == TRUE){
 
@@ -841,13 +873,19 @@ write_report_tsv <- function(report = NULL, variant_type = 'snv_indel'){
   }
 
   if(NROW(output_data) > 0){
+    pcgrr::log4r_info("------")
+    pcgrr::log4r_info(paste0(
+      "Writing tab-separated output file with PCGR annotations - '",
+      output_type, "'"))
     readr::write_tsv(
       output_data, file = fname,
       col_names = TRUE, append = FALSE,
       na = ".", quote = "none")
   } else {
-    pcgrr::log4r_info(
-      paste0("No data to write to TSV file - '", variant_type,"'"))
+    if(eval_output == TRUE){
+      pcgrr::log4r_info(
+        paste0("No data to write to TSV file - '", output_type,"'"))
+    }
   }
 
 }

diff --git a/pcgrr/man/write_report_tsv.Rd b/pcgrr/man/write_report_tsv.Rd
diff --git a/pcgrr/vignettes/CHANGELOG.Rmd b/pcgrr/vignettes/CHANGELOG.Rmd
@@ -92,7 +92,8 @@ pdiakumis <- user("pdiakumis")
   - `--preserved_info_tags` is now named `--retained_info_tags`
   - `--basic` is now named `--no_reporting`
   - `--target_size_mb` is now named `--effective_target_size_mb`
-- LOFTEE plugin in VEP removed as LOF-annotator (due to low level of maintenance, and outdated dependency requirements). For now, a very simplified LoF-annotation is used as a replacement, simply looking at CSQ types - this will be improved in future releases (considering potentially false positives near the transcript end)
+- LOFTEE plugin in VEP removed as loss-of-function variant classifier (due to low level of maintenance, and outdated dependency requirements). For now, a simplified LoF-annotation is used as a replacement, looking primarily at CSQ types (`stop_gained`, `frameshift_variant`, `splice_acceptor_variant`, `splice_donor_variant`). Furthermore, frameshift/stop-gain variants that are found within the last 5% of the coding sequence length are deemed non-LOF, as are splice donor variants not disrupting the canonical site (GC>GT). An even more advanced LoF-annotation is planned for a future release.
+- Biomarkers are matched much more comprehensively than in previous versions, matching at the genomic level, codon, exon, amino acid and gene level (both principal and non-principal transcript matches)
 
 ##### Removed
 

diff --git a/pcgrr/vignettes/installation.Rmd b/pcgrr/vignettes/installation.Rmd
@@ -67,14 +67,14 @@ Here's an example scenario that will be used in the following sections
 
 **A)** Download and unpack the assembly-specific reference data bundle needed for PCGR:
 
-- [grch37 data bundle - 20240610](https://insilico.hpc.uio.no/pcgr/pcgr_ref_data.20240610.grch37.tgz) (approx 4.8Gb)
-- [grch38 data bundle - 20240610](https://insilico.hpc.uio.no/pcgr/pcgr_ref_data.20240610.grch38.tgz) (approx 4.8Gb)
+- [grch37 data bundle - 20240612](https://insilico.hpc.uio.no/pcgr/pcgr_ref_data.20240612.grch37.tgz) (approx 4.8Gb)
+- [grch38 data bundle - 20240612](https://insilico.hpc.uio.no/pcgr/pcgr_ref_data.20240612.grch38.tgz) (approx 4.8Gb)
 
 - Example:
 
 ```bash
 GENOME="grch38" # or "grch37"
-BUNDLE_VERSION="20240610"
+BUNDLE_VERSION="20240612"
 BUNDLE="pcgr_ref_data.${BUNDLE_VERSION}.${GENOME}.tgz"
 
 wget https://insilico.hpc.uio.no/pcgr/${BUNDLE}

diff --git a/pcgrr/vignettes/output.Rmd b/pcgrr/vignettes/output.Rmd
@@ -89,6 +89,7 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i
 | `Feature` | Ensembl stable ID of feature (picked by VEP's `--flag_pick_allele` option) |
 | `cDNA_position` | Relative position of base pair in cDNA sequence (picked by VEP's `--flag_pick_allele` option) |
 | `CDS_position` | Relative position of base pair in coding sequence (picked by VEP's `--flag_pick_allele` option) |
+| `CDS_RELATIVE_POSITION` | Ratio of variant coding position to length of coding sequence |
 | `CDS_CHANGE` | Coding, transcript-specific sequence annotation (picked by VEP's `--flag_pick_allele` option) |
 | `AMINO_ACID_START` | Protein position indicating absolute start of amino acid altered (fetched from `Protein_position`) |
 | `AMINO_ACID_END` |  Protein position indicating absolute end of amino acid altered (fetched from `Protein_position`) |
@@ -119,6 +120,7 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i
 | `CCDS` | The CCDS identifier for this transcript, where applicable (picked by VEP's `--flag_pick_allele` option) |
 | `INTRON` | The intron number (out of total number) (picked by VEP's `--flag_pick_allele` option) |
 | `EXON` | The exon number (out of total number) (picked by VEP's `--flag_pick_allele` option) |
+| `EXON_AFFECTED` | The exon affected by the variant (picked by VEP's `--flag_pick_allele` option) |
 | `LAST_EXON` | Logical indicator for last exon of transcript (picked by VEP's `--flag_pick_allele` option) |
 | `LAST_INTRON` | Logical indicator for last intron of transcript (picked by VEP's `--flag_pick_allele` option) |
 | `INTRON_POSITION` | Relative position of intron variant to nearest exon/intron junction (NearestExonJB VEP plugin) |
@@ -132,9 +134,14 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i
 | `ALLELE_NUM` | Allele number from input; 0 is reference, 1 is first alternate etc - VEP |
 | `REFSEQ_MATCH` | The RefSeq transcript match status; contains a number of flags indicating whether this RefSeq transcript matches the underlying reference sequence and/or an Ensembl transcript (picked by VEP's `--flag_pick_allele` option) |
 | `PICK` | Indicates if this block of consequence data was picked by VEP's `--flag_pick_allele` option |
-| `VEP_ALL_CONSEQUENCE` | All transcript consequences (`Consequence:SYMBOL:Feature_type:Feature:BIOTYPE`) - VEP |
+| `VEP_ALL_CSQ` | All transcript consequences (`Consequence:SYMBOL:Feature_type:Feature:BIOTYPE`) - VEP |
 | `EXONIC_STATUS` | Indicates if variant consequence type is 'exonic' or 'nonexonic'. We here define 'exonic' as any variant with either of the following consequences: `stop_gained / stop_lost`, `start_lost`, `frameshift_variant`, `missense_variant`, `splice_donor_variant`, `splice_acceptor_variant`, `inframe_insertion / inframe_deletion`, `synonymous_variant`, `start_retained`, `stop_retained`, `protein_altering` |
 | `CODING_STATUS` | Indicates if primary variant consequence type is 'coding' or 'noncoding' (wrt. protein-alteration). 'coding' variants are here defined as those with an 'exonic' status, with the exception of synonymous variants |
+| `NULL_VARIANT` | Primary variant consequence type is `frameshift` or `stop_gained`/`stop_lost` |
+| `LOSS_OF_FUNCTION` | Loss-of-function variant |
+| `LOF_FILTER` | Loss-of-function filter |
+| `SPLICE_DONOR_RELEVANT` | Logical indicating if variant is located at a particular location near the splice donor site (`+3A/G`, `+4A` or `+5G`) |
+| `REGULATORY_ANNOTATION` | Comma-separated list of all variant annotations of `Feature_type`, `RegulatoryFeature`, and `MotifFeature`. Format (separated by a `|`): `<Consequence>`, `<Feature_type>`, `<Feature>`, `<BIOTYPE>`, `<MOTIF_NAME>`, `<MOTIF_POS>`, `<HIGH_INF_POS>`, `<MOTIF_SCORE_CHANGE>`, `<TRANSCRIPTION_FACTORS>` |
 
 ##### _Gene information_
 
@@ -238,10 +245,10 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i
 
 | Tag | Description |
 |-----|-------------|
-| `CHEMBL_COMPOUND_ID` | antineoplastic drugs targeting the encoded protein (from [Open Targets Platform](https://www.targetvalidation.org/), drugs are listed as [ChEMBL](https://www.ebi.ac.uk/chembl/) compound identifiers) |
-| `CIVIC_ID`, `CIVIC_ID_SEGMENT` | Variant/segment (exon, codon) identifiers in the [CIViC database](http://civic.genome.wustl.edu) |
-| `CGI_ID`, `CGI_ID_SEGMENT` | Variant/segment (exon, codon) identifier in the [Cancer Genome Interpreter Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) |
-
+| `BIOMARKER_MATCH` | Variant matches with biomarker evidence in CIViC/CGI. Format: <db_source>\|<db_variant_id>\|<db_evidence_id>:<tumor_site>:<clinical_significance>:<evidence_level>:<evidence_type><germline_somatic>\|<matching_type>. Multiple evidence items are separated by '&'. Example: civic|174|EID445:Colon/Rectum:Sensitivity/Response:D:Predictive:Somatic&EID446:Colon/Rectum:Sensitivity/Response:D:Predictive:Somatic|by_gene_mut. Matching type can be any of `by_genomic_coord`, `by_hgvsp_principal`, `by_hgvsc_principal`,  `by_hgvsp_nonprincipal`, `by_hgvsc_nonprincipal`, `by_codon_principal`, `by_exon_mut_principal`, `by_gene_mut_lof`, `by_gene_mut`,
+| `ONCOGENICITY` | Oncogenicity annotation - VICC/ClinGen SOP implementation |
+| `ONCOGENICITY_CODE` | Oncogenicity code - VICC/ClinGen SOP implementation |
+| `ONCOGENICITY_SCORE` | Oncogenicity score - VICC/ClinGen SOP
 #### 2. Tab-separated values (TSV)
 
 We provide a tab-separated values file with most important annotations for SNVs/InDels. The file has the following naming convention:
@@ -316,25 +323,51 @@ The following variables are included in the TSV file (VCF tags issued by the use
 | 62. `CALL_CONFIDENCE` | Call confidence |
 
 
-#### 3. Mutational signature contributions
+### Tumor mutational burden (TSV)
 
-We provide a tab-separated values file with information about mutational signatures detected in the tumor sample. The file has the following naming convention:
+We provide a tab-separated values (TSV) file with information about mutational burden detected in the tumor sample. The file has the following naming convention:
 
-- `<sample_id>.pcgr.<genome_assembly>.mutational_signatures.tsv`
+- `<sample_id>.pcgr.<genome_assembly>.tmb.tsv`
 
 The format of the TSV file is the following:
 
 | Variable | Description |
 |----------|-------------|
-| 1. `signature_id` | identifier for signature |
-| 2. `sample_id` | sample identifier |
-| 3. `prop_signature` | relative contribution of mutational signature |
-| 4. `group` | keyword for signature aetiology |
-| 5. `all_reference_signatures` | logical indicating if all reference signatures were used for reconstruction/inference |
-| 6. `tumor_type` | tumor type (used for retrieval of reference signatures) |
-| 7. `reference_collection` | collection used for reference signatures |
-| 8. `reference_signatures` | signatures present in reference collection |
-| 9. `fitting_accuracy` | accuracy of mutational signature fitting |
+| 1. `sample_id` | sample identifier |
+| 2. `n_somatic_variants` | number of somatic variants in total for sample |
+| 3. `tmb_measure` | TMB measure - type of variants included |
+| 4. `tmb_csq_regex` | VEP consequence regex for variants included in TMB calculation |
+| 5. `tmb_target_size_mb` | target size in megabases |
+| 6. `tmb_dp_min` | minimum depth of coverage for variant to be included in TMB calculation |
+| 7. `tmb_af_min` | minimum allele frequency for variant to be included in TMB calculation |
+| 8. `tmb_n_variants` | number of variants included in TMB calculation |
+| 9. `tmb_estimate` | TMB estimate |
+| 10. `tmb_unit` | TMB unit (i.e. mutations/Mb) |
+
+### Mutational signature contributions (TSV)
+
+We provide a tab-separated values (TSV) file with information about mutational signatures detected in the tumor sample. The file has the following naming convention:
+
+- `<sample_id>.pcgr.<genome_assembly>.msigs.tsv.gz`
+
+The format of the TSV file is the following:
+
+| Variable | Description |
+|----------|-------------|
+| 1. `sample_id` | sample identifier |
+| 2. `signature_id` | identifier for signature |
+| 3. `n_bs_iterations` | number of bootstrap iterations |
+| 4. `prop_signature` | relative contribution of mutational signature |
+| 5. `prop_signature_ci_lower` | lower bound of confidence interval for relative contribution of mutational signature |
+| 6. `prop_signature_ci_upper` | upper bound of confidence interval for relative contribution of mutational signature |
+| 7. `aetiology` | underlying atiology of mutational signature |
+| 8. `comments` | additional comments regarding aetiology |
+| 9. `group` | keyword for signature aetiology |
+| 10. `all_reference_signatures` | logical indicating if all reference signatures were used for reconstruction/inference |
+| 11. `tumor_type` | tumor type (used for retrieval of reference signatures) |
+| 12. `reference_collection` | collection used for reference signatures |
+| 13. `reference_signatures` | signatures present in reference collection |
+| 14. `fitting_accuracy` | accuracy of mutational signature fitting |
 
 
 ### Copy number aberrations
@@ -374,6 +407,18 @@ The format of the compressed TSV file is the following:
 | 21. `BIOMARKER_MATCH` | Biomarker match |
 | 22. `TARGETED_INHIBITORS_ALL2` | Molecularly targeted inhibitors - indicated for any tumor type|
 
-### Excel workbook
+### Gene expression data
+
+If users provide bulk RNA-seq expression data as input, PCGR will attach basic gene annotations for the affected transcripts, and perform similarity analysis and outlier detection if configured by the user. The naming convention of the compressed TSV files are as follows:
+
+* `<sample_id>.pcgr.<genome_assembly>.expression.tsv.gz`
+  - __NOTE__: This file is organized according to the _affected transcripts_ (i.e. one line/record per affected transcript). Contains basic annotations of the affected transcripts.
+* `<sample_id>.pcgr.<genome_assembly>.expression_similarity.tsv.gz`
+  - __NOTE__: This file is organized according to the _samples_  of other gene expression cohorts (i.e. similarity level, one line/record per sample).
+* `<sample_id>.pcgr.<genome_assembly>.expression_outliers.tsv.gz`
+  - __NOTE__: This file is organized according to the _affected genes_ which are considered outliers with respect to the distribution found in reference cohorts (one line/record per affected outlier/gene).
+
+
+### Excel workbook (XLSX)
 
-The Excel workbook contains multiple sheets with data tables, mostly self-explainable, with annotated datasets pending on the analysis performed (SNVs/InDels, CNAs, biomarker evidence, TMB, MSI, mutational signatures, immune contexture profiling etc). The naming convention of the Excel workbook is as follows: `<sample_id>.pcgr.<genome_assembly>.xlsx`
+The Excel workbook contains multiple sheets with data tables, mostly self-explainable, with annotated datasets pending on the analysis performed (assay/sample data, SNVs/InDels, CNAs, biomarker evidence, TMB, MSI, mutational signatures, immune contexture profiling etc). The naming convention of the Excel workbook is as follows: `<sample_id>.pcgr.<genome_assembly>.xlsx`
diff --git a/scripts/pcgrr.R b/scripts/pcgrr.R
@@ -39,6 +39,7 @@ pcg_report <- pcgrr::generate_report(
 if (!is.null(pcg_report)) {
   pcgrr::write_report_quarto_html(report = pcg_report)
   pcgrr::write_report_excel(report = pcg_report)
-  pcgrr::write_report_tsv(report = pcg_report, variant_type = 'snv_indel')
-  pcgrr::write_report_tsv(report = pcg_report, variant_type = 'cna_gene')
+  pcgrr::write_report_tsv(report = pcg_report, output_type = 'snv_indel')
+  pcgrr::write_report_tsv(report = pcg_report, output_type = 'cna_gene')
+  pcgrr::write_report_tsv(report = pcg_report, output_type = 'msigs')
 }