From 7c7d38b2c5064f43bc3e9b0abfff3b899fc99ff5 Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Fri, 6 Oct 2023 10:18:05 +0200 Subject: [PATCH 1/4] transform NA strings to NA to avoid warnings --- R/cvo.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/cvo.R b/R/cvo.R index fb8cc58..dc427ce 100644 --- a/R/cvo.R +++ b/R/cvo.R @@ -221,7 +221,8 @@ parse_cvo_record <- function(record_string){ stringr::str_split("\n") %>% unlist() %>% stringr::str_remove("\\t$") %>% - stringr::str_split("\\t") + stringr::str_split("\\t") %>% + replace(., .=="NA", NA) # replace all string NAs with NA to avoid warnings from as.numeric if(stringr::str_detect(record_string, "TMB|MSI")){ record <- purrr::map(intermediate, ~ as.numeric(.x[2])) From de331191df0d4d4ec72236e216a35e1c4a0b2d42 Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Fri, 6 Oct 2023 10:51:25 +0200 Subject: [PATCH 2/4] avoid warnings in nested list --- R/cvo.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/cvo.R b/R/cvo.R index dc427ce..f5750e1 100644 --- a/R/cvo.R +++ b/R/cvo.R @@ -222,7 +222,7 @@ parse_cvo_record <- function(record_string){ unlist() %>% stringr::str_remove("\\t$") %>% stringr::str_split("\\t") %>% - replace(., .=="NA", NA) # replace all string NAs with NA to avoid warnings from as.numeric + rapply(., function(x) ifelse(x=="NA",NA,x), how = "replace") # replace all string NAs with NA to avoid warnings from as.numeric if(stringr::str_detect(record_string, "TMB|MSI")){ record <- purrr::map(intermediate, ~ as.numeric(.x[2])) From 34e77b55b9a90324108369fe1767559e38b44585 Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Fri, 10 Nov 2023 16:45:01 +0100 Subject: [PATCH 3/4] add functionality for reading raw CNV data --- NAMESPACE | 8 +++++ R/cnv.R | 75 +++++++++++++++++++++++++++++++++++++++ R/wrangle.R | 28 +++++++++++++++ man/cnv.Rd | 17 +++++++++ man/new_cnv_output.Rd | 21 +++++++++++ man/parse_vcf_to_df.Rd | 17 +++++++++ man/read_cnv_data.Rd | 19 ++++++++++ man/summarize_cnv_data.Rd | 18 ++++++++++ 8 files changed, 203 insertions(+) create mode 100755 R/cnv.R create mode 100644 man/cnv.Rd create mode 100644 man/new_cnv_output.Rd create mode 100644 man/parse_vcf_to_df.Rd create mode 100644 man/read_cnv_data.Rd create mode 100644 man/summarize_cnv_data.Rd diff --git a/NAMESPACE b/NAMESPACE index f0d3115..20cd9c0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(add_amplification_data) export(add_annotation_data) export(add_common_theme_elements) export(add_tmb_variant_data) +export(cnv) export(cvo) export(filter_consequences) export(filter_depth) @@ -53,6 +54,7 @@ export(process_and_filter_small_variant_data) export(qualitymetrics) export(read_analysis_status) export(read_annotation_data) +export(read_cnv_data) export(read_cvo_data) export(read_dna_expanded_metrics) export(read_dna_qc_metrics) @@ -70,7 +72,13 @@ export(read_splice_variants) export(read_tmb_details_data) export(read_tmb_details_data_csv) export(read_tmb_trace_data) +export(summarize_cnv_data) export(tmb) export(write_multiqc_data) export(write_rdata_file) export(write_workbook) +importFrom(dplyr,left_join) +importFrom(dplyr,mutate) +importFrom(stringr,str_split_i) +importFrom(vcfR,read.vcfR) +importFrom(vcfR,vcfR2tidy) diff --git a/R/cnv.R b/R/cnv.R new file mode 100755 index 0000000..ef15412 --- /dev/null +++ b/R/cnv.R @@ -0,0 +1,75 @@ +#' Read in a *CopyNumberVariants.vcf file and store as an object +#' +#' @description Read in a *CopyNumberVariants.vcf file +#' +#' @param cnv_file_path a file path to a *CopyNumberVariants.vcf file +#' +#' @return A cnv.output object +#' +#' @export +cnv <- function(cnv_file_path, local_app=FALSE){ + new_cnv_output(cnv_file_path) +} + +#' Constructor function for combined.cnv.output objects +#' Not to be called directly +#' +#' @param cnv_file_path a file path to a *CopyNumberVariants.vcf file +#' @param local_app specifies whether quality metrics are coming from local app +#' +#' @return A combined.cnv.output object +new_cnv_output <- function(cnv_file_path, local_app=FALSE) { + + cnv_data = tibble(file = cnv_file_path) %>% + mutate(data = lapply(file, parse_vcf_to_df)) %>% + unnest(data) %>% + mutate(sample_id = str_replace(basename(file), "_CopyNumberVariants.vcf", "")) %>% + select(-file) %>% + relocate(sample_id) + + return(structure(cnv_data, class = "combined.cnv.output")) +} + +#' Read in a batch of *CopyNumberVariants.vcf files into a list of CNV objects +#' +#' @param cnv_directory a file path to a directory containing one of more *CopyNumberVariants.vcf files +#' @param local_app specifies whether quality metrics are coming from local app +#' +#' @return A named list of combined.cnv.output objects +#' +#' @export +read_cnv_data <- function(cnv_directory, local_app=FALSE){ + cnv_files <- list.files( + path = cnv_directory, + pattern = "*CopyNumberVariants.vcf", + full.names = TRUE + ) + cnv_data <- map(cnv_files, cnv, local_app) %>% + set_names(str_remove(basename(cnv_files), "\\.vcf$")) + cnv_data +} + +#' Read in a batch of *CopyNumberVariants.vcf files into one dataframe +#' +#' @param tmb_directory a file path to a directory containing one of more +#' *tmb.json files +#' +#' @return A dataframe with the read CNV data +#' +#' @export +summarize_cnv_data <- function(cnv_directory){ + cnv_files <- list.files( + path = cnv_directory, + pattern = "*CopyNumberVariants.vcf", + full.names = TRUE + ) + + cnv_data = tibble(file = cnv_files) %>% + mutate(data = lapply(file, parse_vcf_to_df)) %>% + unnest(data) %>% + mutate(sample_id = str_replace(basename(file), "_CopyNumberVariants.vcf", "")) %>% + select(-file) %>% + relocate(sample_id) + + cnv_data +} \ No newline at end of file diff --git a/R/wrangle.R b/R/wrangle.R index 9f5bf3d..c838c79 100644 --- a/R/wrangle.R +++ b/R/wrangle.R @@ -219,6 +219,34 @@ read_rna_expanded_metrics <- function(qmo_list){ return(rna_expanded_metrics) } +#' Parse VCF files for a provided path and construct data frame. +#' +#' @param path path to VCF file in `*.vcf` or `*.vcf.gz` format +#' @return {tibble} new data frame with all variants (fixed field and genotype information) +#' @importFrom dplyr mutate left_join +#' @importFrom vcfR read.vcfR vcfR2tidy +#' @importFrom stringr str_split_i +parse_vcf_to_df <- function(path) { +# parse VCF file + vcf_content <- read.vcfR(path) + + # fixed field content to data frame + fixed_df <- vcfR2tidy(vcf_content)$fix + + # GT content to data frame + gt_df <- vcfR2tidy(vcf_content)$gt + + # create addition column with observed nucleotides in order to avoid collisions when we do the left_join + #gt_df <- gt_df %>% + # dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2)) + + # next use ChromKey, POS and ALT for joining vcf content data frames + joined_vcf_df <- fixed_df %>% + dplyr::left_join(gt_df, by = c("ChromKey", "POS")) + + as_tibble(joined_vcf_df) +} + #' Process and filter small variant data-frame to requirements #' #' @description Processes small-variant data to comply with requirements for diff --git a/man/cnv.Rd b/man/cnv.Rd new file mode 100644 index 0000000..44f6b6e --- /dev/null +++ b/man/cnv.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cnv.R +\name{cnv} +\alias{cnv} +\title{Read in a *CopyNumberVariants.vcf file and store as an object} +\usage{ +cnv(cnv_file_path, local_app = FALSE) +} +\arguments{ +\item{cnv_file_path}{a file path to a *CopyNumberVariants.vcf file} +} +\value{ +A cnv.output object +} +\description{ +Read in a *CopyNumberVariants.vcf file +} diff --git a/man/new_cnv_output.Rd b/man/new_cnv_output.Rd new file mode 100644 index 0000000..f4c890a --- /dev/null +++ b/man/new_cnv_output.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cnv.R +\name{new_cnv_output} +\alias{new_cnv_output} +\title{Constructor function for combined.cnv.output objects +Not to be called directly} +\usage{ +new_cnv_output(cnv_file_path, local_app = FALSE) +} +\arguments{ +\item{cnv_file_path}{a file path to a *CopyNumberVariants.vcf file} + +\item{local_app}{specifies whether quality metrics are coming from local app} +} +\value{ +A combined.cnv.output object +} +\description{ +Constructor function for combined.cnv.output objects +Not to be called directly +} diff --git a/man/parse_vcf_to_df.Rd b/man/parse_vcf_to_df.Rd new file mode 100644 index 0000000..fff418f --- /dev/null +++ b/man/parse_vcf_to_df.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wrangle.R +\name{parse_vcf_to_df} +\alias{parse_vcf_to_df} +\title{Parse VCF files for a provided path and construct data frame.} +\usage{ +parse_vcf_to_df(path) +} +\arguments{ +\item{path}{path to VCF file in `*.vcf` or `*.vcf.gz` format} +} +\value{ +{tibble} new data frame with all variants (fixed field and genotype information) +} +\description{ +Parse VCF files for a provided path and construct data frame. +} diff --git a/man/read_cnv_data.Rd b/man/read_cnv_data.Rd new file mode 100644 index 0000000..8af30bf --- /dev/null +++ b/man/read_cnv_data.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cnv.R +\name{read_cnv_data} +\alias{read_cnv_data} +\title{Read in a batch of *CopyNumberVariants.vcf files into a list of CNV objects} +\usage{ +read_cnv_data(cnv_directory, local_app = FALSE) +} +\arguments{ +\item{cnv_directory}{a file path to a directory containing one of more *CopyNumberVariants.vcf files} + +\item{local_app}{specifies whether quality metrics are coming from local app} +} +\value{ +A named list of combined.cnv.output objects +} +\description{ +Read in a batch of *CopyNumberVariants.vcf files into a list of CNV objects +} diff --git a/man/summarize_cnv_data.Rd b/man/summarize_cnv_data.Rd new file mode 100644 index 0000000..9ca19b9 --- /dev/null +++ b/man/summarize_cnv_data.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cnv.R +\name{summarize_cnv_data} +\alias{summarize_cnv_data} +\title{Read in a batch of *CopyNumberVariants.vcf files into one dataframe} +\usage{ +summarize_cnv_data(cnv_directory) +} +\arguments{ +\item{tmb_directory}{a file path to a directory containing one of more +*tmb.json files} +} +\value{ +A dataframe with the read CNV data +} +\description{ +Read in a batch of *CopyNumberVariants.vcf files into one dataframe +} From 754dc23155fde0c1e0f324563b0a28018fb41c9f Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Mon, 13 Nov 2023 11:48:44 +0100 Subject: [PATCH 4/4] add pattern for non-localapp CNV files --- R/cnv.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/cnv.R b/R/cnv.R index ef15412..dbb0e32 100755 --- a/R/cnv.R +++ b/R/cnv.R @@ -60,7 +60,7 @@ read_cnv_data <- function(cnv_directory, local_app=FALSE){ summarize_cnv_data <- function(cnv_directory){ cnv_files <- list.files( path = cnv_directory, - pattern = "*CopyNumberVariants.vcf", + pattern = "*cnv.vcf|*CopyNumberVariants.vcf", full.names = TRUE )