From 9aa2a50ea8ab2bc02d02680e1bf733816d30a624 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 7 Mar 2024 14:16:03 +0100 Subject: [PATCH 1/9] Vcf (#5) * Initial commit * Update DESCRIPTION * bootstrap tests * Test CI * Prototype function to load gene expression data to SE * Stub functions * Add progress bar * Add pre-commit config * Add personalis functions to package and run roxygen * Add relevant documentation on personalis outputs * Update exploratory notebook * Fix pre-commit prettier * Format files according to pre-commit checks * Draft function to read small variant data * configure lintr * Implement basic functionality to read variant and GEX data into MAE * Implement basic error handling for missing samples * Generalize warning for missing samplese * Read CNV data * Fix CNV IO function in case of empty tables * Add function to read personalis HLA data * Add function to read in TCR data * Scrape TCR summary statistics from HTML * Implement function to read somatic variant statistics * Read summary stats for somatic variants * Implement bumpy_matrix_to_df * Read CNV summary statistics * Read MSI info * refactor * Workaround for samples with no col in bumpy matrix * Apply the fix also to small variant data * Use "Genomic Variant" instead of pos as unique variant identifier * Fix issue with reading non-somatic variants * Handle case when there are no samples for a modality * Fix duplicated mutation ids * Fix column name incompatibility in newer HTML report versions * stub vignette * Add vignette * Update vignette * Ensure bumpy matrix, row and coldata have consistent order * Fix alternative gex filename and CNV import * Support alternative TCR path * Fix column conversion in CNV reader * Fix paths * add function for parsing VCF files * add functionality for reading and storing VCF data * add/change comments * add option to read small variant reports of type all * Angewendeter Vorschlag * Angewendeter Vorschlag * add sample type check * Angewendeter Vorschlag * Angewendeter Vorschlag * add report_type parameter * Update README * Fix reading CNV report * Roxygenize * Fix parse copy number report --------- Co-authored-by: Christopher Mohr Co-authored-by: grst --- R/personalis.R | 278 ++++++++++++++++++++++++++++++++++++++++++------- R/util.R | 28 +++++ 2 files changed, 268 insertions(+), 38 deletions(-) diff --git a/R/personalis.R b/R/personalis.R index f97cb04..9ba2742 100644 --- a/R/personalis.R +++ b/R/personalis.R @@ -3,30 +3,51 @@ GUESS_MAX <- 21474836 # largest value without warning - this is anyway more than an excel sheet can have rows #' Read a set of personalis results folders into a MultiAssayExperiment object -#' @param pathlist List with paths to Personalis results folders #' @importFrom MultiAssayExperiment MultiAssayExperiment +#' @param pathlist List with paths to Personalis results folders +#' @param report_type Type of small variant report that should be read (default: preferred) +#' @param read_vcf_data Boolean to specify if VCF file should be read in as well (default: FALSE) #' @return MultiAssayExperiment #' @export -read_personalis <- function(pathlist) { +read_personalis <- function(pathlist, report_type="preferred", read_vcf_data=FALSE) { + stopifnot("`report_type` must be one of 'preferred' or 'all'." = report_type %in% c("preferred", "all")) + experiments <- list() message(">>> Reading gene expression...") experiments[["gene_expression"]] <- read_personalis_gene_expression(pathlist) message(">>> Reading DNA small variant data... ") message("Reading DNA small variant data (somatic)... ") - experiments[["dna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "somatic") + experiments[["dna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "somatic", report_type) message("Reading DNA small variant data (tumor)... ") - experiments[["dna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "tumor") + experiments[["dna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "tumor", report_type) message("Reading DNA small variant data (normal)... ") - experiments[["dna_small_variants_normalr"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "normal") - + experiments[["dna_small_variants_normal"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "normal", report_type) + + if (read_vcf_data) { + message("Reading DNA small variant data (somatic) from VCF files... ") + experiments[["dna_small_variants_somatic_vcf"]] <- read_personalis_vcf_files(pathlist, "DNA", "somatic") + message("Reading DNA small variant data (tumor) from VCF files... ") + experiments[["dna_small_variants_tumor_vcf"]] <- read_personalis_vcf_files(pathlist, "DNA", "tumor") + message("Reading DNA small variant data (normal) from VCF files... ") + experiments[["dna_small_variants_normal_vcf"]] <- read_personalis_vcf_files(pathlist, "DNA", "normal") + } + message(">>> Reading RNA small variant data... ") message("Reading RNA small variant data (somatic)... ") - experiments[["rna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "somatic") + experiments[["rna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "somatic", report_type) message("Reading RNA small variant data (tumor)... ") - experiments[["rna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "tumor") + experiments[["rna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "tumor", report_type) message("Reading RNA small variant data (normal)... ") - experiments[["rna_small_variants_normal"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "normal") + experiments[["rna_small_variants_normal"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "normal", report_type) + + # for RNA we only have tumor or somatic variant VCF data + if (read_vcf_data) { + message("Reading RNA small variant data (somatic) from VCF files... ") + experiments[["rna_small_variants_somatic_vcf"]] <- read_personalis_vcf_files(pathlist, "RNA", "somatic") + message("Reading RNA small variant data (tumor) from VCF files... ") + experiments[["rna_small_variants_tumor_vcf"]] <- read_personalis_vcf_files(pathlist, "RNA", "tumor") + } message(">>> Reading microsatellite stability (MSI) data... ") experiments[["msi"]] <- read_personalis_msi_reports(pathlist) @@ -118,11 +139,12 @@ read_personalis_gene_expression_sample <- function(sample_folder) { #' 'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and #' 'somatic' refers to tumor vs. normal (i.e. somatic mutations only). #' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA' +#' @param report_type report_type which should be read. Can be either 'preferred' or 'all' #' @return SummarizedExperiment #' @importFrom dplyr select distinct bind_rows #' @importFrom purrr map #' @export -read_personalis_small_variant_reports <- function(sample_paths, modality, sample_type) { +read_personalis_small_variant_reports <- function(sample_paths, modality, sample_type, report_type) { read_all <- function(sample, ...) { list( variant_report = read_personalis_small_variant_report_sample(sample, ...), @@ -134,7 +156,8 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample read_all, sprintf("%s small variant (%s)", modality, sample_type), modality = modality, - sample_type = sample_type + sample_type = sample_type, + report_type = report_type ) if (!length(variant_list)) { @@ -149,7 +172,7 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample row_data <- all_variants |> select( mut_id, - Sequence, + Chromosome, POS, `Variant Type`, `Genomic Variant` @@ -194,23 +217,27 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample #' We also do not read the `cancer_clinical_research`, the `cancer_research` and the `lowpupulationfreq` reports, #' because they are subsets of the full report. #' -#' In addition, Personalis also provides raw VCF files with all (unfiltered) variants. We currently don't -#' read them in because without additional filtering they are not very useful. If you need this level of information, -#' feel free to start from the raw vcf files or even run your own variant calling based on the FASTQ files. +#' In addition, Personalis also provides raw VCF files with all (unfiltered) variants. If you need this level of information, +#' please specify read_vcf_data=TRUE in your read_personalis() call. #' #' @param sample_folder path to personalis folder -#' @param sample_types Can be ome or multiple of of 'tumor', 'normal', or 'somatic'. +#' @param sample_types Can be one or multiple of of 'tumor', 'normal', or 'somatic'. #' 'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations), #' 'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and #' 'somatic' refers to tumor vs. normal (i.e. somatic mutations only). #' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA' +#' @param report_type report_type which should be read. Can be either 'preferred' or 'all' #' @return data frame with all variants #' @importFrom dplyr mutate #' @keywords internal -read_personalis_small_variant_report_sample <- function(sample_folder, modality, sample_type) { +read_personalis_small_variant_report_sample <- function(sample_folder, modality, sample_type, report_type) { stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA")) stopifnot("`sample_types` must be in 'tumor', 'normal', or 'somatic'." = sample_type %in% c("tumor", "normal", "somatic")) + stopifnot("`report_type` must be one of 'preferred' or 'full'." = report_type %in% c("preferred", "all")) sample_name <- basename(sample_folder) + # determine report folder based on parameter + report_folder <- ifelse(report_type == "preferred", "Preferred_Transcripts", "All_Transcripts") + # the prefix of the normal filename ends with N rather than T, e.g. K13T -> tumor, K13N -> normal # even though they are all in the K13T folder. tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name) @@ -219,13 +246,15 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality, sample_folder, sprintf("%s_Pipeline", modality), "Annotated_SmallVariant_Reports", - "Preferred_Transcripts", - sprintf("%s_%s_%s_%s_small_variant_report_preferred.xlsx", modality, tmp_sample_name, sample_type, tolower(modality)) + report_folder, + sprintf("%s_%s_%s_%s_small_variant_report_%s.xlsx", modality, tmp_sample_name, sample_type, tolower(modality), report_type) ), guess_max = GUESS_MAX ) |> mutate(sample = sample_name) |> - mutate(mut_id = sprintf("%s_%s_%s", Sequence, `Genomic Variant`, `Variant Type`)) + # in older versions, the "Chromosome" column is called "Sequence" + rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |> + mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`)) variant_table } @@ -233,8 +262,9 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality, #' #' @importFrom tidyr pivot_longer #' @importFrom dplyr bind_rows +#' @importFrom purrr keep #' @keywords internal -read_personalis_somatic_variants_summary_statistics <- function(sample_folder, modality, sample_type) { +read_personalis_somatic_variants_summary_statistics <- function(sample_folder, modality, sample_type, report_type) { stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA")) sample_name <- basename(sample_folder) @@ -252,7 +282,9 @@ read_personalis_somatic_variants_summary_statistics <- function(sample_folder, m html_elements("#somatic_variant_annotation") |> html_elements("table") |> html_table(na.strings = "N/A") - tables[2:3] |> + tables |> + # some reports contain two such tables, some only one + keep(\(x) "SNVs" %in% colnames(x)) |> lapply(function(df) { colnames(df) <- make.names(colnames(df)) colnames(df)[1] <- "metric" @@ -267,6 +299,158 @@ read_personalis_somatic_variants_summary_statistics <- function(sample_folder, m pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") } +#' Read HTML report on variant calling summary statistics and store information +#' based on given modality and sample_type: +#' DNA -> Normal DNA or Tumor DNA "Summary Small Variants" table +#' RNA -> Tumor RNA "Summary Small Variants" table +#' somatic -> "Tumor/Normal Concordance" table +#' +#' @param sample_folder path to personalis folder +#' @param sample_type Can be one or multiple of of 'tumor', 'normal', or 'somatic'. +#' 'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations), +#' 'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and +#' 'somatic' refers to tumor vs. normal (i.e. somatic mutations only). +#' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA' +#' @importFrom rvest read_html html_elements html_nodes html_text +#' @importFrom tidyr pivot_longer +#' @importFrom stringr str_to_title +read_personalis_variant_calling_summary_statistics <- function(sample_folder, modality, sample_type) { + stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA")) + stopifnot("`sample_type` must be one of 'tumor', 'normal', or 'somatic'." = sample_type %in% c("tumor", "normal", "somatic")) + sample_name <- basename(sample_folder) + + html_file <- file.path( + sample_folder, + "QC_REPORT", + sprintf("%s_%s_%s_statistics.html", modality, sample_name, tolower(modality)) + ) + + # decide based on modality and sample_type which content to read from html + html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality)) + table_number <- if_else(sample_type == "somatic", 1, 2) + columns_to_fix <- if(sample_type == 'somatic') c() else c('SNVs', 'Indels', 'Total') + + tables <- read_html(html_file) |> + html_elements(html_section) |> + html_elements("table") |> + html_table(na.strings = "N/A") + tes <- tables[table_number] |> + lapply(function(df) { + colnames(df) <- make.names(colnames(df)) + colnames(df)[1] <- "metric" + df |> + mutate(across(all_of(columns_to_fix), fix_thousands_separator)) + }) |> + bind_rows() |> + pivot_longer(-metric, names_to = "mut_type", values_to = "value") |> + mutate(sample = sample_name) |> + mutate(var_name = sprintf("%s (%s)", metric, mut_type)) |> + select(sample, var_name, value) |> + pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |> + mutate(across(contains("Number"), fix_thousands_separator)) +} + +#' Read in (unfiltered) small variant data from VCF files from personalis folders +#' @param sample_list A vector of paths to personalis folders +#' @return SummarizedExperiment +#' @export +read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { + stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA")) + + read_all <- function(sample, ...) { + list( + vcf_data = read_personalis_vcf_files_sample(sample, ...), + summary_stats = read_personalis_variant_calling_summary_statistics(sample, ...) + ) + } + + variant_list <- read_samples( + sample_paths, + read_all, + sprintf("%s small variant VCF data (%s)", modality, sample_type), + modality = modality, + sample_type = sample_type + ) + + if (!length(variant_list)) { + return(NULL) + } + + col_data <- map(variant_list, "summary_stats") |> + bind_rows() |> + tibble::column_to_rownames("sample") + + all_variants <- map(variant_list, "vcf_data") |> bind_rows() + row_data <- all_variants |> + select( + mut_id, + CHROM, + POS, + REF, + ALT + ) |> + distinct() + stopifnot("mut_id is not a unique identifier" = !anyDuplicated(row_data$mut_id)) + + # ignore columns that are already in `row_data` except for the `mut_id` identifier + row_data_cols <- colnames(row_data)[colnames(row_data) != "mut_id"] + all_variants <- all_variants[, !(colnames(all_variants) %in% row_data_cols)] + all_variants <- add_dummy_entry(all_variants, col_data) + variants_bm <- splitAsBumpyMatrix( + all_variants, + row = all_variants$mut_id, + column = all_variants$sample, + sparse = TRUE + ) + # sort row and col data to be consistent with BM + row_data <- tibble::column_to_rownames(row_data, "mut_id")[rownames(variants_bm), ] + col_data <- col_data[colnames(variants_bm), ] + + se <- SummarizedExperiment( + assays = list( + variants = variants_bm + ), + rowData = row_data, + colData = col_data + ) + +} + +#' Read Personalis (raw) VCF files for a provided folders/multiple samples and merge to one table. +#' +#' @param sample_folder base path to personalis folders +#' @param sample_type Can be one or multiple of of 'tumor', 'normal', or 'somatic'. +#' 'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations), +#' 'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and +#' 'somatic' refers to tumor vs. normal (i.e. somatic mutations only). +#' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA' +#' @return data frame with all variants +read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_type) { + stopifnot("`modality` must be one of 'DNA'/'dna' or 'RNA'/'rna'." = modality %in% c("DNA", "RNA", "dna", "rna")) + stopifnot("`sample_type` must be in 'tumor', 'normal', or 'somatic'." = sample_type %in% c("tumor", "normal", "somatic")) + sample_name <- basename(sample_folder) + # the prefix of the normal filename ends with N rather than T, e.g. K13T -> tumor, K13N -> normal + # even though they are all in the K13T folder. + tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name) + + # tumor DNA VCF files are gzipped, not totally clear if this rule applies to all cases + file_ending <- if_else(sample_type == "tumor" & modality == "DNA", "vcf.gz", "vcf") + + variant_table <- parse_vcf_to_df( + file.path( + sample_folder, + sprintf("%s_Pipeline", modality), + "Variants", + sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending + ) + ) + ) %>% + mutate(sample = sample_name) %>% + mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT)) + + variant_table +} + # # -------------------- CNV -------------------------- @@ -291,6 +475,14 @@ read_personalis_cnv_reports <- function(sample_paths) { return(NULL) } + col_data <- bind_rows(map(cnv_list, "summary_stats")) + if (nrow(col_data)) { + col_data <- col_data |> tibble::column_to_rownames("sample") + } + + all_cnv <- bind_rows(map(cnv_list, "cnv_report")) + row_data <- all_cnv |> + select(cnv_id, `Gene Symbol`, `Chromosome`, `Segment Start`, `Segment End`) |> col_data <- bind_rows(map(cnv_list, "summary_stats")) |> tibble::column_to_rownames("sample") @@ -339,11 +531,11 @@ read_personalis_cnv_report_sample <- function(sample_folder) { "Gene Symbol" = as.character, "CNA Type" = as.character, "AbsoluteCN" = as.numeric, - "Sequence" = as.character, + "Chromosome" = as.character, "Segment Start" = as.numeric, "Segment End" = as.numeric, - "Estimated Sample purity" = as.numeric, - "Estimated Sample Ploidy" = as.numeric, + # "Estimated Sample purity" = as.numeric, + # "Estimated Sample Ploidy" = as.numeric, "Percent of Gene in Event" = \(x) as.numeric(sub("%", "", x)) ) suppressWarnings({ @@ -353,9 +545,13 @@ read_personalis_cnv_report_sample <- function(sample_folder) { # we also can't specify the columns at import time, because in some personalis versions, some columns # are omitted. amp = read_excel(cnv_file, sheet = "AMP", col_types = NULL) |> + # In older reports the "Chromosome" column is called sequence + rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |> select(-any_of(c("log posterior probability", "B-allele Frequency", "Allelotype", "Mean_log2Ratio"))) |> mutate(across(names(COL_TYPES), \(x) COL_TYPES[[cur_column()]](x))), del = read_excel(cnv_file, sheet = "DEL", col_types = NULL) |> + # In older reports the "Chromosome" column is called sequence + rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |> select(-any_of(c("Wilcoxon pvalue", "KS pvalue"))) |> mutate(across(names(COL_TYPES), \(x) COL_TYPES[[cur_column()]](x))) ) @@ -368,7 +564,7 @@ read_personalis_cnv_report_sample <- function(sample_folder) { cnv_table <- cnv_table |> mutate(sample = sample_name) |> # if a segment spans multiple genes, there will be multiple rows per gene - mutate(cnv_id = sprintf("%s_%i_%i_%s", Sequence, `Segment Start`, `Segment End`, `Gene Symbol`)) + mutate(cnv_id = sprintf("%s_%i_%i_%s", Chromosome, `Segment Start`, `Segment End`, `Gene Symbol`)) } cnv_table @@ -386,18 +582,24 @@ read_personalis_cnv_summary_statistics <- function(sample_folder) { sprintf("DNA_%s_dna_statistics.html", sample_name) ) # unfortunately, this is not a table, but a div of divs that looks like a table - table <- (read_html(html_file) |> html_elements("#copy_number"))[[1]] - titles <- table |> - html_nodes(".title") |> - html_text() - values <- table |> - html_nodes(".value") |> - html_text() - - cnv_metrics <- tibble(metric = titles[1:5], value = values[1:5]) |> - mutate(sample = sample_name) |> - pivot_wider(id_cols = sample, names_from = metric, values_from = value) - cnv_metrics + table <- (read_html(html_file) |> html_elements("#copy_number")) + # unfortunately, it seems missing in newer versions of the report + if (!length(table)) { + return(tibble()) + } else { + table <- table[[1]] + titles <- table |> + html_nodes(".title") |> + html_text() + values <- table |> + html_nodes(".value") |> + html_text() + + cnv_metrics <- tibble(metric = titles[1:5], value = values[1:5]) |> + mutate(sample = sample_name) |> + pivot_wider(id_cols = sample, names_from = metric, values_from = value) + cnv_metrics + } } diff --git a/R/util.R b/R/util.R index ce14b0a..1528b9e 100644 --- a/R/util.R +++ b/R/util.R @@ -114,3 +114,31 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") { dummy_entries ) } + +#' Parse VCF files for a provided path and construct data frame. +#' +#' @param path path to VCF file in `*.vcf` or `*.vcf.gz` format +#' @return {tibble} new data frame with all variants (fixed field and genotype information) +#' @importFrom dplyr mutate left_join +#' @importFrom vcfR read.vcfR vcfR2tidy +#' @importFrom stringr str_split_i +parse_vcf_to_df <- function(path) { + # parse VCF file + vcf_content <- read.vcfR(path) + + # fixed field content to data frame + fixed_df <- vcfR2tidy(vcf_content)$fix + + # GT content to data frame + gt_df <- vcfR2tidy(vcf_content)$gt + + # create addition column with observed nucleotides in order to avoid collisions when we do the left_join + gt_df <- gt_df %>% + dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2)) + + # next use ChromKey, POS and ALT for joining vcf content data frames + joined_vcf_df <- fixed_df %>% + dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT")) + + as_tibble(joined_vcf_df) +} From eb2328a54608eab338f60dc3dd36b132da4a1740 Mon Sep 17 00:00:00 2001 From: grst Date: Thu, 7 Mar 2024 13:27:20 +0000 Subject: [PATCH 2/9] Roxygenize --- NAMESPACE | 6 ++++ man/parse_vcf_to_df.Rd | 17 ++++++++++ man/read_personalis.Rd | 6 +++- ..._personalis_small_variant_report_sample.Rd | 12 ++++--- man/read_personalis_small_variant_reports.Rd | 9 ++++- ...alis_variant_calling_summary_statistics.Rd | 33 +++++++++++++++++++ man/read_personalis_vcf_files.Rd | 17 ++++++++++ man/read_personalis_vcf_files_sample.Rd | 24 ++++++++++++++ 8 files changed, 117 insertions(+), 7 deletions(-) create mode 100644 man/parse_vcf_to_df.Rd create mode 100644 man/read_personalis_variant_calling_summary_statistics.Rd create mode 100644 man/read_personalis_vcf_files.Rd create mode 100644 man/read_personalis_vcf_files_sample.Rd diff --git a/NAMESPACE b/NAMESPACE index 7cff0c9..99bf535 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(read_personalis_hla_reports) export(read_personalis_msi_reports) export(read_personalis_small_variant_reports) export(read_personalis_tcr_reports) +export(read_personalis_vcf_files) importFrom(MultiAssayExperiment,MultiAssayExperiment) importFrom(SummarizedExperiment,SummarizedExperiment) importFrom(dplyr,across) @@ -15,6 +16,7 @@ importFrom(dplyr,bind_rows) importFrom(dplyr,cur_column) importFrom(dplyr,distinct) importFrom(dplyr,if_else) +importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(purrr,keep) @@ -25,6 +27,10 @@ importFrom(rvest,html_nodes) importFrom(rvest,html_table) importFrom(rvest,html_text) importFrom(rvest,read_html) +importFrom(stringr,str_split_i) +importFrom(stringr,str_to_title) importFrom(tibble,tibble) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) +importFrom(vcfR,read.vcfR) +importFrom(vcfR,vcfR2tidy) diff --git a/man/parse_vcf_to_df.Rd b/man/parse_vcf_to_df.Rd new file mode 100644 index 0000000..1f667a9 --- /dev/null +++ b/man/parse_vcf_to_df.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{parse_vcf_to_df} +\alias{parse_vcf_to_df} +\title{Parse VCF files for a provided path and construct data frame.} +\usage{ +parse_vcf_to_df(path) +} +\arguments{ +\item{path}{path to VCF file in \verb{*.vcf} or \verb{*.vcf.gz} format} +} +\value{ +{tibble} new data frame with all variants (fixed field and genotype information) +} +\description{ +Parse VCF files for a provided path and construct data frame. +} diff --git a/man/read_personalis.Rd b/man/read_personalis.Rd index 40e43ab..f9c8e7d 100644 --- a/man/read_personalis.Rd +++ b/man/read_personalis.Rd @@ -4,10 +4,14 @@ \alias{read_personalis} \title{Read a set of personalis results folders into a MultiAssayExperiment object} \usage{ -read_personalis(pathlist) +read_personalis(pathlist, report_type = "preferred", read_vcf_data = FALSE) } \arguments{ \item{pathlist}{List with paths to Personalis results folders} + +\item{report_type}{Type of small variant report that should be read (default: preferred)} + +\item{read_vcf_data}{Boolean to specify if VCF file should be read in as well (default: FALSE)} } \value{ MultiAssayExperiment diff --git a/man/read_personalis_small_variant_report_sample.Rd b/man/read_personalis_small_variant_report_sample.Rd index a4f9bc8..167f544 100644 --- a/man/read_personalis_small_variant_report_sample.Rd +++ b/man/read_personalis_small_variant_report_sample.Rd @@ -7,7 +7,8 @@ read_personalis_small_variant_report_sample( sample_folder, modality, - sample_type + sample_type, + report_type ) } \arguments{ @@ -15,7 +16,9 @@ read_personalis_small_variant_report_sample( \item{modality}{modality from which the variants were called. Can be either 'DNA' or 'RNA'} -\item{sample_types}{Can be ome or multiple of of 'tumor', 'normal', or 'somatic'. +\item{report_type}{report_type which should be read. Can be either 'preferred' or 'all'} + +\item{sample_types}{Can be one or multiple of of 'tumor', 'normal', or 'somatic'. 'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations), 'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and 'somatic' refers to tumor vs. normal (i.e. somatic mutations only).} @@ -36,8 +39,7 @@ COSMIC, the default transcript would be the one corresponding to the longest CDS We also do not read the \code{cancer_clinical_research}, the \code{cancer_research} and the \code{lowpupulationfreq} reports, because they are subsets of the full report. -In addition, Personalis also provides raw VCF files with all (unfiltered) variants. We currently don't -read them in because without additional filtering they are not very useful. If you need this level of information, -feel free to start from the raw vcf files or even run your own variant calling based on the FASTQ files. +In addition, Personalis also provides raw VCF files with all (unfiltered) variants. If you need this level of information, +please specify read_vcf_data=TRUE in your read_personalis() call. } \keyword{internal} diff --git a/man/read_personalis_small_variant_reports.Rd b/man/read_personalis_small_variant_reports.Rd index 82c7b45..dbfb9f0 100644 --- a/man/read_personalis_small_variant_reports.Rd +++ b/man/read_personalis_small_variant_reports.Rd @@ -5,7 +5,12 @@ \title{Read in small variant data from personalis folder We only read in the "Preferred Transcript" report here:} \usage{ -read_personalis_small_variant_reports(sample_paths, modality, sample_type) +read_personalis_small_variant_reports( + sample_paths, + modality, + sample_type, + report_type +) } \arguments{ \item{sample_paths}{A vector of paths to personalis folders} @@ -16,6 +21,8 @@ read_personalis_small_variant_reports(sample_paths, modality, sample_type) 'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations), 'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and 'somatic' refers to tumor vs. normal (i.e. somatic mutations only).} + +\item{report_type}{report_type which should be read. Can be either 'preferred' or 'all'} } \value{ SummarizedExperiment diff --git a/man/read_personalis_variant_calling_summary_statistics.Rd b/man/read_personalis_variant_calling_summary_statistics.Rd new file mode 100644 index 0000000..463cecf --- /dev/null +++ b/man/read_personalis_variant_calling_summary_statistics.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/personalis.R +\name{read_personalis_variant_calling_summary_statistics} +\alias{read_personalis_variant_calling_summary_statistics} +\title{Read HTML report on variant calling summary statistics and store information +based on given modality and sample_type: +DNA -> Normal DNA or Tumor DNA "Summary Small Variants" table +RNA -> Tumor RNA "Summary Small Variants" table +somatic -> "Tumor/Normal Concordance" table} +\usage{ +read_personalis_variant_calling_summary_statistics( + sample_folder, + modality, + sample_type +) +} +\arguments{ +\item{sample_folder}{path to personalis folder} + +\item{modality}{modality from which the variants were called. Can be either 'DNA' or 'RNA'} + +\item{sample_type}{Can be one or multiple of of 'tumor', 'normal', or 'somatic'. +'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations), +'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and +'somatic' refers to tumor vs. normal (i.e. somatic mutations only).} +} +\description{ +Read HTML report on variant calling summary statistics and store information +based on given modality and sample_type: +DNA -> Normal DNA or Tumor DNA "Summary Small Variants" table +RNA -> Tumor RNA "Summary Small Variants" table +somatic -> "Tumor/Normal Concordance" table +} diff --git a/man/read_personalis_vcf_files.Rd b/man/read_personalis_vcf_files.Rd new file mode 100644 index 0000000..361f23a --- /dev/null +++ b/man/read_personalis_vcf_files.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/personalis.R +\name{read_personalis_vcf_files} +\alias{read_personalis_vcf_files} +\title{Read in (unfiltered) small variant data from VCF files from personalis folders} +\usage{ +read_personalis_vcf_files(sample_paths, modality, sample_type) +} +\arguments{ +\item{sample_list}{A vector of paths to personalis folders} +} +\value{ +SummarizedExperiment +} +\description{ +Read in (unfiltered) small variant data from VCF files from personalis folders +} diff --git a/man/read_personalis_vcf_files_sample.Rd b/man/read_personalis_vcf_files_sample.Rd new file mode 100644 index 0000000..7af2d74 --- /dev/null +++ b/man/read_personalis_vcf_files_sample.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/personalis.R +\name{read_personalis_vcf_files_sample} +\alias{read_personalis_vcf_files_sample} +\title{Read Personalis (raw) VCF files for a provided folders/multiple samples and merge to one table.} +\usage{ +read_personalis_vcf_files_sample(sample_folder, modality, sample_type) +} +\arguments{ +\item{sample_folder}{base path to personalis folders} + +\item{modality}{modality from which the variants were called. Can be either 'DNA' or 'RNA'} + +\item{sample_type}{Can be one or multiple of of 'tumor', 'normal', or 'somatic'. +'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations), +'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and +'somatic' refers to tumor vs. normal (i.e. somatic mutations only).} +} +\value{ +data frame with all variants +} +\description{ +Read Personalis (raw) VCF files for a provided folders/multiple samples and merge to one table. +} From 04bb27ebeca68416c6cb69771afee9e72b7b762a Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 7 Mar 2024 14:29:24 +0100 Subject: [PATCH 3/9] trigger ci From 7c9503ad8ebb2ec2a6a7151e3c8dda5e0c5f6923 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 11 Mar 2024 09:15:09 +0100 Subject: [PATCH 4/9] add missing dependencies --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 324de69..92302b8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,9 @@ Depends: purrr, dplyr, BumpyMatrix, - rvest + rvest, + stringr, + vcfR Suggests: knitr, rmarkdown, From 6ae3f7fbb1526a437d9fafb6f4e419ed121a5585 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 11 Mar 2024 09:25:19 +0100 Subject: [PATCH 5/9] Depends -> Imports --- DESCRIPTION | 4 ++-- R/personalis.R | 7 ++++++- R/util.R | 2 ++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 92302b8..01b03a1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,7 +7,8 @@ Description: This package provides convenience functions for reading real-world Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 -Depends: +Imports: + dplyr SummarizedExperiment, readxl, MultiAssayExperiment, @@ -15,7 +16,6 @@ Depends: pbapply, tidyr, purrr, - dplyr, BumpyMatrix, rvest, stringr, diff --git a/R/personalis.R b/R/personalis.R index 6d9b5ed..079a218 100644 --- a/R/personalis.R +++ b/R/personalis.R @@ -351,7 +351,12 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo } #' Read in (unfiltered) small variant data from VCF files from personalis folders -#' @param sample_list A vector of paths to personalis folders +#' @param sample_paths A vector of paths to personalis folders +#' @param sample_type Can be one or multiple of of 'tumor', 'normal', or 'somatic'. +#' 'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations), +#' 'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and +#' 'somatic' refers to tumor vs. normal (i.e. somatic mutations only). +#' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA' #' @return SummarizedExperiment #' @export read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { diff --git a/R/util.R b/R/util.R index 1528b9e..e75b128 100644 --- a/R/util.R +++ b/R/util.R @@ -98,6 +98,7 @@ read_samples <- function(sample_paths, io_func, description, ...) { #' @param col_data {data.frame} data frame that is used as colData (must have rownames that are sample identifiers!) #' @param sample_col {character} column in `df` that contains the sample identifier #' @return {tibble} new data frame with dummy entries added +#' @importFrom tibble as_tibble #' @keywords internal add_dummy_entry <- function(df, col_data, sample_col = "sample") { missing_samples <- setdiff(rownames(col_data), unique(df[[sample_col]])) @@ -122,6 +123,7 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") { #' @importFrom dplyr mutate left_join #' @importFrom vcfR read.vcfR vcfR2tidy #' @importFrom stringr str_split_i +#' @importFrom tibble as_tibble parse_vcf_to_df <- function(path) { # parse VCF file vcf_content <- read.vcfR(path) From 9bf8134395d13f0846a735bb7c38dd8741ba654a Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 11 Mar 2024 09:27:13 +0100 Subject: [PATCH 6/9] missing comma --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 01b03a1..0a44a79 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,7 +8,7 @@ Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 Imports: - dplyr + dplyr, SummarizedExperiment, readxl, MultiAssayExperiment, From 9c2d32f42930b0e98725dd5f0807400baf70aecd Mon Sep 17 00:00:00 2001 From: grst Date: Mon, 11 Mar 2024 08:28:42 +0000 Subject: [PATCH 7/9] Roxygenize --- NAMESPACE | 1 + man/read_personalis_vcf_files.Rd | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 99bf535..2181dee 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,6 +29,7 @@ importFrom(rvest,html_text) importFrom(rvest,read_html) importFrom(stringr,str_split_i) importFrom(stringr,str_to_title) +importFrom(tibble,as_tibble) importFrom(tibble,tibble) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) diff --git a/man/read_personalis_vcf_files.Rd b/man/read_personalis_vcf_files.Rd index 361f23a..df2e755 100644 --- a/man/read_personalis_vcf_files.Rd +++ b/man/read_personalis_vcf_files.Rd @@ -7,7 +7,14 @@ read_personalis_vcf_files(sample_paths, modality, sample_type) } \arguments{ -\item{sample_list}{A vector of paths to personalis folders} +\item{sample_paths}{A vector of paths to personalis folders} + +\item{modality}{modality from which the variants were called. Can be either 'DNA' or 'RNA'} + +\item{sample_type}{Can be one or multiple of of 'tumor', 'normal', or 'somatic'. +'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations), +'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and +'somatic' refers to tumor vs. normal (i.e. somatic mutations only).} } \value{ SummarizedExperiment From a14e7d09abddabaa24861fc5fb6c66f1059fb760 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 11 Mar 2024 09:30:26 +0100 Subject: [PATCH 8/9] trigger ci From e9576db1af6e8e8fa0f95299bb123cad4ec68451 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 11 Mar 2024 14:13:19 +0100 Subject: [PATCH 9/9] Fix missing imports --- DESCRIPTION | 2 +- NAMESPACE | 4 ++++ R/personalis.R | 55 ++++++++++++++++++++++++++------------------------ R/util.R | 14 ++++++------- 4 files changed, 41 insertions(+), 34 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0a44a79..7eabd69 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: PersonalisIO Title: Read Personalis data into MultiAssayExperiment objects -Version: 0.2.0.9000 +Version: 0.3.0.9000 Authors@R: person("Gregor", "Sturm", , "gregor.sturm@boehringer-ingelheim.com", role = c("aut", "cre")) Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects. diff --git a/NAMESPACE b/NAMESPACE index 2181dee..72157b1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,16 +8,20 @@ export(read_personalis_msi_reports) export(read_personalis_small_variant_reports) export(read_personalis_tcr_reports) export(read_personalis_vcf_files) +importFrom(BumpyMatrix,splitAsBumpyMatrix) importFrom(MultiAssayExperiment,MultiAssayExperiment) importFrom(SummarizedExperiment,SummarizedExperiment) importFrom(dplyr,across) +importFrom(dplyr,all_of) importFrom(dplyr,any_of) importFrom(dplyr,bind_rows) +importFrom(dplyr,contains) importFrom(dplyr,cur_column) importFrom(dplyr,distinct) importFrom(dplyr,if_else) importFrom(dplyr,left_join) importFrom(dplyr,mutate) +importFrom(dplyr,rename_with) importFrom(dplyr,select) importFrom(purrr,keep) importFrom(purrr,map) diff --git a/R/personalis.R b/R/personalis.R index 079a218..eb6780b 100644 --- a/R/personalis.R +++ b/R/personalis.R @@ -9,9 +9,9 @@ GUESS_MAX <- 21474836 # largest value without warning - this is anyway more than #' @param read_vcf_data Boolean to specify if VCF file should be read in as well (default: FALSE) #' @return MultiAssayExperiment #' @export -read_personalis <- function(pathlist, report_type="preferred", read_vcf_data=FALSE) { +read_personalis <- function(pathlist, report_type = "preferred", read_vcf_data = FALSE) { stopifnot("`report_type` must be one of 'preferred' or 'all'." = report_type %in% c("preferred", "all")) - + experiments <- list() message(">>> Reading gene expression...") experiments[["gene_expression"]] <- read_personalis_gene_expression(pathlist) @@ -32,7 +32,7 @@ read_personalis <- function(pathlist, report_type="preferred", read_vcf_data=FAL message("Reading DNA small variant data (normal) from VCF files... ") experiments[["dna_small_variants_normal_vcf"]] <- read_personalis_vcf_files(pathlist, "DNA", "normal") } - + message(">>> Reading RNA small variant data... ") message("Reading RNA small variant data (somatic)... ") experiments[["rna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "somatic", report_type) @@ -40,7 +40,7 @@ read_personalis <- function(pathlist, report_type="preferred", read_vcf_data=FAL experiments[["rna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "tumor", report_type) message("Reading RNA small variant data (normal)... ") experiments[["rna_small_variants_normal"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "normal", report_type) - + # for RNA we only have tumor or somatic variant VCF data if (read_vcf_data) { message("Reading RNA small variant data (somatic) from VCF files... ") @@ -142,6 +142,7 @@ read_personalis_gene_expression_sample <- function(sample_folder) { #' @param report_type report_type which should be read. Can be either 'preferred' or 'all' #' @return SummarizedExperiment #' @importFrom dplyr select distinct bind_rows +#' @importFrom BumpyMatrix splitAsBumpyMatrix #' @importFrom purrr map #' @export read_personalis_small_variant_reports <- function(sample_paths, modality, sample_type, report_type) { @@ -228,7 +229,7 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample #' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA' #' @param report_type report_type which should be read. Can be either 'preferred' or 'all' #' @return data frame with all variants -#' @importFrom dplyr mutate +#' @importFrom dplyr mutate rename_with #' @keywords internal read_personalis_small_variant_report_sample <- function(sample_folder, modality, sample_type, report_type) { stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA")) @@ -237,7 +238,7 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality, sample_name <- basename(sample_folder) # determine report folder based on parameter report_folder <- ifelse(report_type == "preferred", "Preferred_Transcripts", "All_Transcripts") - + # the prefix of the normal filename ends with N rather than T, e.g. K13T -> tumor, K13N -> normal # even though they are all in the K13T folder. tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name) @@ -314,21 +315,22 @@ read_personalis_somatic_variants_summary_statistics <- function(sample_folder, m #' @importFrom rvest read_html html_elements html_nodes html_text #' @importFrom tidyr pivot_longer #' @importFrom stringr str_to_title +#' @importFrom dplyr all_of across contains read_personalis_variant_calling_summary_statistics <- function(sample_folder, modality, sample_type) { stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA")) stopifnot("`sample_type` must be one of 'tumor', 'normal', or 'somatic'." = sample_type %in% c("tumor", "normal", "somatic")) sample_name <- basename(sample_folder) - + html_file <- file.path( sample_folder, "QC_REPORT", sprintf("%s_%s_%s_statistics.html", modality, sample_name, tolower(modality)) ) - + # decide based on modality and sample_type which content to read from html html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality)) table_number <- if_else(sample_type == "somatic", 1, 2) - columns_to_fix <- if(sample_type == 'somatic') c() else c('SNVs', 'Indels', 'Total') + columns_to_fix <- if (sample_type == "somatic") c() else c("SNVs", "Indels", "Total") tables <- read_html(html_file) |> html_elements(html_section) |> @@ -358,6 +360,7 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo #' 'somatic' refers to tumor vs. normal (i.e. somatic mutations only). #' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA' #' @return SummarizedExperiment +#' @importFrom BumpyMatrix splitAsBumpyMatrix #' @export read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA")) @@ -368,7 +371,7 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { summary_stats = read_personalis_variant_calling_summary_statistics(sample, ...) ) } - + variant_list <- read_samples( sample_paths, read_all, @@ -376,11 +379,11 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { modality = modality, sample_type = sample_type ) - + if (!length(variant_list)) { return(NULL) } - + col_data <- map(variant_list, "summary_stats") |> bind_rows() |> tibble::column_to_rownames("sample") @@ -389,14 +392,14 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { row_data <- all_variants |> select( mut_id, - CHROM, - POS, + CHROM, + POS, REF, ALT ) |> distinct() stopifnot("mut_id is not a unique identifier" = !anyDuplicated(row_data$mut_id)) - + # ignore columns that are already in `row_data` except for the `mut_id` identifier row_data_cols <- colnames(row_data)[colnames(row_data) != "mut_id"] all_variants <- all_variants[, !(colnames(all_variants) %in% row_data_cols)] @@ -410,7 +413,7 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { # sort row and col data to be consistent with BM row_data <- tibble::column_to_rownames(row_data, "mut_id")[rownames(variants_bm), ] col_data <- col_data[colnames(variants_bm), ] - + se <- SummarizedExperiment( assays = list( variants = variants_bm @@ -418,7 +421,6 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { rowData = row_data, colData = col_data ) - } #' Read Personalis (raw) VCF files for a provided folders/multiple samples and merge to one table. @@ -437,22 +439,21 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ # the prefix of the normal filename ends with N rather than T, e.g. K13T -> tumor, K13N -> normal # even though they are all in the K13T folder. tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name) - + # tumor DNA VCF files are gzipped, not totally clear if this rule applies to all cases file_ending <- if_else(sample_type == "tumor" & modality == "DNA", "vcf.gz", "vcf") - + variant_table <- parse_vcf_to_df( file.path( sample_folder, sprintf("%s_Pipeline", modality), - "Variants", - sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending - ) - ) - ) %>% - mutate(sample = sample_name) %>% + "Variants", + sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending) + ) + ) |> + mutate(sample = sample_name) |> mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT)) - + variant_table } @@ -466,6 +467,7 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ #' @param sample_paths List of directories with Personalis samples #' @importFrom purrr map #' @importFrom dplyr bind_rows +#' @importFrom BumpyMatrix splitAsBumpyMatrix #' @export read_personalis_cnv_reports <- function(sample_paths) { read_all <- function(path) { @@ -655,6 +657,7 @@ read_personalis_hla_sample <- function(sample_folder) { #' @return SummarizedExperiment #' @export #' @importFrom purrr map +#' @importFrom BumpyMatrix splitAsBumpyMatrix read_personalis_tcr_reports <- function(sample_paths) { read_all <- function(path) { list( diff --git a/R/util.R b/R/util.R index e75b128..292492e 100644 --- a/R/util.R +++ b/R/util.R @@ -127,20 +127,20 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") { parse_vcf_to_df <- function(path) { # parse VCF file vcf_content <- read.vcfR(path) - + # fixed field content to data frame fixed_df <- vcfR2tidy(vcf_content)$fix - + # GT content to data frame gt_df <- vcfR2tidy(vcf_content)$gt - + # create addition column with observed nucleotides in order to avoid collisions when we do the left_join - gt_df <- gt_df %>% + gt_df <- gt_df |> dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2)) - + # next use ChromKey, POS and ALT for joining vcf content data frames - joined_vcf_df <- fixed_df %>% + joined_vcf_df <- fixed_df |> dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT")) - + as_tibble(joined_vcf_df) }