Skip to content

Commit

Permalink
plot_dms_heatmap()
Browse files Browse the repository at this point in the history
- added function and Rd
- used function in vignette
  • Loading branch information
tram-nguyen-n committed Dec 3, 2024
1 parent 35282b0 commit 894bd32
Show file tree
Hide file tree
Showing 4 changed files with 272 additions and 63 deletions.
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ export(available_models)
export(benchmark_models)
export(dms_corr_plot)
export(dms_substitutions)
export(plot_dms_heatmap)
export(zeroshot_DMS_metrics)
importFrom(ComplexHeatmap,Heatmap)
importFrom(ExperimentHub,ExperimentHub)
importFrom(dplyr,arrange)
importFrom(dplyr,as_tibble)
importFrom(dplyr,bind_rows)
importFrom(dplyr,case_when)
Expand Down Expand Up @@ -43,6 +46,8 @@ importFrom(queryup,query_uniprot)
importFrom(spdl,info)
importFrom(stats,cor.test)
importFrom(stats,na.omit)
importFrom(stringr,str_sub)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
importFrom(tidyselect,all_of)
importFrom(tidyselect,everything)
195 changes: 195 additions & 0 deletions R/plot_dms_heatmap.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#' @rdname plot_dms_heatmap
#'
#' @noRd
#'
#' @importFrom dplyr filter
#'

filter_by_pos <-
function(df, start_pos = NULL, end_pos = NULL)
{
## Check pos column
if (!"pos" %in% colnames(df)) {
stop("The dataframe must contain a 'pos' column.")
}

if (!is.integer(df$pos)) {
stop("The 'pos' column must be an integer vector.")
}

## If start_pos is NULL, default to the minimum value in "pos"
if (is.null(start_pos)) {
start_pos <- min(df$pos, na.rm = TRUE)
}

## If end_pos is NULL, default to the maximum value in "pos"
if (is.null(end_pos)) {
end_pos <- max(df$pos, na.rm = TRUE)
}

## Filter the dataframe based on the specified positions
filtered_df <- df |>
filter(pos >= start_pos & pos <= end_pos)

return(filtered_df)
}

#' @rdname plot_dms_heatmap
#'
#' @title Visualize DMS Scores Along a Protein
#'
#' @description `plot_dms_heatmap()` plots DMS scores for amino acid
#' substitutions along a protein in a defined DMS assay.
#'
#' @param assay_name `character()` a valid DMS assay name. For the full list of
#' available assays, run `names()` on the list object loaded with
#' `ProteinGymR::dms_substitutions()`. Alternatively, the name of a
#' user-defined DMS assay.
#'
#' @param dms_data `list()` object of DMS assays loaded with
#' `ProteinGymR::dms_substitutions()`.
#' Alternatively, a user-defined list of DMS assays with names corresponding
#' to `assay_name` param.
#'
#' @param start_pos `integer()` first amino acid position to plot. If missing,
#' default start is at the first position along the protein where DMS scores
#' are available.
#'
#' @param end_pos `integer()` last amino acid position to plot. If missing,
#' default end is at the last position along the protein where DMS scores
#' are available.
#'
#' @details
#'
#' For `plot_dms_heatmap()`,
#' `dms_data` must be a `list()` object with set names for each assay
#' element matching `assay_name` parameter.
#'
#' Each assay in the `dms_data()` must include the following columns:
#'
#' - `mutant`: Mutant identifier string matching.
#' Specifically, the set of substitutions to apply on the reference sequence
#' to obtain the mutated sequence (e.g., A1P:D2N implies the amino acid 'A'
#' at position 1 should be replaced by 'P', and 'D' at position 2 should be
#' replaced by 'N').
#' - `DMS_score`: Experimental measurement in the DMS assay.
#' Higher values indicate higher fitness of the mutated protein.
#'
#' @return `plot_dms_heatmap()` returns a [`ComplexHeatmap::Heatmap-class`]
#' object of DMS scores for each position along a protein in a chosen DMS
#' assay. The x-axis shows amino acid positions where a DMS mutation exist,
#' and the y-axis represents possible amino acid residues, ordered by default
#' based on the physiochemical groupings. Higher and lower DMS scores
#' indicate a more positive or negative fitness effect after the mutation,
#' respectively.
#'
#' @examples
#'
#' dms_data <- dms_substitutions()
#'
#' plot_dms_heatmap(assay_name = "A0A192B1T2_9HIV1_Haddox_2018",
#' dms_data = dms_data,
#' start_pos = 10,
#' end_pos = 80)
#'
#'
#' @importFrom dplyr filter pull as_tibble rename_with mutate
#' arrange select
#'
#' @importFrom tidyr pivot_wider
#'
#' @importFrom ComplexHeatmap Heatmap
#'
#' @importFrom stringr str_sub
#'
#' @export
plot_dms_heatmap <-
function(assay_name, dms_data, start_pos, end_pos, ...)
{
## Extract the specified assay
assay_df <- dms_data[[assay_name]]

## Filter out multiple aa sites
assay_df <- assay_df |>
filter(!grepl(":", .data$mutant))

## Stop if all rows are multiple sites
if (nrow(assay_df) == 0){
stop("Unable to plot DMS substitution heatmap; ",
"assay: '", assay_name, "' contains only ",
"multiple amino acid sites."
)
}

## Wrangle the data
assay_df <- assay_df |>
mutate(
ref = str_sub(.data$mutant, 1, 1),
pos = as.integer(
gsub(".*?([0-9]+).*", "\\1", .data$mutant)
),
alt = str_sub(.data$mutant, -1)
)

assay_df <- assay_df |> dplyr::select("ref", "pos", "alt", "DMS_score")

## Reshape to wide format
assay_wide <- assay_df |>
select(-ref) |>
pivot_wider(names_from = alt, values_from = DMS_score) |>
arrange(pos)

## Subset to start_pos and end_pos, or default to first and last sites.
if (missing(start_pos)) {
message(paste(
"'start_pos' not provided,",
"using the first position in the protein."
))

if (missing(end_pos)) {
message(paste(
"'end_pos' not provided,",
"using the last data point in the protein."
))
}

assay_pos <- filter_by_pos(
df = assay_wide)

} else {

assay_pos <- filter_by_pos(
df = assay_wide,
start_pos = start_pos,
end_pos = end_pos
)
}

## Convert to matrix
pos <- assay_pos$pos
alt <- colnames(assay_pos)
alt <- alt[-c(1)]

heatmap_matrix <- assay_pos |>
select(2:length(assay_pos)) |> as.matrix()

## Set aa pos as rownames of matrix and transpose
rownames(heatmap_matrix) <- pos
heatmap_matrix <- t(heatmap_matrix)

## Reorder rows based on physiochemical properties
phyiochem_order <- "DEKRHNQSTPGAVILMCFYW"
phyiochem_order <- unlist(strsplit(phyiochem_order, split = ""))

reordered_matrix <- heatmap_matrix[match(phyiochem_order,
rownames(heatmap_matrix)), ]

## Create the heatmap
ComplexHeatmap::Heatmap(reordered_matrix,
name = "DMS Score",
cluster_rows = FALSE,
cluster_columns = FALSE,
show_row_names = TRUE,
show_column_names = TRUE,
...)
}
67 changes: 67 additions & 0 deletions man/plot_dms_heatmap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

68 changes: 5 additions & 63 deletions vignettes/data_import_and_representation.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -172,70 +172,12 @@ about the information, see the [ProteinGym publication][pg_publication].

## Visualization of DMS data with ComplexHeatmap

Explore an assay and create a heatmap of the DMS scores.
```{r ACE2}
ACE2 <- dms_data[["ACE2_HUMAN_Chan_2020"]]
```
Explore an assay and create a heatmap of the DMS scores with
`plot_dms_heatmap()`.

We want to grab the reference amino acid, protein position, and mutant residue
from the "mutant" column of the dataset.

```{r heatmap, warning = FALSE, fig.wide = TRUE}
ACE2 <-
ACE2 |>
dplyr::mutate(
ref = str_sub(ACE2$mutant, 1, 1),
pos = as.integer(
gsub(".*?([0-9]+).*", "\\1", ACE2$mutant)
),
alt = str_sub(ACE2$mutant, -1)
)
ACE2 <- ACE2 |> select("ref", "pos", "alt", "DMS_score")
head(ACE2)
## Reshape the data to wide format
ACE2_wide <- ACE2 |>
select(-ref) |>
pivot_wider(names_from = alt, values_from = DMS_score) |>
arrange(pos)
## Subset to first 100 position
ACE2_wide <- ACE2_wide |>
filter(pos <= 100)
head(ACE2_wide)
## Convert to matrix
pos <- ACE2_wide$pos
alt <- colnames(ACE2_wide)
alt <- alt[-c(1)]
heatmap_matrix <- ACE2_wide |>
select(2:length(ACE2_wide)) |>
as.matrix()
## Set amino acid position as rownames of matrix
rownames(heatmap_matrix) <- pos
## Transpose so position is x-axis
heatmap_matrix <- t(heatmap_matrix)
## Reorder rows based on physiochemical properties
phyiochem_order <- "DEKRHNQSTPGAVILMCFYW"
phyiochem_order <- unlist(strsplit(phyiochem_order, split = ""))
reordered_matrix <- heatmap_matrix[match(phyiochem_order,
rownames(heatmap_matrix)), ]
## Create the heatmap
ComplexHeatmap::Heatmap(reordered_matrix,
name = "DMS Score",
cluster_rows = FALSE,
cluster_columns = FALSE,
show_row_names = TRUE,
show_column_names = TRUE)
```{r ACE2 heatmap}
plot_dms_heatmap(assay_name = "ACE2_HUMAN_Chan_2020",
dms_data = dms_data, start_pos = 10, end_pos = 100)
```

The heatmap shows the DMS score at each position along the given protein
Expand Down

0 comments on commit 894bd32

Please sign in to comment.