From 2279c848e4816ea01ed15dea844e0ada574a393a Mon Sep 17 00:00:00 2001
From: hyunhwan-bcm <166857764+hyunhwan-bcm@users.noreply.github.com>
Date: Fri, 26 Jul 2024 23:48:21 -0500
Subject: [PATCH] Update VarTierDiseaseDBFalse.R

`VarTierDiseaseDBFalse.R` is the bottleneck for the current pipeline. This PR contains the updated  `VarTierDiseaseDBFalse.R`  from the current `nextflow_branch`, this will help to resolve the processing delay issue.
---
 run/VarTierDiseaseDBFalse.R | 553 +++++++++++++++++-------------------
 1 file changed, 262 insertions(+), 291 deletions(-)

diff --git a/run/VarTierDiseaseDBFalse.R b/run/VarTierDiseaseDBFalse.R
index e53439c..b23659f 100644
--- a/run/VarTierDiseaseDBFalse.R
+++ b/run/VarTierDiseaseDBFalse.R
@@ -1,359 +1,323 @@
-# Run Tier
-
-# Updates history ####
-# Update on 2021.Sep13
-#' Updates:
-#' 1. corrected chromosome number 23->X, 24-> Y
-#' 2. add interaction term of phenotype and Tier. 
-#' 3. for non-coding var, if ClinVarClinSig == conflict ->  push the Tier higher (if IMPACT == MODIFIER/LOW and ClinVarClinSig == conflict, Tier = Tier + 1)
-#' 4. Keep all variant that in-trans (sudo) with a strong Var in the same gene, adjust the Tier to 3.5?
-# Updates on 2022.Apr5
-#' 1. remove the pre-processing of OMIM inheritance to VarTier_preprocessOMIM.R
-#' 2. Add the number of variants (H/M/L impact) per gene: "No.Var.HM","No.Var.H","No.Var.M","No.Var.L"
-#' 3. Add the option if IncludeClinVarHGMD 
-#' 
-#' Updates on 2022.Sep29
-#' ClinVar clinvarSignDesc is in dif format after optimizing the pipeline, update the code to be compatible for that 
-#' gnomadAF, gnomadAFg is "-" not "NA", update the code for that
-#' add IncludeClinVarHGMD for line 101
-
-
-#' change genemap2.Inh into 2 features, AD, AR, save to tsv file
-#' keep GeneID == "-" assinge Tier value
-#' input  output /out/explain/ "ID" + "_Tier"
-#' 
-
-# parameter settings ####
+#!/usr/bin/env Rscript
 
+library(tidyr)
+library(readr)
+library(tibble)
+library(dplyr)
+library(data.table)
 # set adj.k value to adjust Tier
 adj.k <- 1.05
 
 # set if diseases DB will be included
-IncludeClinVarHGMD <- FALSE
 
-# load database: OMIM inheritance ####
-# genemap2.Inh.F <- readRDS(file.path("/run/genemap2.Inh.F.rds"))
-#genemap2.Inh.F <- read.delim2(file.path("/run/data_dependencies/var_tier/hg19/genemap2.Inh.F.txt"))
-# genemap2.Inh.F <- read.delim2("/houston_10t/dongxue/MARRVEL_AI/resource/OMIM/out/genemap2.Inh.F.txt")
-args = commandArgs(trailingOnly=TRUE)
+args <- commandArgs(trailingOnly = TRUE)
 refg <- args[2]
-# load database: OMIM inheritance ####
-# genemap2.Inh.F <- readRDS(file.path("/run/genemap2.Inh.F.rds"))
-genemap2.Inh.F <- read.delim2(file.path(paste0('/run/data_dependencies/var_tier/',refg,'/genemap2.Inh.F.txt')))
 
-# set the input output file / directory ####
+cat("STEP0. load database: OMIM inheritance ####\n")
+omim_inheritance <- file.path(paste0("var_tier/", refg, "/genemap2.Inh.F.txt"))
 
-
-
-# in.f.path <- "/out/input/"
 in.f.path <- "/out/rami-test/"
 out.f.path <- "/out/tier-test-false/"
 
-# When test the code without docker
-#in.f.path <- out.f.path <- "/out/"
-
-# in.f <- paste0(args[1],"_Tier")
-# in.f <- paste0(args[1],"_scores.csv")
 in.f <- paste0(args[1],"_scores.csv")
 out.f <- paste0(args[1],"_Tier.v2.tsv")
 
-# load libraries ####
-library("data.table")
-library("dplyr")
-
-
+genemap2.Inh.F <- readr::read_tsv(omim_inheritance)
 
 # load the input ####
-anno.columns <- c('varId_dash','zyg','geneSymbol','geneEnsId','gnomadAF','gnomadAFg','omimSymptomSimScore','hgmdSymptomSimScore',
-                  'IMPACT','Consequence','hgmdVarFound','clinvarSignDesc','spliceAImax')
-
-anno.orig <- read.table(file.path(in.f.path,in.f), stringsAsFactors = F, sep = ",", header = T)
-# To test:
-# score.path <- "/houston_20t/chaozhong/MARRVEL_AI_2022/explain/906246_raw.csv"
-# score.path <- "/houston_20t/chaozhong/MARRVEL_AI_2022/opt_09302022/output/rami-test/906246_scores.csv"
-# score.path <- "/houston_30t/chaozhong/NDV_model/fake_VCF_test/out/rami-test/NDVG_causal_scores.csv"
-# anno.orig <- read.csv(score.path)
-anno <- anno.orig[,anno.columns]
+anno.columns <- c(
+  "varId_dash",
+  "zyg",
+  "geneSymbol",
+  "geneEnsId",
+  "gnomadAF",
+  "gnomadAFg",
+  "omimSymptomSimScore",
+  "hgmdSymptomSimScore",
+  "IMPACT",
+  "Consequence",
+  "hgmdVarFound",
+  "clinvarSignDesc",
+  "spliceAImax"
+)
+
+anno.orig <- readr::read_csv(in.f) 
+anno <- anno.orig[, anno.columns]
 
 # rename the col
-colnames(anno) <- c("Uploaded_variation","GT","SYMBOL","Gene","gnomadAF","gnomadAFg",
-                      "omimSymptomSimScore", "hgmdSymptomSimScore","IMPACT",
-                      "Consequence","hgmdVarFound","clinvarSignDesc","spliceAImax")
-
-if ("-" %in% anno$Gene) {
-  # for rows without gene ID -> Assign Tier now:
-  anno_noGeneID <- anno[anno$Gene == "-",c("Uploaded_variation","Gene","IMPACT")]
-  # convert IMPACT to numeric value
-  anno_noGeneID$IMPACT.max <- ifelse(anno_noGeneID$IMPACT == "MODIFIER", 1, ifelse(anno_noGeneID$IMPACT == "LOW", 2, ifelse(anno_noGeneID$IMPACT == "MODERATE",3,4)) )
-  anno_noGeneID$IMPACT <- NULL
-  # "TierAD","TierAR","TierAR.adj"
-  anno_noGeneID$TierAD <- 5 - anno_noGeneID$IMPACT.max
-  anno_noGeneID[,c("TierAR","TierAR.adj")] <- ifelse(anno_noGeneID$IMPACT.max >= 3, 3,4)
-  anno_noGeneID$No.Var.HM <- as.numeric(anno_noGeneID$IMPACT.max >= 3)
-  anno_noGeneID$No.Var.H <- as.numeric(anno_noGeneID$IMPACT.max == 4)
-  anno_noGeneID$No.Var.M <- as.numeric(anno_noGeneID$IMPACT.max == 3)
-  anno_noGeneID$No.Var.L <- as.numeric(anno_noGeneID$IMPACT.max <=2)
-}
-
-
+colnames(anno) <- c(
+  "Uploaded_variation",
+  "GT",
+  "SYMBOL",
+  "Gene",
+  "gnomadAF",
+  "gnomadAFg",
+  "omimSymptomSimScore",
+  "hgmdSymptomSimScore",
+  "IMPACT",
+  "Consequence",
+  "hgmdVarFound",
+  "clinvarSignDesc",
+  "spliceAImax"
+)
+
+anno_noGeneID <- anno |> 
+  mutate(
+    IMPACT.max = case_when(
+      IMPACT == "MODIFIER" ~ 1,
+      IMPACT == "LOW" ~ 2,
+      IMPACT == "MODERATE" ~ 3,
+      T ~ 4
+    )
+  ) |> 
+  mutate(
+    IMPACT = NULL,
+    TierAD = 5 - IMPACT.max,
+    TierAR = ifelse(IMPACT.max >= 3, 3, 4),
+    TierAR.adj = ifelse(IMPACT.max >= 3, 3, 4),
+    No.Var.HM = as.numeric(IMPACT.max >= 3),
+    No.Var.H = as.numeric(IMPACT.max == 4),
+    No.Var.M = as.numeric(IMPACT.max == 3),
+    No.Var.L = as.numeric(IMPACT.max <= 2)
+  ) 
+  
 
 # For rows with Gene ID ->  Tier analysis
-anno <- anno[anno$Gene != "-",]
-anno <- unique(anno)
+anno <- anno |> filter(Gene != "-") |> distinct() |> 
+  mutate(
+    IMPACT = ifelse(spliceAImax >= 0.8, "HIGH", IMPACT),
+    IMPACT.no = case_when(
+      IMPACT == "MODIFIER" ~ 1,
+      IMPACT == "LOW" ~ 2,
+      IMPACT == "MODERATE" ~ 3, 
+      T ~ 4)
+    ) 
   
 
-# Tier ####
-
-
-# STEP 0. Adjust variant IMPACT for ClinVar, HGMD, SpliceAImax >= 0.8  ####
-
-if (IncludeClinVarHGMD){
-  anno$IMPACT <- ifelse(grepl("Pathogenic|Likely_pathogenic",anno$clinvarSignDesc) | anno$hgmdVarFound == 1 , "HIGH",anno$IMPACT)
-}
-
-# label Splice AI > 0.8 as HIGH
-anno$IMPACT <- ifelse(anno$spliceAImax >=0.8, "HIGH",anno$IMPACT)
-# convert IMPACT to numeric value
-anno$IMPACT.no <- ifelse(anno$IMPACT == "MODIFIER", 1, ifelse(anno$IMPACT == "LOW", 2, ifelse(anno$IMPACT == "MODERATE",3,4)) )
-  
 
 # Implement the rule based Tier classification ####
-  
+
 # take IMPACT HIGH, MODERATE, LOW, or MODIFIER as H,M,L,L
 # Tier at Gene level
 
-# STEP 1. classify T4: genes with only L variants, nonCodingVar2keep ####
+cat("STEP 1. classify T4: genes with only L variants, nonCodingVar2keep\n")
 
 # STEP 1.1 Keep Genes with IMPACT H or M variants
 Gene.all <- unique(anno$Gene)
-Gene.IMPACT <- unique(anno[,c("Gene","IMPACT")])
-Gene.IMPACT.HM <- Gene.IMPACT[Gene.IMPACT$IMPACT %in% c("HIGH","MODERATE"),]
-  
+Gene.IMPACT <- unique(anno[, c("Gene", "IMPACT")])
+
+Gene.IMPACT <- anno |> select(Gene, IMPACT) |> distinct()
+Gene.IMPACT.HM <- Gene.IMPACT |> filter(IMPACT %in% c("HIGH", "MODERATE"))
+
 # GeneTier4 will be removed for remaining Tier analysis
 GeneTier4 <- Gene.all[!(Gene.all %in% Gene.IMPACT.HM$Gene)]
-anno$TierAD[anno$Gene %in% GeneTier4] = anno$TierAR[anno$Gene %in% GeneTier4] <- 4
 
-# STEP 1.2 Keep Var with max IMPACT == MODIFIER/LOW and ClinVarClinSig == conflict 
+anno <- anno |> 
+  mutate(TierAD = ifelse(Gene %in% GeneTier4, 4, NA)) |> 
+  mutate(TierAR = ifelse(Gene %in% GeneTier4, 4, NA)) 
+
+
+# STEP 1.2 Keep Var with max IMPACT == MODIFIER/LOW and ClinVarClinSig == conflict
 # note: many variants with “Conflicting” are non-coding and coding for different transcripts, only keep the var with max impact as MODIFIER
 
 # find the non-Coding Var to keep
 nonCodingVar2keep <- c()
 
-if(IncludeClinVarHGMD){
-  nonCodingModifier.idx <- which(grepl("Conflicting_interpretations_of_pathogenicity", anno$clinvarSignDesc) & (anno$IMPACT %in% c("MODIFIER")))
-  nonCodingModifier <- anno$Uploaded_variation[nonCodingModifier.idx]
-  nonCodingModifier.df <- anno[anno$Uploaded_variation %in% nonCodingModifier,]
-  
-  for(var in unique(nonCodingModifier.df$Uploaded_variation)){
-    var.df <- nonCodingModifier.df[nonCodingModifier.df$Uploaded_variation == var,]
-    var.impact <- unique(var.df$IMPACT)
-    if(length(var.impact) == 1 ){
-      if(var.impact == "MODIFIER"){
-        nonCodingVar2keep <- c(nonCodingVar2keep,var)
-      }
-    }
-  }
-}
-
-
 # split var to two groups:  ####
 
 # Run Full Tier analysis:
 # non - GeneTier4 and nonCodingVar2keep
-idx <- is.na(anno$TierAD) | (anno$Uploaded_variation %in% nonCodingVar2keep)
+idx <- is.na(anno$TierAD) 
 Var.RunTier <- unique(anno$Uploaded_variation[idx])
 
 # Only Count No.Var.L：
 # Others:
 # check if there are variants without Tier
 ParseVar.NoTier <- F
-if(F %in% idx){
+if (F %in% idx) {
   ParseVar.NoTier <- T
   Var.NoTier <- unique(anno$Uploaded_variation[!idx])
 }
 
-anno.f1 <- anno[idx,]
-
-# set nonCodingVar2keep IMPACT as 3
-anno.f1$IMPACT.no[anno.f1$Uploaded_variation %in% nonCodingVar2keep] <- 3
-
-
-# STEP 2: Get the most severe IMPACT for each variant ####
+anno.f1 <- anno[idx, ]
 
+cat("STEP 2: Get the most severe IMPACT for each variant ####\n")
 VEP.f1 <- anno.f1
 
-# for Gene-Var pair with multiple IMPACT, take the MAX
-VEP.f1.maxIMAPCT <- data.frame(matrix(ncol = ncol(VEP.f1) + 1, nrow = 0))
-colnames(VEP.f1.maxIMAPCT) <- c(colnames(VEP.f1),"IMPACT.max")
-for (var in Var.RunTier){
-  var.df <- VEP.f1[VEP.f1$Uploaded_variation == var,]
-  # check no. of gene for this variant
-  if(length(unique(var.df$Gene)) == 1){
-    var.df$IMPACT.max <- max(var.df$IMPACT.no)
-    VEP.f1.maxIMAPCT <- rbind(VEP.f1.maxIMAPCT,var.df)
-  } else {
-    for (gene in unique(var.df$Gene)){
-      var.g.df <- var.df[var.df$Gene == gene,]
-      var.g.df$IMPACT.max <- max(var.g.df$IMPACT.no)
-      VEP.f1.maxIMAPCT <- rbind(VEP.f1.maxIMAPCT,var.g.df)
-    }
-  }
-}
-  
+# it won't be the same order but will work
+VEP.f1.maxIMAPCT <- VEP.f1 |> 
+  filter(is.na(TierAD)) |> 
+  group_by(Uploaded_variation, Gene) |> 
+  mutate(
+    IMPACT.max = max(IMPACT.no)
+  ) |> 
+  ungroup()
+
 # find all intronic Var with AF == 0, will adjust the Tier if in trans with a high impact variant. ####
 
-rareVar <- VEP.f1.maxIMAPCT$gnomadAF == "-" & VEP.f1.maxIMAPCT$gnomadAFg == "-"
-intronicVar <- grepl("intron_variant",VEP.f1.maxIMAPCT$Consequence)  & VEP.f1.maxIMAPCT$IMPACT.max == 1
-rareIntronicVar <- VEP.f1.maxIMAPCT[rareVar & intronicVar, c("Uploaded_variation","Gene")]
+rareVar <- VEP.f1.maxIMAPCT$gnomadAF == "-" &
+  VEP.f1.maxIMAPCT$gnomadAFg == "-"
+intronicVar <- grepl("intron_variant", VEP.f1.maxIMAPCT$Consequence) &
+  VEP.f1.maxIMAPCT$IMPACT.max == 1
+rareIntronicVar <- VEP.f1.maxIMAPCT[rareVar &
+                                      intronicVar, c("Uploaded_variation", "Gene")]
 rareIntronicVar <- unique(rareIntronicVar)
 
 # for each Variant keep the most severe IMPACT only
-VEP.f2 <- unique(VEP.f1.maxIMAPCT[,c("Uploaded_variation", "SYMBOL","Gene","GT","IMPACT.max","omimSymptomSimScore","hgmdSymptomSimScore")])
+VEP.f2 <- distinct(
+  VEP.f1.maxIMAPCT[, c("Uploaded_variation",
+  "SYMBOL",
+  "Gene",
+  "GT",
+  "IMPACT.max",
+  "omimSymptomSimScore",
+  "hgmdSymptomSimScore")])
+
+cat("STEP 3: assign Tiers\n")
 
-# STEP 3: assign Tiers ####
-  
 # VEP.f2$GT <- ifelse(VEP.f2$GT=="HET", "0/1","1/1")
+# though Tier classification is Gene based, Tier should be assigned to each Variant
 
-# though Tier classification is Gene based, Tier should be assigned to each Variant 
-  
 VEP.Tier.part1 <- data.frame(matrix(ncol = ncol(VEP.f2) + 6, nrow = 0))
-colnames(VEP.Tier.part1) <- c(colnames(VEP.f2),"TierAR","TierAD","No.Var.HM","No.Var.H","No.Var.M","No.Var.L")
+colnames(VEP.Tier.part1) <- c(
+  colnames(VEP.f2),
+  "TierAR",
+  "TierAD",
+  "No.Var.HM",
+  "No.Var.H",
+  "No.Var.M",
+  "No.Var.L"
+)
 
-for (g in unique(VEP.f2$Gene)){
-  
-  g.df <- VEP.f2[VEP.f2$Gene == g,]
-  
-  # X linked, seudo autosomal region -> treat as autosomal
-  # if (sum(grepl("X",g.df$Uploaded_variation)) >0) {
-  #   g.df$TierAD = g.df$TierAR = ifelse(g.df$IMPACT.max <3, 4, 5-g.df$IMPACT.max)
-  # }
-  
-  # AD regardless of the GT
-  g.df$TierAD <- ifelse(g.df$IMPACT.max <3, 4, 5-g.df$IMPACT.max)
-  
-  # AR
-  g.df2 <- g.df
-  
-  # duplicate rows with HOM var
-  if ("HOM" %in% g.df$GT){
-    g.df2 <- rbind(g.df, g.df[g.df$GT == "1/1",])
-  }
-  IMPACT.max <- g.df2$IMPACT.max
-  if(sum(IMPACT.max==4) >=2){
-    # >=2 strong alleles
-    g.df$TierAR <- 5-g.df$IMPACT.max
-    # update 202221005: H+H: M-> change Tie from 2 to 1.5
-    g.df$TierAR[g.df$IMPACT.max ==3] <- 1.5
-  } else if (sum(IMPACT.max==4) ==1 & sum(IMPACT.max==3) >=1) {
-    # H + M
-    g.df$TierAR[g.df$IMPACT.max >=3] <- 1.5 # (need more thought, 2 or 1.5 ???)
-    g.df$TierAR[g.df$IMPACT.max <3] <- 5-g.df$IMPACT.max[g.df$IMPACT.max <3]
-  } else if (sum(IMPACT.max==4) ==1 ) {
-    # H only
-    g.df$TierAR[g.df$IMPACT.max ==4] <- 3 # (need more thought, 2 or 3 ???)
-    g.df$TierAR[g.df$IMPACT.max <=3] <- 5-g.df$IMPACT.max[g.df$IMPACT.max <=3]
-  } else if (sum(IMPACT.max==3) >=2 ) {
-    # M + M
-    g.df$TierAR[g.df$IMPACT.max ==3] <- 2 # (need more thought, 2 or 3 ???)
-    g.df$TierAR[g.df$IMPACT.max < 3] <- 5-g.df$IMPACT.max[g.df$IMPACT.max < 3]
-  }  else if (sum(IMPACT.max==3) == 1 ) {
-    # M only
-    g.df$TierAR[g.df$IMPACT.max ==3] <- 3 # (need more thought, 2 or 3 ???)
-    g.df$TierAR[g.df$IMPACT.max < 3] <- 5-g.df$IMPACT.max[g.df$IMPACT.max < 3]
-  }
-  
-  # count the number of high IMPACT var per gene. 
-  No.Var.HM <- sum(IMPACT.max >=3)
-  No.Var.H <- sum(IMPACT.max == 4)
-  No.Var.M <- sum(IMPACT.max ==3)
-  No.Var.L <- sum(IMPACT.max == 2)
-  g.df$No.Var.HM <- No.Var.HM
-  g.df$No.Var.H <- No.Var.H
-  g.df$No.Var.M <- No.Var.M
-  g.df$No.Var.L <- No.Var.L
-  
-  VEP.Tier.part1 <- rbind(VEP.Tier.part1,g.df)
-}
-  
-# STEP 4: Find all genes with No.Var.H == 1, adjust the Tier for rareIntronicVar ####
+VEP.Tier.part1 <- data.frame(matrix(ncol = ncol(VEP.f2) + 6, nrow = 0))
+colnames(VEP.Tier.part1) <- c(
+  colnames(VEP.f2),
+  "TierAR",
+  "TierAD",
+  "No.Var.HM",
+  "No.Var.H",
+  "No.Var.M",
+  "No.Var.L"
+)
+
+cat("STEP 4: Find all genes with No.Var.H == 1, adjust the Tier for rareIntronicVar\n")
+
+VEP.Tier.part1 <- VEP.f2 |> 
+  group_by(Gene) |>
+  group_modify(~{
+    g.df <- .x
+    
+    # AD regardless of the GT
+    g.df <- g.df |> mutate(TierAD = if_else(IMPACT.max < 3, 4, 5 - IMPACT.max))
+    
+    # AR
+    g.df2 <- g.df
+    
+    # duplicate rows with HOM var
+    if ("HOM" %in% g.df$GT) {
+      g.df2 <- bind_rows(g.df2, filter(g.df, GT == "1/1"))
+    }
+    
+    IMPACT.max <- g.df2$IMPACT.max
+    # Calculate TierAR
+    if (sum(IMPACT.max == 4) >= 2) {
+      g.df$TierAR <- 5 - g.df$IMPACT.max
+      g.df$TierAR[g.df$IMPACT.max == 3] <- 1.5
+    } else if (sum(IMPACT.max == 4) == 1 & sum(IMPACT.max == 3) >= 1) {
+      g.df$TierAR[g.df$IMPACT.max >= 3] <- 1.5
+      g.df$TierAR[g.df$IMPACT.max < 3] <- 5 - g.df$IMPACT.max[g.df$IMPACT.max < 3]
+    } else if (sum(IMPACT.max == 4) == 1) {
+      g.df$TierAR[g.df$IMPACT.max == 4] <- 3
+      g.df$TierAR[g.df$IMPACT.max <= 3] <- 5 - g.df$IMPACT.max[g.df$IMPACT.max <= 3]
+    } else if (sum(IMPACT.max == 3) >= 2) {
+      g.df$TierAR[g.df$IMPACT.max == 3] <- 2
+      g.df$TierAR[g.df$IMPACT.max < 3] <- 5 - g.df$IMPACT.max[g.df$IMPACT.max < 3]
+    } else if (sum(IMPACT.max == 3) == 1) {
+      g.df$TierAR[g.df$IMPACT.max == 3] <- 3
+      g.df$TierAR[g.df$IMPACT.max < 3] <- 5 - g.df$IMPACT.max[g.df$IMPACT.max < 3]
+    } else {
+      g.df$TierAR <- 5 - g.df$IMPACT.max
+    }
+    # Calculate the count columns
+    g.df$No.Var.HM <- sum(IMPACT.max >= 3)
+    g.df$No.Var.H <- sum(IMPACT.max == 4)
+    g.df$No.Var.M <- sum(IMPACT.max == 3)
+    g.df$No.Var.L <- sum(IMPACT.max == 2)
+    
+    g.df
 
-if (IncludeClinVarHGMD){
-  gene.w1HighVar <- VEP.Tier.part1$Gene[VEP.Tier.part1$No.Var.H == 1]
-  rareIntronicVar.2adj <- rareIntronicVar[rareIntronicVar$Gene %in% gene.w1HighVar,]
-  
-  # adjust the Tier for rareIntronicVar based on similarity
-  phenoSimi <- unique(VEP.f1.maxIMAPCT[,c("Gene","omimSymptomSimScore","hgmdSymptomSimScore")])
-  
-  # check for non numeric values and convert to 0
-  omim_NA_idx <- which(is.na(as.numeric(phenoSimi$omimSymptomSimScore)))
-  hgmd_NA_idx <- which(is.na(as.numeric(phenoSimi$hgmdSymptomSimScore)))
-  # print(hgmd_NA_idx)
-  phenoSimi$omimSymptomSimScore[omim_NA_idx] <- 0
-  phenoSimi$hgmdSymptomSimScore[hgmd_NA_idx] <- 0
-  
-  phenoSimi$omimSymptomSimScore <- as.numeric(phenoSimi$omimSymptomSimScore)
-  phenoSimi$hgmdSymptomSimScore <- as.numeric(phenoSimi$hgmdSymptomSimScore)
-  
-  phenoSimi <- phenoSimi[phenoSimi$omimSymptomSimScore != 0 | phenoSimi$hgmdSymptomSimScore != 0,]
-  
-  rareIntronicVar.2adj.wPhenoSimi <- merge(rareIntronicVar.2adj, phenoSimi, by.x = "Gene", by.y = "Gene")
-  rareIntronicVar.2adj.wPhenoSimi$simi <- apply(rareIntronicVar.2adj.wPhenoSimi[, 3:4], 1, max)
-  rareIntronicVar.2adj.wPhenoSimi$TierAR.adj <- 5-(5-4)*exp(adj.k*rareIntronicVar.2adj.wPhenoSimi$simi)
-  
-  rareIntronicVar.df <- rareIntronicVar.2adj.wPhenoSimi[,c("Uploaded_variation","Gene","TierAR.adj")]
-  VEP.Tier.part1 <- merge(VEP.Tier.part1,rareIntronicVar.df,
-                          by.x <- c("Uploaded_variation","Gene"),
-                          by.y = c("Uploaded_variation","Gene"),
-                          all.x = T)
-} else {
-  VEP.Tier.part1$TierAR.adj <- VEP.Tier.part1$TierAR
-}
+  }) |>
+  ungroup()
+
+# If TierAR.adj is needed and not calculated earlier, you can add it here
+# For example, if it's the same as TierAR:
+ VEP.Tier.part1 <- VEP.Tier.part1 |>
+  mutate(TierAR.adj = TierAR)
+
+VEP.Tier.part1$TierAR.adj <- VEP.Tier.part1$TierAR
 
-# add the TierAR.adj to the VEP.Tier df
 
-VEP.Tier.part1 <- VEP.Tier.part1[,c("Uploaded_variation","Gene","GT","IMPACT.max","TierAD","TierAR","TierAR.adj","No.Var.HM","No.Var.H","No.Var.M","No.Var.L")]
+# add the TierAR.adj to the VEP.Tier df
 
-# STEP 5: add the No.Var for Var.NoTier  ####
-if (ParseVar.NoTier){
-  VEP.Tier.part2 <- anno[anno$Uploaded_variation %in% Var.NoTier,c("Uploaded_variation","Gene","GT")]
+VEP.Tier.part1 <- VEP.Tier.part1[, c(
+  "Uploaded_variation",
+  "Gene",
+  "GT",
+  "IMPACT.max",
+  "TierAD",
+  "TierAR",
+  "TierAR.adj",
+  "No.Var.HM",
+  "No.Var.H",
+  "No.Var.M",
+  "No.Var.L"
+)]
+
+cat("STEP 5: add the No.Var for Var.NoTier\n")
+if (ParseVar.NoTier) {
+  VEP.Tier.part2 <- anno[anno$Uploaded_variation %in% Var.NoTier, c("Uploaded_variation", "Gene", "GT")]
   VEP.Tier.part2$IMPACT.max <- 1
-  VEP.Tier.part2[,c("TierAD","TierAR","TierAR.adj")] <- 4
-  VEP.Tier.part2[,c("No.Var.HM","No.Var.H","No.Var.M","No.Var.L")] <- NA
+  VEP.Tier.part2[, c("TierAD", "TierAR", "TierAR.adj")] <- 4
+  VEP.Tier.part2[, c("No.Var.HM", "No.Var.H", "No.Var.M", "No.Var.L")] <- NA
   
   VEP.Tier <- rbind(VEP.Tier.part1, VEP.Tier.part2)
 } else {
   VEP.Tier <- VEP.Tier.part1
 }
 
-VEP.Tier.wGene <- data.frame(matrix(ncol = ncol(VEP.Tier), nrow = 0))
-for (g in unique(VEP.Tier$Gene)){
 
-  g.df <- VEP.Tier[VEP.Tier$Gene == g,]
-  # AR
-  g.df2 <- g.df
-  
-  # duplicate rows with HOM var
-  if ("HOM" %in% g.df$GT){
-    g.df2 <- rbind(g.df, g.df[g.df$GT == "1/1",])
-  }
-  IMPACT.max <- g.df2$IMPACT.max
-  
-  No.Var.HM <- sum(IMPACT.max >=3)
-  No.Var.H <- sum(IMPACT.max == 4)
-  No.Var.M <- sum(IMPACT.max ==3)
-  No.Var.L <- sum(IMPACT.max == 2)
-  g.df$No.Var.HM <- No.Var.HM
-  g.df$No.Var.H <- No.Var.H
-  g.df$No.Var.M <- No.Var.M
-  g.df$No.Var.L <- No.Var.L
-  
-  VEP.Tier.wGene <- rbind(VEP.Tier.wGene,g.df)
-}
-# remove the GT column
-VEP.Tier.wGene$GT <- NULL
-
-
-# STEP 6: add the Tiers from anno_noGeneID
+VEP.Tier.wGene <- VEP.Tier |>
+  group_by(Gene) |>
+  group_modify(~{
+    g.df <- .x
+    
+    # Duplicate rows with HOM var
+    g.df2 <- if ("HOM" %in% g.df$GT) {
+      bind_rows(g.df, filter(g.df, GT == "1/1"))
+    } else {
+      g.df
+    }
+    
+    IMPACT.max <- g.df2$IMPACT.max
+    
+    g.df |>
+      mutate(
+        No.Var.HM = sum(IMPACT.max >= 3),
+        No.Var.H = sum(IMPACT.max == 4),
+        No.Var.M = sum(IMPACT.max == 3),
+        No.Var.L = sum(IMPACT.max == 2)
+      )
+  }) |>
+  ungroup() |>
+  select(-GT)  # Remove the GT column
+
+# If you need to ensure the same column order as the original VEP.Tier
+VEP.Tier.wGene <- VEP.Tier.wGene |>
+  select(names(VEP.Tier)[names(VEP.Tier) != "GT"])
+
+VEP.Tier.wGene <- as_tibble(VEP.Tier.wGene)
+
+cat("STEP 6: add the Tiers from anno_noGeneID\n")
 if ("-" %in% anno$Gene) {
   VEP.Tier.final <- rbind(VEP.Tier.wGene, anno_noGeneID)
 }
@@ -361,21 +325,28 @@ VEP.Tier.final <- VEP.Tier.wGene
 
 colnames(VEP.Tier.final)[colnames(VEP.Tier.final) == "IMPACT.max"] <- "IMPACT.from.Tier"
 VEP.Tier.final$TierAR.adj[is.na(VEP.Tier.final$TierAR.adj)] <- VEP.Tier.final$TierAR[is.na(VEP.Tier.final$TierAR.adj)]
-  
 
 # add inheritance information
 colnames(genemap2.Inh.F)[1] <- "Gene"
-VEP.Tier.wInh <- merge(VEP.Tier.final, genemap2.Inh.F, by.x = "Gene", by.y = "Gene", all.x = T)
+VEP.Tier.wInh <- merge(
+  VEP.Tier.final,
+  genemap2.Inh.F,
+  by.x = "Gene",
+  by.y = "Gene",
+  all.x = T
+)
 VEP.Tier.wInh$dominant[is.na(VEP.Tier.wInh$dominant)] <- 0
 VEP.Tier.wInh$recessive[is.na(VEP.Tier.wInh$recessive)] <- 0
-VEP.Tier.wInh$AD.matched <- ifelse(VEP.Tier.wInh$TierAD <=2 & (VEP.Tier.wInh$dominant == 1),1,0)
-VEP.Tier.wInh$AR.matched <- ifelse(VEP.Tier.wInh$TierAR <=2 & (VEP.Tier.wInh$recessive == 1),1,0)
-
-write.table(VEP.Tier.wInh,file.path(out.f.path,out.f), sep = "\t", quote = F, col.names = T, row.names = F)
-
-# write.table(VEP.Tier.wInh,"/houston_20t/dongxue/MARRVEL_AI/V2/FeatureEng/TestOut/906246_Tier.tsv", sep = "\t", quote = F, col.names = T, row.names = F)
-# write.table(VEP.Tier.wInh,"/houston_30t/dongxue/AI/UDN_Sample_2023/ModifiedAIMCode/NDVG_causal_Tier.v2.tsv", sep = "\t", quote = F, col.names = T, row.names = F)
-### Plot for Tier adjustment ####
-
-
-
+VEP.Tier.wInh$AD.matched <- ifelse(VEP.Tier.wInh$TierAD <= 2 &
+                                     (VEP.Tier.wInh$dominant == 1), 1, 0)
+VEP.Tier.wInh$AR.matched <- ifelse(VEP.Tier.wInh$TierAR <= 2 &
+                                     (VEP.Tier.wInh$recessive == 1), 1, 0)
+
+write.table(
+  VEP.Tier.wInh,
+  out.f,
+  sep = "\t",
+  quote = F,
+  col.names = T,
+  row.names = F
+)