ubiome_quickstart.Rmd

---
title: "uBiome Quickstart"
author: "Chirag Patel"
date: "`r Sys.Date()`"
output: html_document
---


```{r}
library(tidyverse)
library(DBI)
library(survey)
library(tictoc)
devtools::load_all(".")
```

```{r connect}
con <- DBI::dbConnect(RSQLite::SQLite(), dbname='./db/nhanes_112824.sqlite') ## path to the sqlite 
varnames <- dplyr::tbl(con, "variable_names_epcf") 
table_names <- dplyr::tbl(con, "table_names_epcf")
```


```{r}
varnames |> filter(Variable.Name == 'RB_family1_count')

rb_count <- tbl(con, 'DADA2RB_FAMILY_COUNT_F')
alpha <- tbl(con, 'ALPHADIVERSITY_F')

rb_count |> colnames()

rb_count |> collect() |> str() ## which ones actually have variation?

p <- ggplot(rb_count, aes((RB_family29_count+.01))) ## looks like it is right skewed
p + geom_histogram()


```


# variation of the relative abundance information
```{r}
rsv_relative <- tbl(con, 'DADA2RSV_GENUS_RELATIVE_F') |> collect() |> rbind(tbl(con, 'DADA2RSV_GENUS_RELATIVE_G') |> collect())
#rsv_relative <- tbl(con, 'DADA2RB_FAMILY_RELATIVE_F') |> collect() |> rbind(tbl(con, 'DADA2RB_FAMILY_RELATIVE_G') |> collect())
rsv_sd <- rsv_relative |> summarise(across(where(is.numeric), list(sd = sd), .names = "{.col}")) |> 
  pivot_longer(cols = everything())


compute_clr <- function(data) {
  data |>
    rowwise() |> # Operate on rows
    mutate(
      geo_mean = exp(mean(log(c_across(where(is.numeric))+.25), na.rm = TRUE)), # Row-wise geometric mean
      across(where(is.numeric), ~ log(.x +.25 / geo_mean), .names = "clr_{.col}") # CLR transformation
    ) |>
    select(-geo_mean) |> # Remove intermediate geo_mean column
    ungroup()
}

rsv_relative_clr <- compute_clr(rsv_relative)

rsv_mean <- rsv_relative |> summarise(across(where(is.numeric), list(mean = mean), .names = "{.col}")) |> 
  pivot_longer(cols = everything())

rsv_non_zero<- rsv_relative |> summarise(across(where(is.numeric), list(non_z = ~sum(.x>0)), .names = "{.col}")) |> 
  pivot_longer(cols = everything())

rsv_max <- rsv_relative |> summarise(across(where(is.numeric), list(mean = mean), .names = "{.col}")) |> 
  pivot_longer(cols = everything())


rsv_measure <- rsv_sd |> mutate(statistic="sd") |> rbind(rsv_mean |> mutate(statistic="mean")) 

p <- ggplot(rsv_non_zero |> filter(name != "SEQN"), aes(value)) + geom_histogram()
p

p <- ggplot(rsv_non_zero |> filter(name != "SEQN") |> mutate(name = fct_reorder(name, desc(value))), aes(name,value)) + geom_bar(stat = "identity")
p

p <- ggplot(rsv_max |> filter(name != "SEQN"), aes(value)) + geom_histogram()
p

p <- ggplot(rsv_sd |> filter(name != "SEQN"), aes(value)) + geom_histogram()
p

# filter out those that have a SD of zero, then also add .25 and 

```


```{r correlation with sex and age?}

rb_count_demog <- rb_count |> left_join(tbl(con, "DEMO_F")) 

p <- ggplot(rb_count_demog, aes(as.factor(RIAGENDR), I(log10(RB_family29_count+1)))) # maybe?
p <- p + geom_boxplot()
p

p <- ggplot(rb_count_demog, aes(RIDAGEYR, (RB_family29_count+1)))
p <- p + geom_point(alpha=.1)
p


```

Pseudo code from Braden:
```{r}
# outlist = list()
# for(category in c('phenotype','exposure','','')){
# 	dat = chiragadata %>% filter(categegory == c)
# 	for(val in dat$variable){
# 		for(microbe in microbes){
# 		#join microbe data in 
# 		# run weighted regression
# 		outlist[paste(c,val,microbe)] = lm(variable ~ microbe,weight=SRVYWEGHT) %>% broom::tidy() %>% mutate(category = c, variable = v, microbe = m)
# 		}
# 	}
# }

#out = bind_rows(outlist)
# out is a dataframe with the columns:
### term, beta, se, df,pvalue, category, variable, microbe, adjusted_pvalue

```


# Associate RB_family29_count (P) and Cotinine (E)
```{r serum cotinine and RB29 count}

varnames |> filter(Variable.Name == 'LBXCOT') # indicator of nicotene (smoke exposure)
varnames |> filter(Variable.Name == 'RB_family29_count') # Actinomycetaceae


rb_cot <- pe_flex_adjust("RSV_genus29_relative", "LBXCOT", adjustment_models, con, scale_p=T, logxform_p = F, scale_clr = T)
# print out the tidied summary stats for the fully adjusted model
rb_cot$models[[2]]$tidied
# Radju2 for the full adjusted model
rb_cot$models[[2]]$r2
# R2 for base model
rb_cot$base_models[[2]]$r2
# full-base
rb_cot$models[[2]]$r2$rsq - rb_cot$base_models[[2]]$r2$rsq
```


BMI association 
```{r}
rb_bmi <- pe_flex_adjust("BMXBMI", "RSV_genus29_relative", adjustment_models, con, scale_p=T, logxform_p = F, scale_clr = F)
rb_arm <- pe_flex_adjust("BMXARMC", "RSV_genus29_relative", adjustment_models, con, scale_p=T, logxform_p = F, scale_clr = F)
rb_glu <- pe_flex_adjust("LBXGLU", "RB_genus1267_relative", adjustment_models, con, scale_p=T, scale_e = T, logxform_e = F, logxform_p = F, scale_clr = F)
```


```{r}
ptables <- get_table_names_for_varname(con, varname = "BMXBMI") |> rename(p_name = Data.File.Name)
etables <- get_table_names_for_varname(con, varname = "RSV_genus39_relative") |> rename(e_name = Data.File.Name)
table_set <- ptables |> inner_join(etables, by = "Begin.Year")
tab_obj <- get_x_y_tables_as_list(con,table_set$p_name,table_set$e_name)
  ## weight
tab_obj <- figure_out_multiyear_weight(tab_obj)


geo_mean = exp(mean(log(tab_obj$merged_tab$RSV_genus39_relative+.25)))
x <- log(tab_obj$merged_tab$RSV_genus39_relative+.25 / geo_mean)
plot(x,scale(tab_obj$merged_tab$RSV_genus39_relative))


```

# check the CLR 
```{r}

ptables <- get_table_names_for_varname(con, varname = "RSV_genus29_relative") |> rename(p_name = Data.File.Name)
etables <- get_table_names_for_varname(con, varname = "LBXCOT") |> rename(e_name = Data.File.Name)
table_set <- ptables |> inner_join(etables, by = "Begin.Year")
if(nrow(table_set) == 0) {
    stop("Y and X variables not collected in the same survey")
}


  ## get table names for each series
tab_obj <- get_x_y_tables_as_list(con,table_set$p_name,table_set$e_name)

  ## weight
tab_obj <- figure_out_multiyear_weight(tab_obj)

tab_obj <- name_and_xform_pheno_expo("RSV_genus29_relative", "LBXCOT", tab_obj, logxform_p=F, logxform_e=T)

potential_adjusters <- setdiff(unique(adjustment_models$variables), NA)
dat <- tab_obj$merged_tab |> dplyr::filter(!is.na(wt), wt > 0, !is.na(expo), !is.na(pheno), dplyr::if_all(tidyselect::all_of(potential_adjusters), ~!is.na(.)))
dsn <- create_svydesign(dat)
logger::log_info("Phenotype scaled using CLR")
dsn <- stats::update(dsn, logpheno=log(pheno+.25))
mn <- mean(survey::svymean(~logpheno, dsn, na.rm=T))
dsn <- stats::update(dsn, pheno_test=logpheno-mn)
plot(dsn$variables$logpheno, log(dsn$variables$pheno+.25))
abline(0, 1)
plot(dsn$variables$pheno_test, log(dsn$variables$pheno+.25)-mean(log(dsn$variables$pheno+.25)))
abline(0,1)

plot(dsn$variables$pheno_test, dsn$variables$RSV_genus29_relative)

hist(dsn$variables$RSV_genus29_relative)


```


# what are the sample size overlap between some E (e.g., diet) and the uBiome counts?
# what are the sample size overlap between some P (e.g., glucose) and the uBiome counts?
```{r}
sample_size_e <- read_csv('./select/sample_size_ubiome_e_112424.csv.zip') # evarname is the exposure, e_table_name is where it is in .sqlite
sample_size_p <- read_csv('./select/sample_size_ubiome_y_112424.csv.zip') # xvarname is the pheno, x_table_name is where it is in .sqlite
```


```{r}
beta_diversity <- read_tsv("./download/beta/dada2rb-unwunifrac-beta.txt")
```