diff --git a/01_DAPC_heatmaps.R b/01_DAPC_heatmaps.R new file mode 100644 index 0000000..c496037 --- /dev/null +++ b/01_DAPC_heatmaps.R @@ -0,0 +1,317 @@ +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +lib<-c("adagenet","ggplot2", "cowplot", "factoextra", "readr", "tidyr", "stringr", "data.table") +lapply(lib,library,character.only=T) + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/phenotype_data_current/") + +output_folder = "figs/" +extracted_color = "extracted_colour/" + +#bind the UV csv files with the Vis ones and then revise code belows to analyze 4 channels instead of 3 + +IF6_Vis_4T_1R = read.csv(paste(extracted_color, "IF6_Vis_4T_1R.csv", sep="")) + +IF6_UV_4T_1R = read.csv(paste(extracted_color, "IF6_UV_4T_1R.csv", sep="")) + +IF6_Vis_4T_1R = cbind(IF6_Vis_4T_1R, IF6_UV_4T_1R[,4:2479]) + +# write.csv(IF6_Vis_4T_1R, paste(extracted_color, "IF6_Vis_UV_4T_1R.csv"), row.names = FALSE) + + +IF8_Vis_4T_1R = read.csv(paste(extracted_color, "IF8_Vis_4T_1R.csv", sep="")) + +IF8_UV_4T_1R = read.csv(paste(extracted_color, "IF8_UV_4T_1R.csv", sep="")) + +IF8_Vis_4T_1R = cbind(IF8_Vis_4T_1R, IF8_UV_4T_1R[,4:2479]) #specifying columns is to not include the column names in the UV file + +# write.csv(IF8_Vis_4T_1R, paste(extracted_color, "IF8_Vis_UV_4T_1R.csv"), row.names = FALSE) + + + +IF9_Vis_4T_1R = read.csv(paste(extracted_color, "IF9_Vis_4T_1R.csv", sep="")) #waiting on landmarking for IF9 UV and Vis to do this one + +IF9_UV_4T_1R = read.csv(paste(extracted_color, "IF9_UV_4T_1R.csv", sep="")) + +IF9_Vis_4T_1R = cbind(IF9_Vis_4T_1R, IF9_UV_4T_1R[,4:2479]) + +# write.csv(write.csv(IF9_Vis_4T_1R, paste(extracted_color, "IF9_Vis_UV_4T_1R.csv"), row.names = FALSE)) + + +IF10_Vis_4T_1R = read.csv(paste(extracted_color, "IF10_Vis_4T_1R.csv", sep="")) #waiting on landmarking for IF10 UV and Vis to do this one + +IF10_UV_4T_1R = read.csv(paste(extracted_color, "IF10_UV_4T_1R.csv", sep="")) + +IF10_Vis_4T_1R = cbind(IF10_Vis_4T_1R, IF10_UV_4T_1R[,4:2479]) + +# write.csv(write.csv(IF10_Vis_4T_1R, paste(extracted_color, "IF10_Vis_UV_4T_1R.csv"), row.names = FALSE)) + + + +# write.csv(IF10_Vis_4T_1R, paste(extracted_color, "IF10_Vis_UV_4T_1R.csv"), row.names = FALSE) + +# # Just read in the clean files +# IF10_Vis_4T_1R = read.csv("data/Iso-Y_phenotype_data-main/ IF10_Vis_UV_4T_1R.csv", sep="") +# IF6_Vis_4T_1R = read.csv("data/Iso-Y_phenotype_data-main/ IF6_Vis_UV_4T_1R.csv", sep="") +# IF8_Vis_4T_1R = read.csv("data/Iso-Y_phenotype_data-main/ IF8_Vis_UV_4T_1R.csv", sep="") +# IF9_Vis_4T_1R = read.csv("data/Iso-Y_phenotype_data-main/IF9_Vis_UV_4T_1R.csv", sep="") + + +#unwarp all populations to the same, global consensus so that there is only 1 set of coords for below - need to update the Sampling_locations.csv file to be global consensus once I know how to make that file + + +## X Y coords of sampling points, saved from image processing/sample collection custom code +## The X Y coords from the image processing may need to have the y axis flipped. If the fish shown in the plots at the end of this code are upside-down, just multiply your y coords by -1 +X_Y_coords = read.csv("extracted_colour//Sampling_locations.csv") + +IF6_name = rep("Iso-Y6", times = 41) +IF8_name = rep("Iso-Y8", times = 48) +IF9_name = rep("Iso-Y9", times = 42) +IF10_name = rep("Iso-Y10", times = 42) + + +## Combine the above vectors into one vector. This vector is in the appropriate order so that the population label aligns with the dataframe rows +#dapc.all_0p_4DT.pops = c(IF6_name, IF8_name, IF10_name) +dapc.all_0p_4DT.pops = c(IF6_name, IF8_name, IF9_name, IF10_name) + +dapc.all_0p_4DT = rbind(IF6_Vis_4T_1R, IF8_Vis_4T_1R, IF9_Vis_4T_1R, IF10_Vis_4T_1R) + + +dapc_4DT.2_noPRLP17 = dapc(dapc.all_0p_4DT[,c(4:9907)], dapc.all_0p_4DT.pops, var.contrib = TRUE, scale = TRUE, n.pca = 10, n.da = 10, var.loadings = TRUE) +#n.da is the number of discriminant functions to use, which defaults to 1 less than number of groups + + +IF_line = c(rep(6,41),rep(8,48),rep(10,42),rep(10,42)) + +dapc_4DT.2_noPRLP17$ind.coord_lin_names = cbind(IF_line, dapc_4DT.2_noPRLP17$ind.coord) + + + + +fish_dapc_loading_heatmap = function(dapc_object, axis, filename, ylim, X_Y_coords, color.channel = "sum",plot.legend=T){ + + + variable.loadings <- dapc_object$var.load[,axis] + + + variable.names <- names(dapc_object$var.load[,axis]) + + + + variable.loadings.data.frame = as.data.frame(variable.loadings) + variable.names.data.frame = as.data.frame(variable.names) + + + loadings.and.names = cbind(variable.loadings.data.frame, variable.names.data.frame) + + + colnames(loadings.and.names) = c("corrs", "point") + + + red.loadings.and.names = loadings.and.names[1:2476, ] #the number of rows in each needs to match number of rows in X_Y_coords - if I want to use thresholds >0 above, need to find a way to workaround this issue + green.loadings.and.names = loadings.and.names[2477:4952, ] + blue.loadings.and.names = loadings.and.names[4953:7428, ] + UV.loadings.and.names = loadings.and.names[7429:9904, ] + + all.color.loadings = as.data.frame(cbind(red.loadings.and.names$corrs, green.loadings.and.names$corrs, blue.loadings.and.names$corrs, UV.loadings.and.names$corrs)) + colnames(all.color.loadings) = c("red", "gre", "blu", "UV") + all.color.loadings[1:5, ] + + + coordinates.and.loadings = cbind(X_Y_coords, all.color.loadings) + + coordinates.and.loadings$mean = rowMeans(coordinates.and.loadings[,c("red","gre","blu", "UV")]) + + if(color.channel == "mean"){ + all.color.loadings = cbind(X_Y_coords, coordinates.and.loadings[7]) + low.color = "white" + high.color = "black" + } else if(color.channel == "red"){ + all.color.loadings = cbind(X_Y_coords, coordinates.and.loadings$red) + low.color = "black" + high.color = "red" + }else if(color.channel == "green"){ + all.color.loadings = cbind(X_Y_coords, coordinates.and.loadings$gre) + low.color = "black" + high.color = "green" + }else if(color.channel == "blue"){ + all.color.loadings = cbind(X_Y_coords, coordinates.and.loadings$blu) + low.color = "black" + high.color = "blue" + }else { #default is UV + all.color.loadings = cbind(X_Y_coords, coordinates.and.loadings$UV) + low.color = "black" + high.color = "white" + } + names(all.color.loadings) = c("V1","V2", "response") + + if(plot.legend){ + ggplot(all.color.loadings, aes(V1,-V2, color = response)) + + ggtitle("") + + geom_point(size=1.5) + + coord_fixed() + + scale_color_gradient2(low = low.color, mid = "grey", high = high.color,breaks = c(-0.006,0,0.006),labels = c(-0.006,0,0.006), limits = ylim, name = toupper(color.channel)) + + theme(axis.line=element_blank(),axis.text.x=element_blank(), + plot.title = element_text(size = 24, family = "Avenir", hjust = 0.5, vjust=5), + axis.text.y=element_blank(),axis.ticks=element_blank(), + axis.title.x=element_blank(), + axis.title.y=element_blank(), + panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(), + panel.grid.minor=element_blank(),plot.background=element_blank(), + legend.position = "right", + legend.text = element_text(size=10, family = "Avenir"), + legend.title = element_text(size=12, family = "Avenir")) + + } else { + ggplot(all.color.loadings, aes(V1,-V2, color = response)) + + ggtitle("") + + geom_point(size=1.5) + + coord_fixed() + + scale_color_gradient2(low = low.color, mid = "grey", high = high.color, limits = ylim, name = toupper(color.channel)) + + theme(axis.line=element_blank(),axis.text.x=element_blank(), + plot.title = element_text(size = 24, family = "Avenir", hjust = 0.5, vjust=5), + axis.text.y=element_blank(),axis.ticks=element_blank(), + axis.title.x=element_blank(), + axis.title.y=element_blank(), + panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(), + panel.grid.minor=element_blank(),plot.background=element_blank(), + legend.position = "none") + } + +} + +# List colours +cols <- c("red","green","blue","UV") +dapc1_fish <- lapply(cols,function(colour){ + fish_dapc_loading_heatmap(dapc_object = dapc_4DT.2_noPRLP17, axis = 1, filename = paste(output_folder, "DAPC_ax1_red_corrs.png", sep = ""), ylim = c(-0.006,0.006), X_Y_coords = X_Y_coords, color.channel = colour,plot.legend = F) +}) +dapc1_fish[[1]] <- dapc1_fish[[1]]+ggtitle("DF1") +dapc1_fish_fig <- cowplot::plot_grid(plotlist = dapc1_fish,ncol=1) + +dapc2_fish <- lapply(cols,function(colour){ + fish_dapc_loading_heatmap(dapc_object = dapc_4DT.2_noPRLP17, axis = 2, filename = paste(output_folder, "DAPC_ax1_red_corrs.png", sep = ""), ylim = c(-0.006,0.006), X_Y_coords = X_Y_coords, color.channel = colour,plot.legend = T) +}) +dapc2_fish[[1]] <- dapc2_fish[[1]]+ggtitle("DF2") +dapc2_fish_fig <- cowplot::plot_grid(plotlist = dapc2_fish,ncol=1) + +dapc22_fish <- lapply(cols,function(colour){ + fish_dapc_loading_heatmap(dapc_object = dapc_4DT.2_noPRLP17, axis = 2, filename = paste(output_folder, "DAPC_ax1_red_corrs.png", sep = ""), ylim = c(-0.006,0.006), X_Y_coords = X_Y_coords, color.channel = colour,plot.legend = F) +}) +dapc22_fish[[1]] <- dapc22_fish[[1]]+ggtitle("DF2") +dapc22_fish_fig <- cowplot::plot_grid(plotlist = dapc22_fish,ncol=1) + +dapc3_fish <- lapply(cols,function(colour){ + fish_dapc_loading_heatmap(dapc_object = dapc_4DT.2_noPRLP17, axis = 3, filename = paste(output_folder, "DAPC_ax1_red_corrs.png", sep = ""), ylim = c(-0.006,0.006), X_Y_coords = X_Y_coords, color.channel = colour,plot.legend = T) +}) +dapc3_fish[[1]] <- dapc3_fish[[1]]+ggtitle("DF3") +dapc3_fish_fig <- cowplot::plot_grid(plotlist = dapc3_fish,ncol=1) + +# Read in DAPC figure and plot alongside... +# dapc_fig <- readRDS("data/complete_dapc_fig.rds") + +###### MAKE DAPC FIGURE HERE ######## +library(adegenet) +library(ggplot2) +DAPC_data <- readRDS("DAPC_results.rds") + +# Plot with ggplot +plot_dd <- data.frame(DAPC_data$ind.coord) +plot_dd$IF_line <- DAPC_data$grp +plot_dd$IF_line <- gsub('\\s+', '', plot_dd$IF_line) + +# Get all the label info... +cols <- data.frame(line=paste0("Iso-Y",c(10,6,8,9)), + col=c("#3CBB75","#B22222","#2D708E","#481567")) + +plot_labs <- data.frame(DAPC_data$grp.coord) +plot_labs$IF_line <- rownames(plot_labs) +plot_labs$IF_line <- c("Iso-Y10", "Iso-Y6", "Iso-Y8", "Iso-Y9") + +# And get the eigenvals +eigenvals <- round((DAPC_data$eig/sum(DAPC_data$eig)) * 100,2) + + +dapc_fig <- ggplot(plot_dd,aes(LD1,LD2,colour=IF_line))+ + theme_classic()+ + geom_vline(xintercept = 0,alpha=0.3)+ + geom_hline(yintercept = 0,alpha=0.3)+ + geom_point(show.legend = F,size=3, alpha=0.7)+ + scale_colour_manual(breaks = cols$line, + values = cols$col)+ + geom_segment(data=plot_dd[plot_dd$IF_line == "Iso-Y6",],aes(x=LD1,xend=plot_labs[plot_labs$IF_line == "Iso-Y6","LD1"], + y=LD2,yend=plot_labs[plot_labs$IF_line == "Iso-Y6","LD2"]))+ + geom_segment(data=plot_dd[plot_dd$IF_line == "Iso-Y8",],aes(x=LD1,xend=plot_labs[plot_labs$IF_line == "Iso-Y8","LD1"], + y=LD2,yend=plot_labs[plot_labs$IF_line == "Iso-Y8","LD2"]))+ + geom_segment(data=plot_dd[plot_dd$IF_line == "Iso-Y9",],aes(x=LD1,xend=plot_labs[plot_labs$IF_line == "Iso-Y9","LD1"], + y=LD2,yend=plot_labs[plot_labs$IF_line == "Iso-Y9","LD2"]))+ + geom_segment(data=plot_dd[plot_dd$IF_line == "Iso-Y10",],aes(x=LD1,xend=plot_labs[plot_labs$IF_line == "Iso-Y10","LD1"], + y=LD2,yend=plot_labs[plot_labs$IF_line == "Iso-Y10","LD2"]))+ + + geom_label(data=plot_labs,aes(x=LD1,y=LD2,label=IF_line),show.legend = F,size=8, family = "Avenir", alpha=0.8)+ + theme(axis.title = element_text(size=20, family = "Avenir"), + axis.text = element_text(size=18, famil = "Avenir"), + legend.position = "none")+ + scale_x_continuous(breaks=c(-6,0,6))+ + scale_y_continuous(breaks=c(-6,0,6))+ + labs(x=paste0("Discriminant Function 1 (",eigenvals[1],"%)"), + y=paste0("Discriminant Function 2 (",eigenvals[2],"%)"))+ + stat_ellipse() + +png("figs/Figure1_full_dapc_only.png",width = 40, height = 20, units = "cm", res = 400) +# plot_grid(dapc_fig,dapc1_fish_fig,dapc2_fish_fig,ncol = 3,rel_widths = c(2.5,0.75,1.05),labels=c("a","b"),label_size = 20) +plot_grid(dapc_fig,ncol = 1,rel_widths = c(2.5)) +dev.off() + +## Make figure for DAPC Axes 2 & 3 +dapc23_fig <- ggplot(plot_dd,aes(LD2,LD3,colour=IF_line))+ + theme_classic()+ + geom_vline(xintercept = 0,alpha=0.5)+ + geom_hline(yintercept = 0,alpha=0.5)+ + geom_point(show.legend = F,size=2)+ + scale_colour_manual(breaks = cols$line, + values = cols$col)+ + geom_segment(data=plot_dd[plot_dd$IF_line == "Iso-Y6",],aes(x=LD2,xend=plot_labs[plot_labs$IF_line == "Iso-Y6","LD2"], + y=LD3,yend=plot_labs[plot_labs$IF_line == "Iso-Y6","LD3"]))+ + geom_segment(data=plot_dd[plot_dd$IF_line == "Iso-Y8",],aes(x=LD2,xend=plot_labs[plot_labs$IF_line == "Iso-Y8","LD2"], + y=LD3,yend=plot_labs[plot_labs$IF_line == "Iso-Y8","LD3"]))+ + geom_segment(data=plot_dd[plot_dd$IF_line == "Iso-Y9",],aes(x=LD2,xend=plot_labs[plot_labs$IF_line == "Iso-Y9","LD2"], + y=LD3,yend=plot_labs[plot_labs$IF_line == "Iso-Y9","LD3"]))+ + geom_segment(data=plot_dd[plot_dd$IF_line == "Iso-Y10",],aes(x=LD2,xend=plot_labs[plot_labs$IF_line == "Iso-Y10","LD2"], + y=LD3,yend=plot_labs[plot_labs$IF_line == "Iso-Y10","LD3"]))+ + + geom_label(data=plot_labs,aes(x=LD2,y=LD3,label=IF_line),show.legend = F,size=8,alpha=0.8, family = "Avenir")+ + theme(axis.title = element_text(size=20, family = "Avenir"), + axis.text = element_text(size=18, family = "Avenir"), + legend.position = "none")+ + labs(x=paste0("Discriminant Function 2 (",eigenvals[2],"%)"), + y=paste0("Discriminant Function 3 (",eigenvals[3],"%)"))+ + stat_ellipse() + + + + +##################################### +empty <- ggplot()+theme_void() + +# Compose +library(cowplot) +# pdf("figs/full_dapc_heatmaps.pdf",width = 14,height=10) +# plot_grid(plot_grid(dapc2_fish_fig,dapc_fig,ncol=2,rel_widths = c(1,3)), +# plot_grid(empty,dapc1_fish_fig,ncol=2,rel_widths = c(1,3)), +# nrow = 2,ncol = 1,rel_heights = c(3,1)) +# dev.off() + +png("figs/Figure1_full_dapc_heatmaps.png",width = 40, height = 20, units = "cm", res = 400) +# plot_grid(dapc_fig,dapc1_fish_fig,dapc2_fish_fig,ncol = 3,rel_widths = c(2.5,0.75,1.05),labels=c("a","b"),label_size = 20) +plot_grid(dapc_fig,dapc1_fish_fig,dapc2_fish_fig,ncol = 3,rel_widths = c(2.5,0.75,1.05)) +dev.off() + +pdf("figs/FigureSX_full_dapc_DF2_DF3_heatmaps.pdf",width = 14,height=8) +plot_grid(dapc23_fig,dapc22_fish_fig,dapc3_fish_fig,ncol = 3,rel_widths = c(2.5,0.75,1.15),labels=c("A","B"),label_size = 20) +dev.off() + + + + diff --git a/02_phenotype_PCA_plots.R b/02_phenotype_PCA_plots.R new file mode 100644 index 0000000..ffccce8 --- /dev/null +++ b/02_phenotype_PCA_plots.R @@ -0,0 +1,114 @@ +# new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +lib<-c("lemon", "grid", "gridExtra", "ggplot2", "cowplot", "factoextra", "readr", "tidyr", "stringr", "data.table") +lapply(lib,library,character.only=T) + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/phenotype_data_current/extracted_colour/") + +PC.for.ADONIS <- read.csv(file = "PC_for_ADONIS.csv", header = TRUE, sep = ",") + +#adding line labels +## create a vector with the line ID; the number is how many males in that group +IF6_name = rep("Iso-Y 6", times = 41) +IF8_name = rep("Iso-Y 8", times = 48) +IF9_name = rep("Iso-Y 9", times = 42) +IF10_name = rep("Iso-Y 10", times = 42) + +Line = factor(c(IF6_name, IF8_name, IF9_name, IF10_name), levels = c("Iso-Y 6", "Iso-Y 8", "Iso-Y 9", "Iso-Y 10") ) + +PC.for.ADONIS <- cbind(Line, PC.for.ADONIS) + + +element_textbox <- function(...) { + el <- element_text(...) + class(el) <- c("element_textbox", class(el)) + el +} + +element_grob.element_textbox <- function(element, ...) { + text_grob <- NextMethod() + rect_grob <- element_grob(calc_element("strip.background", theme_bw())) + + ggplot2:::absoluteGrob( + grid::gList( + element_grob(calc_element("strip.background", theme_bw())), + text_grob + ), + height = grid::grobHeight(text_grob), + width = grid::unit(1, "npc") + ) +} + +scaleFUN <- function(x) sprintf("%.3f", x) + +myCol <- c("#B22222", "#2D708E", "#481567", "#3CBB75") # 6,8,9,10 + +c("#3CBB75", "#B22222", "#2D708E", "#481567") + + +pc_plotter = function(dataset, column, title, ylabel = " ", legend.position = "none") { + + + + a <- ggplot(dataset, aes(x = dataset[,column])) + a = a + geom_density(aes(fill = Line), alpha = 0.4) + + labs(title = title) + + ylab(ylabel) + + theme(plot.title = element_textbox( + family="Avenir", size = 24, hjust = 0.5, margin = margin(t = 5, b = 5) + ), axis.title.x=element_blank(), axis.text.x = element_text(family="Avenir", size = 20), axis.text.y = element_text(family="Avenir", size = 24), axis.title.y = element_text(family="Avenir", size = 24), + panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"), legend.position = legend.position) + + scale_y_continuous(labels=scaleFUN) + + scale_fill_manual( values = myCol) + a +} + + + +a = pc_plotter(PC.for.ADONIS, 3, "PC1", legend.position = "top") + +b = pc_plotter(PC.for.ADONIS, 4, "PC2") + +c = pc_plotter(PC.for.ADONIS, 5, "PC3") + +d = pc_plotter(PC.for.ADONIS, 6, "PC4") + +e = pc_plotter(PC.for.ADONIS, 7, "PC5") + +f = pc_plotter(PC.for.ADONIS, 8, "PC6") + +g = pc_plotter(PC.for.ADONIS, 9, "PC7") + +h = pc_plotter(PC.for.ADONIS, 10, "PC8") + +i = pc_plotter(PC.for.ADONIS, 11, "PC9") + +j = pc_plotter(PC.for.ADONIS, 12, "PC10") + +k = pc_plotter(PC.for.ADONIS, 13, "PC11") + +l = pc_plotter(PC.for.ADONIS, 14, "PC12") + +m = pc_plotter(PC.for.ADONIS, 15, "PC13") + +n = pc_plotter(PC.for.ADONIS, 16, "PC14") + +o = pc_plotter(PC.for.ADONIS, 17, "PC15") + +p = pc_plotter(PC.for.ADONIS, 18, "PC16") + +q = pc_plotter(PC.for.ADONIS, 19, "PC17") + + + + +pp <- grid_arrange_shared_legend(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q, nrow = 6, ncol = 3, position = c("top")) #ncol determines number of columns +#nrow determines number of rows + + +ggsave("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/phenotype_PCs_SIFigure1.png", pp, width = 40, height = 40, units = "cm", dpi = 400) + diff --git a/03_phenotype_perm.R b/03_phenotype_perm.R new file mode 100644 index 0000000..fe050af --- /dev/null +++ b/03_phenotype_perm.R @@ -0,0 +1,287 @@ +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +lib<-c("dplyr","ggplot2", "devtools", "magrittr", "gridExtra", "readr", "tidyr", "stringr", "data.table") +lapply(lib,library,character.only=T) + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/phenotype_data_current/") + +D_all_lines1 <- read.csv(file = "./extracted_colour/IF_lines_trace.csv") +extracted_color = "extracted_colour/" + +#bind the UV csv files with the Vis ones and then revise code belows to analyze 4 channels instead of 3 + +IF6_Vis_4T_1R = read.csv(paste(extracted_color, "IF6_Vis_4T_1R.csv", sep="")) + +IF6_UV_4T_1R = read.csv(paste(extracted_color, "IF6_UV_4T_1R.csv", sep="")) + +IF6_Vis_4T_1R = cbind(IF6_Vis_4T_1R, IF6_UV_4T_1R[,4:ncol(IF6_UV_4T_1R)]) + +IF8_Vis_4T_1R = read.csv(paste(extracted_color, "IF8_Vis_4T_1R.csv", sep="")) + +IF8_UV_4T_1R = read.csv(paste(extracted_color, "IF8_UV_4T_1R.csv", sep="")) + +IF8_Vis_4T_1R = cbind(IF8_Vis_4T_1R, IF8_UV_4T_1R[,4:ncol(IF8_UV_4T_1R)]) #specifying columns is to not include the column names in the UV file + + +IF9_Vis_4T_1R = read.csv(paste(extracted_color, "IF9_Vis_4T_1R.csv", sep="")) #waiting on landmarking for IF9 UV and Vis to do this one + +IF9_UV_4T_1R = read.csv(paste(extracted_color, "IF9_UV_4T_1R.csv", sep="")) + +IF9_Vis_4T_1R = cbind(IF9_Vis_4T_1R, IF9_UV_4T_1R[,4:ncol(IF9_UV_4T_1R)]) + +IF10_Vis_4T_1R = read.csv(paste(extracted_color, "IF10_Vis_4T_1R.csv", sep="")) + +IF10_UV_4T_1R = read.csv(paste(extracted_color, "IF10_UV_4T_1R.csv", sep="")) + +IF10_Vis_4T_1R = cbind(IF10_Vis_4T_1R, IF10_UV_4T_1R[,4:ncol(IF10_UV_4T_1R)]) + +IF6_vars = apply(IF6_Vis_4T_1R[c(4:9904)], 2, var) + +#write.csv(data.frame(names(IF6_vars), IF6_vars), paste(extracted_color, "IF6_vars.csv"), row.names = FALSE, col.names =c("color channel and sampling location", "variance ")) + +write.table(data.frame(names(IF6_vars), IF6_vars), paste(extracted_color, "IF6_vars.csv"), sep = ",", row.names = FALSE, col.names =c("color channel and sampling location", "variance ")) + + +IF8_vars = apply(IF8_Vis_4T_1R[c(4:9904)], 2, var) + +#write.csv(IF8_vars, paste(extracted_color, "IF8_vars.csv"), col.names = c("color channel and sampling location", "variance ")) + +write.table(data.frame(names(IF8_vars), IF8_vars), paste(extracted_color, "IF8_vars.csv"), sep = ",", row.names = FALSE, col.names =c("color channel and sampling location", "variance ")) + + +IF9_vars = apply(IF9_Vis_4T_1R[c(4:9904)], 2, var) + +IF10_vars = apply(IF10_Vis_4T_1R[c(4:9904)], 2, var) + +#write.csv(IF10_vars, paste(extracted_color, "IF10_vars.csv"), col.names = c("color channel and sampling location", "variance ")) + +write.table(data.frame(names(IF10_vars), IF10_vars), paste(extracted_color, "IF10_vars.csv"), sep = ",", row.names = FALSE, col.names =c("color channel and sampling location", "variance ")) + +#red variances for each line +IF6_r_vars = sum(IF6_vars[1:2476]) + +IF8_r_vars = sum(IF8_vars[1:2476]) + +IF9_r_vars = sum(IF9_vars[1:2476]) + +IF10_r_vars = sum(IF10_vars[1:2476]) + + +#blue variances for each line +IF6_blue_vars = sum(IF6_vars[2477:4952]) + +IF8_blue_vars = sum(IF8_vars[2477:4952]) + +IF9_blue_vars = sum(IF9_vars[2477:4952]) + +IF10_blue_vars = sum(IF10_vars[2477:4952]) + +#green variances for each line +IF6_green_vars = sum(IF6_vars[4953:7428]) + +IF8_green_vars = sum(IF8_vars[4953:7428]) + +IF9_green_vars = sum(IF9_vars[4953:7428]) + +IF10_green_vars = sum(IF10_vars[4953:7428]) + +#UV variances for each line +IF6_UV_vars = sum(IF6_vars[7429:9904]) + +IF8_UV_vars = sum(IF8_vars[7429:9904]) + +IF9_UV_vars = sum(IF9_vars[7429:9904]) + +IF10_UV_vars = sum(IF10_vars[7429:9904]) + +#total variance for each line +IF6_total_vars = sum(IF6_vars) + +IF8_total_vars = sum(IF8_vars) + +IF9_total_vars = sum(IF9_vars) + +IF10_total_vars = sum(IF10_vars) + + +IF9_IF10_diff = IF9_total_vars - IF10_total_vars + +IF8_IF10_diff = IF8_total_vars - IF10_total_vars + +IF6_IF10_diff = IF6_total_vars - IF10_total_vars + +IF8_IF9_diff = IF8_total_vars - IF9_total_vars + +IF6_IF9_diff = IF6_total_vars - IF9_total_vars + +IF6_IF8_diff = IF6_total_vars - IF8_total_vars + + +## Load comparisons +IF9_IF10 = D_all_lines1[,7] +IF8_IF10 = D_all_lines1[,6] +IF8_IF9 = D_all_lines1[,5] +IF6_IF10 = D_all_lines1[,4] +IF6_IF9 = D_all_lines1[,3] +IF6_IF8 = D_all_lines1[,2] + +## Create all the permutation plots + +AR_PERM_plot6 = ggplot(D_all_lines1, aes(x = IF6_IF8)) + + geom_density(fill="gray30", alpha = 0.8)+ + scale_x_continuous(name = "Iso-Y6 - Iso-Y8", limits=c(-16.5,16.5)) + + scale_y_continuous(name = "")+ + #ggtitle("Proportion of permutations less than the observed")+ + theme_classic()+ + theme(axis.line = element_line(size=1, colour = "black"), + panel.border = element_blank(), panel.background = element_blank(), + plot.title = element_text(hjust = 0,size = 12, family = "Avenir"), + text=element_text(family="Avenir"), + axis.title=element_text(size=12,face="bold"), + axis.text.x=element_text(colour="black", size = 12), + axis.text.y=element_text(colour="black", size = 12)) + + geom_vline(xintercept=IF6_IF8_diff, linetype="dashed", size=1.5) + + #geom_rug(data=observ_diff_trace_ar, color = "red", lwd = 2.5) + ggtitle("a") +AR_PERM_plot6 + + +IF6_IF8_prop = sum(abs(IF6_IF8) >= abs(IF6_IF8_diff)) #count the number of traces that were larger than observed +IF6_IF8_P = IF6_IF8_prop/1000 + + +AR_PERM_plot5 = ggplot(D_all_lines1, aes(x = IF6_IF9)) + + geom_density(fill="gray30", alpha = 0.8)+ + scale_x_continuous(name = "Iso-Y6 - Iso-Y9", limits=c(-16.5,16.5)) + + scale_y_continuous(name = "")+ + #ggtitle("Proportion of permutations less than the observed")+ + theme_classic()+ + theme(axis.line = element_line(size=1, colour = "black"), + panel.border = element_blank(), panel.background = element_blank(), + plot.title = element_text(hjust = 0,size = 12, family = "Avenir"), + text=element_text(family="Avenir"), + axis.title=element_text(size=12,face="bold"), + axis.text.x=element_text(colour="black", size = 12), + axis.text.y=element_text(colour="black", size = 12)) + + geom_vline(xintercept=IF6_IF9_diff, linetype="dashed", size=1.5) + + #geom_rug(data=observ_diff_trace_ar, color = "red", lwd = 2.5) + ggtitle("b") +AR_PERM_plot5 + + + +IF6_IF9_prop = sum(abs(IF6_IF9) >= abs(IF6_IF9_diff)) #count the number of traces that were larger than observed +IF6_IF9_P = IF6_IF9_prop/1000 + + +AR_PERM_plot4 = ggplot(D_all_lines1, aes(x = IF8_IF9)) + + geom_density(fill="gray30", alpha = 0.8)+ + scale_x_continuous(name = "Iso-Y8 - Iso-Y9", limits=c(-16.5,16.5)) + + scale_y_continuous(name = "")+ + #ggtitle("Proportion of permutations less than the observed")+ + theme_classic()+ + theme(axis.line = element_line(size=1, colour = "black"), + panel.border = element_blank(), panel.background = element_blank(), + plot.title = element_text(hjust = 0,size = 12, family = "Avenir"), + text=element_text(family="Avenir"), + axis.title=element_text(size=12,face="bold"), + axis.text.x=element_text(colour="black", size = 12), + axis.text.y=element_text(colour="black", size = 12)) + + geom_vline(xintercept=IF8_IF9_diff, linetype="dashed", size=1.5) + #need to make line show up + #geom_rug(data=observ_diff_trace_ar, color = "red", lwd = 2.5) + ggtitle("c") +AR_PERM_plot4 + + + +IF8_IF9_prop = sum(abs(IF8_IF9) >= abs(IF8_IF9_diff)) #count the number of traces that were larger than observed +IF8_IF9_P = IF8_IF9_prop/1000 + + +AR_PERM_plot2 = ggplot(D_all_lines1, aes(x = IF6_IF10)) + + geom_density(fill="gray30", alpha = 0.8)+ + scale_x_continuous(name = "Iso-Y6 - Iso-Y10", limits=c(-16.5,16.5)) + + scale_y_continuous(name = "")+ + #ggtitle("Proportion of permutations less than the observed")+ + theme_classic()+ + theme(axis.line = element_line(size=1, colour = "black"), + panel.border = element_blank(), panel.background = element_blank(), + plot.title = element_text(hjust = 0,size = 12, family = "Avenir"), + text=element_text(family="Avenir"), + axis.title=element_text(size=12,face="bold"), + axis.text.x=element_text(colour="black", size = 12), + axis.text.y=element_text(colour="black", size = 12)) + + geom_vline(xintercept=IF6_IF10_diff, linetype="dashed", size=1.5) + + #geom_rug(data=observ_diff_trace_ar, color = "red", lwd = 2.5) + ggtitle("d") +AR_PERM_plot2 + + + +IF6_IF10_prop = sum(abs(IF6_IF10) >= abs(IF6_IF10_diff)) #count the number of traces that were larger than observed +IF6_IF10_P = IF6_IF10_prop/1000 + + +AR_PERM_plot = ggplot(D_all_lines1, aes(x = IF8_IF10)) + + geom_density(fill="gray30", alpha = 0.8)+ + scale_x_continuous(name = "Iso-Y8 - Iso-Y10", limits=c(-16.5,16.5)) + + scale_y_continuous(name = "")+ + #ggtitle("Proportion of permutations less than the observed")+ + theme_classic()+ + theme(axis.line = element_line(size=1, colour = "black"), + panel.border = element_blank(), panel.background = element_blank(), + plot.title = element_text(hjust = 0,size = 12, family = "Avenir"), + text=element_text(family="Avenir"), + axis.title=element_text(size=12,face="bold"), + axis.text.x=element_text(colour="black", size = 12), + axis.text.y=element_text(colour="black", size = 12)) + + geom_vline(xintercept=IF8_IF10_diff, linetype="dashed", size=1.5) + + #geom_rug(data=observ_diff_trace_ar, color = "red", lwd = 2.5) + ggtitle("e") +AR_PERM_plot + + + +IF8_IF10_prop = sum(abs(IF8_IF10) >= abs(IF8_IF10_diff)) #count the number of traces that were larger than observed +IF8_IF10_P = IF8_IF10_prop/1000 #turns this into a p-value + + +AR_PERM_plot3 = ggplot(D_all_lines1, aes(x = IF9_IF10)) + + geom_density(fill="gray30", alpha = 0.8)+ + scale_x_continuous(name = "Iso-Y9 - Iso-Y10", limits=c(-16.5,16.5)) + + scale_y_continuous(name = "")+ + #ggtitle("Proportion of permutations less than the observed")+ + theme_classic()+ + theme(axis.line = element_line(size=1, colour = "black"), + panel.border = element_blank(), panel.background = element_blank(), + plot.title = element_text(hjust = 0,size = 12, family = "Avenir"), + text=element_text(family="Avenir"), + axis.title=element_text(size=12,face="bold"), + axis.text.x=element_text(colour="black", size = 12), + axis.text.y=element_text(colour="black", size = 12)) + + geom_vline(xintercept=IF9_IF10_diff, linetype="dashed", size=1.5) + + #geom_rug(data=observ_diff_trace_ar, color = "red", lwd = 2.5) + ggtitle("f") +AR_PERM_plot3 + + + +IF9_IF10_prop = sum(abs(IF9_IF10) >= abs(IF9_IF10_diff)) #count the number of traces that were larger than observed +IF9_IF10_P = IF9_IF10_prop/1000 +#P = 0, indicating that Iso-Y 9 is significantly more variable than Iso-Y 10 + +## Put them all together +figure_grid <- grid.arrange(AR_PERM_plot6, AR_PERM_plot5, AR_PERM_plot4, AR_PERM_plot2, AR_PERM_plot, AR_PERM_plot3, ncol = 3, nrow = 3, + layout_matrix = rbind(c(1, NA, NA), + c(2, 3, NA), + c(4, 5, 6) + #C(AR_PERM_plot2, AR_PERM_plot, AR_PERM_plot3, NA)) + ), + bottom = "Difference in phenotypic variance between lines", + left = "Density", + heights = unit(c(2,2,2),c("in","in","in")), + widths = unit(c(2,2,2),c("in","in","in"))) + +ggsave(figure_grid, file = "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/perm_tests_phenotypes.tiff", dpi = 1200, height=160, width=200, units="mm") diff --git a/CPD_analysis.R b/CPD_analysis.R new file mode 100644 index 0000000..6de7675 --- /dev/null +++ b/CPD_analysis.R @@ -0,0 +1,67 @@ +# new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +#install.packages("changepoint") + +lib<-as.vector(c("changepoint","data.table","devtools","rlang","tidyr","ggplot2")) +lapply(lib,library,character.only=TRUE) + +## working directory +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/change-point/") + +### Read in the chr12 freqs +### from here: +chr12 <- data.frame(fread("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat/outputs/final/chr12_AFs_final.tsv", header=TRUE)) + +## Polarise to IF9: +## remove pos +tmp <- chr12 %>% select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF9 freqs: +IF9_freqs<-tmp$IF9_AF + +for(i in colnames(tmp)){ + tmp[IF9_freqs < 0.5,i]<-(1-tmp[IF9_freqs < 0.5,i]) +} + +## add IF9 polarised back in back in: +chr12_dd <- cbind(chr12,tmp) + +colnames(chr12_dd) <- c("chr", "pos", "REF", "ALT", "IF10_AF", "IF6_AF", "IF8_AF", "IF9_AF", + "IF10_polar", "IF6_polar", "IF8_polar", "IF9_polar") + + +## take out polarised +chr12 <- chr12_dd %>% select(pos,IF10_polar,IF9_polar,IF8_polar,IF6_polar) + +## Perform CPD +IF6_chr12 <- chr12 %>% select(IF6_polar) +IF6_chr12 <- as.numeric(IF6_chr12$IF6_polar) + +IF8_chr12 <- chr12 %>% select(IF8_polar) +IF8_chr12 <- as.numeric(IF8_chr12$IF8_polar) + +IF9_chr12 <- chr12 %>% select(IF9_polar) +IF9_chr12 <- as.numeric(IF9_chr12$IF9_polar) + +IF10_chr12 <- chr12 %>% select(IF10_polar) +IF10_chr12 <- as.numeric(IF10_chr12$IF10_polar) + +# change in mean allele frequencies +IF6mean=cpt.mean(IF6_chr12, method = "BinSeg", penalty = "SIC", Q=10) + +# change in mean allele frequencies +IF8mean=cpt.mean(IF8_chr12, method = "BinSeg", penalty = "SIC", Q=10) + +# change in mean allele frequencies +IF9mean=cpt.mean(IF9_chr12, method = "BinSeg", penalty = "SIC", Q=10) + +# change in mean allele frequencies +IF10mean=cpt.mean(IF10_chr12, method = "BinSeg", penalty = "SIC", Q=10) + +IF6mean@cpts +IF8mean@cpts +IF9mean@cpts +IF10mean@cpts diff --git a/GO_KEGG_analysis_LG1.R b/GO_KEGG_analysis_LG1.R new file mode 100644 index 0000000..7905e0c --- /dev/null +++ b/GO_KEGG_analysis_LG1.R @@ -0,0 +1,167 @@ +# new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/colour_gene_exploration/") + +lib<-c("clusterProfiler", "enrichplot", "parallel","ggplot2", "cowplot", "ggrepel", "gridExtra", "biomaRt", "AnnotationHub") +lapply(lib,library,character.only=T) + +## use ensembl annotations +ensembl=useMart("ensembl") + +### list species available: +# ensembl_dd <- as.data.frame(listDatasets(ensembl)) + +## get guppy +guppy <- useDataset("preticulata_gene_ensembl",mart=ensembl) + +## check it out if you fancy +# listAttributes(guppy) + +## Grab the guppy universe +guppy_universe<-getBM(attributes = c("ensembl_gene_id","entrezgene_id","kegg_enzyme"), + mart=guppy) + +guppy_entrez_universe<-as.character(unique(guppy_universe$entrezgene_id)) + + +# Fetch the Uniprot IDs... +ensembl_digger<-function(outliers=NULL,genome1=NULL,genome2=NULL,aln_block_length=1000,aln_qual=40, + biomart_attributes=c("ensembl_gene_id","entrezgene_id","uniprot_gn_id", "description", "external_gene_name", "kegg_enzyme","chromosome_name","start_position","end_position", "go_id", "name_1006", "definition_1006"),n.cores=1){ + # Firstly, take all the outlier regions and make a multi-fasta + lapply(outliers,function(x){ + system(paste0("samtools faidx ",genome1," ",x," >> outputs/tmp_multi.fa")) + }) + # Align to old genome + system2('/usr/local/bin/minimap2', + args=c(paste0("-t ",n.cores),genome2,"outputs/tmp_multi.fa"), + stdout ="outputs/tmp_multi.paf",wait=T) + # Fetch regions + aln<-read.table("outputs/tmp_multi.paf",fill=T) + # Keep tidy! + system(paste0("rm -f outputs/tmp_multi*")) + # Only keep "high support alignment regions" + aln<-aln[aln$V11 > aln_block_length & aln$V12 >= aln_qual,] + regions<-paste0(aln$V6,":",as.integer(aln$V8),":",as.integer(aln$V9)) + # Pull uniprot genes from biomaRt for each region + tmp_biomart<-getBM(attributes = biomart_attributes, + filters= "chromosomal_region", + values=regions, + mart=guppy) + # Return + return(tmp_biomart) +} + +## Coordinates for analysis +## Region 1 genes: 4,079,988 - 5,984,584 bp +### Region 2 genes 9,627,619 - 17,074,870 bp +### Region 3 genes: 21,944,840 - 24,959,750 bp +## Region of interest: chr1:12210000-12220000 +## LG12 MDS2 coordinates: 5,608,703 - 7,063,435 bp +## LG12 MDS1 coordinates: 21,913,633 - 25,664,533 bp + +outliers<- "chr1" + +male_genome <- "~/Dropbox/Sussex_Guppies/genomes/STAR.chromosomes.release.fasta" + +female_genome <- "~/Dropbox/Sussex_Guppies/genomes/Poecilia_reticulata.Guppy_female_1.0_MT.dna.toplevel.fa" + +LG1 <- ensembl_digger(outliers=outliers, genome1 = male_genome, genome2 = female_genome) + +## write results: +write.table(file = "./outputs/LG12_MDS1_genes.tsv", LG12_MDS1, sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE) + +## Read results in +LG1_region1 <- read_tsv("outputs/LG1_region1_genes.tsv") +LG1_region2 <- read_tsv("outputs/LG1_region2_genes.tsv") +LG1_region3 <- read_tsv("outputs/LG1_region3_genes.tsv") + + +### First try KEGG enrichment: +LG1_region3_enrich<-enrichKEGG(gene=na.omit(unique(LG1_region3$entrezgene_id)), + keyType = 'kegg', + organism = 'pret', + pvalueCutoff = 0.05, + pAdjustMethod = "fdr", + universe = guppy_entrez_universe) + +## Add to a table, and write results: +LG1_region1_enrich_results <- LG1_region1_enrich@result +write.table(file="./outputs/LG1_region1_KEGG_results.tsv", LG1_region1_enrich_results, sep="\t", quote=FALSE,col.names=TRUE,row.names=FALSE) + +LG1_region2_enrich_results <- LG1_region2_enrich@result +write.table(file="./outputs/LG1_region2_KEGG_results.tsv", LG1_region2_enrich_results, sep="\t", quote=FALSE,col.names=TRUE,row.names=FALSE) + +LG1_region3_enrich_results <- LG1_region3_enrich@result +write.table(file="./outputs/LG1_region3_KEGG_results.tsv", LG1_region3_enrich_results, sep="\t", quote=FALSE,col.names=TRUE,row.names=FALSE) + +# Try with all genes in the identified regions on LG1: + +region1 <- LG1_region1$entrezgene_id +region2 <- LG1_region2$entrezgene_id +region3 <- LG1_region3$entrezgene_id + +## append +LG1_regions <- c(region1,region2,region3) + +LG1_regions_enrich<-enrichKEGG(gene=na.omit(unique(LG1_regions)), + keyType = 'kegg', + organism = 'pret', + pvalueCutoff = 0.05, + pAdjustMethod = "fdr", + universe = guppy_entrez_universe) + +# Write results +LG1_regions_enrich_results <- LG1_regions_enrich@result +write.table(file="./outputs/LG1_regions_KEGG_results.tsv", LG1_regions_enrich_results, sep="\t", quote=FALSE,col.names=TRUE,row.names=FALSE) + +## Ok and now GO analysis ... + +library(AnnotationHub) + +## Get the hub +hub <- AnnotationHub() + +## Take a look for latest OrgDB +d <- display(hub) + +## on 14/06/2021, this was the latest hub for PRET +PRET <- hub[["AH86018"]] + +## You can look at what info the annotation has (if you so desire): +# biomaRt::keytypes(PRET) + +## Now run for each region +LG1_region3_enrich_GO <-enrichGO(gene=na.omit(unique(LG1_region3$entrezgene_id)), + keyType = 'ENTREZID', + OrgDb = PRET, + pvalueCutoff = 0.05, + pAdjustMethod = "fdr", + universe = guppy_entrez_universe) + +## Add to a table, and write results: +LG1_region1_enrich_GO_results <- LG1_region1_enrich_GO@result +write.table(file="./outputs/LG1_region1_GO_results.tsv", LG1_region1_enrich_GO_results, sep="\t", quote=FALSE,col.names=TRUE,row.names=FALSE) + +LG1_region2_enrich_GO_results <- LG1_region2_enrich_GO@result +write.table(file="./outputs/LG1_region2_GO_results.tsv", LG1_region2_enrich_GO_results, sep="\t", quote=FALSE,col.names=TRUE,row.names=FALSE) + +LG1_region3_enrich_GO_results <- LG1_region3_enrich_GO@result +write.table(file="./outputs/LG1_region3_GO_results.tsv", LG1_region3_enrich_GO_results, sep="\t", quote=FALSE,col.names=TRUE,row.names=FALSE) + + +## And finally run for all the regions +LG1_regions_enrich_GO <-enrichGO(gene=na.omit(unique(LG1_regions)), + keyType = 'ENTREZID', + OrgDb = PRET, + pvalueCutoff = 0.05, + pAdjustMethod = "fdr", + universe = guppy_entrez_universe) + +LG1_regions_enrich_GO_results <- LG1_regions_enrich_GO@result +write.table(file="./outputs/LG1_regions_GO_results.tsv", LG1_regions_enrich_GO_results, sep="\t", quote=FALSE,col.names=TRUE,row.names=FALSE) + + + diff --git a/LD_heatmap.R b/LD_heatmap.R new file mode 100644 index 0000000..124aa08 --- /dev/null +++ b/LD_heatmap.R @@ -0,0 +1,190 @@ +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/linkage/") +lib<-c("ggplot2","data.table","viridis","parallel", "LDheatmap", "reshape2", "cowplot", "grid") +lapply(lib,library,character.only=T) + +############################## +## LG1 +############################## + +## read in the LD matrix from plink for LG1: +chr1<-read.table("./inputs/chr1.nat_data.POLY.maf0.1.5kTHIN.SQUARE.ld",header = F) + +# read in a sites table for the matrix +sites<-read.table("inputs/chr1.nat_data.POLY.maf0.1.5kTHIN_sites.tsv",header = F) + +## add the positions to the LD matrix (colnames and rownames) +colnames(chr1)<-sites$V2 +rownames(chr1)<-sites$V2 + +## remove NAs +chr1[chr1 =='NaN']<- NA +## remove Na columns and rows +chr1 <-chr1[rowSums(is.na(chr1)) != ncol(chr1), ] +chr1 <-chr1[,colSums(is.na(chr1)) != nrow(chr1)] + +## get a list of the snps +snps<-colnames(chr1) + +## list of SNPs every 100 SNPs +snps2<-snps[seq(1,length(snps),100)] + +## Keep it as a matrix format: +chr1_ld<-as.matrix(chr1) + + +chr1_map<-LDheatmap(chr1_ld,add.map = F,color=plasma(100), add.key = TRUE, flip=TRUE, + title = "LG1") + +png("./figs/LG1_LD.png",res=300, width=600, height=600) +grid.newpage() +grid.draw(chr1_map$LDheatmapGrob) +dev.off() + +############################## +## LG12 +############################## + +## read in the LD matrix from plink +chr12<-read.table("inputs/chr12.nat_data_STAR_update.POLY.maf.5kTHIN.SQUARE.ld",header = F) + +# read in a sites table for the matrix +sites<-read.table("./inputs/chr12.nat_data_STAR_update.POLY.maf.5kTHIN.sites.tsv",header = F) + +## add sites to the matrix +colnames(chr12)<-sites$V1 +rownames(chr12)<-sites$V1 + +## remove NAs +chr12[chr12 =='NaN']<- NA + +## remove Na columns and rows +chr12<-chr12[rowSums(is.na(chr12)) != ncol(chr12), ] +chr12<-chr12[,colSums(is.na(chr12)) != nrow(chr12)] + +## get a list of the snps every 100 SNPs +snps<-colnames(chr12) +snps2<-snps[seq(1,length(snps),100)] + +chr12_ld<-as.matrix(chr12) + +## Plot +chr12_map<-LDheatmap(chr12_ld,add.map = F,color=plasma(100), SNP.name = snps2, add.key=FALSE,title = "Chromosome 12") + +png("./figs/LG12_LD.png",res=300, width=600, height=600) +grid.newpage() +grid.draw(chr12_map$LDheatmapGrob) +dev.off() + + +############################## +## LG1 Males only +############################## + +## read in male only linkage +chr1_males <-read.table("./inputs/chr1.nat_data.POLY.maf0.1.5kTHIN.males.SQUARE.ld",header = F) +# read in a sites table for the matrix +males_sites<-read.table("inputs/chr1.nat_data.POLY.maf0.1.5kTHIN.males.sites.tsv",header = F) + +## add the positions to the LD matrix (colnames and rownames) +colnames(chr1_males)<-males_sites$V2 +rownames(chr1_males)<-males_sites$V2 + +## remove NAs +chr1_males[chr1_males =='NaN']<- NA +## remove Na columns and rows +chr1_males <-chr1_males[rowSums(is.na(chr1_males)) != ncol(chr1_males), ] +chr1_males <-chr1_males[,colSums(is.na(chr1_males)) != nrow(chr1_males)] + +## get a list of the snps +snps<-colnames(chr1_males) + +## list of SNPs every 10 +snps2<-snps[seq(1,length(snps),10)] + +## Keep it as a matrix format: +chr1_males_ld<-as.matrix(chr1_males) + +chr1_males_map<-LDheatmap(chr1_males_ld,add.map = T,color=plasma(10), add.key = FALSE, SNP.name = snps2, + title = "LG1 Males") + +png("./figs/LG1_males_only.png",res=300, width=600, height=600) +grid.newpage() +grid.draw(high_map$LDheatmapGrob) +dev.off() + + + +############################## +## LG1 Females only +############################## + +## read in male only linkage +chr1_females <-read.table("./inputs/chr1.nat_data.POLY.maf0.1.5kTHIN.females.SQUARE.ld",header = F) +# read in a sites table for the matrix +females_sites<-read.table("inputs/chr1.nat_data.POLY.maf0.1.5kTHIN.females.sites.tsv",header = F) + +## add the positions to the LD matrix (colnames and rownames) +colnames(chr1_females)<-females_sites$V2 +rownames(chr1_females)<-females_sites$V2 + +## remove NAs +chr1_females[chr1_females =='NaN']<- NA +## remove Na columns and rows +chr1_females <-chr1_females[rowSums(is.na(chr1_females)) != ncol(chr1_females), ] +chr1_females <-chr1_females[,colSums(is.na(chr1_females)) != nrow(chr1_females)] + +## get a list of the snps +snps<-colnames(chr1_females) + +## list of SNPs every 10 +snps2<-snps[seq(1,length(snps),10)] + +## Keep it as a matrix format: +chr1_females_ld<-as.matrix(chr1_females) + +chr1_females_map<-LDheatmap(chr1_females_ld,add.map = T,color=plasma(10), add.key = FALSE, SNP.name = snps2, + title = "LG1 Males") + +png("./figs/LG1_females_only.png",res=300, width=600, height=600) +grid.newpage() +grid.draw(high_map$LDheatmapGrob) +dev.off() + + +############################################################################################################# +## For the high linkage region ### +############################################################################################################# + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/linkage/") + +high<-read.table("inputs/chr1.10-16.highLD_region.SQUARE.ld",header = F) + +# read in a sites table for the matrix +sites<-read.table("inputs/chr1.10-16.highLD_region.sites.tsv",header = F) + +## add sites to the matrix +colnames(high)<-sites$V1 +rownames(high)<-sites$V1 + +## remove NAs +high[high =='NaN']<- NA + +## remove Na columns and rows +high<-high[rowSums(is.na(high)) != ncol(high), ] +high<-high[,colSums(is.na(high)) != nrow(high)] + +## get a list of the snps +snps<-colnames(high) +snps2<-snps[seq(1,length(snps),100)] + +high_ld<-as.matrix(high) +high_map<-LDheatmap(high_ld,add.map = F,color=plasma(100), add.key=FALSE, title=NULL) + +png("./figs/highLD_LG1.png",res=300, width=600, height=600) +grid.newpage() +grid.draw(high_map$LDheatmapGrob) +dev.off() diff --git a/Z_FST_PCA_analysis.R b/Z_FST_PCA_analysis.R new file mode 100644 index 0000000..eff82a0 --- /dev/null +++ b/Z_FST_PCA_analysis.R @@ -0,0 +1,362 @@ +# new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat") + +lib<-c("poolfstat","ggplot2", "cowplot", "readr", "tidyr", "stringr", "data.table", "ggridges") +lapply(lib,library,character.only=T) + + +## Read in so we don't have to repeat the above: +dd <- data.frame(fread("./outputs/poolseq_lifted_pairwise_fst.tsv", header = TRUE)) + +## remove NA +dd <- drop_na(dd) + +# PCA with scale and center (same as Zscores) +Z_PCA<-prcomp(dd[,5:10],scale=T,center=T) + +# Add PC scores to dd +dd$Z_PC1<-Z_PCA$x[,1] +dd$Z_PC2<-Z_PCA$x[,2] +dd$Z_PC3<-Z_PCA$x[,3] +dd$chr<-factor(dd$chr,levels=unique(dd$chr)) + +# Summary +summary(Z_PCA) + +# Summarise by chr (PC1) +FST_sum <- dd %>% group_by(chr) %>% dplyr::summarise(Z_FST=mean(Z_PC1)) + +write.table(file = "./outputs/Z_FST_1_SUM_whole_genome.tsv", FST_sum, sep = "\t", quote = FALSE, col.names = TRUE) + + +# Set up bits for plotting (don't need all of this) +dd <- dd %>% + mutate(chr_num = str_extract(chr, "[^r]+$")) %>% + mutate(chr_name = ifelse(str_detect(chr, "chr"), chr, "unplaced_chr0")) %>% + mutate(color = ifelse(chr_num %in% seq(0, 23, 2), "even", "odd")) %>% + mutate(number = 1) %>% # this is a column to be used latter for a running number by cumsum + filter(str_detect(chr,"chr")) %>% + group_by(chr) %>% mutate(rownum = cumsum(number)) + +dd$chr_num <- factor(dd$chr_num, + levels = c("1", "2", "3", "4", "5", "6", "7", "8", + "9", "10", "11", "12", "13", "14", "15", + "16", "17", "18", "19", "20", "21", "22", "23")) + +## plot PCA scores +g1 <- ggplot(dd)+ + geom_density_ridges_gradient(aes(x=`Z_PC1`,y=`chr_num`, group=chr_num, fill = stat(x)), + rel_min_height = 0.01, scale=3, quantile_lines = TRUE, quantiles = 0.5)+ + geom_vline(xintercept = 3, linetype="dashed", alpha=0.8)+ + scale_fill_viridis_c(name = expression(paste("Z-F" [ST], " PC1")), option = "plasma", direction=-1) + + scale_x_continuous(breaks=seq(-2, 10, by = 1), labels=seq(-2, 10,1), limits=c(-2,10)) + + xlab(expression(paste("Z-F" [ST], " PC1")))+ + ylab("Chromosome")+ + theme_classic()+ + theme(axis.text=element_text(size=12), + axis.title.y = element_text(family="Avenir",size=40,vjust=0.5), + legend.position = "none", + legend.title = element_blank(), + legend.text = element_text(family = "Avenir", size = 18), + legend.key.size = unit(0.8, "cm"), + axis.ticks.x = element_line(), + axis.text.y = element_text(family="Avenir", size = 24), + axis.text.x = element_text(family="Avenir", size = 24), + axis.ticks.y = element_blank(), + axis.title = element_text(size=28, family="Avenir")) + +ggsave(g1, width = 40, height = 20, units = "cm", dpi = 400, + file = "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/Figure_2A_density_ZPC.png") + + +## pull out LG1 +chr1 <- dd %>% filter(chr == "chr1") + +chr1 <- drop_na(chr1) + +# PCA with scale and center (same as Zscores) ## try with no scale +Z_PCA<-prcomp(chr1[,5:10],scale=T,center=T) + +chr1$Z_PC1<-Z_PCA$x[,1] +chr1$Z_PC2<-Z_PCA$x[,2] +chr1$Z_PC3<-Z_PCA$x[,3] + +summary(Z_PCA) + +Z_PCA$rotation + +## set up spline +x=chr1$pos +y=chr1$Z_PC1 +spline=predict(smooth.spline(x,y,spar=0.5),x)$y +chr1$spline_PC1 <- spline + +x=chr1$pos +y=chr1$Z_PC2 +spline=predict(smooth.spline(x,y,spar=0.5),x)$y +chr1$spline_PC2 <- spline + +x=chr1$pos +y=chr1$Z_PC3 +spline=predict(smooth.spline(x,y,spar=0.5),x)$y +chr1$spline_PC3 <- spline + +## Add in shading for regions 1, 2 and 3 where regions have been calculated using changepoint detection +chr1_regions <- read.table("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/chr1_regions/chr1_region_coords.txt", header=T) +chr1_regions <- as_tibble(chr1_regions) + + +LG1_PC1 <- ggplot(chr1)+ + geom_point(aes(x=pos,y=Z_PC1),alpha=0.4, size = 0.1,colour = "#1e272c")+ + geom_line(aes(x=pos,y=spline_PC1),colour="#f4b41a", size = 2.2, alpha=0.8)+ + geom_rect(data=chr1_regions,alpha = 0.3, fill=c("grey", "grey", "grey"), colour = "white", + linetype=1, size = 0, + mapping=aes(xmin=start,xmax=end, + ymin=c(-2,-2,-2), ymax=c(10, 10, 10)))+ + theme_classic()+ + ylab(expression(paste("Z-F" [ST], " PC1")))+ + xlab("LG1 (Mb)")+ + theme(plot.title = element_blank(), + axis.text.x=element_blank(), + axis.title.y=element_text(size=30, family="Avenir"), + axis.title.x=element_blank(), + axis.text.y=element_text(size=18, family = "Avenir"), + panel.border = element_blank(), + panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ + scale_y_continuous(breaks=seq(-2, 10, by = 1), labels=seq(-2,10,1), limits=c(-2,10)) + + scale_x_continuous(breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1)) + + +LG1_PC2 <- ggplot(chr1)+ + geom_point(aes(x=pos,y=Z_PC2),alpha=0.4, size = 0.1,colour = "#1e272c")+ + geom_line(aes(x=pos,y=spline_PC2),colour="#f4b41a", size = 2.2, alpha=0.8)+ + geom_rect(data=chr1_regions,alpha = 0.3, fill=c("grey", "grey", "grey"), colour = "white", + linetype=1, size = 0, + mapping=aes(xmin=start,xmax=end, + ymin=c(-6,-6,-6), ymax=c(3, 3, 3)))+ + theme_classic()+ + ylab(expression(paste("Z-F" [ST], " PC2")))+ + xlab("LG1 (Mb)")+ + theme(plot.title = element_blank(), + axis.text.x=element_blank(), + axis.title.y=element_text(size=30, family="Avenir"), + axis.title.x=element_blank(), + axis.text.y=element_text(size=18, family = "Avenir"), + panel.border = element_blank(), + panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ + scale_y_continuous(breaks=seq(-6, 3, by = 1), labels=seq(-6,3,1), limits=c(-6,3)) + + scale_x_continuous(breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1)) + + +LG1_PC3 <- ggplot(chr1)+ + geom_point(aes(x=pos,y=Z_PC3),alpha=0.4, size = 0.1,colour = "#1e272c")+ + geom_line(aes(x=pos,y=spline_PC3),colour="#f4b41a", size = 2.2, alpha=0.8)+ + geom_rect(data=chr1_regions,alpha = 0.3, fill=c("grey", "grey", "grey"), colour = "white", + linetype=1, size = 0, + mapping=aes(xmin=start,xmax=end, + ymin=c(-7,-7,-7), ymax=c(5, 5, 5)))+ + theme_classic()+ + ylab(expression(paste("Z-F" [ST], " PC3")))+ + xlab("LG1 (Mb)")+ + theme(plot.title = element_blank(), + axis.text.x=element_text(family = "Avenir", size=18), + axis.title.y=element_text(size=30, family="Avenir"), + axis.title.x=element_blank(), + axis.text.y=element_text(size=18, family = "Avenir"), + panel.border = element_blank(), + panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ + scale_y_continuous(breaks=seq(-7, 5, by = 1), labels=seq(-7,5,1), limits=c(-7,5)) + + scale_x_continuous(breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1)) + + +LG1_PC2_PC3 <- cowplot::plot_grid(LG1_PC2,LG1_PC3, ncol=1) + +ggsave("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/PC2_PC3_LG1_loadings.png", LG1_PC2_PC3 , width = 40, height = 20, units = "cm", dpi = 400) + + +################################################ +# pull out LG12 +################################################ +chr12 <- dd %>% filter(chr == "chr12") +chr12 <- drop_na(chr12) + +# PCA with scale and center (same as Zscores) ## try with no scale +Z_PCA<-prcomp(chr12[,5:10],scale=T,center=T) + +chr12$Z_PC1<-Z_PCA$x[,1] +chr12$Z_PC2<-Z_PCA$x[,2] +chr12$Z_PC3<-Z_PCA$x[,3] + +summary(Z_PCA) + +Z_PCA$rotation + +## set up spline +x=chr12$pos +y=chr12$Z_PC1 +spline=predict(smooth.spline(x,y,spar=0.5),x)$y +chr12$spline_PC1 <- spline + +## set up spline +x=chr12$pos +y=chr12$Z_PC2 +spline=predict(smooth.spline(x,y,spar=0.5),x)$y +chr12$spline_PC2 <- spline + +## set up spline +x=chr12$pos +y=chr12$Z_PC3 +spline=predict(smooth.spline(x,y,spar=0.5),x)$y +chr12$spline_PC3 <- spline + + +LG12_PC1 <- ggplot(chr12,aes(x=pos,y=spline_PC1))+ + geom_point(aes(y=Z_PC1),alpha=0.4, size = 0.1,colour = "#1e272c")+ + geom_line(colour="#f4b41a", size = 2.2, alpha=0.8)+ + theme_classic()+ + ylab(expression(paste("Z-F" [ST], " PC2")))+ + xlab("LG12 (Mb)")+ + theme(plot.title = element_blank(), + axis.text.x=element_blank(), + axis.title.y=element_text(size=30, family="Avenir"), + axis.title.x=element_blank(), + axis.text.y=element_text(size=18, family = "Avenir"), + panel.border = element_blank(), + panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ + scale_y_continuous(breaks=seq(-2, 10, by = 1), labels=seq(-2,10,1), limits=c(-2,10)) + + scale_x_continuous(breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1)) + +LG12_PC2 <- ggplot(chr12,aes(x=pos,y=spline_PC2))+ + geom_point(aes(y=Z_PC2),alpha=0.4, size = 0.1,colour = "#1e272c")+ + geom_line(colour="#f4b41a", size = 2.2, alpha=0.8)+ + theme_classic()+ + ylab(expression(paste("Z-F" [ST], " PC2")))+ + xlab("LG12 (Mb)")+ + theme(plot.title = element_blank(), + axis.text.x=element_blank(), + axis.title.y=element_text(size=30, family="Avenir"), + axis.title.x=element_blank(), + axis.text.y=element_text(size=18, family = "Avenir"), + panel.border = element_blank(), + panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ + scale_y_continuous(breaks=seq(-3, 5, by = 1), labels=seq(-3,5,1), limits=c(-3,5)) + + scale_x_continuous(breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1)) + +LG12_PC3 <- ggplot(chr12,aes(x=pos,y=spline_PC3))+ + geom_point(aes(y=Z_PC3),alpha=0.4, size = 0.1,colour = "#1e272c")+ + geom_line(colour="#f4b41a", size = 2.2, alpha=0.8)+ + theme_classic()+ + ylab(expression(paste("Z-F" [ST], " PC3")))+ + xlab("LG12 (Mb)")+ + theme(plot.title = element_blank(), + axis.text.x=element_text(size=14, family="Avenir"), + axis.title.y=element_text(size=30, family="Avenir"), + axis.title.x=element_blank(), + axis.text.y=element_text(size=18, family = "Avenir"), + panel.border = element_blank(), + panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ + scale_y_continuous(breaks=seq(-6, 2, by = 1), labels=seq(-6,2,1), limits=c(-6,2)) + + scale_x_continuous(breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1)) + +LG12_PC2_PC3 <- cowplot::plot_grid(LG12_PC2,LG12_PC3, ncol=1) + + +ggsave("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/PC2_PC3_LG12_loadings.png", LG12_PC2_PC3 , width = 40, height = 20, units = "cm", dpi = 400) + +############################################################ +####### Calculate alphas for sig SNPs ##################### +############################################################ + +## calculate alpha PC1 +my_threshold_lower <- quantile(dd$Z_PC1, 0.05) +my_threshold_upper <- quantile(dd$Z_PC1, 0.95) + +## use the upper 95%, 2.94, rounded to 3 as your top limit + +PC1 <- dd %>% mutate(PC1_outlier = ifelse(Z_PC1 > 3, "outlier", "no")) %>% + dplyr::filter(PC1_outlier == "outlier") + +## count per chromosomes: +PC1_sum <- PC1 %>% group_by(chr) %>% dplyr::summarise(Z_FST=dplyr::count(PC1_outlier)) + +## count per chromosomes: +PC1_sum <- PC1 %>% group_by(chr) %>% count(Z_FST=(PC1_outlier)) + +write.table(file = "./outputs/PC1_loadings_per_chr_scaff_new.tsv", PC1_sum, sep = "\t", quote = FALSE, col.names = TRUE) + +## Get sum of total outliers: +tot <- sum(PC1_sum$n) # 190259 + +options(scipen = 999) +PC1_sum <- PC1_sum %>% mutate(perct_r=round((n/tot)*100)) + +write.table(file = "./outputs/PC1_loadings_per_chr_scaff_new.tsv", PC1_sum, sep = "\t", quote = FALSE, col.names = TRUE) + +### Read in table with each percentage and expected + +dd <- read.table(file="./outputs/PC_zscores_percentage_chrs.txt", header = TRUE, sep="\t") + +## lock in chr order: +dd$chr <- factor(dd$chr, levels = dd$chr) + +dd2 <- tidyr::gather(dd,key = type, value = PC,PC1:PC3) + +ggplot(dd2)+ + geom_col(aes(x=chr,y=PC, fill = type), position = "dodge") + + +### Make a plot with all the scaffolds and chrs: + +dd <- read.table("LGs_scaffolds_outilers.txt", header=TRUE, sep="\t") + +chr <- dd %>% filter(grepl("LG",Chromosome)) + +# lock in factor level order +chr$Chromosome <- factor(chr$Chromosome, levels = chr$Chromosome) + + +## chrs: +chrs <- ggplot(chr)+ + geom_col(aes(x=Chromosome,y=Proportion.of.high.Z.FST.PC1.above.3))+ + ylab(expression(paste("% SNPs > critical ", alpha)))+ + xlab("Chromosome")+ + theme_bw()+ + theme(axis.text.x=element_text(family="Avenir", size=12), + axis.text.y=element_text(family="Avenir", size=16), + axis.title = element_text(family = "Avenir", size = 20), + panel.grid.minor = element_blank(), + panel.grid.major.x = element_blank())+ + scale_y_continuous(breaks = seq(0,30, by=1), labels = seq(0,30, by=1)) + +ggsave("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/chromosomes_proportion_alpha.png", chrs, width = 40, height = 20, units = "cm", dpi = 400) + + +scaff <- dd %>% filter(!grepl("LG",Chromosome)) + +# lock in factor level order +chr$Chromosome <- factor(chr$Chromosome, levels = chr$Chromosome) + +## scafs: +scaf <- ggplot(scaff)+ + geom_col(aes(x=Chromosome,y=Proportion.of.high.Z.FST.PC1.above.3))+ + ylab(expression(paste("% SNPs > critical ", alpha)))+ + xlab("Unplaced genomic scaffolds")+ + theme_bw()+ + theme(axis.text.x=element_text(family="Avenir", size=4, angle = 90), + axis.text.y=element_text(family="Avenir", size=16), + axis.title = element_text(family = "Avenir", size = 20), + panel.grid.minor = element_blank(), + panel.grid.major.x = element_blank())+ + scale_y_continuous(breaks = seq(0,1, by=0.1), labels = seq(0,1, by=0.1)) + +ggsave("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/scaffolds_proportion_alpha.png", scaf, width = 40, height = 20, units = "cm", dpi = 400) diff --git a/Z_pi_PCA_analysis.R b/Z_pi_PCA_analysis.R new file mode 100644 index 0000000..b40feac --- /dev/null +++ b/Z_pi_PCA_analysis.R @@ -0,0 +1,183 @@ +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +lib<-as.vector(c("vcfR","pbapply","PopGenome","cowplot","gridExtra","viridis","grid","data.table","ggplot2","parallel","dplyr","tidyr", "stringr")) +lapply(lib,library,character.only=TRUE) + +source("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/R_scripts/pool_pi.R") + +######################################### +#### LG12 ##### +########################################## + +vcf_path <- "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/vcf_files/chr1_variant_filtered.recode.vcf.gz" +vcf_in <- read.vcfR(vcf_path) + +# Usage +chr1_pi <- pool_pi(pool_vcf = vcf_in,wind_size=10000,n_cores=2,min_snp = 10) +chr1_pi <- na.omit(chr1_pi) + +# Keep only pi in all pools +wind_counts <- table(chr1_pi$start) +to_keep <- names(wind_counts[wind_counts==4]) +# Make a new dd +pca_dd_LG1 <- data.frame(IF10=chr1_pi[chr1_pi$pool=="IF10" & chr1_pi$start %in% to_keep,"pi"], + IF9=chr1_pi[chr1_pi$pool=="IF9" & chr1_pi$start %in% to_keep,"pi"], + IF8=chr1_pi[chr1_pi$pool=="IF8" & chr1_pi$start %in% to_keep,"pi"], + IF6=chr1_pi[chr1_pi$pool=="IF6" & chr1_pi$start %in% to_keep,"pi"]) + +pi_pca_LG1 <- prcomp(pca_dd_LG1,scale. = T,center = T) +summary(pi_pca_LG1) +pi_pca_LG1 + +# Scores +LG1_scores <- data.frame(pi_pca_LG1$x) +LG1_scores$start <- as.integer(to_keep) + +PC1 <- ggplot(LG1_scores,aes(y=PC1,x=start))+ + geom_line()+ + geom_point(data=LG1_PC1,aes(x=start,y=10), colour="red")+ + ylab(expression(paste("Z-",pi," PC1 (74%)")))+ + xlab("LG1 Position (Mb)")+ + theme_classic()+ + theme(plot.title = element_text(family = "Avenir", size=20), + axis.text.x=element_text(family = "Avenir", size=18), + axis.title.y=element_text(size=30, family="Avenir"), + axis.title.x=element_text(size=30, family = "Avenir"), + axis.text.y=element_text(size=18, family = "Avenir"), + panel.border = element_blank(), + panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ + scale_y_continuous(breaks=seq(-4, 10, by = 1), labels=seq(-4, 10,1), limits=c(-4,10)) + + scale_x_continuous(breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1)) + +## save +ggsave("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/z-pi_PC1_LG1.png", PC1, width = 40, height = 20, units = "cm", dpi = 400) + + +## calculate alpha PC1 +my_threshold_lower <- quantile(LG1_scores$PC1, 0.01) # 5% = -2.419399 +my_threshold_upper <- quantile(LG1_scores$PC1, 0.99) #95% = 2.821794 + +## use the upper 95%, i.e. 3 as your top limit + +LG1_PC1 <- LG1_scores %>% mutate(PC1_outlier = ifelse(PC1 > my_threshold_upper, "outlier", "no")) %>% + dplyr::filter(PC1_outlier == "outlier") + + +######################################### +#### LG12 ##### +########################################## + +vcf_path <- "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/vcf_files/chr12_variant_filtered_STAR_updated.vcf.gz" +vcf_in <- read.vcfR(vcf_path) + +# Usage eg +chr12_pi <- pool_pi(pool_vcf = vcf_in,wind_size=10000,n_cores=4,min_snp = 1) +chr12_pi <- na.omit(chr12_pi) + +# Keep only pi in all pools +wind_counts <- table(chr12_pi$start) +to_keep <- names(wind_counts[wind_counts==4]) +# Make a new dd +pca_dd_LG12 <- data.frame(IF10=chr12_pi[chr12_pi$pool=="IF10" & chr12_pi$start %in% to_keep,"pi"], + IF9=chr12_pi[chr12_pi$pool=="IF9" & chr12_pi$start %in% to_keep,"pi"], + IF8=chr12_pi[chr12_pi$pool=="IF8" & chr12_pi$start %in% to_keep,"pi"], + IF6=chr12_pi[chr12_pi$pool=="IF6" & chr12_pi$start %in% to_keep,"pi"]) + +pi_pca_LG12 <- prcomp(pca_dd_LG12,scale. = T,center = T) +summary(pi_pca_LG12) +pi_pca_LG12 + +# Scores +LG12_scores <- data.frame(pi_pca_LG12$x) +LG12_scores$start <- as.integer(to_keep) + + +## top PC2 in positive direction +head(LG12_scores[order(-LG12_scores$PC2),]) # 24840000 +## top PC2 in negative direction +head(LG12_scores[order(LG12_scores$PC2),]) # 24280000 + +## top PC1 in positive direction +head(LG12_scores[order(-LG12_scores$PC1),]) # 24270000, also 6180000 +## top PC1 in negative direction +head(LG12_scores[order(LG12_scores$PC1),]) # 880000 + +## calculate alpha PC1 +my_threshold_lower <- quantile(LG12_scores$PC1, 0.01) # 1% = -3.361235 +my_threshold_upper <- quantile(LG12_scores$PC1, 0.99) #99% = 4.583852 + +LG12_PC1 <- LG12_scores %>% mutate(PC1_outlier = ifelse(PC1 > my_threshold_upper, "outlier", "no")) %>% + dplyr::filter(PC1_outlier == "outlier") + +## calculate alpha PC2 +my_threshold_lower <- quantile(LG12_scores$PC2, 0.01) # 1% = -1.356175 +my_threshold_upper <- quantile(LG12_scores$PC2, 0.99) #99% = 1.308372 + +LG12_PC2 <- LG12_scores %>% mutate(PC2_outlier = ifelse(PC2 < my_threshold_lower, "outlier", "no")) %>% + dplyr::filter(PC2_outlier == "outlier") + +# Plot the scores - tells us general diversity (PC1), and IF9 diversity (PC2) + + +PC1 <- ggplot(LG12_scores,aes(y=PC1,x=start))+ + geom_line()+ + ylab(expression(paste("Z-",pi," PC1 (88%)")))+ + xlab("LG12 Position (Mb)")+ + geom_point(data=LG12_PC1,aes(x=start, y=10.5), colour="red")+ + theme_classic()+ + theme(plot.title = element_text(family = "Avenir", size=20), + axis.text.x=element_text(family = "Avenir", size=18), + axis.title.y=element_text(size=30, family="Avenir"), + axis.title.x=element_blank(), + axis.text.y=element_text(size=18, family = "Avenir"), + panel.border = element_blank(), + panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ + scale_y_continuous(breaks=seq(-4, 11, by = 1), labels=seq(-4, 11,1), limits=c(-4,11)) + + scale_x_continuous(breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1)) + + +PC2 <- ggplot(LG12_scores,aes(y=PC2,x=start))+ + geom_line()+ + ylab(expression(paste("Z-",pi," PC2 (7%)")))+ + xlab("LG12 Position (Mb)")+ + geom_point(data=LG12_PC2,aes(x=start, y=-3.5), colour="red")+ + theme_classic()+ + theme(plot.title = element_text(family = "Avenir", size=20), + axis.text.x=element_text(family = "Avenir", size=18), + axis.title.y=element_text(size=30, family="Avenir"), + axis.title.x=element_text(size=30, family = "Avenir"), + axis.text.y=element_text(size=18, family = "Avenir"), + panel.border = element_blank(), + panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ + scale_y_continuous(breaks=seq(-3, 3, by = 1), labels=seq(-3, 3,1), limits=c(-4,3)) + + scale_x_continuous(breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1)) + + +PC1_PC2 <- cowplot::plot_grid(PC1,PC2, ncol=1) + +## save +ggsave("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/z-pi_PC1_PC2LG12.png", PC1_PC2, width = 40, height = 20, units = "cm", dpi = 400) + +cowplot::plot_grid( + ggplot(scores,aes(y=PC1,x=start))+geom_line()+ggtitle("PC1 (87.79%) - General Pi"), + ggplot(scores,aes(y=PC2,x=start))+geom_line()+ggtitle("PC2 (7.05%) - IF9 Pi"), + ncol=1) + + + +######################## +# If you want to look at the Z-transformation analysis as well.. +# Get PC1 +pi_PC1 <- pi_pca$x[,1] +IF9_piZ <- scale(pca_dd$IF9) +plot_dd <- data.frame(start=as.integer(to_keep), + IF9_diff=scale(pca_dd$IF9)-pi_PC1, + IF10_diff=scale(pca_dd$IF10)-pi_PC1, + IF8_diff=scale(pca_dd$IF8)-pi_PC1, + IF6_diff=scale(pca_dd$IF6)-pi_PC1) +ggplot(plot_dd,aes(start,IF9_diff))+geom_line() diff --git a/chr12_liftover.R b/chr12_liftover.R new file mode 100644 index 0000000..a410e12 --- /dev/null +++ b/chr12_liftover.R @@ -0,0 +1,92 @@ +### chromosome 12 liftover +# new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +source("~/Dropbox/Sussex_Guppies/Analyses/update_chr12_liftover.R") + +lib<-c("data.table","vcfR","tidyverse","parallel","ggplot2") +lapply(lib,library,character.only=T) + +############################################################################################ +#### poolseq data ###### +############################################################################################ + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/vcf_files/") +vcf_file <- "poolseq_variant_filtered.vcf.gz" + +vcf<-read.vcfR(vcf_file) +meta<-data.frame(vcf@fix) + +# Run the liftover function for the new chr12 STAR positions: +meta[meta$CHROM == "chr12","POS"]<-update_STAR(as.integer(meta[meta$CHROM == "chr12","POS"])) +# Replace metadata +vcf@fix<-as.matrix(meta) + +# Write new chr12 VCF +write.vcf(file = "poolseq_variant_filtered_update_STAR.vcf.gz", vcf) + +## need to sort the vcf file afterwards using: +# bcftools sort poolseq_variant_filtered_update_STAR.vcf.gz > poolseq_variant_filtered_update_STAR.sorted.vcf.gz +# Then rename to poolseq_variant_filtered_update_STAR.vcf.gz + + +############################################################################################ +#### natural data ###### +############################################################################################ + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/natural_data") +vcf_file <- "chr12.nat_data.recode.vcf.gz" + +vcf<-read.vcfR(vcf_file) +meta<-data.frame(vcf@fix) + +# Run the liftover function for the new chr12 STAR positions: +meta[meta$CHROM == "chr12","POS"]<-update_STAR(as.integer(meta[meta$CHROM == "chr12","POS"])) +# Replace metadata +vcf@fix<-as.matrix(meta) + +# Write new chr12 VCF +write.vcf(file = "chr12_update_STAR_natural.vcf.gz", vcf) + +## Sort vcf file using code above + +######################################################################################################################### + +## liftover for annotation file +######################################################################################################################### + +setwd("/Users/jrp228/Dropbox/Sussex_Guppies/Analyses/pool-seq/chr12_interest_examination") +gff_file <- read_tsv("chr12_genes_OHR95_validated_gene_annotations.gff3", col_names = TRUE) + +gff <- as.data.frame(gff_file) + +gff$start <- as.character(gff$start) +gff$end <- as.character(gff$end) + +# Run the liftover function for the new chr12 STAR positions: +gff[gff$seqID == "chr12","start"]<-update_STAR(as.integer(gff[gff$seqID == "chr12","start"])) +gff[gff$seqID == "chr12","end"]<-update_STAR(as.integer(gff[gff$seqID == "chr12","end"])) + + +# Find rows that have been inverted +inverted <- which(gff$start > gff$end) +# Save the start pos somewhere +new_start <- gff$start[inverted] +# Now replace the end and start +gff$start[inverted] <- gff$end[inverted] +gff$end[inverted] <- new_start +# And finally fix the strands +inverted_strands <- gff$strand[inverted] +new_strands <- sapply(inverted_strands,function(x){ifelse(x == "+",return("-"),return("+"))}) +gff$strand[inverted] <- new_strands + +## now sort based on start position: +gff$start <- as.numeric(gff$start) +gff$end <- as.numeric(gff$end) + +gff <- gff[order(gff$start),] + +## write the new gff: +write.table(gff, "chr12_genes_OHR95_validated_gene_annotations_STAR_liftover.gff3", sep ="\t", quote=F, row.names = F) \ No newline at end of file diff --git a/lostruct_localPCA.R b/lostruct_localPCA.R new file mode 100644 index 0000000..d818e74 --- /dev/null +++ b/lostruct_localPCA.R @@ -0,0 +1,463 @@ +########################################################################################## +# Script runs local PCA across a chromosomal PCA to infer windows of shared ancestry + +# Version 2 of this script runs off a whole VCF and subsets in-line +######################################################################################### + +#install.packages("ggpubr") +#install.packages("data.table") +#devtools::install_github("petrelharp/local_pca/lostruct") +#library(lostruct) + +# new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +# Set working directory +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/lostruct") + +## Load packages +lib = c("parallel","data.table","tidyr","dplyr","ggplot2","lostruct","ggpubr","vcfR", "gridExtra", "extrafont") +lapply(lib, library, character.only=TRUE) + +# Define various groupings, can either be split into males and females, or all +female_vec<-c("Paria_F12", "Paria_F16", "Paria_F2", "Paria_F2_RNA", "Paria_F8", "UM10", "UM11", "UM12", "UM13", "UM14", "UM15", "UM16", "UM17", "UM18", "UM8", "UM9") +male_vec<-c("Paria_M12", "Paria_M2", "Paria_M5", "Paria_M9", "UM1", "UM2", "UM3", "UM5", "UM6", "UM7") + + +# Get parameters from commandline to find outputs + +# This is the name of the original vcf +vcf<- "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/natural_data/chr1.nat_data_39606.DEFAULT.vcf.gz" +## Use both 10 and 100bp windows +window_N<-10 +chr<-"chr1" +pops<-c("Paria", "UM") +## change output depending on whether or not you're doing 10bp or 100bp windows +output<-"chr1_nat_data_10bp" + +# Make the input +system(paste0("bcftools view -r ",chr," ",vcf," > localPCA_tmp.vcf"),wait=TRUE) + +# Read in VCF in popGenome for info on BP locations for unphased vcf +vcf_PG<-read.vcfR("localPCA_tmp.vcf") +BP1<-as.integer(vcf_PG@fix[,2]) +window_BP<-BP1[seq((window_N/2),length(BP1),by=window_N)] +inds_vec<-colnames(vcf_PG@gt)[2:length(colnames(vcf_PG@gt))] + +# Filter for inds we want +#inds_to_keep<-unlist(lapply(pops,function(x){ +# inds2<-inds_vec[inds_vec %in% male_vec] +#return(inds2[grep(x,inds2)]) +#})) + + +# vcf_PG@gt<-vcf_PG@gt[,c(colnames(vcf_PG@gt)[1],inds_to_keep)] +# Get males and female columns +#females<-match(female_vec,inds_vec)[is.na(match(female_vec,inds_vec))==FALSE] +#males<-setdiff(1:length(inds_vec),females) + +# Return logical vector for polymorphic sites +invariant_filter<-is.polymorphic(vcf_PG,na.omit=T) + +# Read in VCF for localPCA +vcf_in<-read_vcf("localPCA_tmp.vcf") + +# And tidy up... +system("rm -f localPCA_tmp.vcf") + +# Filter for invariants and individuals +#vcf_in<-vcf_in[invariant_filter,(inds_vec %in% male_vec)] +BP1<-BP1[invariant_filter] +window_BP<-BP1[seq((window_N/2),length(BP1),by=window_N)] + +# Read in eigen_windows, win=number of rows of matrix, k = number of eigenvector/value pairs to return, mc.cores compatible but does seem to crash sometimes with mc.cores +eigenstuff <- eigen_windows(vcf_in, win=window_N, k=3) + +# Calculate the distances, npc=N of PCs computed, mc.core compatible +windist <- pc_dist(eigenstuff, npc=3,mc.cores=detectCores()-1) + +# Remove any windows for which we are getting NA values +to_keep_logical<-rowSums(is.na(windist)) != ncol(windist) +to_keep<-(1:ncol(windist))[to_keep_logical] +windist2<-windist[to_keep,to_keep] + +# Principal Coordinates Analysis, k = dimension of space for data to represented in, eig = return eigenvalues +fit2d <- cmdscale(windist2, eig=TRUE, k=3) + +eigenvals <- fit2d$eig/sum(fit2d$eig) +## check out sig eigvalues +eigenvals_10 <- eigenvals[eigenvals > 0.01] ## first 3 MDS + +# Plot along a chromosome +plot_dd<-data.frame(mds1=fit2d$points[,1], + mds2=fit2d$points[,2], + mds3=fit2d$points[,3], + window_N=1:length(fit2d$points[,1]), + window_pos=window_BP[to_keep]) + + +# Calculate cut-offs for outliers by trimming distributions and taking 3*SD of trimmed distribution +trim_mds1<-plot_dd[plot_dd$mds1 > quantile(plot_dd$mds1,probs=0.05) & plot_dd$mds1 < quantile(plot_dd$mds1,probs=0.95),"mds1"] +mds1_cutoffs<-c(mean(trim_mds1)+3*sd(trim_mds1), + mean(trim_mds1)-3*sd(trim_mds1)) + +trim_mds2<-plot_dd[plot_dd$mds2 > quantile(plot_dd$mds2,probs=0.05) & plot_dd$mds2 < quantile(plot_dd$mds2,probs=0.95),"mds2"] +mds2_cutoffs<-c(mean(trim_mds2)+3*sd(trim_mds2), + mean(trim_mds2)-3*sd(trim_mds2)) + +trim_mds3<-plot_dd[plot_dd$mds3 > quantile(plot_dd$mds3,probs=0.05) & plot_dd$mds3 < quantile(plot_dd$mds3,probs=0.95),"mds3"] +mds3_cutoffs<-c(mean(trim_mds3)+3*sd(trim_mds3), + mean(trim_mds3)-3*sd(trim_mds3)) + +################## +# We also want to return the windows at the extremes of the distribution +################## +# Give window IDs +window_ids<-rep(0) +for(j in 1:length(to_keep)){ + i <- to_keep[j] + window_ids[j]<-paste0(chr,":",((BP1[(i-1)*window_N])+1),"-",BP1[(i)*window_N]) +} +######## This was shortening the vector by filtering again on "to_keep" +plot_dd$window_id <- window_ids +######################################################## + +windows_to_keep1<-unique(c(plot_dd[plot_dd$mds1 < mds1_cutoffs[2],"window_id"], + plot_dd[plot_dd$mds1 > mds1_cutoffs[1],"window_id"])) + +windows_to_keep2<-unique(c(plot_dd[plot_dd$mds2 < mds2_cutoffs[2],"window_id"], + plot_dd[plot_dd$mds2 > mds2_cutoffs[1],"window_id"])) + +windows_to_keep3<-unique(c(plot_dd[plot_dd$mds3 < mds3_cutoffs[2],"window_id"], + plot_dd[plot_dd$mds3 > mds3_cutoffs[1],"window_id"])) + +# Reassign to plot_dd +plot_dd$outlier<-rep("N") +plot_dd[plot_dd$window_id %in% windows_to_keep1,"outlier"]<-"mds1" +plot_dd[plot_dd$window_id %in% windows_to_keep2,"outlier"]<-"mds2" +plot_dd[plot_dd$window_id %in% windows_to_keep3,"outlier"]<-"mds3" + +# Throw in the colour vals +cols1 <- c("mds1" = "#FCD225", "mds2" = "#C92D59", "mds3" = "#300060", "N" = "black", "mds4" = "#a52a2a", "mds5" = "green", "mds6" = "blue") +cols2 <- c("mds2" = "#C92D59", "mds1" = "#FCD225", "mds3" = "#300060", "N" = "black", "mds4" = "#a52a2a", "mds5" = "green", "mds6" = "blue") +cols3 <- c("mds3" = "#300060", "mds2" = "#C92D59", "mds1" = "#FCD225", "N" = "black", "mds4" = "#a52a2a", "mds5" = "green", "mds6" = "blue") +cols4 <- c("mds4" = "#a52a2a", "mds3" = "#300060", "mds2" = "#C92D59", "mds1" = "#FCD225", "N" = "black", "mds5" = "green", "mds6" = "blue") +cols5 <- c("mds5" = "green", "mds4" = "#a52a2a", "mds3" = "#300060", "mds2" = "#C92D59", "mds1" = "#FCD225", "N" = "black", "mds6" = "blue") +cols6 <- c("mds6" = "blue", "mds4" = "#a52a2a", "mds3" = "#300060", "mds2" = "#C92D59", "mds1" = "#FCD225", "N" = "black", "mds6" = "blue") + +plot_dd<-na.omit(plot_dd) + +## p1 is coordinates 1 and 2 +p1<-ggplot(plot_dd,aes(x=mds1,y=mds2, colour=outlier))+ + geom_point(alpha=0.7, size=2)+ + xlab("Coordinate 1")+ + ylab("Coordinate 2")+ + scale_colour_manual(values=cols1)+ + theme_bw()+ + theme(panel.grid = element_blank(), + axis.title = element_text(size=20, family = "Avenir"), + axis.text = element_text(size=18, family = "Avenir"), + legend.position="none", + strip.text = element_text(size=20))+ + geom_hline(yintercept=mds2_cutoffs,linetype="dashed")+ + geom_vline(xintercept=mds1_cutoffs,linetype="dashed") + + +## p1.1 is coordinates 1 and 3 +p1.1<-ggplot(plot_dd,aes(x=mds1,y=mds3, colour=outlier))+ + geom_point(alpha=0.7, size=2)+ + xlab("Coordinate 1")+ + ylab("Coordinate 3")+ + scale_colour_manual(values=cols1)+ + theme_bw()+ + theme(panel.grid = element_blank(), + axis.title = element_text(size=20, family = "Avenir"), + axis.text = element_text(size=18, family = "Avenir"), + legend.position="none", + strip.text = element_text(size=20))+ + geom_hline(yintercept=mds3_cutoffs,linetype="dashed")+ + geom_vline(xintercept=mds1_cutoffs,linetype="dashed") + + + +## p2 is mds1 coordinates along the window position +p2<-ggplot(plot_dd,aes(x=window_pos,y=mds1,colour=outlier))+ + geom_point(alpha=0.7, size=1.5)+ + scale_x_continuous(name="Position (Mb)", + breaks=seq(0,46000000, by = 1000000),expand=c(0,0), + labels=c("0", "1", "2", "3", "4", "5", "6", + "7", "8", "9", "10", "11", "12", "13", "14", + "15", "16", "17", "18", "19", "20", "21", + "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", + "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", + "42", "43", "44", "45", "46"))+ + scale_y_continuous(labels = scales::number_format(accuracy = 0.01))+ + ylab("MDS 1")+ + theme_bw()+ + theme(panel.grid = element_blank(), + axis.title.y=element_text(family="Avenir", size=30, vjust=0.5), + axis.title.x = element_blank(), + axis.text.y=element_text(family = "Avenir", size=20), + axis.text.x = element_blank(), + # strip.text = element_text(size=24), + legend.position="none")+ + scale_colour_manual(values=cols1) + + +## p2.2 is MDS2 coordinates along window position +p2.2<-ggplot(plot_dd,aes(x=window_pos,y=mds2,colour=outlier))+ + geom_point(alpha=0.7, size=1.5)+ + scale_x_continuous(name="Position (Mb)", + breaks=seq(0,46000000, by = 1000000),expand=c(0,0), + labels=c("0", "1", "2", "3", "4", "5", "6", + "7", "8", "9", "10", "11", "12", "13", "14", + "15", "16", "17", "18", "19", "20", "21", + "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", + "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", + "42", "43", "44", "45", "46"))+ + ylab("MDS 2")+ + theme_bw()+ + theme(panel.grid = element_blank(), + axis.title.y=element_text(family="Avenir", size=30, vjust=0.5), + axis.title.x = element_blank(), + axis.text.y=element_text(family = "Avenir", size=20), + axis.text.x = element_blank(), + strip.text = element_text(size=20), + legend.position="none")+ + scale_colour_manual(values=cols2) + + +## p2.3 is MDS3 along window position +p2.3<-ggplot(plot_dd,aes(x=window_pos,y=mds3,colour=outlier))+ + geom_point(alpha=0.7, size=1.5)+ + scale_x_continuous(name="Position (Mb)", + breaks=seq(0,46000000, by = 1000000),expand=c(0,0), + labels=c("0", "1", "2", "3", "4", "5", "6", + "7", "8", "9", "10", "11", "12", "13", "14", + "15", "16", "17", "18", "19", "20", "21", + "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", + "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", + "42", "43", "44", "45", "46"))+ + ylab("MDS 3")+ + theme_bw()+ + theme(panel.grid = element_blank(), + axis.title.y=element_text(family="Avenir", size=30, vjust=0.5), + axis.title.x=element_blank(), + axis.text.y=element_text(family = "Avenir", size=20), + axis.text.x=element_text(family = "Avenir", size=12), + strip.text = element_text(size=20), + legend.position="none")+ + scale_colour_manual(values=cols3) + + +## Try out various plot combinations +plot_1 <- grid.arrange(ggarrange(p1,p1.1,nrow=2), ggarrange(p2,p2.2,p2.3,nrow=3),ncol=2,widths=c(2,3)) + +ggsave(plot_1, width = 40, height = 30, units = "cm", file = "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/lostruct_LG1_10bp_windows.png") + + +############################### Chromosome 12 ################################### +## new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/lostruct") + +## Load packages +lib = c("parallel","data.table","tidyr","dplyr","ggplot2","lostruct","ggpubr","vcfR", "gridExtra", "extrafont") +lapply(lib, library, character.only=TRUE) + +# Define various groupings, can either be split into males and females, or all +female_vec<-c("Paria_F12", "Paria_F16", "Paria_F2", "Paria_F2_RNA", "Paria_F8", "UM10", "UM11", "UM12", "UM13", "UM14", "UM15", "UM16", "UM17", "UM18", "UM8", "UM9") +male_vec<-c("Paria_M12", "Paria_M2", "Paria_M5", "Paria_M9", "UM1", "UM2", "UM3", "UM5", "UM6", "UM7") + +vcf<- "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/natural_data/chr12.nat_data_31893_updateSTAR.DEFAULT.vcf.gz" +window_N<-10 +chr<-"chr12" +output<-"chr12_nat_data_10bp" + +# Make the input +system(paste0("bcftools view -r ",chr," ",vcf," > localPCA_tmp.vcf"),wait=TRUE) + +# Read in VCF in popGenome for info on BP locations for unphased vcf +vcf_PG<-read.vcfR("localPCA_tmp.vcf") +BP1<-as.integer(vcf_PG@fix[,2]) +window_BP<-BP1[seq((window_N/2),length(BP1),by=window_N)] +inds_vec<-colnames(vcf_PG@gt)[2:length(colnames(vcf_PG@gt))] + +# Filter for inds we want +#inds_to_keep<-unlist(lapply(pops,function(x){ +# inds2<-inds_vec[inds_vec %in% male_vec] +#return(inds2[grep(x,inds2)]) +#})) + + +# vcf_PG@gt<-vcf_PG@gt[,c(colnames(vcf_PG@gt)[1],inds_to_keep)] +# Get males and female columns +#females<-match(female_vec,inds_vec)[is.na(match(female_vec,inds_vec))==FALSE] +#males<-setdiff(1:length(inds_vec),females) + +# Return logical vector for polymorphic sites +invariant_filter<-is.polymorphic(vcf_PG,na.omit=T) + +# Read in VCF for localPCA +vcf_in<-read_vcf("localPCA_tmp.vcf") + +# And tidy up... +system("rm -f localPCA_tmp.vcf") + +# Filter for invariants and individuals +#vcf_in<-vcf_in[invariant_filter,(inds_vec %in% male_vec)] +BP1<-BP1[invariant_filter] +window_BP<-BP1[seq((window_N/2),length(BP1),by=window_N)] + +# Read in eigen_windows, win=number of rows of matrix, k = number of eigenvector/value pairs to return, mc.cores compatible but does seem to crash sometimes with mc.cores +eigenstuff <- eigen_windows(vcf_in, win=window_N, k=3) + +# Calculate the distances, npc=N of PCs computed, mc.core compatible +windist <- pc_dist(eigenstuff, npc=3,mc.cores=detectCores()-1) + +# Remove any windows for which we are getting NA values +to_keep_logical<-rowSums(is.na(windist)) != ncol(windist) +to_keep<-(1:ncol(windist))[to_keep_logical] +windist2<-windist[to_keep,to_keep] + +# Principal Coordinates Analysis, k = dimension of space for data to represented in, eig = return eigenvalues +fit2d <- cmdscale(windist2, eig=TRUE, k=3) + +## Look for drop in eigvals +eigenvals <- fit2d$eig/sum(fit2d$eig) +eigenvals_10 <- eigenvals[eigenvals > 0.01] + +# Plot along a chromosome +plot_dd<-data.frame(mds1=fit2d$points[,1], + mds2=fit2d$points[,2], + mds3=fit2d$points[,3], + window_N=1:length(fit2d$points[,1]), + window_pos=window_BP[to_keep]) + + +# Calculate cut-offs for outliers by trimming distributions and taking 3*SD of trimmed distribution +trim_mds1<-plot_dd[plot_dd$mds1 > quantile(plot_dd$mds1,probs=0.05) & plot_dd$mds1 < quantile(plot_dd$mds1,probs=0.95),"mds1"] +mds1_cutoffs<-c(mean(trim_mds1)+3*sd(trim_mds1), + mean(trim_mds1)-3*sd(trim_mds1)) + +trim_mds2<-plot_dd[plot_dd$mds2 > quantile(plot_dd$mds2,probs=0.05) & plot_dd$mds2 < quantile(plot_dd$mds2,probs=0.95),"mds2"] +mds2_cutoffs<-c(mean(trim_mds2)+3*sd(trim_mds2), + mean(trim_mds2)-3*sd(trim_mds2)) + +trim_mds3<-plot_dd[plot_dd$mds3 > quantile(plot_dd$mds3,probs=0.05) & plot_dd$mds3 < quantile(plot_dd$mds3,probs=0.95),"mds3"] +mds3_cutoffs<-c(mean(trim_mds3)+3*sd(trim_mds3), + mean(trim_mds3)-3*sd(trim_mds3)) + +################## +# We also want to return the windows at the extremes of the distribution +################## + +# Give window IDs +window_ids<-rep(0) +for(j in 1:length(to_keep)){ + i <- to_keep[j] + window_ids[j]<-paste0(chr,":",((BP1[(i-1)*window_N])+1),"-",BP1[(i)*window_N]) +} +######## This was shortening the vector by filtering again on "to_keep" +plot_dd$window_id <- window_ids +######################################################## + +windows_to_keep1<-unique(c(plot_dd[plot_dd$mds1 < mds1_cutoffs[2],"window_id"], + plot_dd[plot_dd$mds1 > mds1_cutoffs[1],"window_id"])) + +windows_to_keep2<-unique(c(plot_dd[plot_dd$mds2 < mds2_cutoffs[2],"window_id"], + plot_dd[plot_dd$mds2 > mds2_cutoffs[1],"window_id"])) + +windows_to_keep3<-unique(c(plot_dd[plot_dd$mds3 < mds3_cutoffs[2],"window_id"], + plot_dd[plot_dd$mds3 > mds3_cutoffs[1],"window_id"])) + +# Reassign to plot_dd +plot_dd$outlier<-rep("N") +plot_dd[plot_dd$window_id %in% windows_to_keep1,"outlier"]<-"mds1" +plot_dd[plot_dd$window_id %in% windows_to_keep2,"outlier"]<-"mds2" +plot_dd[plot_dd$window_id %in% windows_to_keep3,"outlier"]<-"mds3" + +# Throw in colour values +cols1 <- c("mds1" = "#FCD225", "mds2" = "#C92D59", "mds3" = "black", "N" = "black", "mds4" = "#a52a2a", "mds5" = "green", "mds6" = "blue") +cols2 <- c("mds2" = "#C92D59", "mds1" = "#FCD225", "mds3" = "black", "N" = "black", "mds4" = "#a52a2a", "mds5" = "green", "mds6" = "blue") +cols3 <- c("mds3" = "#300060", "mds2" = "#C92D59", "mds1" = "#FCD225", "N" = "black", "mds4" = "#a52a2a", "mds5" = "green", "mds6" = "blue") + +plot_dd<-na.omit(plot_dd) + +## Plotting follows LG1 above +p1<-ggplot(plot_dd,aes(x=mds1,y=mds2, colour=outlier))+ + geom_point(alpha=0.7, size=2)+ + xlab("Coordinate 1")+ + ylab("Coordinate 2")+ + scale_colour_manual(values=cols1)+ + theme_bw()+ + theme(panel.grid = element_blank(), + axis.title = element_text(size=20, family = "Avenir"), + axis.text = element_text(size=18, family = "Avenir"), + legend.position="none", + strip.text = element_text(size=20))+ + geom_hline(yintercept=mds2_cutoffs,linetype="dashed")+ + geom_vline(xintercept=mds1_cutoffs,linetype="dashed") + +p1.1<-ggplot(plot_dd,aes(x=mds1,y=mds3, colour=outlier))+ + geom_point(alpha=0.7, size=2)+ + xlab("Coordinate 1")+ + ylab("Coordinate 3")+ + scale_colour_manual(values=cols1)+ + theme_bw()+ + theme(panel.grid = element_blank(), + axis.title = element_text(size=20, family = "Avenir"), + axis.text = element_text(size=18, family = "Avenir"), + legend.position="none", + strip.text = element_text(size=20))+ + geom_hline(yintercept=mds3_cutoffs,linetype="dashed")+ + geom_vline(xintercept=mds1_cutoffs,linetype="dashed") + +p2<-ggplot(plot_dd,aes(x=window_pos,y=mds1,colour=outlier))+ + geom_point(alpha=0.7, size=1.5)+ + scale_x_continuous(name="Position (Mb)", + breaks=seq(0,46000000, by = 1000000),expand=c(0,0), + labels=c("0", "1", "2", "3", "4", "5", "6", + "7", "8", "9", "10", "11", "12", "13", "14", + "15", "16", "17", "18", "19", "20", "21", + "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", + "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", + "42", "43", "44", "45", "46"))+ + scale_y_continuous(labels = scales::number_format(accuracy = 0.01))+ + ylab("MDS 1")+ + theme_bw()+ + theme(panel.grid = element_blank(), + axis.title.y=element_text(family="Avenir", size=30, vjust =0.5), + axis.title.x = element_blank(), + axis.text.y=element_text(family = "Avenir", size=20), + axis.text.x = element_blank(), + # strip.text = element_text(size=24), + legend.position="none")+ + scale_colour_manual(values=cols1) + +p2.2<-ggplot(plot_dd,aes(x=window_pos,y=mds2,colour=outlier))+ + geom_point(alpha=0.7, size=1.5)+ + scale_x_continuous(name="Position (Mb)", + breaks=seq(0,46000000, by = 1000000),expand=c(0,0), + labels=c("0", "1", "2", "3", "4", "5", "6", + "7", "8", "9", "10", "11", "12", "13", "14", + "15", "16", "17", "18", "19", "20", "21", + "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", + "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", + "42", "43", "44", "45", "46"))+ + scale_y_continuous(labels = scales::number_format(accuracy = 0.01))+ + ylab("MDS 2")+ + theme_bw()+ + theme(panel.grid = element_blank(), + axis.title.y=element_text(family="Avenir", size=30, vjust =0.5), + axis.title.x = element_blank(), + axis.text.y=element_text(family = "Avenir", size=20), + axis.text.x = element_text(family = "Avenir", size=14), + strip.text = element_text(size=20), + legend.position="none")+ + scale_colour_manual(values=cols2) diff --git a/polarise_plot_AFs.R b/polarise_plot_AFs.R new file mode 100644 index 0000000..e80576d --- /dev/null +++ b/polarise_plot_AFs.R @@ -0,0 +1,277 @@ +# new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +lib<-as.vector(c("ggplot2","dplyr","tidyr","stringr","data.table","plyr","viridis")) +lapply(lib,library,character.only=TRUE) + +## working directory +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/geno_plots/") + +######################################### +## LG1 plot ## +######################################### + +## read in the frequencies +chr1 <- data.frame(fread("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat/outputs/final/chr1_AFs_final.tsv", header=TRUE)) + +## Polarise to IF9: +## remove pos +tmp <- chr1 %>% dplyr::select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF9 freqs: +IF9_freqs<-tmp$IF9_AF + +# Polarise to IF9 + +for(i in colnames(tmp)){ + tmp[IF9_freqs < 0.5,i]<-(1-tmp[IF9_freqs < 0.5,i]) +} + +## add IF9 polarised back in back in: +chr1_dd <- cbind(chr1,tmp) + +colnames(chr1_dd) <- c("chr", "pos", "REF", "ALT", "IF10_AF", "IF6_AF", "IF8_AF", "IF9_AF", + "IF10_polar", "IF6_polar", "IF8_polar", "IF9_polar") + + +## take out normal +# chr1 <- chr1_dd %>% select(pos,IF10_AF,IF9_AF,IF8_AF,IF6_AF) + +## take out polarised +chr1 <- chr1_dd %>% dplyr::select(pos,IF10_polar,IF9_polar,IF8_polar,IF6_polar) + +################################# +# Within each pool, create the segments +max_pos<-max(chr1$pos)+10 +long_AFs<-data.frame(rbindlist(lapply(2:5,function(x){ + + # Get the pool + tmp<-data.frame(chr1[,c(1,x)]) + + # Bin the 0s + tmp<-tmp[tmp[,2] >= 0,] + + # Set up xend + tmp$xend<-c(tmp$pos[2:nrow(tmp)],max_pos) + + # Set up y/yend + tmp$y<-x-1.5 + tmp$yend<-x-0.5 + + # Name pool + tmp$pool<-colnames(chr1)[x] + + # Tidy + colnames(tmp)[2]<-"AFs" + return(tmp) +}))) + +## convert AFs to factor for plotting +long_AFs$chr1<-as.factor(long_AFs$AFs) + +# rename for ease +chr1_long_AFs<- long_AFs + +# Plot +LG1_geno <- ggplot(chr1_long_AFs,aes(x=pos,y=y))+ + geom_segment(aes(xend=xend,yend=yend,colour=AFs))+ + ## CP detection + # geom_segment(data = CP,aes(x =x, y = y, xend = xend, yend = yend), colour="black", size = 1)+ + viridis::scale_color_viridis(name = 'AF', option="magma")+ + # ggtitle("IF9 polarised AFs")+ + theme_classic()+ + theme(axis.text.x=element_text(family = "Avenir", size=18), + axis.ticks.x=element_line(), + axis.ticks.y = element_blank(), + axis.text.y=element_text(family = "Avenir", size=20, angle=90, hjust=0.5, + colour=c("#3CBB75", "#481567", "#2D708E", "#B22222")), + axis.title.x=element_text(family="Avenir", size=30, vjust =-2), + axis.title.y=element_text(size=30, family="Avenir"), + legend.title = element_blank(), + # legend.key.size = unit(2,"line"), + # legend.text = element_text("Avenir", size=20), + legend.position = "none")+ + scale_x_continuous(name="LG1 (Mb)", + breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1))+ + scale_y_continuous(name = "AFs", + breaks = seq(1,4, by = 1), + labels=c("Iso\nY10", "Iso\nY9", "Iso\nY8", "Iso\nY6")) + +ggsave(width = 30, height = 12, units = "cm", dpi=800, "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/LG1_AF_plot_Isoy9-polarised.png", LG1_geno) + + +######################################### +## LG12 plot ## +######################################### + +## read in LG12 data: +chr12 <- data.frame(fread("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat/outputs/final/chr12_AFs_final.tsv", header=TRUE)) + +## Polarise to IF9: +## remove pos +tmp <- chr12 %>% dplyr::select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF9 freqs: +IF9_freqs<-tmp$IF9_AF + +## polarise to Iso-Y9 +for(i in colnames(tmp)){ + tmp[IF9_freqs < 0.5,i]<-(1-tmp[IF9_freqs < 0.5,i]) +} + +## add IF9 polarised back in back in: +chr12_dd <- cbind(chr12,tmp) + +colnames(chr12_dd) <- c("chr", "pos", "REF", "ALT", "IF10_AF", "IF6_AF", "IF8_AF", "IF9_AF", + "IF10_polar", "IF6_polar", "IF8_polar", "IF9_polar") + +## take out polarised +chr12 <- chr12_dd %>% dplyr::select(pos,IF10_polar,IF9_polar,IF8_polar,IF6_polar) + +################################# +# create segments +max_pos<-max(chr12$pos)+10 +long_AFs<-data.frame(rbindlist(lapply(2:5,function(x){ + + # Get the pool + tmp<-data.frame(chr12[,c(1,x)]) + + # Bin the 0s + tmp<-tmp[tmp[,2] >= 0,] + + # Set up xend + tmp$xend<-c(tmp$pos[2:nrow(tmp)],max_pos) + + # Set up y/yend + tmp$y<-x-1.5 + tmp$yend<-x-0.5 + + # Name pool + tmp$pool<-colnames(chr12)[x] + + # Tidy + colnames(tmp)[2]<-"AFs" + return(tmp) +}))) + +## convert AFs to factor for plotting +long_AFs$chr12<-as.factor(long_AFs$AFs) + +# rename for plotting +chr12_long_AFs <- long_AFs + + +## plot +LG12_geno <- ggplot(chr12_long_AFs,aes(x=pos,y=y))+ + geom_segment(aes(xend=xend,yend=yend,colour=AFs))+ + viridis::scale_color_viridis(name = 'AF', option="magma")+ + theme_classic()+ + theme(axis.text.x=element_text(family = "Avenir", size=18), + axis.ticks.x=element_line(), + axis.ticks.y = element_blank(), + axis.text.y=element_text(family = "Avenir", size=20, angle=90, hjust=0.5, + colour=c("#3CBB75", "#481567", "#2D708E", "#B22222")), + axis.title.x=element_text(family="Avenir", size=30, vjust =-2), + axis.title.y=element_text(size=30, family="Avenir"), + legend.title = element_blank(), + legend.position = "none")+ + scale_x_continuous(name="LG12 (Mb)", + breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1))+ + scale_y_continuous(name = "AFs", + breaks = seq(1,4, by = 1), + labels=c("Iso\nY10", "Iso\nY9", "Iso\nY8", "Iso\nY6")) + +ggsave(width = 30, height = 12, units = "cm", dpi=800, "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/LG12_AF_plot_Isoy9-polarised.png", LG1_geno) + + +## Can also look at folded AFs, e.g.: + +chr12 <- data.frame(fread("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat/outputs/final/chr12_AFs_final.tsv", header=TRUE)) + +## fold AFs +chr12$IF10_folded = ifelse(chr12$IF10_AF < 0.5, abs(chr12$IF10_AF - 1), chr12$IF10_AF ) +chr12$IF6_folded = ifelse(chr12$IF6_AF < 0.5, abs(chr12$IF6_AF - 1), chr12$IF6_AF ) +chr12$IF8_folded = ifelse(chr12$IF8_AF < 0.5, abs(chr12$IF8_AF - 1), chr12$IF8_AF ) +chr12$IF9_folded = ifelse(chr12$IF9_AF < 0.5, abs(chr12$IF9_AF - 1), chr12$IF9_AF ) + + +## Polarise to IF9: +## remove pos +tmp <- chr12 %>% dplyr::select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF9 freqs: +IF9_freqs<-tmp$IF9_AF + +for(i in colnames(tmp)){ + tmp[IF9_freqs < 0.5,i]<-(1-tmp[IF9_freqs < 0.5,i]) +} + +## add IF9 polarised back in back in: +chr12_dd <- cbind(chr12,tmp) + +colnames(chr12_dd) <- c("chr", "pos", "REF", "ALT", "IF10_AF", "IF6_AF", "IF8_AF", "IF9_AF", + "IF10_folded", "IF6_folded", "IF8_folded", "IF9_folded", + "IF10_polar", "IF6_polar", "IF8_polar", "IF9_polar") + + +## take out folded +chr12 <- chr12_dd %>% dplyr::select(pos,IF10_folded,IF9_folded,IF8_folded,IF6_folded) + + +################################# +# create segments +max_pos<-max(chr12$pos)+10 +long_AFs<-data.frame(rbindlist(lapply(2:5,function(x){ + + # Get the pool + tmp<-data.frame(chr12[,c(1,x)]) + + # Bin the 0s + tmp<-tmp[tmp[,2] >= 0,] + + # Set up xend + tmp$xend<-c(tmp$pos[2:nrow(tmp)],max_pos) + + # Set up y/yend + tmp$y<-x-1.5 + tmp$yend<-x-0.5 + + # Name pool + tmp$pool<-colnames(chr12)[x] + + # Tidy + colnames(tmp)[2]<-"AFs" + return(tmp) +}))) + + +long_AFs$chr12<-as.factor(long_AFs$AFs) +chr12_long_AFs <- long_AFs + + +## plot +LG12_AFs_folded <- ggplot(chr12_long_AFs,aes(x=pos,y=y))+ + geom_segment(aes(xend=xend,yend=yend,colour=AFs))+ + viridis::scale_color_viridis(name = 'AF', option="magma")+ + theme_classic()+ + theme(axis.text.x=element_text(family = "Avenir", size=18), + axis.ticks.x=element_line(), + axis.ticks.y = element_blank(), + axis.text.y=element_text(family = "Avenir", size=20, angle=90, hjust=0.5, + colour=c("#3CBB75", "#481567", "#2D708E", "#B22222")), + axis.title.x=element_text(family="Avenir", size=30, vjust =-2), + axis.title.y=element_text(size=30, family="Avenir"), + legend.title = element_blank(), + legend.position = "none")+ + scale_x_continuous(name="LG12 (Mb)", + breaks=seq(0,34000000, by = 1000000), + labels=seq(0,34, by = 1))+ + scale_y_continuous(name = "AFs", + breaks = seq(1,4, by = 1), + labels=c("Iso\nY10", "Iso\nY9", "Iso\nY8", "Iso\nY6")) + + diff --git a/polarised_AF_densities.R b/polarised_AF_densities.R new file mode 100644 index 0000000..e7eb1d0 --- /dev/null +++ b/polarised_AF_densities.R @@ -0,0 +1,297 @@ +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +lib<-as.vector(c("ggpmisc","readr","tidyverse","datatable","gridExtra","dplyr","tidyr","stringr")) +lapply(lib,library,character.only=TRUE) + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/polarising_freqs/") + +AF_dd <- data.frame(fread("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat/outputs/final/chr1_AFs_final.tsv")) + +############################################################################################################# +## Region 2: 9,627,619 - 17,074,870 bp +############################################################################################################# + +dd2 <- AF_dd %>% dplyr::filter(pos >= 9627619 & pos <= 17074870) + +## remove pos +dd3 <- dd2 %>% dplyr::select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF9 freqs: +IF9_freqs<-dd3$IF9_AF + +for(i in colnames(dd3)){ + dd3[IF9_freqs < 0.5,i]<-(1-dd3[IF9_freqs < 0.5,i]) +} + +## add pos back in: +dd3 <- add_column(dd3,pos=dd2$pos) + +dd <- dd3 + +## IF6 +densityCurve <- ggplot(dd, aes(x=IF6_AF)) + geom_density() + +# extract the data from the graph +densityCurveData <- ggplot_build(densityCurve) +# get the indices of the local minima +localMins <- which(ggpmisc:::find_peaks(-densityCurveData$data[[1]]$density) == TRUE) + +# get the value of the local minima +localMins <- densityCurveData$data[[1]]$x[localMins] +localMins <- c(-Inf, localMins, +Inf) + +local <- localMins[2:3] + +IF6 <- ggplot(dd, aes(x=IF6_AF))+ + geom_density(fill="#B22222", alpha=0.5, lwd=2) + + geom_vline(xintercept = local, color="grey3", linetype = "dashed", lwd=3)+ + ylab("Density")+ + xlab("AF")+ + ggtitle("Iso-Y6")+ + theme_classic()+ + theme(plot.title = element_text(hjust = 0.5, family="Avenir", size=50), + panel.grid.major.y=element_line(), + panel.grid.minor.y=element_line(), + axis.text = element_text(family="Avenir", size=24), + axis.title = element_text(family="Avenir", size=34)) + +ggsave("./figs/IF6_AF_densities_Region2.png", IF6, width = 15, height = 20, units = "cm", dpi = 400) + + +## IF8 +densityCurve <- ggplot(dd, aes(x=IF8_AF)) + geom_density() + +# extract the data from the graph +densityCurveData <- ggplot_build(densityCurve) +# get the indices of the local minima +localMins <- which(ggpmisc:::find_peaks(-densityCurveData$data[[1]]$density) == TRUE) + +# get the value of the local minima +localMins <- densityCurveData$data[[1]]$x[localMins] +localMins <- c(-Inf, localMins, +Inf) + +IF8 <- ggplot(dd, aes(x=IF8_AF))+ + geom_density(fill="#2d708e", alpha=0.5, lwd=2) + + geom_vline(xintercept = 0.6973848, color="grey3", linetype = "dashed", lwd=3)+ + ylab("Density")+ + xlab("AF")+ + ggtitle("Iso-Y8")+ + theme_classic()+ + theme(plot.title = element_text(hjust = 0.5, family="Avenir", size=50), + panel.grid.major.y=element_line(), + panel.grid.minor.y=element_line(), + axis.text = element_text(family="Avenir", size=24), + axis.title = element_text(family="Avenir", size=34)) + +ggsave("./figs/IF8_AF_densities_Region2.png", IF8, width = 15, height = 20, units = "cm", dpi = 400) + + +## IF10 +densityCurve <- ggplot(dd, aes(x=IF10_AF)) + geom_density() + +# extract the data from the graph +densityCurveData <- ggplot_build(densityCurve) +# get the indices of the local minima +localMins <- which(ggpmisc:::find_peaks(-densityCurveData$data[[1]]$density) == TRUE) + +# get the value of the local minima +localMins <- densityCurveData$data[[1]]$x[localMins] +localMins <- c(-Inf, localMins, +Inf) + +local <- localMins[5] + +IF10 <- ggplot(dd, aes(x=IF10_AF))+ + geom_density(fill="#3cbb75", alpha=0.5, lwd=2) + + geom_vline(xintercept = local, color="grey3", linetype = "dashed", lwd=3)+ + ylab("Density")+ + xlab("AF")+ + ggtitle("Iso-Y10")+ + theme_classic()+ + theme(plot.title = element_text(hjust = 0.5, family="Avenir", size=50), + panel.grid.major.y=element_line(), + panel.grid.minor.y=element_line(), + axis.text = element_text(family="Avenir", size=24), + axis.title = element_text(family="Avenir", size=34)) + +ggsave("./figs/IF10_AF_densities_Region2.png", IF10, width = 15, height = 20, units = "cm", dpi = 400) + +## IF9 +densityCurve <- ggplot(dd, aes(x=IF9_AF)) + geom_density() + +# extract the data from the graph +densityCurveData <- ggplot_build(densityCurve) +# get the indices of the local minima +localMins <- which(ggpmisc:::find_peaks(-densityCurveData$data[[1]]$density) == TRUE) + +# get the value of the local minima +localMins <- densityCurveData$data[[1]]$x[localMins] +localMins <- c(-Inf, localMins, +Inf) + +IF9 <- ggplot(dd, aes(x=IF9_AF))+ + geom_density(fill="#481567", alpha=0.5, lwd=1) + + #geom_vline(xintercept = localMins, color="grey3", linetype = "dashed")+ + ylab("Density")+ + xlab("AF")+ + ggtitle("Iso-Y9")+ + theme_classic()+ + theme(plot.title = element_text(hjust = 0.5, family="Avenir", size=50), + panel.grid.major.y=element_line(), + panel.grid.minor.y=element_line(), + axis.text = element_text(family="Avenir", size=24), + axis.title = element_text(family="Avenir", size=34)) + +ggsave("./figs/IF9_AF_densities_Region2.png", IF9, width = 15, height = 20, units = "cm", dpi = 400) + +## plot together +plots <-list(IF6,IF8,IF9,IF10) + +arranged <- grid.arrange(grobs = plots, nrow = 1) + +ggsave("./figs/All_density_plots_Region2.png", arranged, width = 60, height = 20, units = "cm", dpi = 400) + +############################################################################################################# +## Region 3:21,944,840 - 24,959,750 bp +############################################################################################################# +AF_dd <- data.frame(fread("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat/outputs/final/chr1_AFs_final.tsv")) + +dd2 <- AF_dd %>% filter(pos >= 21944840 & pos <= 24959750) + +## remove pos +dd3 <- dd2 %>% dplyr::select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF9 freqs: +IF9_freqs<-dd3$IF9_AF + +for(i in colnames(dd3)){ + dd3[IF9_freqs < 0.5,i]<-(1-dd3[IF9_freqs < 0.5,i]) +} + +## add pos back in: +dd3 <- add_column(dd3,pos=dd2$pos) + +dd <- dd3 + +## IF6 +densityCurve <- ggplot(dd, aes(x=IF6_AF)) + geom_density() + +# extract the data from the graph +densityCurveData <- ggplot_build(densityCurve) +# get the indices of the local minima +localMins <- which(ggpmisc:::find_peaks(-densityCurveData$data[[1]]$density) == TRUE) + +# get the value of the local minima +localMins <- densityCurveData$data[[1]]$x[localMins] +localMins <- c(-Inf, localMins, +Inf) + +local <- localMins[2:3] + +IF6 <- ggplot(dd, aes(x=IF6_AF))+ + geom_density(fill="#B22222", alpha=0.5, lwd=2) + + geom_vline(xintercept = 0.59, color="grey3", linetype = "dashed", lwd=3)+ + ylab("Density")+ + xlab("Allele Frequency")+ + ggtitle("Iso-Y6")+ + theme_classic()+ + theme(plot.title = element_text(hjust = 0.5, family="Avenir", size=50), + panel.grid.major.y=element_line(), + panel.grid.minor.y=element_line(), + axis.text = element_text(family="Avenir", size=20), + axis.title = element_text(family="Avenir", size=30)) + + +ggsave("./figs/IF6_AF_densities_region3.png", IF6, width = 20, height = 20, units = "cm", dpi = 400) + +## IF8 +densityCurve <- ggplot(dd, aes(x=IF8_AF)) + geom_density() + +# extract the data from the graph +densityCurveData <- ggplot_build(densityCurve) +# get the indices of the local minima +localMins <- which(ggpmisc:::find_peaks(-densityCurveData$data[[1]]$density) == TRUE) + +# get the value of the local minima +localMins <- densityCurveData$data[[1]]$x[localMins] +localMins <- c(-Inf, localMins, +Inf) + +IF8 <- ggplot(dd, aes(x=IF8_AF))+ + geom_density(fill="#2d708e", alpha=0.5, lwd=2) + + # geom_vline(xintercept = localMins, color="grey3", linetype = "dashed", lwd=3)+ + ylab("Density")+ + xlab("Allele Frequency")+ + ggtitle("Iso-Y8")+ + theme_classic()+ + theme(plot.title = element_text(hjust = 0.5, family="Avenir", size=50), + panel.grid.major.y=element_line(), + panel.grid.minor.y=element_line(), + axis.text = element_text(family="Avenir", size=20), + axis.title = element_text(family="Avenir", size=30)) + +ggsave("./figs/IF8_AF_densities_region3.png", IF8, width = 20, height = 20, units = "cm", dpi = 400) + + +## IF10 +densityCurve <- ggplot(dd, aes(x=IF10_AF)) + geom_density() + +# extract the data from the graph +densityCurveData <- ggplot_build(densityCurve) +# get the indices of the local minima +localMins <- which(ggpmisc:::find_peaks(-densityCurveData$data[[1]]$density) == TRUE) + +# get the value of the local minima +localMins <- densityCurveData$data[[1]]$x[localMins] +localMins <- c(-Inf, localMins, +Inf) + +local <- localMins[5] + +IF10 <- ggplot(dd, aes(x=IF10_AF))+ + geom_density(fill="#3cbb75", alpha=0.5, lwd=2) + + # geom_vline(xintercept = local, color="grey3", linetype = "dashed", lwd=3)+ + ylab("Density")+ + xlab("Allele Frequency")+ + ggtitle("Iso-Y10")+ + theme_classic()+ + theme(plot.title = element_text(hjust = 0.5, family="Avenir", size=50), + panel.grid.major.y=element_line(), + panel.grid.minor.y=element_line(), + axis.text = element_text(family="Avenir", size=20), + axis.title = element_text(family="Avenir", size=30)) + +ggsave("./figs/IF10_AF_densities_Region3.png", IF10, width = 20, height = 20, units = "cm", dpi = 400) + + +## IF9 +densityCurve <- ggplot(dd, aes(x=IF9_AF)) + geom_density() + +# extract the data from the graph +densityCurveData <- ggplot_build(densityCurve) +# get the indices of the local minima +localMins <- which(ggpmisc:::find_peaks(-densityCurveData$data[[1]]$density) == TRUE) + +# get the value of the local minima +localMins <- densityCurveData$data[[1]]$x[localMins] +localMins <- c(-Inf, localMins, +Inf) + +IF9 <- ggplot(dd, aes(x=IF9_AF))+ + geom_density(fill="#481567", alpha=0.5, lwd=1) + + #geom_vline(xintercept = localMins, color="grey3", linetype = "dashed")+ + ylab("Density")+ + xlab("Allele Frequency")+ + ggtitle("Iso-Y9")+ + theme_classic()+ + theme(plot.title = element_text(hjust = 0.5, family="Avenir", size=50), + panel.grid.major.y=element_line(), + panel.grid.minor.y=element_line(), + axis.text = element_text(family="Avenir", size=20), + axis.title = element_text(family="Avenir", size=30)) + +ggsave("./figs/IF9_AF_densities_region3.png", IF9, width = 20, height = 20, units = "cm", dpi = 400) + +## plot together +plots <-list(IF6,IF8,IF9,IF10) + +arranged <- grid.arrange(grobs = plots, nrow = 1) + +ggsave("./figs/All_density_plots_region3.png", arranged, width = 60, height = 20, units = "cm", dpi = 400) + diff --git a/polarised_AFs_LG1_regions.R b/polarised_AFs_LG1_regions.R new file mode 100644 index 0000000..f9168d5 --- /dev/null +++ b/polarised_AFs_LG1_regions.R @@ -0,0 +1,308 @@ +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +lib<-as.vector(c("cowplot","data.table","ggplot2","parallel","dplyr","tidyr", "stringr")) +lapply(lib,library,character.only=TRUE) + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/polarising_freqs/") + +AF_dd <- data.frame(fread("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat/outputs/final/chr1_AFs_final.tsv")) + +## read in CPD regions file +chr1_regions <- read.table("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/chr1_regions/chr1_region_coords.txt", header=T) +chr1_regions <- as_tibble(chr1_regions) + +## extract region 1 (IF6 fixed) from frequencies: +dd2 <- AF_dd %>% filter(pos >= 4079988 & pos <= 5984584) + +## remove pos +dd3 <- dd2 %>% dplyr::select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF6 freqs and polarise to this +IF6_freqs<-dd3$IF6_AF + +for(i in colnames(dd3)){ + dd3[IF6_freqs < 0.5,i]<-(1-dd3[IF6_freqs < 0.5,i]) +} + +## add pos back in: +dd3 <- add_column(dd3,pos=dd2$pos) + +## make long +dd3 <- dd3 %>% gather(pool, AF,IF10_AF:IF9_AF) + +## read in gtf for gene locations +genes <- read.table("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/gene_density/chr1_OHR95_genes.tsv") +chr1_genes <- genes[genes$V1 == "chr1",] +chr1_genes <- as_tibble(chr1_genes) +region1_genes <- filter(chr1_genes, V4 >= 4079988 & V5 <= 5984584) +region1_genes <- add_column(region1_genes, y = 1.02) + + +## plot region 1: +af_plot_region1 <- ggplot(dd3)+ + geom_jitter(aes(x=pos, y=AF, colour=pool), size=1.5, alpha=0.2)+ + geom_rug(data = region1_genes, aes(V4), sides = "top", length = unit(0.06, "npc"), size=0.5)+ + theme_bw()+ + theme(legend.position = "none", + axis.text.x = element_text(family="Avenir",size=20), + axis.title.x = element_text(family="Avenir", size=30), + axis.ticks.x = element_line(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text.y = element_text(family = "Avenir", size = 20), + axis.title.y = element_text(family = "Avenir", size = 30), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1.4))+ + scale_colour_manual(values = c("#3CBB75", "#B22222", "#2D708E", "#481567"))+ + scale_y_continuous(name = "Allele Frequency", + breaks=seq(0,1, by=0.1), + limits =c(0,1.02))+ + scale_x_continuous(name="LG1 Position (Mb)", + breaks=seq(4000000,6000000, by = 500000), + labels=c("4.0", "4.5", "5.0", "5.5", "6.0")) + + +ggsave("./figs/whole_region1_polarised.png", af_plot_region1, width = 20, height = 20, units = "cm", dpi = 400) + +############################################################################################## +### pull out individual AFs for each line for region 2: +head(AF_dd) + +## and now the same for region 2: +## extract region 1 (IF9 fixed) from frequencies: +dd2 <- AF_dd %>% filter(pos >= 9627619 & pos <= 17074870) + +## remove pos +dd3 <- dd2 %>% dplyr::select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF9 freqs and polarise to this: +IF9_freqs<-dd3$IF9_AF + +for(i in colnames(dd3)){ + dd3[IF9_freqs < 0.5,i]<-(1-dd3[IF9_freqs < 0.5,i]) +} + +## add pos back in: +dd3 <- add_column(dd3,pos=dd2$pos) + +## make long +dd3 <- dd3 %>% gather(pool, AF,IF10_AF:IF9_AF) + +## genes +region2_genes <- filter(chr1_genes, V4 >= 9627619 & V5 <= 17074870) +region2_genes <- add_column(region2_genes, y = 1.02) + + +## plot region 2: +af_plot_region2 <- ggplot(dd3)+ + geom_jitter(aes(x=pos, y=AF, colour=pool), size=1.2, alpha=0.2)+ + geom_rug(data = region2_genes, aes(V4), sides = "top", length = unit(0.06, "npc"), size=0.5)+ + theme_bw()+ + theme(legend.position = "none", + axis.text.x = element_text(family="Avenir",size=20), + axis.title.x = element_text(family="Avenir", size=30), + axis.ticks.x = element_line(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text.y = element_text(family = "Avenir", size = 20), + axis.title.y = element_text(family = "Avenir", size = 30), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1.4))+ + scale_colour_manual(values = c("#3CBB75", "#B22222", "#2D708E", "#481567"))+ + scale_y_continuous(name = "Allele Frequency", + breaks=seq(0,1, by=0.1), + limits =c(0,1.02))+ + scale_x_continuous(name="LG1 Position (Mb)", + breaks=seq(9500000,17500000, by = 1000000), + labels=c("9.5", "10.5", "11.5", "12.5", "13.5", "14.5", "15.5", + "16.5", "17.5")) + +ggsave("./figs/whole_region2_polarised.png", af_plot_region2, width = 20, height = 20, units = "cm", dpi = 400) + + +########################################################################################################### +## and now the same for region 3: +## extract region 3 (IF9 fixed) from frequencies: +dd2 <- AF_dd %>% filter(pos >= 21944840 & pos <= 24959750) + +## remove pos +dd3 <- dd2 %>% dplyr::select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF9 freqs and polarise to this: +IF9_freqs<-dd3$IF9_AF + +for(i in colnames(dd3)){ + dd3[IF9_freqs < 0.5,i]<-(1-dd3[IF9_freqs < 0.5,i]) +} + +## add pos back in: +dd3 <- add_column(dd3,pos=dd2$pos) + +## make long +dd3 <- dd3 %>% gather(pool, AF,IF10_AF:IF9_AF) + +## genes +region3_genes <- filter(chr1_genes, V4 >= 21944840 & V5 <= 24959750) +region3_genes <- add_column(region3_genes, y = 1.02) + +## plot region 3: +af_plot_region3 <- ggplot(dd3)+ + geom_jitter(aes(x=pos, y=AF, colour=pool), size=1.5, alpha=0.2)+ + geom_rug(data = region3_genes, aes(V4), sides = "top", length = unit(0.06, "npc"), size=0.5)+ + theme_bw()+ + theme(legend.position = "none", + axis.text.x = element_text(family="Avenir",size=20), + axis.title.x = element_text(family="Avenir", size=30), + axis.ticks.x = element_line(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text.y = element_text(family = "Avenir", size = 20), + axis.title.y = element_text(family = "Avenir", size = 30), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1.4))+ + scale_colour_manual(values = c("#3CBB75", "#B22222", "#2D708E", "#481567"))+ + scale_y_continuous(name = "Allele Frequency", + breaks=seq(0,1, by=0.1), + limits =c(0,1.02))+ + scale_x_continuous(name="LG1 Position (Mb)", + breaks=seq(22000000,25000000, by = 1000000), + labels=c("22.0", "23.0", "24.0", "25.0")) + +ggsave("./figs/whole_region3_polarised.png", af_plot_region3, width = 20, height = 20, units = "cm", dpi = 400) + +plot_grid(af_plot_region1,af_plot_region2,af_plot_region3) + +################################################################################################################## +### pull out individual AFs for each line for region 2 or region 3: +head(AF_dd) + +## and now the same for region 2: +## extract region 1 (IF9 fixed) from frequencies: +dd2 <- AF_dd %>% filter(pos >= 9627619 & pos <= 17074870) +dd2 <- AF_dd %>% filter(pos >= 21944840 & pos <= 24959750) + +## remove pos +dd3 <- dd2 %>% dplyr::select(IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +## save the IF6 freqs: +IF9_freqs<-dd3$IF9_AF + +for(i in colnames(dd3)){ + dd3[IF9_freqs < 0.5,i]<-(1-dd3[IF9_freqs < 0.5,i]) +} + +## add pos back in: +dd3 <- add_column(dd3,pos=dd2$pos) + + +## genes +region2_genes <- filter(chr1_genes, V4 >= 9627619 & V5 <= 17074870) +region2_genes <- add_column(region2_genes, y = 1.02) + + +## plot IF9: +IF9_plot <- ggplot(dd3)+ + geom_jitter(aes(x=pos, y=IF9_AF), colour="#481567", size=0.5, alpha=0.6)+ + theme_bw()+ + theme(legend.position = "none", + axis.text.x = element_text(family="Avenir",size=20), + axis.title.x = element_text(family="Avenir",size=20), + axis.ticks.x = element_line(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text.y = element_text(family = "Avenir", size = 20), + axis.title.y = element_text(family = "Avenir", size = 30), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1.4))+ + scale_y_continuous(name = "Allele Frequency", + breaks=seq(0,1, by=0.05), + limits =c(0,1))+ + scale_x_continuous(name="LG1 Position (Mb)", + breaks=seq(22000000,25000000, by = 1000000), + labels=c("22", "23", "24", "25")) + +## plot IF6: +IF6_plot <- ggplot(dd3)+ + geom_jitter(aes(x=pos, y=IF6_AF), colour="#B22222", size=0.5, alpha=0.6)+ + theme_bw()+ + theme(legend.position = "none", + axis.text.x = element_text(family="Avenir",size=20), + axis.title.x = element_text(family="Avenir",size=20), + axis.ticks.x = element_line(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text.y = element_text(family = "Avenir", size = 20), + axis.title.y = element_text(family = "Avenir", size = 30), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1.4))+ + scale_y_continuous(name = "Allele Frequency", + breaks=seq(0,1, by=0.05), + limits =c(0,1))+ + scale_x_continuous(name="LG1 Position (Mb)", + breaks=seq(22000000,25000000, by = 1000000), + labels=c("22", "23", "24", "25")) + +## plot region: +IF8_plot <- ggplot(dd3)+ + geom_jitter(aes(x=pos, y=IF8_AF), colour="#2d708eff", size=0.5, alpha=0.6)+ + theme_bw()+ + theme(legend.position = "none", + axis.text.x = element_text(family="Avenir",size=20), + axis.title.x = element_text(family="Avenir",size=20), + axis.ticks.x = element_line(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text.y = element_text(family = "Avenir", size = 20), + axis.title.y = element_text(family = "Avenir", size = 30), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1.4))+ + scale_y_continuous(name = "Allele Frequency", + breaks=seq(0,1, by=0.05), + limits =c(0,1))+ + scale_x_continuous(name="LG1 Position (Mb)", + breaks=seq(22000000,25000000, by = 1000000), + labels=c("22", "23", "24", "25")) +## plot region: +IF10_plot <- ggplot(dd3)+ + geom_jitter(aes(x=pos, y=IF10_AF), colour="#3cbb75ff", size=0.5, alpha=0.6)+ + theme_bw()+ + theme(legend.position = "none", + axis.text.x = element_text(family="Avenir",size=20), + axis.title.x = element_text(family="Avenir",size=20), + axis.ticks.x = element_line(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text.y = element_text(family = "Avenir", size = 20), + axis.title.y = element_text(family = "Avenir", size = 30), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1.4))+ + scale_y_continuous(name = "Allele Frequency", + breaks=seq(0,1, by=0.05), + limits =c(0,1))+ + scale_x_continuous(name="LG1 Position (Mb)", + breaks=seq(22000000,25000000, by = 1000000), + labels=c("22", "23", "24", "25")) + + +## plot together +plots <-list(IF6_plot,IF8_plot,IF9_plot,IF10_plot) + +arranged <- grid.arrange(grobs = plots, nrow = 1) + +ggsave("./figs/region3_AFs_seperataely_plots.png", arranged, width = 60, height = 20, units = "cm", dpi = 400) + + +ggsave("./figs/IF9_ONLY_polarised_region3.png", IF9_plot, width = 20, height = 20, units = "cm", dpi = 400) +ggsave("./figs/IF6_ONLY_polarised_region3.png", IF6_plot, width = 20, height = 20, units = "cm", dpi = 400) +ggsave("./figs/IF8_ONLY_polarised._region3.png", IF8_plot, width = 20, height = 20, units = "cm", dpi = 400) +ggsave("./figs/IF10_ONLY_polarised_region3.png", IF10_plot, width = 20, height = 20, units = "cm", dpi = 400) + diff --git a/pool_pi.R b/pool_pi.R new file mode 100644 index 0000000..128fe8b --- /dev/null +++ b/pool_pi.R @@ -0,0 +1,64 @@ +pool_pi <- function(pool_vcf,wind_size=10000,n_cores=1,min_snp=10){ + # Add a chr error + if(length(unique(pool_vcf@fix[,1])) > 1){ + stop("Too many chromosomes in VCF") + } + # Get all the window starts and end + bp <- as.integer(pool_vcf@fix[,2]) + winds <- seq(0,max(bp),wind_size) + winds2 <- winds+wind_size + winds2[length(winds2)] <- max(bp) + # Get the pool info + pools <- colnames(pool_vcf@gt)[2:ncol(pool_vcf@gt)] + # Work through windows + winds_out <- data.frame(rbindlist(mclapply(1:length(winds),function(x){ + # Subset VCF + vcf_sub <- pool_vcf[bp < winds2[x] & bp >= winds[x],] + # Get the AD + allele_counts <- extract.gt(vcf_sub,element="AD") + # Get counts for each pool + pool_pi <- sapply(pools,function(pool){ + if(nrow(vcf_sub@fix) < min_snp){ + return(NA) + } else { + # Take column + tmp_counts <- allele_counts[,pool] + # Separate + split_counts <- str_split(tmp_counts,",") + # Merge to data.frame + count_dd <- data.frame(count1 = as.integer(sapply(split_counts,function(y){return(y[1])})), + count2 = as.integer(sapply(split_counts,function(y){return(y[2])}))) + # Sum + count_dd$sum <- rowSums(count_dd) + # Get Pi + count_dd$pi <- (2*(count_dd$count1 * count_dd$count2))/(count_dd$sum*(count_dd$sum-1)) + # Avg Pi + pi_avg <- sum(count_dd$pi)/(winds2[x]-winds[x]) + return(pi_avg) + } + }) + # Out format + out <- data.frame(pool = pools, + pi = pool_pi, + chr = unique(pool_vcf@fix[,1]), + start = as.integer(winds[x]), + end = as.integer(winds2[x])) + rownames(out) <- NULL + return(out) + },mc.cores=n_cores))) + return(winds_out) +} + + + +# Usage eg: + +#vcf_path <- "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/vcf_files/chr12_variant_filtered_STAR_updated.vcf.gz" + +#vcf<-read.vcfR(vcf_path) + +#chr12_pi <- pool_pi(pool_vcf = vcf,wind_size=10000,n_cores=2,min_snp = 10) + +# Libraries required! +#lib<-as.vector(c("vcfR","pbapply","PopGenome","cowplot","gridExtra","viridis","grid","data.table","ggplot2","parallel","dplyr","tidyr", "stringr")) +#lapply(lib,library,character.only=TRUE) \ No newline at end of file diff --git a/poolfstat_FST_AFs.R b/poolfstat_FST_AFs.R new file mode 100644 index 0000000..a010a3b --- /dev/null +++ b/poolfstat_FST_AFs.R @@ -0,0 +1,57 @@ +# new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat") + +lib<-c("poolfstat","ggplot2", "cowplot", "readr", "tidyr", "stringr", "data.table") +lapply(lib,library,character.only=T) + + +pool_data <- vcf2pooldata(vcf.file = "../vcf_files/poolseq_variant_filtered_update_STAR.vcf.gz", + poolsizes=rep(96,4), + poolnames =c("IF10", "IF6", "IF8", "IF9")) +min.cov.per.pool = 30, +max.cov.per.pool = 500, +min.rc = 10) + + +## check it's been read properly +is.pooldata(pool_data) + +## Calculate pairwise FST +pairwise_FST <- compute.pairwiseFST(pool_data, output.snp.values = TRUE, method = "Anova") + +## Write: +write.table(pairwise_FST, "./outputs/poolseq_lifted_pairwise_fst.tsv", sep ="\t", quote=F, row.names = F) + +## Allele frequencies + +# take info we want from the pool_data object +dd <- cbind(pool_data@snp.info, pool_data@refallele.readcount, pool_data@readcoverage) + +## Add column titles +colnames(dd) <- c("chr","pos","REF","ALT", + "refallelereadcount_IF10", + "refallelereadcount_IF6", + "refallelereadcount_IF8", + "refallelereadcount_IF9", + "readcoverage_IF10", + "readcoverage_IF6", + "readcoverage_IF8", + "readcoverage_IF9") + + +write.table(dd, "./outputs/whole_genome_raw_read_counts.tsv", sep ="\t", quote=F, row.names = F) + +dd <- data.frame(fread("./outputs/whole_genome_raw_read_counts.tsv", header=TRUE)) + +## calculate ref allele frequencies +AF_dd <- dd %>% mutate(IF10_AF = refallelereadcount_IF10 / readcoverage_IF10) %>% + mutate(IF6_AF = refallelereadcount_IF6 / readcoverage_IF6) %>% + mutate(IF8_AF = refallelereadcount_IF8 / readcoverage_IF8) %>% + mutate(IF9_AF = refallelereadcount_IF9 / readcoverage_IF9) %>% + select(chr, pos, REF, ALT, IF10_AF, IF6_AF, IF8_AF, IF9_AF) + +write.table(AF_dd, "./outputs/whole_genome_AFs_final.tsv", sep ="\t", quote=F, row.names = F) diff --git a/popgenome_nat_data.R b/popgenome_nat_data.R new file mode 100644 index 0000000..35ee9ed --- /dev/null +++ b/popgenome_nat_data.R @@ -0,0 +1,97 @@ +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/natural_data/") + +# Get libs +lib<-as.vector(c("vcfR","pbapply","PopGenome","cowplot","gridExtra","viridis","grid","data.table","ggplot2","parallel","dplyr","tidyr")) +lapply(lib,library,character.only=TRUE) + +# Get VCF (you need to put the full path here otherwise it freaks out) +vcf_path <- "/Users/jrp228/Dropbox/Sussex_Guppies/Analyses/pool-seq/natural_data/chr12.nat_data_31893_updateSTAR.DEFAULT.vcf.gz" + +vcf_path <- "/Users/jrp228/Dropbox/Sussex_Guppies/Analyses/pool-seq/natural_data/chr1.nat_data_39606.DEFAULT.vcf.gz" + +# Get chrs +vcf<-read.vcfR(vcf_path) +chrs<-unique(vcf@fix[,1]) +fai<-read.table("~/Dropbox/Sussex_Guppies/genomes/STAR.chromosomes.release.fasta.fai", header=F) +fai<-fai[fai$V1 %in% chrs,] + +# Get males and females +inds<-colnames(vcf@gt)[2:ncol(vcf@gt)] +females<-c("UM10","UM11","UM12","UM13","UM14","UM15", "UM16", "UM17", "UM18", "UM8", "UM9", inds[grep("F",inds)]) +males<-inds[!(inds %in% females)] + +# Run popgenome over them in different window sizes +wind_sizes=c(1000) +wind_sizes=c(10000) + +i<-1 +# Set window size +wind_size<-wind_sizes[i] + +# Run over chrs for comparisons +all_chr<-data.frame(rbindlist(pblapply(c(1:length(chrs)),function(chr){ + #all_chr<-data.frame(rbindlist(lapply(141:length(chrs),function(chr){ + + # chr name + tid = chrs[chr] + print(tid) + # chr length + topos = fai[fai$V1 == tid,]$V2 + + # Read in the VCF + snp <- readVCF(file = vcf_path, tid=tid, frompos = 1, topos = topos, numcols=1000000, include.unknown=TRUE) + + # Set up males and females + snp<-set.populations(snp,do.call("list",list(males,females)),diploid=TRUE) + snp<-set.outgroup(snp,FALSE) + + #split into windows + win_SNP<-sliding.window.transform(snp,width=wind_size,jump=wind_size,type=2) + + # Do stats + win_SNP <-F_ST.stats(win_SNP,mode="nucleotide") + win_SNP <-neutrality.stats(win_SNP,FAST=FALSE) + win_SNP <-diversity.stats.between(win_SNP,nucleotide.mode=TRUE) + + # Get centre of the window + genome.pos_1K <- sapply(win_SNP@region.names, function(x){ + split <- strsplit(x," ")[[1]][c(1,3)] + val <- mean(as.numeric(split)) + return(val) + }) + + #output results matrix + PG_out<-data.frame(chr = tid, + FST=win_SNP@nucleotide.F_ST, + n.seg.M=win_SNP@n.segregating.sites[,1], + n.seg.F=win_SNP@n.segregating.sites[,2], + dxy=(win_SNP@nuc.diversity.between/wind_size), + pi.M=(win_SNP@nuc.diversity.within/wind_size)[,1], + pi.F=(win_SNP@nuc.diversity.within/wind_size)[,2], + Taj_all=win_SNP@Tajima.D, + Taj.M=win_SNP@Tajima.D[,1], + Taj.F=win_SNP@Tajima.D[,2], + Watt.M=win_SNP@theta_Watterson[,1], + start=genome.pos_1K -(wind_size/2)-1.5, + end=genome.pos_1K +(wind_size/2)-1.5, + pop="Paria/Marianne") + colnames(PG_out)[c(2,5)]<-c("FST","DXY") + #PG_out<-na.omit(PG_out) + PG_out[PG_out$FST < 0,"FST"]<-0 + rownames(PG_out)<-NULL + + return(na.omit(PG_out)) +}))) + +# Calc resid Dxy +all_chr$Da<-all_chr$DXY-all_chr$pi.F + + +# Write to output +write.table(all_chr, + "~/Dropbox/Sussex_Guppies/Analyses/pool-seq/natural_data/popgen_stats/chr12_nat_data_popgen_1kb_STAR_lift.txt", + row.names = F,quote=F,sep="\t") \ No newline at end of file diff --git a/raw_pairwise_fst.R b/raw_pairwise_fst.R new file mode 100644 index 0000000..9bc684f --- /dev/null +++ b/raw_pairwise_fst.R @@ -0,0 +1,212 @@ +# new general run +rm(list=ls()) #clears all variables +objects() # clear all objects +graphics.off() #close all figures + +setwd("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/poolfstat") + +lib<-c("ggplot2", "cowplot", "readr", "tidyr", "stringr", "data.table") +lapply(lib,library,character.only=T) + +############ +## Chr1 ## +############ + +## read in data: +chr1 <- data.frame(fread("./outputs/final/chr1_raw_pairwise_fst.tsv", header=TRUE)) + +chr1[is.na(chr1)] <- 0 + +## now subset for each line: +IF10_vs_IF6 <- chr1 %>% dplyr::select(chr,pos,IF10_vs_IF6) %>% set_colnames(c("chr","pos","fst")) +IF10_vs_IF8 <- chr1 %>% dplyr::select(chr,pos,IF10_vs_IF8) %>% set_colnames(c("chr","pos","fst")) +IF10_vs_IF9 <- chr1 %>% dplyr::select(chr,pos,IF10_vs_IF9) %>% set_colnames(c("chr","pos","fst")) +IF6_vs_IF8 <- chr1 %>% dplyr::select(chr,pos,IF6_vs_IF8) %>% set_colnames(c("chr","pos","fst")) +IF6_vs_IF9 <- chr1 %>% dplyr::select(chr,pos,IF6_vs_IF9) %>% set_colnames(c("chr","pos","fst")) +IF8_vs_IF9 <- chr1 %>% dplyr::select(chr,pos,IF8_vs_IF9) %>% set_colnames(c("chr","pos","fst")) + +## add spline chr1 +x=IF10_vs_IF6$pos +y=IF10_vs_IF6$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF10_vs_IF6$spline <- spline_fst + +x=IF10_vs_IF8$pos +y=IF10_vs_IF8$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF10_vs_IF8$spline <- spline_fst + +x=IF10_vs_IF9$pos +y=IF10_vs_IF9$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF10_vs_IF9$spline <- spline_fst + +x=IF6_vs_IF8$pos +y=IF6_vs_IF8$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF6_vs_IF8$spline <- spline_fst + +x=IF6_vs_IF9$pos +y=IF6_vs_IF9$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF6_vs_IF9$spline <- spline_fst + +x=IF8_vs_IF9$pos +y=IF8_vs_IF9$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF8_vs_IF9$spline <- spline_fst + +## Load chr1 regions of differentation +chr1_regions <- read.table("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/chr1_regions/chr1_region_coords.txt", header=T) +chr1_regions <- as_tibble(chr1_regions) + +# Function for plotting all pairwise +plot_fst_chr1 <- function(dataset1,pop){ + ggplot(dataset1) + + geom_point(mapping=aes(x =pos, y = fst), colour = "#1e272c",size = 0.1, alpha = 0.2) + + geom_line(mapping=aes(x=pos, y=spline),colour="#fab201", size = 1.8)+ + # geom_smooth(mapping=aes(x=pos, y=fst),colour="gold2", size = 1, span=0.000001)+ + geom_rect(alpha = 0.3, fill=c("grey", "grey", "grey"), colour = "white", + linetype=1, size = 0, + data=chr1_regions, + mapping=aes(xmin=start, + xmax=end,ymin=ymin, + ymax=c(0.99, 0.99, 0.99)))+ + ylab(expression(F[ST]))+ + xlab("Position") + + scale_y_continuous(breaks=seq(-0.2, 1, by = 0.2), limits = c(0,1)) + + scale_color_manual(values=c("#f4b41a")) + guides(colour=FALSE)+ + scale_x_continuous(name="Position (Mb)", + breaks=seq(0,46000000, by = 5000000),expand=c(0,0), + labels=seq(0,46, by = 5))+ + theme_bw() + + theme(plot.title = element_text(family ="Avenir", size=30, hjust =0.5), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line = element_line(colour = "black"), + panel.border = element_rect(colour = "black", fill=NA, size=1.4), + axis.title.y = element_blank(), + axis.text.y = element_text(family = "Avenir", size=30), + axis.ticks.y = element_blank(), + axis.ticks.x = element_line(), + axis.text.x = element_text(family = "Avenir", size=30), + axis.title.x = element_blank(), + panel.spacing = unit(0, "lines")) +} + + +IF10_IF6_plot1 <- plot_fst_chr1(IF10_vs_IF6) + ggtitle ("Iso-Y6 vs Iso-Y10") + theme(axis.text.x = element_blank()) +IF10_IF8_plot1 <- plot_fst_chr1(IF10_vs_IF8) + ggtitle ("Iso-Y10 vs Iso-Y8") +IF10_IF9_plot1 <- plot_fst_chr1(IF10_vs_IF9) + ggtitle ("Iso-Y10 vs Iso-Y9") + theme(axis.text.x = element_blank()) +IF6_IF8_plot1 <- plot_fst_chr1(IF6_vs_IF8) + ggtitle ("Iso-Y6 vs Iso-Y8") +IF6_IF9_plot1 <- plot_fst_chr1(IF6_vs_IF9) + ggtitle ("Iso-Y6 vs Iso-Y9") + theme(axis.text.x = element_blank()) +IF8_IF9_plot1 <- plot_fst_chr1(IF8_vs_IF9) + ggtitle ("Iso-Y8 vs Iso-Y9") + + +plots_chr1 <-list(IF10_IF6_plot1,IF6_IF9_plot1,IF6_IF8_plot1,IF10_IF9_plot1, IF10_IF8_plot1, IF8_IF9_plot1) + +## Create empty matrix for chr1 to plot in pairwise triangles +m1 <- matrix(NA, 3, 3) +m1[lower.tri(m1, diag = T)] <- 1:6 +m1 +tri_fst_plot_chr1 <- grid.arrange(grobs = plots_chr1, layout_matrix = m1) + +ggsave("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/chr1_pairwise_plot_nonwindowed.png", tri_fst_plot_chr1, width = 40, height = 40, units = "cm", dpi = 400) + +############ +## Chr12 ## +############ + +## read in again: +chr12 <- data.frame(fread("./outputs/final/chr12_raw_pairwise_fst.tsv", header=TRUE)) + +chr12[is.na(chr12)] <- 0 + +## now subset for each line: +IF10_vs_IF6 <- chr12 %>% dplyr::select(chr,pos,IF10_vs_IF6) %>% set_colnames(c("chr","pos","fst")) +IF10_vs_IF8 <- chr12 %>% dplyr::select(chr,pos,IF10_vs_IF8) %>% set_colnames(c("chr","pos","fst")) +IF10_vs_IF9 <- chr12 %>% dplyr::select(chr,pos,IF10_vs_IF9) %>% set_colnames(c("chr","pos","fst")) +IF6_vs_IF8 <- chr12 %>% dplyr::select(chr,pos,IF6_vs_IF8) %>% set_colnames(c("chr","pos","fst")) +IF6_vs_IF9 <- chr12 %>% dplyr::select(chr,pos,IF6_vs_IF9) %>% set_colnames(c("chr","pos","fst")) +IF8_vs_IF9 <- chr12 %>% dplyr::select(chr,pos,IF8_vs_IF9) %>% set_colnames(c("chr","pos","fst")) + +## add spline chr12 +x=IF10_vs_IF6$pos +y=IF10_vs_IF6$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF10_vs_IF6$spline <- spline_fst + +x=IF10_vs_IF8$pos +y=IF10_vs_IF8$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF10_vs_IF8$spline <- spline_fst + +x=IF10_vs_IF9$pos +y=IF10_vs_IF9$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF10_vs_IF9$spline <- spline_fst + +x=IF6_vs_IF8$pos +y=IF6_vs_IF8$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF6_vs_IF8$spline <- spline_fst + +x=IF6_vs_IF9$pos +y=IF6_vs_IF9$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF6_vs_IF9$spline <- spline_fst + +x=IF8_vs_IF9$pos +y=IF8_vs_IF9$fst +spline_fst=predict(smooth.spline(x,y,spar=0.5),x)$y +IF8_vs_IF9$spline <- spline_fst + +# Function for plotting all pairwise +plot_fst_chr12 <- function(dataset1,pop){ + ggplot(dataset1) + + geom_point(mapping=aes(x =pos, y = fst), colour = "#1e272c",size = 0.1, alpha = 0.2) + + geom_line(mapping=aes(x=pos, y=spline),colour="#fab201", size = 1.8)+ + ylab(expression(F[ST]))+ + xlab("Position") + + scale_y_continuous(breaks=seq(-0.2, 1, by = 0.2), limits = c(0,1)) + + scale_color_manual(values=c("#f4b41a")) + guides(colour=FALSE)+ + scale_x_continuous(name="Position (Mb)", + breaks=seq(0,46000000, by = 5000000),expand=c(0,0), + labels=seq(0,46, by = 5))+ + theme_bw() + + theme(plot.title = element_text(family ="Avenir", size=30, hjust =0.5), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line = element_line(colour = "black"), + panel.border = element_rect(colour = "black", fill=NA, size=1.4), + axis.title.y = element_blank(), + axis.text.y = element_text(family = "Avenir", size=30), + axis.ticks.y = element_blank(), + axis.ticks.x = element_line(), + axis.text.x = element_text(family = "Avenir", size=30), + axis.title.x = element_blank(), + panel.spacing = unit(0, "lines")) +} + + + +IF10_IF6_plot1 <- plot_fst_chr12(IF10_vs_IF6) + ggtitle ("Iso-Y6 vs Iso-Y10") + theme(axis.text.x = element_blank()) +IF10_IF8_plot1 <- plot_fst_chr12(IF10_vs_IF8) + ggtitle ("Iso-Y10 vs Iso-Y8") +IF10_IF9_plot1 <- plot_fst_chr12(IF10_vs_IF9) + ggtitle ("Iso-Y10 vs Iso-Y9") + theme(axis.text.x = element_blank()) +IF6_IF8_plot1 <- plot_fst_chr12(IF6_vs_IF8) + ggtitle ("Iso-Y6 vs Iso-Y8") +IF6_IF9_plot1 <- plot_fst_chr12(IF6_vs_IF9) + ggtitle ("Iso-Y6 vs Iso-Y9") + theme(axis.text.x = element_blank()) +IF8_IF9_plot1 <- plot_fst_chr12(IF8_vs_IF9) + ggtitle ("Iso-Y8 vs Iso-Y9") + + +plots_chr12 <-list(IF10_IF6_plot1,IF6_IF9_plot1,IF6_IF8_plot1,IF10_IF9_plot1, IF10_IF8_plot1, IF8_IF9_plot1) + +## Create empty matrix for chr12 in pairwise triangle +m1 <- matrix(NA, 3, 3) +m1[lower.tri(m1, diag = T)] <- 1:6 +m1 +tri_fst_plot_chr12 <- grid.arrange(grobs = plots_chr12, layout_matrix = m1) + +ggsave("~/Dropbox/Sussex_Guppies/Analyses/pool-seq/Figures/chr12_pairwise_plot_nonwindowed.png", tri_fst_plot_chr12, width = 40, height = 40, units = "cm", dpi = 400) + diff --git a/sample_metadata.txt b/sample_metadata.txt new file mode 100644 index 0000000..df1d00b --- /dev/null +++ b/sample_metadata.txt @@ -0,0 +1,28 @@ +Study Accession Sample Accession Sample Name Nominal Name Sex Read1 Read2 +PRJEB10680 SAMEA3649957 UM1 NAT21 M TruSeqnanogenomic_S2_L004_R1_001.fastq TruSeqnanogenomic_S2_L004_R2_001.fastq +PRJEB10680 SAMEA3649958 UM10 NAT06 F UM10_GTGAAA_L008_R1_001.fastq UM10_GTGAAA_L008_R2_001.fastq +PRJEB10680 SAMEA3649959 UM11 NAT07 F genomicNanolib_S1_L003_R1_001.fastq genomicNanolib_S1_L003_R2_001.fastq +PRJEB10680 SAMEA3649960 UM12 NAT08 F genomicNanolib_S2_L003_R1_001.fastq genomicNanolib_S2_L003_R2_001.fastq +PRJEB10680 SAMEA3649961 UM13 NAT09 F genomicNanolib_S3_L003_R1_001.fastq genomicNanolib_S3_L003_R2_001.fastq +PRJEB10680 SAMEA3649962 UM14 NAT10 F genomicNanolib_S4_L003_R1_001.fastq genomicNanolib_S4_L003_R2_001.fastq +PRJEB10680 SAMEA3649963 UM15 NAT11 F genomicNanolib_S5_L003_R1_001.fastq genomicNanolib_S5_L003_R2_001.fastq +PRJEB10680 SAMEA3649964 UM16 NAT12 F genomicNanolib_S6_L003_R1_001.fastq genomicNanolib_S6_L003_R2_001.fastq +PRJEB10680 SAMEA3649965 UM17 NAT13 F genomicNanolib_S7_L003_R1_001.fastq genomicNanolib_S7_L003_R2_001.fastq +PRJEB10680 SAMEA3649966 UM18 NAT14 F genomicNanolib_S8_L003_R1_001.fastq genomicNanolib_S8_L003_R2_001.fastq +PRJEB10680 SAMEA3649967 UM2 NAT22 M UM2_TGACCA_L008_R1_001.fastq UM2_TGACCA_L008_R2_001.fastq +PRJEB10680 SAMEA3649968 UM3 NAT23 M genomicNanolib_S7_L001_R1_001.fastq genomicNanolib_S7_L001_R2_001.fastq +PRJEB10680 SAMEA3649968 UM3 NAT23 M UM3_ACAGTG_L008_R1_001.fastq UM3_ACAGTG_L008_R2_001.fastq +PRJEB10680 SAMEA3649969 UM5 NAT24 M TruSeqnanogenomic_S3_L004_R1_001.fastq TruSeqnanogenomic_S3_L004_R2_001.fastq +PRJEB10680 SAMEA3649970 UM6 NAT25 M UM6_CTTGTA_L008_R1_001.fastq UM6_CTTGTA_L008_R2_001.fastq +PRJEB10680 SAMEA3649971 UM7 NAT26 M UM7_ATGTCA_L008_R1_001.fastq UM7_ATGTCA_L008_R2_001.fastq +PRJEB10680 SAMEA3649972 UM8 NAT15 F TruSeqnanogenomic_S4_L004_R1_001.fastq TruSeqnanogenomic_S4_L004_R2_001.fastq +PRJEB10680 SAMEA3649973 UM9 NAT16 F TruSeqnanogenomic_S5_L004_R1_001.fastq TruSeqnanogenomic_S5_L004_R2_001.fastq +PRJEB36506 SAMEA8750557 Paria1 NAT01 F Paria1_R1.fastq.gz Paria1_R2.fastq.gz +PRJEB36506 SAMEA8750558 Paria2 NAT02 F Paria2_R1.fastq.gz Paria2_R2.fastq.gz +PRJEB36506 SAMEA8750559 Paria3 NAT03 F Paria3_R1.fastq.gz Paria3_R2.fastq.gz +PRJEB36506 SAMEA8750560 Paria4 NAT04 F Paria4_R1.fastq.gz Paria4_R2.fastq.gz +PRJEB36506 SAMEA8750561 Paria5 NAT05 F Paria5_R1.fastq.gz Paria5_R2.fastq.gz +PRJEB36506 SAMEA8750562 Paria6 NAT17 M Paria6_R1.fastq.gz Paria6_R2.fastq.gz +PRJEB36506 SAMEA8750563 Paria7 NAT18 M Paria7_R1.fastq.gz Paria7_R2.fastq.gz +PRJEB36506 SAMEA8750564 Paria8 NAT19 M Paria8_R1.fastq.gz Paria8_R2.fastq.gz +PRJEB36506 SAMEA8750565 Paria9 NAT20 M Paria9_R1.fastq.gz Paria9_R2.fastq.gz \ No newline at end of file diff --git a/update_chr12_liftover.R b/update_chr12_liftover.R new file mode 100644 index 0000000..fd15773 --- /dev/null +++ b/update_chr12_liftover.R @@ -0,0 +1,39 @@ +update_STAR <- function(bp_vec=NULL){ + vec_out <- sapply(bp_vec,function(x){ + # Invert I and II... + if(x <= 3258871){ + bp_out <- 3258871 - x + } + # Move Contig 6... + else if (x >= 5331818 & x <= 7997830){ + bp_out <- x + 433864 + 10000 + } else if (x >= 8007831 & x <= 8441694){ + bp_out <- x - 2666013 - 10000 + } + # Invert Contig XI + else if (x >= 20209082 & x <= 24252560){ + bp_out <- 24252560 - x + 20209082 + } + # Swap Contig XIII and XIV + else if (x >= 24511363 & x <= 25631364){ + bp_out <- x + 963876 + 10000 + } else if (x >= 25641365 & x <= 26605240){ + bp_out <- x - 1120002 - 10000 + } else { + bp_out <- x + } + return(bp_out) + }) + return(vec_out) +} + +## Loop for if you are running it over intervals or windows because it inverts your start and end will flip round +# ranges$start <- update_STAR(ranges$start) +# ranges$end <- update_STAR(ranges$end) +# for(i in 1:nrow(ranges)){ +# if(ranges$start[i] > ranges$end[i]){ +# tmp <- ranges$start[i] +# ranges$start[i] <- ranges$end[i] +# ranges$end[i] <- tmp +# } +#} \ No newline at end of file