-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdotplot.R
140 lines (113 loc) · 5.05 KB
/
dotplot.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Generate a dot-plot of two sequences
# Not optimized: using 2Gb RAM for 3K sequences - don't go over 5K
#
# Aleix Lafita - November 2019
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(seqinr))
suppressPackageStartupMessages(library(argparse))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(stringi))
suppressPackageStartupMessages(library(ggplot2))
theme_set(theme_bw() + theme(panel.grid.minor = element_blank()))
###################### Argparse #############################
seq1 = "examples/SasG_prot.fa"
seq2 = "examples/SasG_prot.fa"
window = 5
size = 1.0
prefix = "examples/SasG_prot"
type = "prot"
# create parser object
parser = ArgumentParser(description = 'Generate a dot-plot of two sequences')
# specify our desired options
parser$add_argument("-x", "--seq1", default=seq1,
help="First sequence (horizontal) in FASTA format [default \"%(default)s\"]")
parser$add_argument("-y", "--seq2", default=seq2,
help="Second sequence (vertical) in FASTA format [default \"%(default)s\"]")
parser$add_argument("-w", "--window", default=window,
help="Window size of the comparison [default \"%(default)s\"]")
parser$add_argument("-s", "--size", default=size,
help="Scaling size for the plot dimensions [default \"%(default)s\"]")
parser$add_argument("-t", "--type", default=type,
help="Sequence type, either 'prot' or 'nucl' [default \"%(default)s\"]")
parser$add_argument("-p", "--prefix", default=prefix,
help="Prefix for the output dot-plot file [default \"%(default)s\"]")
# get command line options, if help option encountered print help and exit,
# otherwise if options not found on command line then set defaults,
args = parser$parse_args()
seq1 = args$seq1
seq2 = args$seq2
window = as.integer(args$window)
size = as.integer(args$size)
prefix = args$prefix
type = args$type
if (!(type == "prot" | type == "nucl"))
stop(sprintf("Option 'type' must be either 'prot' or 'nucl', but it is: '%s'", type))
###################### Parsing #############################
message("# Parsing sequences...")
# Parse the sequences and convert to DF
seq1.fasta = read.fasta(seq1)[1]
seq1.len = length(getSequence(seq1.fasta)[[1]])
seq1.name = names(seq1.fasta)
seq1.df = data.frame(
id = seq1.name,
pos = seq(1, seq1.len, 1),
seq = unlist(getSequence(seq1.fasta)),
stringsAsFactors = T
) %>% rowwise() %>%
mutate(
#seq = toupper(seq),
seqw = paste0(unlist(getSequence(seq1.fasta))[pos:min((pos+window)-1, seq1.len)], collapse = ""),
# If it is not DNA the complement gets assigned the same sequence
seqwc = ifelse(type != "nucl", seqw, paste0(comp(unlist(getSequence(seq1.fasta))[pos:min((pos+window)-1, seq1.len)]), collapse = "")),
) %>% filter(nchar(seqw) == window)
seq2.fasta = read.fasta(seq2)[1]
seq2.len = length(getSequence(seq2.fasta)[[1]])
seq2.name = names(seq2.fasta)
seq2.df = data.frame(
id = seq2.name,
pos = seq(1, seq2.len, 1),
seq = unlist(getSequence(seq2.fasta)),
stringsAsFactors = T
) %>% rowwise() %>%
mutate(
#seq = toupper(seq),
seqw = paste0(unlist(getSequence(seq2.fasta))[pos:min((pos+window)-1, seq2.len)], collapse = ""),
seqwc = ifelse(type != "nucl", seqw, paste0(comp(unlist(getSequence(seq2.fasta))[pos:min((pos+window)-1, seq1.len)]), collapse = "")),
) %>% filter(nchar(seqw) == window)
###################### Calculate #############################
message(sprintf("# Generating dotplot matrix (%ix%i), this might take a while...", nrow(seq1.df), nrow(seq2.df)))
dotplot = merge(seq1.df, seq2.df, by =c())
message("# Extracting dotplot points, also takes a while...")
# First calculate the values, not memory efficient but cleaner
dotplot = dotplot %>%
mutate(
ident = seqw.x == seqw.y,
rev = seqw.x == stri_reverse(seqw.y),
comp = seqw.x == seqwc.y,
revcomp = seqw.x == stri_reverse(seqwc.y)
)
# Now filter the points that do not match any
dotplot = dotplot %>%
filter(ident | rev | comp | revcomp)
# The seqinr package also has a dotplot function, but takes equally long or longer than this script
# dotplot(as.character(seq1.df$seq), as.character(seq2.df$seq))
###################### Plot #############################
# Plot the distribution of GC content
p = ggplot(dotplot, aes(x = pos.x, y = pos.y)) +
#geom_tile(fill = "black") +
geom_point(shape = "square", size = 1) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
xlab(seq1.name) +
ylab(seq2.name) +
#scale_fill_gradient(low = "white", high = "black") +
scale_x_continuous(limits = c(1, seq1.len), expand = c(0, 0), breaks = c(1, seq(100, seq1.len, 100))) +
scale_y_reverse(limits = c(seq2.len, 1), expand = c(0, 0), breaks = c(1, seq(100, seq2.len, 100))) +
coord_fixed()
pdf(sprintf("%s_dotplot.pdf", prefix), size * max(8, seq1.len/200), size * max(8, seq2.len/200))
plot(p)
log = dev.off()
message(sprintf("# Dotplot saved to %s_dotplot.pdf", prefix))