-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
165 lines (136 loc) · 6.6 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# This directory is laid out assuming that code and data are not necessarily
# included, so they will be obtained if needed by the rules here. If both are
# provided, only the analysis and figure-generating rules should need to run.
# CHIIMP git repository commit to check out, if a copy isn't already present
REPO = "https://github.com/ShawHahnLab/chiimp.git"
COMMIT = 0.1.0
# The forward and reverse Illumina adapters in the raw sequence data. We only
# rely on R1 here, but both are given.
ADAPT_R1 = "CTGTCTCTTATACACATCTCCGAGCCCACGAGAC"
ADAPT_R2 = "CTGTCTCTTATACACATCTGACGCTGCCGACGA"
# Final product is the figures.
all: results/figures.html
# Storing the sample download code separately so the Makefile can parse the
# resulting filenames
# https://stackoverflow.com/a/41303169/6073858
ifneq (n,$(findstring n,$(firstword -$(MAKEFLAGS))))
data_result := $(shell bash fetch_data.sh >&2)
endif
pat_data = $(subst raw/,prepared/,$(gz))
pat_raw = $(subst data/prepared,data/raw,$@)
d_raw_R1 = $(wildcard data/raw/*R1_001.fastq.gz)
d_raw_R2 = $(wildcard data/raw/*R2_001.fastq.gz)
d_R1 = $(foreach gz, $(d_raw_R1), $(pat_data))
d_R2 = $(foreach gz, $(d_raw_R2), $(pat_data))
# Data prep: trim adapters and keep the R1 reads
# Trim adapters from the R1 reads
$(d_R1):
mkdir -p $(dir $@)
cutadapt -a $(ADAPT_R1) $(pat_raw) | gzip > $@
# We don't use the R2 reads, but here's a rule for them
$(d_R2):
mkdir -p $(dir $@)
cutadapt -a $(ADAPT_R2) $(pat_raw) | gzip > $@
# We need a full analysis on the various datasets to make the figures
results/figures.html: figures.Rmd \
results/gombe-round24/report.html \
results/gombe-blinded-test-2/report.html \
results/gombe-blinded-test-2-simple/report.html \
results/gombe-blinded-test-2-full/report.html \
results/gme/report.html \
results/gme-mplex/report.html
R --slave --vanilla -e "rmarkdown::render('$<', output_file = '$@', quiet = TRUE)"
### Full reports
#
# These depend on the trimmed forward reads, the input spreadsheets, and the
# chiimp package.
# The known_genotypes and known_alleles spreadsheets were generated separately
# from the combined results of many samples and replicates across our genotyped
# Gombe chimps, validated with additional information such as known
# inheritance. The locus_attrs spreadsheet corresponds to the known attributes
# of our selected loci.
spreadsheets = metadata/known_alleles.csv metadata/known_alleles_combined.csv metadata/known_genotypes.csv metadata/locus_attrs.csv
# This allows the $$ below to find the data files after the download script
# runs
.SECONDEXPANSION:
# The first dataset, just using the replicate 1 files
results/gombe-blinded-test-2/report.html: config/config-gmblind2.yml metadata/samples-gmblind2.csv $(spreadsheets) | chiimp $$(d_R1)
chiimp/inst/bin/chiimp $<
# The first dataset, just using the replicate 1 files, with no artifact/stutter
# filter
results/gombe-blinded-test-2-simple/report.html: config/config-gmblind2-simple.yml metadata/samples-gmblind2.csv $(spreadsheets) | chiimp $$(d_R1)
chiimp/inst/bin/chiimp $<
# The first dataset, all files
results/gombe-blinded-test-2-full/report.html: config/config-gmblind2-full.yml metadata/samples-gmblind2-full.csv $(spreadsheets) | chiimp $$(d_R1)
chiimp/inst/bin/chiimp $<
# The second dataset
results/gombe-round24/report.html: config/config-round24.yml metadata/samples-round24.csv $(spreadsheets) | chiimp $$(d_R1)
chiimp/inst/bin/chiimp $<
# The third dataset (GME, singleplex)
results/gme/report.html: config/config-gme.yml metadata/samples-gme.csv $(spreadsheets) | chiimp $$(d_R1)
chiimp/inst/bin/chiimp $<
# The fourth dataset (GME, multiplex)
results/gme-mplex/report.html: config/config-gme-mplex.yml metadata/samples-gme-mplex.csv $(spreadsheets) | chiimp $$(d_R1)
chiimp/inst/bin/chiimp $<
# Sample attributes for the first dataset, just replicate 1 files
metadata/samples-gmblind2.csv: metadata/sample_attrs.csv
awk -F, '{if (NR == 1 || ( $$9 == 1 && $$12 == 1 ) ) {print $$0}}' $^ | cut -f 9,11,12,13,15,31 -d , > $@
# Sample attributes for the first dataset, all files
metadata/samples-gmblind2-full.csv: metadata/sample_attrs.csv
awk -F, '{if (NR == 1 || $$9 == 1) {print $$0}}' $^ | cut -f 9,11,12,13,15,31 -d , > $@
# Sample attributes for the second dataset
metadata/samples-round24.csv: metadata/sample_attrs.csv
awk -F, '{if (NR == 1 || $$9 == 2) {print $$0}}' $^ | cut -f 9,11,12,13,15,31 -d , > $@
# Sample attributes for the third dataset
metadata/samples-gme.csv: metadata/sample_attrs.csv
awk -F, '{if (NR == 1 || $$9 == 3) {print $$0}}' $^ | cut -f 9,11,12,14,15,31 -d , > $@
# Sample attributes for the fourth dataset
metadata/samples-gme-mplex.csv: metadata/sample_attrs.csv
awk -F, '{if (NR == 1 || $$9 == 4) {print $$0}}' $^ | cut -f 9,11,12,14,15,31 -d , > $@
# The chiimp software
# Also installing the development packages as they're required for
# devtools::load_all().
chiimp:
git clone $(REPO) && cd chiimp && git checkout $(COMMIT)
R --slave --vanilla -e "pkgs<-c('devtools','roxygen2','testthat');to_inst<-pkgs[!pkgs%in%installed.packages()[,'Package']];if(length(to_inst)>0)install.packages(to_inst,repos='https://cloud.r-project.org')"
R --slave --vanilla -e " if (!'msa' %in% installed.packages()){source('https://bioconductor.org/biocLite.R');biocLite();biocLite('msa')}"
R --slave --vanilla -e "devtools::install_deps('$@')"
# Removes the targets of rules above, but not input data or the CHIIMP
# directory.
clean:
# Figures
rm -f results/figures.html
# Sample sheets
rm -f metadata/samples-gmblind2.csv
rm -f metadata/samples-gmblind2-full.csv
rm -f metadata/samples-gme.csv
rm -f metadata/samples-gme-mplex.csv
rm -f metadata/samples-round24.csv
# Reports
rm -f results/gme-mplex/report.html
rm -f results/gme/report.html
rm -f results/gombe-blinded-test-2-full/report.html
rm -f results/gombe-blinded-test-2-simple/report.html
rm -f results/gombe-blinded-test-2/report.html
rm -f results/gombe-round24/report.html
# Summary spreadsheets
rm -f results/gme-mplex/summary.csv
rm -f results/gme/summary.csv
rm -f results/gombe-blinded-test-2-full/summary.csv
rm -f results/gombe-blinded-test-2-simple/summary.csv
rm -f results/gombe-blinded-test-2/summary.csv
rm -f results/gombe-round24/summary.csv
# As above, but also including all input data and the CHIIMP directory.
veryclean: clean
rm -f data/raw/*
rm -f data/prepared/*
rmdir data/raw
rmdir data/prepared
rm -rf chiimp/
rm -rf results/
### Other stuff
# The SRA metadata table. This info should already be in the local copy, but
# here's a way to fetch it.
SRP = SRP132984
RunInfoTable.csv:
wget -O $@ "http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=$(SRP)"