benchmark

#!/usr/bin/make -rRf
my_mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
my_current_dir :=  $(patsubst %/,%,$(dir $(my_mkfile_path)))/
include $(my_current_dir)/Makefile
ECHO := echo -e
RAW_AWK := awk
AWK := $(RAW_AWK) -f
benchmark_aux_path = $(my_current_dir)aux
GNU_TIME?=$(benchmark_aux_path)/other_tools/time-1.9/time
BWA_DIR := $(my_current_dir)consensus/bwa_v0.7.17
GET_USER_TIME := $(benchmark_aux_path)/get_user_time.awk
GET_WALL_TIME := $(benchmark_aux_path)/get_wall_time.awk
GET_MEM := $(benchmark_aux_path)/get_mem.awk
GET_ACCURACY := $(benchmark_aux_path)/get_accuracy.awk
BWA?=$(BWA_DIR)/bwa
reference_bwa_index := $(reference).bwt
CONVERT_CDHITEST := $(benchmark_aux_path)/convert_cdhitest_to_cluster.sh
CONVERT_STARCODE := $(benchmark_aux_path)/convert_starcode_to_cluster.sh
CONVERT_RAINBOW := $(benchmark_aux_path)/convert_rainbow_to_cluster.sh
CONVERT_UMITOOLS := $(benchmark_aux_path)/convert_umitools_to_cluster.sh
CONVERT_DUNOVO := $(benchmark_aux_path)/convert_dunovo_to_cluster.sh

calib_params?=l_$(barcode_length).m_$(minimizers_num).k_$(kmer_size).e_$(barcode_error_tolerance).t_$(minimizers_threshold).tc_$(thread_count).
calib_output_prefix=$(input_reads_prefix)calib.$(calib_params)
calib_cluster=$(calib_output_prefix)cluster
calib_accuracy=$(calib_output_prefix)accuracy
calib_time=$(calib_output_prefix)time

cdhitest?=$(benchmark_aux_path)/other_tools/cdhit/cd-hit-est
cdhitest_dist?=0.95
cdhitest_params=dist_$(cdhitest_dist).
cdhitest_output_prefix=$(input_reads_prefix)cdhitest.$(cdhitest_params)
cdhitest_output=$(cdhitest_output_prefix)out
cdhitest_cluster=$(cdhitest_output_prefix)cluster
cdhitest_accuracy=$(cdhitest_output_prefix)accuracy
cdhitest_time=$(cdhitest_output_prefix)time

starcode?=$(benchmark_aux_path)/other_tools/starcode/starcode
starcode_umi?=$(benchmark_aux_path)/other_tools/starcode/starcode-umi
starcode_umi_dist?=2
starcode_umi_ratio?=5
starcode_seq_dist?=4
starcode_seq_ratio?=5
starcode_seq_trim?=0
starcode_params=umi_dist_$(starcode_umi_dist).umi_ratio_$(starcode_umi_ratio).seq_dist_$(starcode_seq_dist).seq_ratio_$(starcode_seq_ratio).seq_trim_$(starcode_seq_trim).
starcode_output_prefix=$(input_reads_prefix)starcode.$(starcode_params)
starcode_output=$(starcode_output_prefix)out
starcode_cluster=$(starcode_output_prefix)cluster
starcode_accuracy=$(starcode_output_prefix)accuracy
starcode_time=$(starcode_output_prefix)time

rainbow?=$(benchmark_aux_path)/other_tools/rainbow/rainbow
RAINBOW_RUN=$(benchmark_aux_path)/run_rainbow.sh
rainbow_div?=false
rainbow_mismatch?=4
rainbow_params=mismatch_$(rainbow_mismatch).div_$(rainbow_div).
rainbow_output_prefix=$(input_reads_prefix)rainbow.$(rainbow_params)
rainbow_output=$(rainbow_output_prefix)out
rainbow_cluster=$(rainbow_output_prefix)cluster
rainbow_accuracy=$(rainbow_output_prefix)accuracy
rainbow_time=$(rainbow_output_prefix)time

# dunovo_prefix is the directory where dunovo is cloned
dunovo_prefix?=$(benchmark_aux_path)/other_tools/dunovo
DUNOVO_RUN=$(benchmark_aux_path)/run_dunovo.sh
dunovo_dist?=2
dunovo_barcode_length?=$(barcode_length)
dunovo_invariant?=0
dunovo_params=dist_$(dunovo_dist).
dunovo_output_prefix=$(input_reads_prefix)dunovo.$(dunovo_params)
dunovo_temp_directory?=$(dunovo_output_prefix)temp
dunovo_output=$(dunovo_output_prefix)out
dunovo_cluster=$(dunovo_output_prefix)cluster
dunovo_accuracy=$(dunovo_output_prefix)accuracy
dunovo_time=$(dunovo_output_prefix)time

UMITOOLS_RUN=$(benchmark_aux_path)/run_umitools.sh
umitools_params=edit_2.
umitools_output_prefix=$(input_reads_prefix)umitools.$(umitools_params)
umitools_temp_directory?=$(umitools_output_prefix)temp
umitools_output=$(umitools_output_prefix)out
umitools_cluster=$(umitools_output_prefix)cluster
umitools_accuracy=$(umitools_output_prefix)accuracy
umitools_time=$(umitools_output_prefix)time

log_comment?=no_comment
calib_log_comment?=$(log_comment)_calib
cdhitest_log_comment?=$(log_comment)_cdhitest
starcode_log_comment?=$(log_comment)_starcode
dunovo_log_comment?=$(log_comment)_dunovo
rainbow_log_comment?=$(log_comment)_rainbow
umitools_log_comment?=$(log_comment)_umitools

calib_log_file?=$(input_reads_prefix)calib_benchmarks.tsv
cdhitest_log_file?=$(input_reads_prefix)cdhitest_benchmarks.tsv
starcode_log_file?=$(input_reads_prefix)starcode_benchmarks.tsv
dunovo_log_file?=$(input_reads_prefix)dunovo_benchmarks.tsv
rainbow_log_file?=$(input_reads_prefix)rainbow_benchmarks.tsv
umitools_log_file?=$(input_reads_prefix)umitools_benchmarks.tsv
run: calib_log cdhitest_log starcode_log rainbow_log  rainbow_log umitools_log

log_files: calib_log_file cdhitest_log_file starcode_log_file dunovo_log_file rainbow_log_file umitools_log_file
calib_log_file: $(calib_log_file)
cdhitest_log_file: $(cdhitest_log_file)
starcode_log_file: $(starcode_log_file)
dunovo_log_file: $(dunovo_log_file)
rainbow_log_file: $(rainbow_log_file)
umitools_log_file: $(umitools_log_file)

calib_log: $(calib_log_file) $(calib_accuracy)
	$(eval my_user_time:= $(shell $(AWK) $(GET_USER_TIME) $(calib_time)))
	$(eval my_wall_time:= $(shell $(AWK) $(GET_WALL_TIME) $(calib_time)))
	$(eval my_mem := $(shell $(AWK) $(GET_MEM) $(calib_time)))
	$(eval my_accuracy := $(shell $(AWK) $(GET_ACCURACY) $(calib_accuracy)))
	$(eval my_timestamp := $(shell /bin/date "+%Y-%m-%d---%H-%M-%S"))
	$(ECHO) "$(calib_log_comment)\t$(my_timestamp)\t$(my_user_time)\t$(my_wall_time)\t$(my_mem)\t$(my_accuracy)\t$(barcode_length)\t$(minimizers_num)\t$(kmer_size)\t$(barcode_error_tolerance)\t$(minimizers_threshold)" >> $(calib_log_file);
	rm $(calib_accuracy) $(calib_time) $(calib_cluster);

$(calib_log_file):
	$(ECHO) "log_comment\tmy_timestamp\tuser_time\twall_time\tmem\tARI\tbarcode_length\tminimizers_num\tkmer_size\tbarcode_error_tolerance\tminimizers_threshold" > $(calib_log_file)

$(calib_accuracy): $(calib_time)
	$(python3) $(simulating_path)rand_index.py \
		--true-cluster-file $(true_cluster) \
		--predicted-cluster-file $(calib_cluster) \
		--output-accuracy-results $(calib_accuracy)

$(calib_time): $(GNU_TIME) calib simulate
	$(GNU_TIME) -v -o $(calib_time) \
	./calib \
		--input-forward $(forward_reads) \
		--input-reverse $(reverse_reads) \
		--output-prefix $(calib_output_prefix) \
		--barcode-length $(barcode_length) \
		--minimizer-count $(minimizers_num) \
		--kmer-size $(kmer_size) \
		--error-tolerance $(barcode_error_tolerance) \
		--minimizer-threshold $(minimizers_threshold) \
		--threads $(thread_count) \
		$(silent) \
		$(sort)

# CD-HIT-EST
cdhitest_log: $(cdhitest_log_file) $(cdhitest_accuracy)
	$(eval my_user_time:= $(shell $(AWK) $(GET_USER_TIME) $(cdhitest_time)))
	$(eval my_wall_time:= $(shell $(AWK) $(GET_WALL_TIME) $(cdhitest_time)))
	$(eval my_mem := $(shell $(AWK) $(GET_MEM) $(cdhitest_time)))
	$(eval my_accuracy := $(shell $(AWK) $(GET_ACCURACY) $(cdhitest_accuracy)))
	$(eval my_timestamp := $(shell /bin/date "+%Y-%m-%d---%H-%M-%S"))
	$(ECHO) "$(cdhitest_log_comment)\t$(my_timestamp)\t$(my_user_time)\t$(my_wall_time)\t$(my_mem)\t$(my_accuracy)\t$(cdhitest_dist)" >> $(cdhitest_log_file);
	rm $(cdhitest_accuracy) $(cdhitest_time) $(cdhitest_cluster) $(cdhitest_output).1.fastq.clstr $(cdhitest_output).1.fastq $(cdhitest_output).2.fastq;

$(cdhitest_log_file):
	$(ECHO) "log_comment\tmy_timestamp\tuser_time\twall_time\tmem\tARI\tcdhitest_dist" > $(cdhitest_log_file)

$(cdhitest_accuracy): $(cdhitest_time)
	$(CONVERT_CDHITEST) $(cdhitest_output).1.fastq.clstr > $(cdhitest_cluster);
	$(python3) $(simulating_path)rand_index.py \
		--true-cluster-file $(true_cluster) \
		--predicted-cluster-file $(cdhitest_cluster) \
		--output-accuracy-results $(cdhitest_accuracy)

$(cdhitest_time): $(GNU_TIME) cdhitest simulate
	$(GNU_TIME) -v -o $(cdhitest_time) \
		$(cdhitest) \
			-P 1 \
			-i $(forward_reads) \
			-j $(reverse_reads) \
			-o $(cdhitest_output).1.fastq \
			-op $(cdhitest_output).2.fastq \
			-c $(cdhitest_dist) \
			-d 0 \
			-M 0

cdhitest:
	make -C $(benchmark_aux_path)/other_tools/cdhit

# Starcode
starcode_log: $(starcode_log_file) $(starcode_accuracy)
	$(eval my_user_time:= $(shell $(AWK) $(GET_USER_TIME) $(starcode_time)))
	$(eval my_wall_time:= $(shell $(AWK) $(GET_WALL_TIME) $(starcode_time)))
	$(eval my_mem := $(shell $(AWK) $(GET_MEM) $(starcode_time)))
	$(eval my_accuracy := $(shell $(AWK) $(GET_ACCURACY) $(starcode_accuracy)))
	$(eval my_timestamp := $(shell /bin/date "+%Y-%m-%d---%H-%M-%S"))
	$(ECHO) "$(starcode_log_comment)\t$(my_timestamp)\t$(my_user_time)\t$(my_wall_time)\t$(my_mem)\t$(my_accuracy)\t$(starcode_umi_dist)\t$(starcode_umi_ratio)\t$(starcode_seq_dist)\t$(starcode_seq_ratio)\t$(starcode_seq_trim)" >> $(starcode_log_file);
	rm $(starcode_accuracy) $(starcode_time) $(starcode_cluster) $(starcode_output);


$(starcode_log_file):
	$(ECHO) "log_comment\tmy_timestamp\tuser_time\twall_time\tmem\tARI\tstarcode_umi_dist\tstarcode_umi_ratio\tstarcode_seq_dist\tstarcode_seq_ratio\tstarcode_seq_trim" > $(starcode_log_file)

$(starcode_accuracy): $(starcode_time)
	$(CONVERT_STARCODE) $(starcode_output) > $(starcode_cluster);
	$(python3) $(simulating_path)rand_index.py \
		--true-cluster-file $(true_cluster) \
		--predicted-cluster-file $(starcode_cluster) \
		--output-accuracy-results $(starcode_accuracy)

$(starcode_time): $(GNU_TIME) starcode simulate
	$(GNU_TIME) -v -o $(starcode_time) \
		$(starcode_umi) \
			--umi-len $(barcode_length) \
			--starcode-path $(starcode) \
			--umi-d $(starcode_umi_dist) \
			--seq-d $(starcode_seq_dist) \
			--umi-cluster-ratio $(starcode_umi_ratio) \
			--seq-cluster-ratio $(starcode_seq_ratio) \
			--seq-trim $(starcode_seq_trim) \
			--seq-id \
			$(forward_reads) \
			$(reverse_reads) \
			> $(starcode_output)

starcode:
	make -C $(benchmark_aux_path)/other_tools/starcode

# Du Novo
dunovo_log: $(dunovo_log_file) $(dunovo_accuracy)
	$(eval my_user_time:= $(shell $(AWK) $(GET_USER_TIME) $(dunovo_time)))
	$(eval my_wall_time:= $(shell $(AWK) $(GET_WALL_TIME) $(dunovo_time)))
	$(eval my_mem := $(shell $(AWK) $(GET_MEM) $(dunovo_time)))
	$(eval my_accuracy := $(shell $(AWK) $(GET_ACCURACY) $(dunovo_accuracy)))
	$(eval my_timestamp := $(shell /bin/date "+%Y-%m-%d---%H-%M-%S"))
	$(ECHO) "$(dunovo_log_comment)\t$(my_timestamp)\t$(my_user_time)\t$(my_wall_time)\t$(my_mem)\t$(my_accuracy)\t$(dunovo_dist)" >> $(dunovo_log_file);
	rm $(dunovo_accuracy) $(dunovo_time) $(dunovo_cluster) $(dunovo_output);

$(dunovo_log_file):
	$(ECHO) "log_comment\tmy_timestamp\tuser_time\twall_time\tmem\tARI\tdunovo_dist" > $(dunovo_log_file)

$(dunovo_accuracy): $(dunovo_time)
	$(CONVERT_DUNOVO) $(dunovo_output) > $(dunovo_cluster);
	$(python3) $(simulating_path)rand_index.py \
		--true-cluster-file $(true_cluster) \
		--predicted-cluster-file $(dunovo_cluster) \
		--output-accuracy-results $(dunovo_accuracy)

$(dunovo_time): $(GNU_TIME) dunovo simulate
	$(eval rand := $(shell $(RAW_AWK) 'BEGIN{srand();printf("%d", 65536*rand())}'))
	conda create --name calib_dunovo$(rand) -c faircloth-lab -c bioconda samtools=0.1.19 python=2.7 networkx bowtie=1.2.2 -y;
	$(GNU_TIME) -v -o $(dunovo_time) \
		$(DUNOVO_RUN) \
			$(dunovo_temp_directory) \
			$(forward_reads) \
			$(reverse_reads) \
			$(dunovo_prefix) \
			$(dunovo_invariant) \
			$(barcode_length) \
			$(dunovo_dist) \
			$(dunovo_output) \
			calib_dunovo$(rand)
	conda remove --name calib_dunovo"$(rand)"  --all -y

dunovo:
	make -C $(benchmark_aux_path)/other_tools/dunovo

# Rainbow
rainbow_log: $(rainbow_log_file) $(rainbow_accuracy)
	$(eval my_user_time:= $(shell $(AWK) $(GET_USER_TIME) $(rainbow_time)))
	$(eval my_wall_time:= $(shell $(AWK) $(GET_WALL_TIME) $(rainbow_time)))
	$(eval my_mem := $(shell $(AWK) $(GET_MEM) $(rainbow_time)))
	$(eval my_accuracy := $(shell $(AWK) $(GET_ACCURACY) $(rainbow_accuracy)))
	$(eval my_timestamp := $(shell /bin/date "+%Y-%m-%d---%H-%M-%S"))
	$(ECHO) "$(rainbow_log_comment)\t$(my_timestamp)\t$(my_user_time)\t$(my_wall_time)\t$(my_mem)\t$(my_accuracy)\t$(rainbow_mismatch)\t$(rainbow_div)" >> $(rainbow_log_file);
	rm $(rainbow_accuracy) $(rainbow_time) $(rainbow_cluster) $(rainbow_output);

$(rainbow_log_file):
	$(ECHO) "log_comment\tmy_timestamp\tuser_time\twall_time\tmem\tARI\trainbow_mismatch\trainbow_div" > $(rainbow_log_file)

$(rainbow_accuracy): $(rainbow_time)
	$(CONVERT_RAINBOW) $(rainbow_output) > $(rainbow_cluster);
	$(python3) $(simulating_path)rand_index.py \
		--true-cluster-file $(true_cluster) \
		--predicted-cluster-file $(rainbow_cluster) \
		--output-accuracy-results $(rainbow_accuracy)


$(rainbow_time): $(GNU_TIME) rainbow simulate
	$(GNU_TIME) -v -o $(rainbow_time) \
		$(RAINBOW_RUN) \
			$(rainbow) \
			$(forward_reads) \
			$(reverse_reads) \
			$(rainbow_mismatch) \
			$(rainbow_div) \
			$(rainbow_output)

rainbow:
	make -C $(benchmark_aux_path)/other_tools/rainbow

#UMI-tools
umitools_log: $(umitools_log_file) $(umitools_accuracy)
	$(eval my_user_time:= $(shell $(AWK) $(GET_USER_TIME) $(umitools_time)))
	$(eval my_wall_time:= $(shell $(AWK) $(GET_WALL_TIME) $(umitools_time)))
	$(eval my_mem := $(shell $(AWK) $(GET_MEM) $(umitools_time)))
	$(eval my_accuracy := $(shell $(AWK) $(GET_ACCURACY) $(umitools_accuracy)))
	$(eval my_timestamp := $(shell /bin/date "+%Y-%m-%d---%H-%M-%S"))
	$(ECHO) "$(umitools_log_comment)\t$(my_timestamp)\t$(my_user_time)\t$(my_wall_time)\t$(my_mem)\t$(my_accuracy)\t2" >> $(umitools_log_file);
	rm $(umitools_accuracy) $(umitools_time) $(umitools_cluster) $(umitools_output) $(umitools_temp_directory) -r;

$(umitools_log_file):
	$(ECHO) "log_comment\tmy_timestamp\tuser_time\twall_time\tmem\tARI\tumitools_edit" > $(umitools_log_file)

$(umitools_accuracy): $(umitools_time)
	$(CONVERT_UMITOOLS) $(umitools_output) > $(umitools_cluster);
	$(python3) $(simulating_path)rand_index.py \
		--true-cluster-file $(true_cluster) \
		--predicted-cluster-file $(umitools_cluster) \
		--output-accuracy-results $(umitools_accuracy)

$(umitools_time): $(GNU_TIME) umitools simulate $(BWA) $(reference_bwa_index) $(reference) $(forward_reads)_umitools.1.fq $(reverse_reads)_umitools.2.fq
	$(GNU_TIME) -v -o $(umitools_time) \
		$(UMITOOLS_RUN) \
			$(forward_reads)_umitools.1.fq \
			$(reverse_reads)_umitools.2.fq \
			$(umitools_output) \
			$(umitools_temp_directory) \
			$(BWA) \
			$(reference) \
			samtools
	rm $(forward_reads)_umitools.1.fq $(reverse_reads)_umitools.2.fq

$(forward_reads)_umitools.1.fq:
	$(RAW_AWK) 'BEGIN{n=0} NR % 4 == 1 {print "@"n++} NR % 4 != 1 {print}' $(forward_reads) > $(forward_reads)_umitools.1.fq
$(reverse_reads)_umitools.2.fq:
	$(RAW_AWK) 'BEGIN{n=0} NR % 4 == 1 {print "@"n++} NR % 4 != 1 {print}' $(reverse_reads) > $(reverse_reads)_umitools.2.fq

umitools:
ifeq ("$(shell umi_tools --version)",  "UMI-tools version: 0.5.4")
	@echo "UMI-tools v0.5.4 is installed"
else
	cd $(benchmark_aux_path)/other_tools/UMI-tools; pip install -r requirements.txt; python setup.py install --user --prefix=;
endif

reference_bwa_index: $(reference_bwa_index)
$(reference_bwa_index): $(BWA) $(reference)
	$(BWA) index $(reference)

bwa: $(BWA)
$(BWA):
	make -C $(BWA_DIR)

gnu_time: $(GNU_TIME)
$(GNU_TIME):
	wget https://ftp.gnu.org/gnu/time/time-1.9.tar.gz \
		-O $(benchmark_aux_path)/other_tools/time-1.9.tar.gz;
	tar -xzf $(benchmark_aux_path)/other_tools/time-1.9.tar.gz \
		-C $(benchmark_aux_path)/other_tools/;
	rm $(benchmark_aux_path)/other_tools/time-1.9.tar.gz -f;
	cd $(benchmark_aux_path)/other_tools/time-1.9; \
		./configure; \
		cd - ;
	make -C $(benchmark_aux_path)/other_tools/time-1.9;