From d3c90c751696a089e85538825639bc2be131b4b4 Mon Sep 17 00:00:00 2001 From: Brian Pardy Date: Tue, 25 Feb 2020 08:42:58 -0500 Subject: [PATCH 1/6] Add normalize_gisaid_fasta.sh for issue #53 --- scripts/normalize_gisaid_fasta.sh | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 scripts/normalize_gisaid_fasta.sh diff --git a/scripts/normalize_gisaid_fasta.sh b/scripts/normalize_gisaid_fasta.sh new file mode 100755 index 000000000..7a29f4ad5 --- /dev/null +++ b/scripts/normalize_gisaid_fasta.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -e +GISAID_SARSCOV2_IN=$1 +GISAID_SARSCOV2_OUT=$2 +MIN_LENGTH=$3 + +if [[ ! -r "$GISAID_SARSCOV2_IN" ]] +then + echo "$0: input $GISAID_SARSCOV2_IN not found" + exit 1 +fi + +if [[ -z "$MIN_LENGTH" ]] +then + echo "Using default minimum length of 15000" + MIN_LENGTH=15000 +fi + +echo "Normalizing GISAID file $GISAID_SARSCOV2_IN to $GISAID_SARSCOV2_OUT (min length $MIN_LENGTH)" + +# Remove leading 'BetaCoV' and 'BetaCov' from sequence names +# Remove embedded spaces in sequence names (Hong Kong sequences) +# Remove trailing |EPI_ISL_id|datestamp from sequence names +# Remove sequences shorter than minimum length +# Eliminate duplicate sequences (keep only the first seen) + +cat $GISAID_SARSCOV2_IN | + sed 's/^>BetaCoV\//>/gi' | # remove leading BetaCo[vV] + sed 's/ //g' | # remove embedded spaces + sed 's/|.*$//' | # remove trailing metadata + awk "BEGIN{RS=\">\";FS=\"\n\"}length>$MIN_LENGTH{print \">\"\$0}" | # remove short seqs + awk 'BEGIN{RS=">";FS="\n"}!x[$1]++{print ">"$0}' | # remove duplicates + grep -v '^>*$' > $GISAID_SARSCOV2_OUT + +exit 0 From 4a123001469e15103c4db36457f72e956bbea663 Mon Sep 17 00:00:00 2001 From: Brian Pardy Date: Wed, 26 Feb 2020 08:24:06 -0500 Subject: [PATCH 2/6] Useless use of cat reported by Annatar removed --- scripts/normalize_gisaid_fasta.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/normalize_gisaid_fasta.sh b/scripts/normalize_gisaid_fasta.sh index 7a29f4ad5..8d4249214 100755 --- a/scripts/normalize_gisaid_fasta.sh +++ b/scripts/normalize_gisaid_fasta.sh @@ -24,10 +24,10 @@ echo "Normalizing GISAID file $GISAID_SARSCOV2_IN to $GISAID_SARSCOV2_OUT (min l # Remove sequences shorter than minimum length # Eliminate duplicate sequences (keep only the first seen) -cat $GISAID_SARSCOV2_IN | - sed 's/^>BetaCoV\//>/gi' | # remove leading BetaCo[vV] - sed 's/ //g' | # remove embedded spaces - sed 's/|.*$//' | # remove trailing metadata +#cat $GISAID_SARSCOV2_IN | + sed 's/^>BetaCoV\//>/gi' $GISAID_SARSCOV2_IN | # remove leading BetaCo[vV] + sed 's/ //g' | # remove embedded spaces + sed 's/|.*$//' | # remove trailing metadata awk "BEGIN{RS=\">\";FS=\"\n\"}length>$MIN_LENGTH{print \">\"\$0}" | # remove short seqs awk 'BEGIN{RS=">";FS="\n"}!x[$1]++{print ">"$0}' | # remove duplicates grep -v '^>*$' > $GISAID_SARSCOV2_OUT From bc85d4b91dba2da69ffedf36853ac53011b92681 Mon Sep 17 00:00:00 2001 From: Brian Pardy Date: Wed, 26 Feb 2020 12:02:46 -0500 Subject: [PATCH 3/6] Add gisaid rule to Snakefile --- Snakefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Snakefile b/Snakefile index 5f8bb0ac3..365427d58 100644 --- a/Snakefile +++ b/Snakefile @@ -420,3 +420,19 @@ rule clean: "auspice" shell: "rm -rfv {params}" + +rule gisaid: + message: "Normalizing GISAID download" + input: + gisaid_fasta = "data/gisaid_cov2020_sequences.fasta" + output: + sequences = "data/sequences.fasta" + params: + min_length = 15000 + shell: + """ + if [[ ! -f "data/sequences.fasta" && -f "data/gisaid_cov2020_sequences.fasta" ]] + then + scripts/normalize_gisaid_fasta.sh data/gisaid_cov2020_sequences.fasta data/sequences.fasta {params.min_length} + fi + """ From 1cf4613c80f99aed6ae21ba8c2215d86a03d2859 Mon Sep 17 00:00:00 2001 From: Brian Pardy Date: Sun, 8 Mar 2020 13:38:57 -0400 Subject: [PATCH 4/6] Support GISAID renaming to hCoV-19 --- scripts/normalize_gisaid_fasta.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/normalize_gisaid_fasta.sh b/scripts/normalize_gisaid_fasta.sh index 8d4249214..4668daacd 100755 --- a/scripts/normalize_gisaid_fasta.sh +++ b/scripts/normalize_gisaid_fasta.sh @@ -25,7 +25,7 @@ echo "Normalizing GISAID file $GISAID_SARSCOV2_IN to $GISAID_SARSCOV2_OUT (min l # Eliminate duplicate sequences (keep only the first seen) #cat $GISAID_SARSCOV2_IN | - sed 's/^>BetaCoV\//>/gi' $GISAID_SARSCOV2_IN | # remove leading BetaCo[vV] + sed 's/^>hCoV-19\//>/gi' $GISAID_SARSCOV2_IN | # remove leading BetaCo[vV] sed 's/ //g' | # remove embedded spaces sed 's/|.*$//' | # remove trailing metadata awk "BEGIN{RS=\">\";FS=\"\n\"}length>$MIN_LENGTH{print \">\"\$0}" | # remove short seqs From 4f6c225fa0068cef7a012dd57bfbd158090a1dd2 Mon Sep 17 00:00:00 2001 From: Trevor Bedford Date: Sat, 14 Mar 2020 18:02:49 -0700 Subject: [PATCH 5/6] Fix typo in normalize_gisaid_fasta.sh --- scripts/normalize_gisaid_fasta.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/normalize_gisaid_fasta.sh b/scripts/normalize_gisaid_fasta.sh index 4668daacd..0f8271662 100755 --- a/scripts/normalize_gisaid_fasta.sh +++ b/scripts/normalize_gisaid_fasta.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -set -e +set -e GISAID_SARSCOV2_IN=$1 GISAID_SARSCOV2_OUT=$2 MIN_LENGTH=$3 @@ -24,8 +24,8 @@ echo "Normalizing GISAID file $GISAID_SARSCOV2_IN to $GISAID_SARSCOV2_OUT (min l # Remove sequences shorter than minimum length # Eliminate duplicate sequences (keep only the first seen) -#cat $GISAID_SARSCOV2_IN | - sed 's/^>hCoV-19\//>/gi' $GISAID_SARSCOV2_IN | # remove leading BetaCo[vV] +#cat $GISAID_SARSCOV2_IN | + sed 's/^>hCoV-19\//>/g' $GISAID_SARSCOV2_IN | # remove leading BetaCo[vV] sed 's/ //g' | # remove embedded spaces sed 's/|.*$//' | # remove trailing metadata awk "BEGIN{RS=\">\";FS=\"\n\"}length>$MIN_LENGTH{print \">\"\$0}" | # remove short seqs From a869ac3873f61150fba3882795cb0706c7cdf171 Mon Sep 17 00:00:00 2001 From: Trevor Bedford Date: Sat, 14 Mar 2020 18:14:18 -0700 Subject: [PATCH 6/6] Remove gisaid rule from snakefile Asking snakemake for "data/sequences.fasta" was throwing an error due to ambiguous DAG. This file can be created by either rule download or rule gisaid. There might be a better solution here, but in order to merge, I'm just going to remove from snakefile. --- Snakefile | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/Snakefile b/Snakefile index 365427d58..5f8bb0ac3 100644 --- a/Snakefile +++ b/Snakefile @@ -420,19 +420,3 @@ rule clean: "auspice" shell: "rm -rfv {params}" - -rule gisaid: - message: "Normalizing GISAID download" - input: - gisaid_fasta = "data/gisaid_cov2020_sequences.fasta" - output: - sequences = "data/sequences.fasta" - params: - min_length = 15000 - shell: - """ - if [[ ! -f "data/sequences.fasta" && -f "data/gisaid_cov2020_sequences.fasta" ]] - then - scripts/normalize_gisaid_fasta.sh data/gisaid_cov2020_sequences.fasta data/sequences.fasta {params.min_length} - fi - """