Skip to content

Commit

Permalink
Merge pull request #59 from brianpardy/gisaid_fasta
Browse files Browse the repository at this point in the history
Add normalize_gisaid_fasta.sh for issue #53
  • Loading branch information
trvrb authored Mar 15, 2020
2 parents de8fd86 + a869ac3 commit 4ee884d
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions scripts/normalize_gisaid_fasta.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -e
GISAID_SARSCOV2_IN=$1
GISAID_SARSCOV2_OUT=$2
MIN_LENGTH=$3

if [[ ! -r "$GISAID_SARSCOV2_IN" ]]
then
echo "$0: input $GISAID_SARSCOV2_IN not found"
exit 1
fi

if [[ -z "$MIN_LENGTH" ]]
then
echo "Using default minimum length of 15000"
MIN_LENGTH=15000
fi

echo "Normalizing GISAID file $GISAID_SARSCOV2_IN to $GISAID_SARSCOV2_OUT (min length $MIN_LENGTH)"

# Remove leading 'BetaCoV' and 'BetaCov' from sequence names
# Remove embedded spaces in sequence names (Hong Kong sequences)
# Remove trailing |EPI_ISL_id|datestamp from sequence names
# Remove sequences shorter than minimum length
# Eliminate duplicate sequences (keep only the first seen)

#cat $GISAID_SARSCOV2_IN |
sed 's/^>hCoV-19\//>/g' $GISAID_SARSCOV2_IN | # remove leading BetaCo[vV]
sed 's/ //g' | # remove embedded spaces
sed 's/|.*$//' | # remove trailing metadata
awk "BEGIN{RS=\">\";FS=\"\n\"}length>$MIN_LENGTH{print \">\"\$0}" | # remove short seqs
awk 'BEGIN{RS=">";FS="\n"}!x[$1]++{print ">"$0}' | # remove duplicates
grep -v '^>*$' > $GISAID_SARSCOV2_OUT

exit 0

0 comments on commit 4ee884d

Please sign in to comment.