Add a script that can batch import and explain Lancet data in the Nov…

…ember format
vgteam · May 3, 2024 · 09c3ddb · 09c3ddb
1 parent 234a75c
commit 09c3ddb
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 32 deletions.
diff --git a/scripts/prepare_lancet_output.sh b/scripts/prepare_lancet_output.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+# prepare_lancet_input.sh: Turn a directory of Lancet output into a Tube Map BED file and directory that can be hosted on the web.
+set -x
+
+function usage() {
+    echo >&2 "${0}: Prepare a directory of Lancet data ."
+    echo >&2
+    echo >&2 "The input directory should have in it:"
+    echo >&2 "- variant_calling_regions.with_calls.bed, an extended BED file with biallelic VCF REF in column 6, ALT in column 7, and INFO in column 9 with a TYPE field"
+    echo >&2 "- giraffe_alignments/, a directory with, for each VCF record:"
+    echo >&2 "    - <CONTIG>_<START>_<END>.giraffe.gbz"
+    echo >&2 "    - normal.<CONTIG>_<START>_<END>.gam"
+    echo >&2 "    - tumor.<CONTIG>_<START>_<END>.gam"
+    echo >&2 "Note that it is tolerated for the END used in the file names to not match the one used in the BED records."
+    echo >&2
+    echo >&2 "Usage: ${0} <INPUT_DIR> <OUTPUT_DIR>"
+    exit 1
+}
+
+INPUT_DIR="${1}"
+OUTPUT_DIR="${2}"
+
+if [[ -z "${OUTPUT_DIR}" || -z "${INPUT_DIR}" || ! -d "${INPUT_DIR}" ]] ; then
+    usage
+fi
+
+# Make all the paths absolute
+INPUT_DIR="$(realpath "${INPUT_DIR}")"
+OUTPUT_DIR="$(realpath "${OUTPUT_DIR}")"
+
+INPUT_BED="${INPUT_DIR}/variant_calling_regions.with_calls.bed"
+INPUT_DATA_DIR="${INPUT_DIR}/giraffe_alignments"
+
+if [[ ! -f "${INPUT_BED}" || ! -d "${INPUT_DATA_DIR}" ]] ; then
+    usage
+fi
+
+
+SCRIPTS_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
+
+mkdir -p "${OUTPUT_DIR}"
+
+OUTPUT_BED="${OUTPUT_DIR}/index.bed"
+rm -f "${OUTPUT_BED}"
+
+# We need to ber in the output directory to use relative paths in the generated BED.
+cd "${OUTPUT_DIR}"
+
+while IFS="" read -r INPUT_LINE || [[ -n "${INPUT_LINE}" ]] ; do
+    # Read all the lines, even if the newline is missing. See <https://stackoverflow.com/a/1521498>
+
+    # Parse the BED
+    CONTIG="$(echo "${INPUT_LINE}" | cut -f1)"
+    START_POS="$(echo "${INPUT_LINE}" | cut -f2)"
+    END_POS="$(echo "${INPUT_LINE}" | cut -f3)"
+    REF_SEQ="$(echo "${INPUT_LINE}" | cut -f6)"
+    ALT_SEQ="$(echo "${INPUT_LINE}" | cut -f7)"
+    INFO="$(echo "${INPUT_LINE}" | cut -f9)"
+
+    # Fetch out the type (MNP, INS, etc.)
+    VARIANT_TYPE="$(echo "${INFO}" | grep -o "TYPE=[A-Z]*" | cut -f2 -d'=')"
+
+
+    # End positions on the variants are not actually the end positions used in
+    # the filenames. So find the right graph based only on the start position.
+    INPUT_GRAPH="$(find "${INPUT_DATA_DIR}" -name "${CONTIG}_${START_POS}_*.giraffe.gbz")"
+    INPUT_GRAPH_BASENAME="$(basename "${INPUT_GRAPH}")"
+    SLUG="${INPUT_GRAPH_BASENAME%.giraffe.gbz}"
+
+    INPUT_NORMAL_GAM="${INPUT_DATA_DIR}/normal.${SLUG}.gam"
+    INPUT_TUMOR_GAM="${INPUT_DATA_DIR}/tumor.${SLUG}.gam"
+
+    # Make the chunk directory
+    "${SCRIPTS_DIR}/prepare_local_chunk.sh" \
+         -x "${INPUT_GRAPH}" \
+         -g "${INPUT_NORMAL_GAM}" \
+         -p '{"mainPalette": "blues", "auxPalette": "blues"}' \
+         -g "${INPUT_TUMOR_GAM}" \
+         -p '{"mainPalette": "reds", "auxPalette": "reds"}' \
+         -r "${CONTIG}:${START_POS}-${END_POS}" \
+         -d "Tumor-specific ${REF_SEQ} -> ${ALT_SEQ} ${VARIANT_TYPE} variant" \
+         -o "${SLUG}" \
+         >>"${OUTPUT_BED}"
+
+    # We want to hide the path cover paths from Giraffe and only show the paths from the original Lancet GFA
+    vg paths --drop-paths --paths-by "path_cover_" -x "${SLUG}/chunk.vg" >"${SLUG}/chunk.vg.new"
+    mv "${SLUG}/chunk.vg.new" "${SLUG}/chunk.vg"
+
+done <"${INPUT_BED}"
diff --git a/scripts/prepare_local_chunk.sh b/scripts/prepare_local_chunk.sh
@@ -15,11 +15,11 @@ NODE_COLORS=()
 while getopts x:g:p:r:o:d:n: flag
 do
     case "${flag}" in
-        x) GRAPH_FILE=${OPTARG};;
+        x) GRAPH_FILE="${OPTARG}";;
         g) GAM_FILES+=("$OPTARG");;
         p) GAM_PALETTES+=("$OPTARG");;
-        r) REGION=${OPTARG};;
-        o) OUTDIR=${OPTARG};;
+        r) REGION="${OPTARG}";;
+        o) OUTDIR="${OPTARG}";;
         d) DESC="${OPTARG}";;
         n) NODE_COLORS+=("${OPTARG}");;
         *)
@@ -57,45 +57,45 @@ if [[ -z "${DESC}" ]] ; then
     DESC="Region ${REGION}"
 fi
 
-echo >&2 "Graph File: " $GRAPH_FILE
-echo >&2 "Region: " $REGION
-echo >&2 "Output Directory: " $OUTDIR
-echo >&2 "Node colors: " ${NODE_COLORS[@]}
+echo >&2 "Graph File: $GRAPH_FILE"
+echo >&2 "Region: $REGION"
+echo >&2 "Output Directory: $OUTDIR"
+echo >&2 "Node colors: ${NODE_COLORS[@]}"
 
-rm -fr $OUTDIR
-mkdir -p $OUTDIR
+rm -fr "$OUTDIR"
+mkdir -p "$OUTDIR"
 
 TEMP="$(mktemp -d)"
 
 
 # Covert GAF files to GAM
 for i in "${!GAM_FILES[@]}"; do
-    if [[ ${GAM_FILES[$i]} == *.gaf ]]; then
+    if [[ "${GAM_FILES[$i]}" == *.gaf ]]; then
         # Filename without path
-        filename=$(basename -- ${GAM_FILES[$i]})
+        filename="$(basename -- "${GAM_FILES[$i]}")"
         # Remove file extension
-        filename=${filename%.*}
-        vg convert --gaf-to-gam ${GAM_FILES[$i]} ${GRAPH_FILE} > $TEMP/${filename}.gam
+        filename="${filename%.*}"
+        vg convert --gaf-to-gam "${GAM_FILES[$i]}" "${GRAPH_FILE}" > "$TEMP/${filename}.gam"
         GAM_FILES[$i]="$TEMP/${filename}.gam"
     fi
 done
 
 # Parse the region
-REGION_END="$(echo ${REGION} | rev | cut -f1 -d'-' | rev)"
-REGION_START="$(echo ${REGION} | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)"
-REGION_CONTIG="$(echo ${REGION} | rev| cut -f2- -d':' | rev)"
+REGION_END="$(echo "${REGION}" | rev | cut -f1 -d'-' | rev)"
+REGION_START="$(echo "${REGION}" | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)"
+REGION_CONTIG="$(echo "${REGION}" | rev| cut -f2- -d':' | rev)"
 
 # get path relative to directory above the scripts directory
-GRAPH_FILE_PATH=$(realpath --relative-to $(dirname ${BASH_SOURCE[0]})/../ $GRAPH_FILE)
+GRAPH_FILE_PATH="$(realpath --relative-to "$(dirname "${BASH_SOURCE[0]}")/../" "$GRAPH_FILE")"
 
 # construct track JSON for graph file
-GRAPH_PALETTE="$(cat "$(dirname ${BASH_SOURCE[0]})/../src/config.json" | jq '.defaultGraphColorPalette')"
-jq -n --arg trackFile "${GRAPH_FILE_PATH}" --arg trackType "graph" --argjson trackColorSettings "$GRAPH_PALETTE" '$ARGS.named' >> $OUTDIR/temp.json
+GRAPH_PALETTE="$(cat "$(dirname "${BASH_SOURCE[0]}")/../src/config.json" | jq '.defaultGraphColorPalette')"
+jq -n --arg trackFile "${GRAPH_FILE_PATH}" --arg trackType "graph" --argjson trackColorSettings "$GRAPH_PALETTE" '$ARGS.named' >> "$OUTDIR/temp.json"
 
 # Put the graphy file in place
-vg convert -p "${GRAPH_FILE}" > $OUTDIR/chunk.vg
+vg convert -p "${GRAPH_FILE}" > "$OUTDIR/chunk.vg"
 # Start the region BED inside the chunk
-printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > $OUTDIR/regions.tsv
+printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > "$OUTDIR/regions.tsv"
 
 
 echo >&2 "Gam Files:"
@@ -107,9 +107,9 @@ for GAM_FILE in "${GAM_FILES[@]}"; do
     if [[ -z "${GAM_PALETTE}" ]] ; then
         GAM_PALETTE="${DEFAULT_READ_PALETTE}"
     fi
-    GAM_FILE_PATH=$(realpath --relative-to $(dirname ${BASH_SOURCE[0]})/../ $GAM_FILE)
+    GAM_FILE_PATH=$(realpath --relative-to "$(dirname "${BASH_SOURCE[0]}")/../" "$GAM_FILE")
     # construct track JSON for each gam file
-    jq -n --arg trackFile "${GAM_FILE_PATH}" --arg trackType "read" --argjson trackColorSettings "$GAM_PALETTE" '$ARGS.named' >> $OUTDIR/temp.json
+    jq -n --arg trackFile "${GAM_FILE_PATH}" --arg trackType "read" --argjson trackColorSettings "$GAM_PALETTE" '$ARGS.named' >> "$OUTDIR/temp.json"
     # Work out a chunk-internal GAM name with the same leading numbering vg chunk uses
     if [[ "${GAM_NUM}" == "0" ]] ; then
         GAM_LEADER="chunk"
@@ -120,35 +120,35 @@ for GAM_FILE in "${GAM_FILES[@]}"; do
     # Put the chunk in place
     cp "${GAM_FILE}" "${GAM_CHUNK_NAME}"
     # List it in the regions TSV like vg would
-    printf "\t$(basename "${GAM_CHUNK_NAME}")" >> $OUTDIR/regions.tsv
+    printf "\t$(basename "${GAM_CHUNK_NAME}")" >> "$OUTDIR/regions.tsv"
     GAM_NUM=$((GAM_NUM + 1))
 done
 
 # put all tracks objects into an array
-(jq -s '.' < $OUTDIR/temp.json) > $OUTDIR/tracks.json
+(jq -s '.' < "$OUTDIR/temp.json") > "$OUTDIR/tracks.json"
 
-rm $OUTDIR/temp.json
+rm "$OUTDIR/temp.json"
 
 # construct node file
 if [[ ! -z "${NODE_COLORS}" ]] ; then
     for NODENAME in "${NODE_COLORS[@]}"; do
         echo >&2 "Highlighted node: $NODENAME"
-        printf "$NODENAME\n" >> $OUTDIR/nodeColors.tsv
+        printf "$NODENAME\n" >> "$OUTDIR/nodeColors.tsv"
     done
 fi
 
 # Make the empty but required annotation file. We have no haplotypes to put in it.
 touch "${OUTDIR}/chunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt"
-printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> $OUTDIR/regions.tsv
+printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> "$OUTDIR/regions.tsv"
 
-for file in `ls $OUTDIR/`
+for file in `ls "$OUTDIR/"`
 do
-    printf "$file\n" >> $OUTDIR/chunk_contents.txt
+    printf "$file\n" >> "$OUTDIR/chunk_contents.txt"
 done
 
 # Print BED line
-cat $OUTDIR/regions.tsv | cut -f1-3 | tr -d "\n"
+cat "$OUTDIR/regions.tsv" | cut -f1-3 | tr -d "\n"
 printf "\t${DESC}\t${OUTDIR}\n"
 
-rm -fr $TEMP
+rm -fr "$TEMP"