From 09c3ddb849c64201b301e0d11ef03bc57591684b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 3 May 2024 16:51:34 -0400 Subject: [PATCH] Add a script that can batch import and explain Lancet data in the November format --- scripts/prepare_lancet_output.sh | 89 ++++++++++++++++++++++++++++++++ scripts/prepare_local_chunk.sh | 64 +++++++++++------------ 2 files changed, 121 insertions(+), 32 deletions(-) create mode 100755 scripts/prepare_lancet_output.sh diff --git a/scripts/prepare_lancet_output.sh b/scripts/prepare_lancet_output.sh new file mode 100755 index 00000000..70327fa1 --- /dev/null +++ b/scripts/prepare_lancet_output.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# prepare_lancet_input.sh: Turn a directory of Lancet output into a Tube Map BED file and directory that can be hosted on the web. +set -x + +function usage() { + echo >&2 "${0}: Prepare a directory of Lancet data ." + echo >&2 + echo >&2 "The input directory should have in it:" + echo >&2 "- variant_calling_regions.with_calls.bed, an extended BED file with biallelic VCF REF in column 6, ALT in column 7, and INFO in column 9 with a TYPE field" + echo >&2 "- giraffe_alignments/, a directory with, for each VCF record:" + echo >&2 " - __.giraffe.gbz" + echo >&2 " - normal.__.gam" + echo >&2 " - tumor.__.gam" + echo >&2 "Note that it is tolerated for the END used in the file names to not match the one used in the BED records." + echo >&2 + echo >&2 "Usage: ${0} " + exit 1 +} + +INPUT_DIR="${1}" +OUTPUT_DIR="${2}" + +if [[ -z "${OUTPUT_DIR}" || -z "${INPUT_DIR}" || ! -d "${INPUT_DIR}" ]] ; then + usage +fi + +# Make all the paths absolute +INPUT_DIR="$(realpath "${INPUT_DIR}")" +OUTPUT_DIR="$(realpath "${OUTPUT_DIR}")" + +INPUT_BED="${INPUT_DIR}/variant_calling_regions.with_calls.bed" +INPUT_DATA_DIR="${INPUT_DIR}/giraffe_alignments" + +if [[ ! -f "${INPUT_BED}" || ! -d "${INPUT_DATA_DIR}" ]] ; then + usage +fi + + +SCRIPTS_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +mkdir -p "${OUTPUT_DIR}" + +OUTPUT_BED="${OUTPUT_DIR}/index.bed" +rm -f "${OUTPUT_BED}" + +# We need to ber in the output directory to use relative paths in the generated BED. +cd "${OUTPUT_DIR}" + +while IFS="" read -r INPUT_LINE || [[ -n "${INPUT_LINE}" ]] ; do + # Read all the lines, even if the newline is missing. See + + # Parse the BED + CONTIG="$(echo "${INPUT_LINE}" | cut -f1)" + START_POS="$(echo "${INPUT_LINE}" | cut -f2)" + END_POS="$(echo "${INPUT_LINE}" | cut -f3)" + REF_SEQ="$(echo "${INPUT_LINE}" | cut -f6)" + ALT_SEQ="$(echo "${INPUT_LINE}" | cut -f7)" + INFO="$(echo "${INPUT_LINE}" | cut -f9)" + + # Fetch out the type (MNP, INS, etc.) + VARIANT_TYPE="$(echo "${INFO}" | grep -o "TYPE=[A-Z]*" | cut -f2 -d'=')" + + + # End positions on the variants are not actually the end positions used in + # the filenames. So find the right graph based only on the start position. + INPUT_GRAPH="$(find "${INPUT_DATA_DIR}" -name "${CONTIG}_${START_POS}_*.giraffe.gbz")" + INPUT_GRAPH_BASENAME="$(basename "${INPUT_GRAPH}")" + SLUG="${INPUT_GRAPH_BASENAME%.giraffe.gbz}" + + INPUT_NORMAL_GAM="${INPUT_DATA_DIR}/normal.${SLUG}.gam" + INPUT_TUMOR_GAM="${INPUT_DATA_DIR}/tumor.${SLUG}.gam" + + # Make the chunk directory + "${SCRIPTS_DIR}/prepare_local_chunk.sh" \ + -x "${INPUT_GRAPH}" \ + -g "${INPUT_NORMAL_GAM}" \ + -p '{"mainPalette": "blues", "auxPalette": "blues"}' \ + -g "${INPUT_TUMOR_GAM}" \ + -p '{"mainPalette": "reds", "auxPalette": "reds"}' \ + -r "${CONTIG}:${START_POS}-${END_POS}" \ + -d "Tumor-specific ${REF_SEQ} -> ${ALT_SEQ} ${VARIANT_TYPE} variant" \ + -o "${SLUG}" \ + >>"${OUTPUT_BED}" + + # We want to hide the path cover paths from Giraffe and only show the paths from the original Lancet GFA + vg paths --drop-paths --paths-by "path_cover_" -x "${SLUG}/chunk.vg" >"${SLUG}/chunk.vg.new" + mv "${SLUG}/chunk.vg.new" "${SLUG}/chunk.vg" + +done <"${INPUT_BED}" diff --git a/scripts/prepare_local_chunk.sh b/scripts/prepare_local_chunk.sh index 7734f0f2..63d813c5 100755 --- a/scripts/prepare_local_chunk.sh +++ b/scripts/prepare_local_chunk.sh @@ -15,11 +15,11 @@ NODE_COLORS=() while getopts x:g:p:r:o:d:n: flag do case "${flag}" in - x) GRAPH_FILE=${OPTARG};; + x) GRAPH_FILE="${OPTARG}";; g) GAM_FILES+=("$OPTARG");; p) GAM_PALETTES+=("$OPTARG");; - r) REGION=${OPTARG};; - o) OUTDIR=${OPTARG};; + r) REGION="${OPTARG}";; + o) OUTDIR="${OPTARG}";; d) DESC="${OPTARG}";; n) NODE_COLORS+=("${OPTARG}");; *) @@ -57,45 +57,45 @@ if [[ -z "${DESC}" ]] ; then DESC="Region ${REGION}" fi -echo >&2 "Graph File: " $GRAPH_FILE -echo >&2 "Region: " $REGION -echo >&2 "Output Directory: " $OUTDIR -echo >&2 "Node colors: " ${NODE_COLORS[@]} +echo >&2 "Graph File: $GRAPH_FILE" +echo >&2 "Region: $REGION" +echo >&2 "Output Directory: $OUTDIR" +echo >&2 "Node colors: ${NODE_COLORS[@]}" -rm -fr $OUTDIR -mkdir -p $OUTDIR +rm -fr "$OUTDIR" +mkdir -p "$OUTDIR" TEMP="$(mktemp -d)" # Covert GAF files to GAM for i in "${!GAM_FILES[@]}"; do - if [[ ${GAM_FILES[$i]} == *.gaf ]]; then + if [[ "${GAM_FILES[$i]}" == *.gaf ]]; then # Filename without path - filename=$(basename -- ${GAM_FILES[$i]}) + filename="$(basename -- "${GAM_FILES[$i]}")" # Remove file extension - filename=${filename%.*} - vg convert --gaf-to-gam ${GAM_FILES[$i]} ${GRAPH_FILE} > $TEMP/${filename}.gam + filename="${filename%.*}" + vg convert --gaf-to-gam "${GAM_FILES[$i]}" "${GRAPH_FILE}" > "$TEMP/${filename}.gam" GAM_FILES[$i]="$TEMP/${filename}.gam" fi done # Parse the region -REGION_END="$(echo ${REGION} | rev | cut -f1 -d'-' | rev)" -REGION_START="$(echo ${REGION} | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)" -REGION_CONTIG="$(echo ${REGION} | rev| cut -f2- -d':' | rev)" +REGION_END="$(echo "${REGION}" | rev | cut -f1 -d'-' | rev)" +REGION_START="$(echo "${REGION}" | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)" +REGION_CONTIG="$(echo "${REGION}" | rev| cut -f2- -d':' | rev)" # get path relative to directory above the scripts directory -GRAPH_FILE_PATH=$(realpath --relative-to $(dirname ${BASH_SOURCE[0]})/../ $GRAPH_FILE) +GRAPH_FILE_PATH="$(realpath --relative-to "$(dirname "${BASH_SOURCE[0]}")/../" "$GRAPH_FILE")" # construct track JSON for graph file -GRAPH_PALETTE="$(cat "$(dirname ${BASH_SOURCE[0]})/../src/config.json" | jq '.defaultGraphColorPalette')" -jq -n --arg trackFile "${GRAPH_FILE_PATH}" --arg trackType "graph" --argjson trackColorSettings "$GRAPH_PALETTE" '$ARGS.named' >> $OUTDIR/temp.json +GRAPH_PALETTE="$(cat "$(dirname "${BASH_SOURCE[0]}")/../src/config.json" | jq '.defaultGraphColorPalette')" +jq -n --arg trackFile "${GRAPH_FILE_PATH}" --arg trackType "graph" --argjson trackColorSettings "$GRAPH_PALETTE" '$ARGS.named' >> "$OUTDIR/temp.json" # Put the graphy file in place -vg convert -p "${GRAPH_FILE}" > $OUTDIR/chunk.vg +vg convert -p "${GRAPH_FILE}" > "$OUTDIR/chunk.vg" # Start the region BED inside the chunk -printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > $OUTDIR/regions.tsv +printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > "$OUTDIR/regions.tsv" echo >&2 "Gam Files:" @@ -107,9 +107,9 @@ for GAM_FILE in "${GAM_FILES[@]}"; do if [[ -z "${GAM_PALETTE}" ]] ; then GAM_PALETTE="${DEFAULT_READ_PALETTE}" fi - GAM_FILE_PATH=$(realpath --relative-to $(dirname ${BASH_SOURCE[0]})/../ $GAM_FILE) + GAM_FILE_PATH=$(realpath --relative-to "$(dirname "${BASH_SOURCE[0]}")/../" "$GAM_FILE") # construct track JSON for each gam file - jq -n --arg trackFile "${GAM_FILE_PATH}" --arg trackType "read" --argjson trackColorSettings "$GAM_PALETTE" '$ARGS.named' >> $OUTDIR/temp.json + jq -n --arg trackFile "${GAM_FILE_PATH}" --arg trackType "read" --argjson trackColorSettings "$GAM_PALETTE" '$ARGS.named' >> "$OUTDIR/temp.json" # Work out a chunk-internal GAM name with the same leading numbering vg chunk uses if [[ "${GAM_NUM}" == "0" ]] ; then GAM_LEADER="chunk" @@ -120,35 +120,35 @@ for GAM_FILE in "${GAM_FILES[@]}"; do # Put the chunk in place cp "${GAM_FILE}" "${GAM_CHUNK_NAME}" # List it in the regions TSV like vg would - printf "\t$(basename "${GAM_CHUNK_NAME}")" >> $OUTDIR/regions.tsv + printf "\t$(basename "${GAM_CHUNK_NAME}")" >> "$OUTDIR/regions.tsv" GAM_NUM=$((GAM_NUM + 1)) done # put all tracks objects into an array -(jq -s '.' < $OUTDIR/temp.json) > $OUTDIR/tracks.json +(jq -s '.' < "$OUTDIR/temp.json") > "$OUTDIR/tracks.json" -rm $OUTDIR/temp.json +rm "$OUTDIR/temp.json" # construct node file if [[ ! -z "${NODE_COLORS}" ]] ; then for NODENAME in "${NODE_COLORS[@]}"; do echo >&2 "Highlighted node: $NODENAME" - printf "$NODENAME\n" >> $OUTDIR/nodeColors.tsv + printf "$NODENAME\n" >> "$OUTDIR/nodeColors.tsv" done fi # Make the empty but required annotation file. We have no haplotypes to put in it. touch "${OUTDIR}/chunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt" -printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> $OUTDIR/regions.tsv +printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> "$OUTDIR/regions.tsv" -for file in `ls $OUTDIR/` +for file in `ls "$OUTDIR/"` do - printf "$file\n" >> $OUTDIR/chunk_contents.txt + printf "$file\n" >> "$OUTDIR/chunk_contents.txt" done # Print BED line -cat $OUTDIR/regions.tsv | cut -f1-3 | tr -d "\n" +cat "$OUTDIR/regions.tsv" | cut -f1-3 | tr -d "\n" printf "\t${DESC}\t${OUTDIR}\n" -rm -fr $TEMP +rm -fr "$TEMP"