Skip to content

Commit

Permalink
Add a script that can batch import and explain Lancet data in the Nov…
Browse files Browse the repository at this point in the history
…ember format
  • Loading branch information
adamnovak committed May 3, 2024
1 parent 234a75c commit 09c3ddb
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 32 deletions.
89 changes: 89 additions & 0 deletions scripts/prepare_lancet_output.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env bash
# prepare_lancet_input.sh: Turn a directory of Lancet output into a Tube Map BED file and directory that can be hosted on the web.
set -x

function usage() {
echo >&2 "${0}: Prepare a directory of Lancet data ."
echo >&2
echo >&2 "The input directory should have in it:"
echo >&2 "- variant_calling_regions.with_calls.bed, an extended BED file with biallelic VCF REF in column 6, ALT in column 7, and INFO in column 9 with a TYPE field"
echo >&2 "- giraffe_alignments/, a directory with, for each VCF record:"
echo >&2 " - <CONTIG>_<START>_<END>.giraffe.gbz"
echo >&2 " - normal.<CONTIG>_<START>_<END>.gam"
echo >&2 " - tumor.<CONTIG>_<START>_<END>.gam"
echo >&2 "Note that it is tolerated for the END used in the file names to not match the one used in the BED records."
echo >&2
echo >&2 "Usage: ${0} <INPUT_DIR> <OUTPUT_DIR>"
exit 1
}

INPUT_DIR="${1}"
OUTPUT_DIR="${2}"

if [[ -z "${OUTPUT_DIR}" || -z "${INPUT_DIR}" || ! -d "${INPUT_DIR}" ]] ; then
usage
fi

# Make all the paths absolute
INPUT_DIR="$(realpath "${INPUT_DIR}")"
OUTPUT_DIR="$(realpath "${OUTPUT_DIR}")"

INPUT_BED="${INPUT_DIR}/variant_calling_regions.with_calls.bed"
INPUT_DATA_DIR="${INPUT_DIR}/giraffe_alignments"

if [[ ! -f "${INPUT_BED}" || ! -d "${INPUT_DATA_DIR}" ]] ; then
usage
fi


SCRIPTS_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"

mkdir -p "${OUTPUT_DIR}"

OUTPUT_BED="${OUTPUT_DIR}/index.bed"
rm -f "${OUTPUT_BED}"

# We need to ber in the output directory to use relative paths in the generated BED.
cd "${OUTPUT_DIR}"

while IFS="" read -r INPUT_LINE || [[ -n "${INPUT_LINE}" ]] ; do
# Read all the lines, even if the newline is missing. See <https://stackoverflow.com/a/1521498>

# Parse the BED
CONTIG="$(echo "${INPUT_LINE}" | cut -f1)"
START_POS="$(echo "${INPUT_LINE}" | cut -f2)"
END_POS="$(echo "${INPUT_LINE}" | cut -f3)"
REF_SEQ="$(echo "${INPUT_LINE}" | cut -f6)"
ALT_SEQ="$(echo "${INPUT_LINE}" | cut -f7)"
INFO="$(echo "${INPUT_LINE}" | cut -f9)"

# Fetch out the type (MNP, INS, etc.)
VARIANT_TYPE="$(echo "${INFO}" | grep -o "TYPE=[A-Z]*" | cut -f2 -d'=')"


# End positions on the variants are not actually the end positions used in
# the filenames. So find the right graph based only on the start position.
INPUT_GRAPH="$(find "${INPUT_DATA_DIR}" -name "${CONTIG}_${START_POS}_*.giraffe.gbz")"
INPUT_GRAPH_BASENAME="$(basename "${INPUT_GRAPH}")"
SLUG="${INPUT_GRAPH_BASENAME%.giraffe.gbz}"

INPUT_NORMAL_GAM="${INPUT_DATA_DIR}/normal.${SLUG}.gam"
INPUT_TUMOR_GAM="${INPUT_DATA_DIR}/tumor.${SLUG}.gam"

# Make the chunk directory
"${SCRIPTS_DIR}/prepare_local_chunk.sh" \
-x "${INPUT_GRAPH}" \
-g "${INPUT_NORMAL_GAM}" \
-p '{"mainPalette": "blues", "auxPalette": "blues"}' \
-g "${INPUT_TUMOR_GAM}" \
-p '{"mainPalette": "reds", "auxPalette": "reds"}' \
-r "${CONTIG}:${START_POS}-${END_POS}" \
-d "Tumor-specific ${REF_SEQ} -> ${ALT_SEQ} ${VARIANT_TYPE} variant" \
-o "${SLUG}" \
>>"${OUTPUT_BED}"

# We want to hide the path cover paths from Giraffe and only show the paths from the original Lancet GFA
vg paths --drop-paths --paths-by "path_cover_" -x "${SLUG}/chunk.vg" >"${SLUG}/chunk.vg.new"
mv "${SLUG}/chunk.vg.new" "${SLUG}/chunk.vg"

done <"${INPUT_BED}"
64 changes: 32 additions & 32 deletions scripts/prepare_local_chunk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ NODE_COLORS=()
while getopts x:g:p:r:o:d:n: flag
do
case "${flag}" in
x) GRAPH_FILE=${OPTARG};;
x) GRAPH_FILE="${OPTARG}";;
g) GAM_FILES+=("$OPTARG");;
p) GAM_PALETTES+=("$OPTARG");;
r) REGION=${OPTARG};;
o) OUTDIR=${OPTARG};;
r) REGION="${OPTARG}";;
o) OUTDIR="${OPTARG}";;
d) DESC="${OPTARG}";;
n) NODE_COLORS+=("${OPTARG}");;
*)
Expand Down Expand Up @@ -57,45 +57,45 @@ if [[ -z "${DESC}" ]] ; then
DESC="Region ${REGION}"
fi

echo >&2 "Graph File: " $GRAPH_FILE
echo >&2 "Region: " $REGION
echo >&2 "Output Directory: " $OUTDIR
echo >&2 "Node colors: " ${NODE_COLORS[@]}
echo >&2 "Graph File: $GRAPH_FILE"
echo >&2 "Region: $REGION"
echo >&2 "Output Directory: $OUTDIR"
echo >&2 "Node colors: ${NODE_COLORS[@]}"

rm -fr $OUTDIR
mkdir -p $OUTDIR
rm -fr "$OUTDIR"
mkdir -p "$OUTDIR"

TEMP="$(mktemp -d)"


# Covert GAF files to GAM
for i in "${!GAM_FILES[@]}"; do
if [[ ${GAM_FILES[$i]} == *.gaf ]]; then
if [[ "${GAM_FILES[$i]}" == *.gaf ]]; then
# Filename without path
filename=$(basename -- ${GAM_FILES[$i]})
filename="$(basename -- "${GAM_FILES[$i]}")"
# Remove file extension
filename=${filename%.*}
vg convert --gaf-to-gam ${GAM_FILES[$i]} ${GRAPH_FILE} > $TEMP/${filename}.gam
filename="${filename%.*}"
vg convert --gaf-to-gam "${GAM_FILES[$i]}" "${GRAPH_FILE}" > "$TEMP/${filename}.gam"
GAM_FILES[$i]="$TEMP/${filename}.gam"
fi
done

# Parse the region
REGION_END="$(echo ${REGION} | rev | cut -f1 -d'-' | rev)"
REGION_START="$(echo ${REGION} | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)"
REGION_CONTIG="$(echo ${REGION} | rev| cut -f2- -d':' | rev)"
REGION_END="$(echo "${REGION}" | rev | cut -f1 -d'-' | rev)"
REGION_START="$(echo "${REGION}" | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)"
REGION_CONTIG="$(echo "${REGION}" | rev| cut -f2- -d':' | rev)"

# get path relative to directory above the scripts directory
GRAPH_FILE_PATH=$(realpath --relative-to $(dirname ${BASH_SOURCE[0]})/../ $GRAPH_FILE)
GRAPH_FILE_PATH="$(realpath --relative-to "$(dirname "${BASH_SOURCE[0]}")/../" "$GRAPH_FILE")"

# construct track JSON for graph file
GRAPH_PALETTE="$(cat "$(dirname ${BASH_SOURCE[0]})/../src/config.json" | jq '.defaultGraphColorPalette')"
jq -n --arg trackFile "${GRAPH_FILE_PATH}" --arg trackType "graph" --argjson trackColorSettings "$GRAPH_PALETTE" '$ARGS.named' >> $OUTDIR/temp.json
GRAPH_PALETTE="$(cat "$(dirname "${BASH_SOURCE[0]}")/../src/config.json" | jq '.defaultGraphColorPalette')"
jq -n --arg trackFile "${GRAPH_FILE_PATH}" --arg trackType "graph" --argjson trackColorSettings "$GRAPH_PALETTE" '$ARGS.named' >> "$OUTDIR/temp.json"

# Put the graphy file in place
vg convert -p "${GRAPH_FILE}" > $OUTDIR/chunk.vg
vg convert -p "${GRAPH_FILE}" > "$OUTDIR/chunk.vg"
# Start the region BED inside the chunk
printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > $OUTDIR/regions.tsv
printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > "$OUTDIR/regions.tsv"


echo >&2 "Gam Files:"
Expand All @@ -107,9 +107,9 @@ for GAM_FILE in "${GAM_FILES[@]}"; do
if [[ -z "${GAM_PALETTE}" ]] ; then
GAM_PALETTE="${DEFAULT_READ_PALETTE}"
fi
GAM_FILE_PATH=$(realpath --relative-to $(dirname ${BASH_SOURCE[0]})/../ $GAM_FILE)
GAM_FILE_PATH=$(realpath --relative-to "$(dirname "${BASH_SOURCE[0]}")/../" "$GAM_FILE")
# construct track JSON for each gam file
jq -n --arg trackFile "${GAM_FILE_PATH}" --arg trackType "read" --argjson trackColorSettings "$GAM_PALETTE" '$ARGS.named' >> $OUTDIR/temp.json
jq -n --arg trackFile "${GAM_FILE_PATH}" --arg trackType "read" --argjson trackColorSettings "$GAM_PALETTE" '$ARGS.named' >> "$OUTDIR/temp.json"
# Work out a chunk-internal GAM name with the same leading numbering vg chunk uses
if [[ "${GAM_NUM}" == "0" ]] ; then
GAM_LEADER="chunk"
Expand All @@ -120,35 +120,35 @@ for GAM_FILE in "${GAM_FILES[@]}"; do
# Put the chunk in place
cp "${GAM_FILE}" "${GAM_CHUNK_NAME}"
# List it in the regions TSV like vg would
printf "\t$(basename "${GAM_CHUNK_NAME}")" >> $OUTDIR/regions.tsv
printf "\t$(basename "${GAM_CHUNK_NAME}")" >> "$OUTDIR/regions.tsv"
GAM_NUM=$((GAM_NUM + 1))
done

# put all tracks objects into an array
(jq -s '.' < $OUTDIR/temp.json) > $OUTDIR/tracks.json
(jq -s '.' < "$OUTDIR/temp.json") > "$OUTDIR/tracks.json"

rm $OUTDIR/temp.json
rm "$OUTDIR/temp.json"

# construct node file
if [[ ! -z "${NODE_COLORS}" ]] ; then
for NODENAME in "${NODE_COLORS[@]}"; do
echo >&2 "Highlighted node: $NODENAME"
printf "$NODENAME\n" >> $OUTDIR/nodeColors.tsv
printf "$NODENAME\n" >> "$OUTDIR/nodeColors.tsv"
done
fi

# Make the empty but required annotation file. We have no haplotypes to put in it.
touch "${OUTDIR}/chunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt"
printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> $OUTDIR/regions.tsv
printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> "$OUTDIR/regions.tsv"

for file in `ls $OUTDIR/`
for file in `ls "$OUTDIR/"`
do
printf "$file\n" >> $OUTDIR/chunk_contents.txt
printf "$file\n" >> "$OUTDIR/chunk_contents.txt"
done

# Print BED line
cat $OUTDIR/regions.tsv | cut -f1-3 | tr -d "\n"
cat "$OUTDIR/regions.tsv" | cut -f1-3 | tr -d "\n"
printf "\t${DESC}\t${OUTDIR}\n"

rm -fr $TEMP
rm -fr "$TEMP"

0 comments on commit 09c3ddb

Please sign in to comment.