Merge pull request #337 from vgteam/graph-to-chunk

Fix track JSON load, make sure chunk ref path is first, and add script to help make a chunk from a graph
vgteam · Sep 15, 2023 · 938683f · 938683f
2 parents 6267403 + 0cfc9b3
commit 938683f
Show file tree

Hide file tree

Showing 8 changed files with 249 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -137,28 +137,51 @@ That can sometimes up to 10-20 seconds.
 If you already know of regions/subgraphs that you will be looking at, you can pre-fetch the data in advance. 
 This will save some time during the interactive visualization, especially if there are a lot of regions to visualize.
 
-This is a 2 step process that involves creating the chunk and linking it to a bed file
+The net result needs to be one or more chunk directories on disk, referenced from a BED file.
 
-1. 
-The subgraphs need to be pre-fetched using `vg chunk` like shown in [`prepare_chunks.sh`](scripts/prepare_chunks.sh). For example:
+To generate each chunk, you can use the `prepare_chunks.sh` script. You ought to run it from the directory containing your input files and where your output chunks will be stored (i.e. the `dataPath` in `sequenceTubeMpas/src/config.json`), which defaults to the `exampleData` directory in the repo.
+
+For example:
 
 ```
-./prepare_chunk.sh -x mygraph.xg -h mygraph.gbwt -r chr1:1-100 -o chunk-chr1-1-100 -g mygam1.gam -g mygam2.gam
+cd exampleData/
+../scripts/prepare_chunk.sh -x mygraph.xg -h mygraph.gbwt -r chr1:1-100 -d 'Region A' -o chunk-chr1-1-100 -g mygam1.gam -g mygam2.gam >> mychunks.bed
+../scripts/prepare_chunk.sh -x mygraph.xg -h mygraph.gbwt -r chr1:101-200 -d 'Region B' -o chunk-chr1-100-200 -g mygam1.gam -g mygam2.gam >> mychunks.bed
 ```
 
-2.
-Then compile those regions in a BED file with two additional columns: 
+The BED file linking to the chunks has two additional nonstandard columns: 
 
 - a description of the region (column 4)
 - the path to the output directory of the chunk, `chunk-chr1-1-100` in the example above, (column 5). 
 
 ```
-ref	1	10	region one to ten	chunk-ref-1-20
-ref	10	20	region ten to twenty	chunk-ref-1-20
+chr1	1	100	Region A	chunk-chr1-1-100
+chr1	101	200	Region B	chunk-chr2-101-200
 ```
 Note each column is seperated by tabs
 
-This BED file will be read if placed in the `dataPath` directory, like for other files to mount (see above).
+This BED file needs to be in the `dataPath` directory, or it can be hosted on the web along with its chunk directories and accessed via URL.
+
+##### Pre-made subgraphs
+
+You may want to look at a graph that has already been extracted from a larger graph.
+To support this, there is a `prepare_local_chunk.sh` script, which takes a subgraph rather than a full graph.
+It supports most of the options that `prepare_chunks.sh` does, with the notable exception of haplotype files.
+It assumes that the graph represents some region along some reference path that is present in the graph, and expects that region to be provided with the `-r` option.
+It assumes that path names in the subgraph *don't* use subregion suffixes (bracket-enclosed numbers).
+The path name used in the region should *exactly* match the name of one of the paths in the graph.
+
+For example, you can run it like:
+
+```
+cd exampleData/
+../scripts/prepare_local_chunk.sh -x subgraph.gfa -r chr5:1023911-1025911 -g subgraph_reads.gam -g other_sample_reads.gam -o subgraph1 >> subgraphs.bed
+```
+
+If the original subgraph file does not remain in place under the configured `dataPath` and accessible by the tube map, errors may occur complaining that it couldn't be accessed when the tube map attempts to list ist contained paths.
+
+The net result will be that you can select the BED file, select the region it specifies, and view a precomputed view of the subgraph, with coordinates computed assuming it covers the region provided to `prepare_local_chunk.sh`.
+
 
 #### Development Mode
 

diff --git a/docker/config.json b/docker/config.json
@@ -51,14 +51,14 @@
   "internalDataPath": "./exampleData/internal",
 
   "defaultHaplotypeColorPalette" : {
-    "mainPalette": "ygreys", 
-    "auxPalette": "ygreys", 
+    "mainPalette": "greys", 
+    "auxPalette": "greys", 
     "colorReadsByMappingQuality": false
   },
 
   "defaultReadColorPalette" : {
-    "mainPalette": "reds", 
-    "auxPalette": "blues", 
+    "mainPalette": "blues", 
+    "auxPalette": "reds", 
     "colorReadsByMappingQuality": false
   },
 

diff --git a/exampleData/chunk-ref-1-20/tracks.json b/exampleData/chunk-ref-1-20/tracks.json
@@ -1,15 +1,15 @@
 {
     "trackFile": "cactus.vg", 
     "trackType": "graph", 
-    "trackColorSettings": {"mainPalette": "greys", "auxPalette": "ygreys"}
+    "trackColorSettings": {"mainPalette": "plainColors", "auxPalette": "greys"}
 },
 {
     "trackFile": "cactus0_10.sorted.gam", 
     "trackType": "read", 
-    "trackColorSettings": {"mainPalette": "greys", "auxPalette": "ygreys"}
+    "trackColorSettings": {"mainPalette": "blues", "auxPalette": "reds"}
 },
 {
     "trackFile": "cactus10_20.sorted.gam", 
     "trackType": "read", 
-    "trackColorSettings": {"mainPalette": "greys", "auxPalette": "ygreys"}
-}
+    "trackColorSettings": {"mainPalette": "blues", "auxPalette": "reds"}
+}
diff --git a/scripts/prepare_chunks.sh b/scripts/prepare_chunks.sh
@@ -1,53 +1,91 @@
 #!/usr/bin/env bash
 set -e
 
-while getopts x:h:g:r:o: flag
+function usage() {
+    echo >&2 "${0}: Extract graph and read chunks for a region, producing a referencing line for a BED file on standard output"
+    echo >&2
+    echo >&2 "Usage: ${0} -x mygraph.xg [-h mygraph.gbwt] -r chr1:1-100 [-d 'Description of region'] -o chunk-chr1-1-100 [-g mygam1.gam [-g mygam2.gam ...]] >> regions.bed"
+    exit 1
+}
+
+while getopts x:h:g:r:o:d: flag
 do
     case "${flag}" in
-        x) XG_FILE=${OPTARG};;
-        h) GBWT=${OPTARG};;
+        x) GRAPH_FILE=${OPTARG};;
+        h) HAPLOTYPE_FILE=${OPTARG};;
         g) GAM_FILES+=("$OPTARG");;
         r) REGION=${OPTARG};;
         o) OUTDIR=${OPTARG};;
+        d) DESC="${OPTARG}";;
+        *)
+            usage
+            ;;
+
     esac
 done
 
 if ! command -v jq &> /dev/null
 then
-    echo "This script requires jq, exiting..."
-    exit
+    echo >&2 "This script requires jq, exiting..."
+    exit 1
+fi
+
+if [[ -z "${REGION}" ]] ; then
+    echo >&2 "You must specify a region with -r"
+    echo >&2
+    usage
+fi
+
+if [[ -z "${GRAPH_FILE}" ]] ; then
+    echo >&2 "You must specify a graph with -x"
+    echo >&2
+    usage
 fi
 
-echo "XG File: " $XG_FILE
-echo "Haplotype File: " $GBWT
-echo "Region: " $REGION
-echo "Output Directory: " $OUTDIR
+if [[ -z "${OUTDIR}" ]] ; then
+    echo >&2 "You must specify an output directory with -o"
+    echo >&2
+    usage
+fi
+
+if [[ -z "${DESC}" ]] ; then
+    DESC="Region ${REGION}"
+fi
+
+echo >&2 "Graph File: " $GRAPH_FILE
+echo >&2 "Haplotype File: " $HAPLOTYPE_FILE
+echo >&2 "Region: " $REGION
+echo >&2 "Output Directory: " $OUTDIR
 
 rm -fr $OUTDIR
 mkdir -p $OUTDIR
 
-vg_chunk_params="-x $XG_FILE -g -c 20 -p $REGION -T -b $OUTDIR/chunk -E $OUTDIR/regions.tsv"
+vg_chunk_params=(-x $GRAPH_FILE -g -c 20 -p $REGION -T -b $OUTDIR/chunk -E $OUTDIR/regions.tsv)
 
-# construct track JSON for xg file
-jq -n --arg trackFile "${XG_FILE}" --arg trackType "graph" --argjson trackColorSettings '{"mainPalette": "greys", "auxPalette": "ygreys"}' '$ARGS.named' >> $OUTDIR/tracks.json
+# construct track JSON for graph file
+jq -n --arg trackFile "${GRAPH_FILE}" --arg trackType "graph" --argjson trackColorSettings '{"mainPalette": "plainColors", "auxPalette": "greys"}' '$ARGS.named' >> $OUTDIR/tracks.json
 
-# construct track JSON for gbwt file; if not any specific gbwt file, then default would be haplotype
-if [[ ! -z "${GBWT}" ]] ; then
-    jq -n --arg trackFile "${GBWT}" --arg trackType "haplotype" --argjson trackColorSettings '{"mainPalette": "blues", "auxPalette": "reds"}' '$ARGS.named' >> $OUTDIR/tracks.json
+# construct track JSON for haplotype file, if provided
+if [[ ! -z "${HAPLOTYPE_FILE}" ]] ; then
+    jq -n --arg trackFile "${HAPLOTYPE_FILE}" --arg trackType "haplotype" --argjson trackColorSettings '{"mainPalette": "blues", "auxPalette": "reds"}' '$ARGS.named' >> $OUTDIR/tracks.json
 fi
 
 # construct track JSON for each gam file
-echo "Gam Files:"
+echo >&2 "Gam Files:"
 for GAM_FILE in "${GAM_FILES[@]}"; do
-    echo " - $GAM_FILE"
+    echo >&2 " - $GAM_FILE"
     jq -n --arg trackFile "${GAM_FILE}" --arg trackType "read" --argjson trackColorSettings '{"mainPalette": "blues", "auxPalette": "reds"}' '$ARGS.named' >> $OUTDIR/tracks.json
-    vg_chunk_params=" $vg_chunk_params -a $GAM_FILE"
+    vg_chunk_params+=(-a $GAM_FILE)
 done
 
 # Call vg chunk
-vg chunk $vg_chunk_params > $OUTDIR/chunk.vg
+vg chunk "${vg_chunk_params[@]}" > $OUTDIR/chunk.vg
 
 for file in `ls $OUTDIR/`
 do
     printf "$file\n" >> $OUTDIR/chunk_contents.txt
-done
+done
+
+# Print BED line
+cat $OUTDIR/regions.tsv | cut -f1-3 | tr -d "\n"
+printf "\t${DESC}\t${OUTDIR}\n"
diff --git a/scripts/prepare_local_chunk.sh b/scripts/prepare_local_chunk.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+set -e
+
+function usage() {
+    echo >&2 "${0}: Prepare a tube map chunk and BED line on standard output from a pre-made subgraph. Only supports paths, not haplotypes."
+    echo >&2
+    echo >&2 "Usage: ${0} -x subgraph.xg -r chr1:1-100 [-d 'Description of region'] -o chunk-chr1-1-100 [-g mygam1.gam [-g mygam2.gam ...]] >> regions.bed"
+    exit 1
+}
+
+while getopts x:g:r:o:d: flag
+do
+    case "${flag}" in
+        x) GRAPH_FILE=${OPTARG};;
+        g) GAM_FILES+=("$OPTARG");;
+        r) REGION=${OPTARG};;
+        o) OUTDIR=${OPTARG};;
+        d) DESC="${OPTARG}";;
+        *)
+            usage
+            ;;
+
+    esac
+done
+
+if ! command -v jq &> /dev/null
+then
+    echo >&2 "This script requires jq, exiting..."
+    exit 1
+fi
+
+if [[ -z "${REGION}" ]] ; then
+    echo >&2 "You must specify a region with -r"
+    echo >&2
+    usage
+fi
+
+if [[ -z "${GRAPH_FILE}" ]] ; then
+    echo >&2 "You must specify a graph with -x"
+    echo >&2
+    usage
+fi
+
+if [[ -z "${OUTDIR}" ]] ; then
+    echo >&2 "You must specify an output directory with -o"
+    echo >&2
+    usage
+fi
+
+if [[ -z "${DESC}" ]] ; then
+    DESC="Region ${REGION}"
+fi
+
+echo >&2 "Graph File: " $GRAPH_FILE
+echo >&2 "Region: " $REGION
+echo >&2 "Output Directory: " $OUTDIR
+
+rm -fr $OUTDIR
+mkdir -p $OUTDIR
+
+# Parse the region
+REGION_END="$(echo ${REGION} | rev | cut -f1 -d'-' | rev)"
+REGION_START="$(echo ${REGION} | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)"
+REGION_CONTIG="$(echo ${REGION} | rev| cut -f2- -d':' | rev)"
+
+# construct track JSON for graph file
+jq -n --arg trackFile "${GRAPH_FILE}" --arg trackType "graph" --argjson trackColorSettings '{"mainPalette": "plainColors", "auxPalette": "greys"}' '$ARGS.named' >> $OUTDIR/tracks.json
+
+# Put the graphy file in place
+vg convert -p "${GRAPH_FILE}" > $OUTDIR/chunk.vg
+# Start the region BED inside the chunk
+printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > $OUTDIR/regions.tsv
+
+
+echo >&2 "Gam Files:"
+GAM_NUM=0
+for GAM_FILE in "${GAM_FILES[@]}"; do
+    echo >&2 " - $GAM_FILE"
+    # construct track JSON for each gam file
+    jq -n --arg trackFile "${GAM_FILE}" --arg trackType "read" --argjson trackColorSettings '{"mainPalette": "blues", "auxPalette": "reds"}' '$ARGS.named' >> $OUTDIR/tracks.json
+    # Work out a chunk-internal GAM name with the same leading numbering vg chunk uses
+    if [[ "${GAM_NUM}" == "0" ]] ; then
+        GAM_LEADER="chunk"
+    else
+        GAM_LEADER="chunk-${GAM_NUM}"
+    fi
+    GAM_CHUNK_NAME="${OUTDIR}/${GAM_LEADER}_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.gam"
+    # Put the chunk in place
+    cp "${GAM_FILE}" "${GAM_CHUNK_NAME}"
+    # List it in the regions TSV like vg would
+    printf "\t$(basename "${GAM_CHUNK_NAME}")" >> $OUTDIR/regions.tsv
+    GAM_NUM=$((GAM_NUM + 1))
+done
+
+# Make the empty but required annotation file. We have no haplotypes to put in it.
+touch "${OUTDIR}/chunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt"
+printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> $OUTDIR/regions.tsv
+
+for file in `ls $OUTDIR/`
+do
+    printf "$file\n" >> $OUTDIR/chunk_contents.txt
+done
+
+# Print BED line
+cat $OUTDIR/regions.tsv | cut -f1-3 | tr -d "\n"
+printf "\t${DESC}\t${OUTDIR}\n"
+
diff --git a/src/components/TubeMapContainer.js b/src/components/TubeMapContainer.js
@@ -30,7 +30,7 @@ class TubeMapContainer extends Component {
 
   handleFetchError(error, message) {
     if (!this.cancelSignal.aborted) {
-      console.log(message, error.name, error.message);
+      console.error(message, error);
       this.setState({ error: error, isLoading: false });
     } else {
       console.log("fetch canceled by componentWillUnmount", error.message);
@@ -199,7 +199,7 @@ class TubeMapContainer extends Component {
     } catch (error) {
       this.handleFetchError(
         error,
-        `POST to ${this.props.apiUrl}/getChunkedData failed:`
+        `Fetching and parsing POST to ${this.props.apiUrl}/getChunkedData failed:`
       );
     }
   };