From 250aa660d8b515806d035f638ee6cbab111e75e7 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 15 Sep 2023 13:23:30 -0400 Subject: [PATCH 1/2] Fix track JSON load, make sure chunk ref path is first, and add script to help make a chunk from a graph --- docker/config.json | 8 +- exampleData/chunk-ref-1-20/tracks.json | 8 +- scripts/prepare_chunks.sh | 78 +++++++++++++----- scripts/prepare_local_chunk.sh | 107 +++++++++++++++++++++++++ src/components/TubeMapContainer.js | 4 +- src/server.mjs | 39 ++++++++- src/util/tubemap.js | 7 ++ 7 files changed, 217 insertions(+), 34 deletions(-) create mode 100755 scripts/prepare_local_chunk.sh diff --git a/docker/config.json b/docker/config.json index 36a020aa..7fb8a7ff 100644 --- a/docker/config.json +++ b/docker/config.json @@ -51,14 +51,14 @@ "internalDataPath": "./exampleData/internal", "defaultHaplotypeColorPalette" : { - "mainPalette": "ygreys", - "auxPalette": "ygreys", + "mainPalette": "greys", + "auxPalette": "greys", "colorReadsByMappingQuality": false }, "defaultReadColorPalette" : { - "mainPalette": "reds", - "auxPalette": "blues", + "mainPalette": "blues", + "auxPalette": "reds", "colorReadsByMappingQuality": false }, diff --git a/exampleData/chunk-ref-1-20/tracks.json b/exampleData/chunk-ref-1-20/tracks.json index 9d806afa..44c6fd5f 100644 --- a/exampleData/chunk-ref-1-20/tracks.json +++ b/exampleData/chunk-ref-1-20/tracks.json @@ -1,15 +1,15 @@ { "trackFile": "cactus.vg", "trackType": "graph", - "trackColorSettings": {"mainPalette": "greys", "auxPalette": "ygreys"} + "trackColorSettings": {"mainPalette": "plainColors", "auxPalette": "greys"} }, { "trackFile": "cactus0_10.sorted.gam", "trackType": "read", - "trackColorSettings": {"mainPalette": "greys", "auxPalette": "ygreys"} + "trackColorSettings": {"mainPalette": "blues", "auxPalette": "reds"} }, { "trackFile": "cactus10_20.sorted.gam", "trackType": "read", - "trackColorSettings": {"mainPalette": "greys", "auxPalette": "ygreys"} -} \ No newline at end of file + "trackColorSettings": {"mainPalette": "blues", "auxPalette": "reds"} +} diff --git a/scripts/prepare_chunks.sh b/scripts/prepare_chunks.sh index 01af814c..af4c00dd 100755 --- a/scripts/prepare_chunks.sh +++ b/scripts/prepare_chunks.sh @@ -1,53 +1,91 @@ #!/usr/bin/env bash set -e -while getopts x:h:g:r:o: flag +function usage() { + echo >&2 "${0}: Extract graph and read chunks for a region, producing a referencing line for a BED file on standard output" + echo >&2 + echo >&2 "Usage: ${0} -x mygraph.xg [-h mygraph.gbwt] -r chr1:1-100 [-d 'Description of region'] -o chunk-chr1-1-100 [-g mygam1.gam [-g mygam2.gam ...]] >> regions.bed" + exit 1 +} + +while getopts x:h:g:r:o:d: flag do case "${flag}" in - x) XG_FILE=${OPTARG};; - h) GBWT=${OPTARG};; + x) GRAPH_FILE=${OPTARG};; + h) HAPLOTYPE_FILE=${OPTARG};; g) GAM_FILES+=("$OPTARG");; r) REGION=${OPTARG};; o) OUTDIR=${OPTARG};; + d) DESC="${OPTARG}";; + *) + usage + ;; + esac done if ! command -v jq &> /dev/null then - echo "This script requires jq, exiting..." - exit + echo >&2 "This script requires jq, exiting..." + exit 1 +fi + +if [[ -z "${REGION}" ]] ; then + echo >&2 "You must specify a region with -r" + echo >&2 + usage +fi + +if [[ -z "${GRAPH_FILE}" ]] ; then + echo >&2 "You must specify a graph with -x" + echo >&2 + usage fi -echo "XG File: " $XG_FILE -echo "Haplotype File: " $GBWT -echo "Region: " $REGION -echo "Output Directory: " $OUTDIR +if [[ -z "${OUTDIR}" ]] ; then + echo >&2 "You must specify an output directory with -o" + echo >&2 + usage +fi + +if [[ -z "${DESC}" ]] ; then + DESC="Region ${REGION}" +fi + +echo >&2 "Graph File: " $GRAPH_FILE +echo >&2 "Haplotype File: " $HAPLOTYPE_FILE +echo >&2 "Region: " $REGION +echo >&2 "Output Directory: " $OUTDIR rm -fr $OUTDIR mkdir -p $OUTDIR -vg_chunk_params="-x $XG_FILE -g -c 20 -p $REGION -T -b $OUTDIR/chunk -E $OUTDIR/regions.tsv" +vg_chunk_params=(-x $GRAPH_FILE -g -c 20 -p $REGION -T -b $OUTDIR/chunk -E $OUTDIR/regions.tsv) -# construct track JSON for xg file -jq -n --arg trackFile "${XG_FILE}" --arg trackType "graph" --argjson trackColorSettings '{"mainPalette": "greys", "auxPalette": "ygreys"}' '$ARGS.named' >> $OUTDIR/tracks.json +# construct track JSON for graph file +jq -n --arg trackFile "${GRAPH_FILE}" --arg trackType "graph" --argjson trackColorSettings '{"mainPalette": "plainColors", "auxPalette": "greys"}' '$ARGS.named' >> $OUTDIR/tracks.json -# construct track JSON for gbwt file; if not any specific gbwt file, then default would be haplotype -if [[ ! -z "${GBWT}" ]] ; then - jq -n --arg trackFile "${GBWT}" --arg trackType "haplotype" --argjson trackColorSettings '{"mainPalette": "blues", "auxPalette": "reds"}' '$ARGS.named' >> $OUTDIR/tracks.json +# construct track JSON for haplotype file, if provided +if [[ ! -z "${HAPLOTYPE_FILE}" ]] ; then + jq -n --arg trackFile "${HAPLOTYPE_FILE}" --arg trackType "haplotype" --argjson trackColorSettings '{"mainPalette": "blues", "auxPalette": "reds"}' '$ARGS.named' >> $OUTDIR/tracks.json fi # construct track JSON for each gam file -echo "Gam Files:" +echo >&2 "Gam Files:" for GAM_FILE in "${GAM_FILES[@]}"; do - echo " - $GAM_FILE" + echo >&2 " - $GAM_FILE" jq -n --arg trackFile "${GAM_FILE}" --arg trackType "read" --argjson trackColorSettings '{"mainPalette": "blues", "auxPalette": "reds"}' '$ARGS.named' >> $OUTDIR/tracks.json - vg_chunk_params=" $vg_chunk_params -a $GAM_FILE" + vg_chunk_params+=(-a $GAM_FILE) done # Call vg chunk -vg chunk $vg_chunk_params > $OUTDIR/chunk.vg +vg chunk "${vg_chunk_params[@]}" > $OUTDIR/chunk.vg for file in `ls $OUTDIR/` do printf "$file\n" >> $OUTDIR/chunk_contents.txt -done \ No newline at end of file +done + +# Print BED line +cat $OUTDIR/regions.tsv | cut -f1-3 | tr -d "\n" +printf "\t${DESC}\t${OUTDIR}\n" diff --git a/scripts/prepare_local_chunk.sh b/scripts/prepare_local_chunk.sh new file mode 100755 index 00000000..63cad98a --- /dev/null +++ b/scripts/prepare_local_chunk.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +set -e + +function usage() { + echo >&2 "${0}: Prepare a tube map chunk and BED line on standard output from a pre-made subgraph. Only supports paths, not haplotypes." + echo >&2 + echo >&2 "Usage: ${0} -x subgraph.xg -r chr1:1-100 [-d 'Description of region'] -o chunk-chr1-1-100 [-g mygam1.gam [-g mygam2.gam ...]] >> regions.bed" + exit 1 +} + +while getopts x:g:r:o:d: flag +do + case "${flag}" in + x) GRAPH_FILE=${OPTARG};; + g) GAM_FILES+=("$OPTARG");; + r) REGION=${OPTARG};; + o) OUTDIR=${OPTARG};; + d) DESC="${OPTARG}";; + *) + usage + ;; + + esac +done + +if ! command -v jq &> /dev/null +then + echo >&2 "This script requires jq, exiting..." + exit 1 +fi + +if [[ -z "${REGION}" ]] ; then + echo >&2 "You must specify a region with -r" + echo >&2 + usage +fi + +if [[ -z "${GRAPH_FILE}" ]] ; then + echo >&2 "You must specify a graph with -x" + echo >&2 + usage +fi + +if [[ -z "${OUTDIR}" ]] ; then + echo >&2 "You must specify an output directory with -o" + echo >&2 + usage +fi + +if [[ -z "${DESC}" ]] ; then + DESC="Region ${REGION}" +fi + +echo >&2 "Graph File: " $GRAPH_FILE +echo >&2 "Region: " $REGION +echo >&2 "Output Directory: " $OUTDIR + +rm -fr $OUTDIR +mkdir -p $OUTDIR + +# Parse the region +REGION_END="$(echo ${REGION} | rev | cut -f1 -d'-' | rev)" +REGION_START="$(echo ${REGION} | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)" +REGION_CONTIG="$(echo ${REGION} | rev| cut -f2- -d':' | rev)" + +# construct track JSON for graph file +jq -n --arg trackFile "${GRAPH_FILE}" --arg trackType "graph" --argjson trackColorSettings '{"mainPalette": "plainColors", "auxPalette": "greys"}' '$ARGS.named' >> $OUTDIR/tracks.json + +# Put the graphy file in place +vg convert -p "${GRAPH_FILE}" > $OUTDIR/chunk.vg +# Start the region BED inside the chunk +printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > $OUTDIR/regions.tsv + + +echo >&2 "Gam Files:" +GAM_NUM=0 +for GAM_FILE in "${GAM_FILES[@]}"; do + echo >&2 " - $GAM_FILE" + # construct track JSON for each gam file + jq -n --arg trackFile "${GAM_FILE}" --arg trackType "read" --argjson trackColorSettings '{"mainPalette": "blues", "auxPalette": "reds"}' '$ARGS.named' >> $OUTDIR/tracks.json + # Work out a chunk-internal GAM name with the same leading numbering vg chunk uses + if [[ "${GAM_NUM}" == "0" ]] ; then + GAM_LEADER="chunk" + else + GAM_LEADER="chunk-${GAM_NUM}" + fi + GAM_CHUNK_NAME="${OUTDIR}/${GAM_LEADER}_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.gam" + # Put the chunk in place + cp "${GAM_FILE}" "${GAM_CHUNK_NAME}" + # List it in the regions TSV like vg would + printf "\t$(basename "${GAM_CHUNK_NAME}")" >> $OUTDIR/regions.tsv + GAM_NUM=$((GAM_NUM + 1)) +done + +# Make the empty but required annotation file. We have no haplotypes to put in it. +touch "${OUTDIR}/chunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt" +printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> $OUTDIR/regions.tsv + +for file in `ls $OUTDIR/` +do + printf "$file\n" >> $OUTDIR/chunk_contents.txt +done + +# Print BED line +cat $OUTDIR/regions.tsv | cut -f1-3 | tr -d "\n" +printf "\t${DESC}\t${OUTDIR}\n" + diff --git a/src/components/TubeMapContainer.js b/src/components/TubeMapContainer.js index a11a4b0c..8c063a6a 100644 --- a/src/components/TubeMapContainer.js +++ b/src/components/TubeMapContainer.js @@ -30,7 +30,7 @@ class TubeMapContainer extends Component { handleFetchError(error, message) { if (!this.cancelSignal.aborted) { - console.log(message, error.name, error.message); + console.error(message, error); this.setState({ error: error, isLoading: false }); } else { console.log("fetch canceled by componentWillUnmount", error.message); @@ -199,7 +199,7 @@ class TubeMapContainer extends Component { } catch (error) { this.handleFetchError( error, - `POST to ${this.props.apiUrl}/getChunkedData failed:` + `Fetching and parsing POST to ${this.props.apiUrl}/getChunkedData failed:` ); } }; diff --git a/src/server.mjs b/src/server.mjs index be3e4826..674785bb 100644 --- a/src/server.mjs +++ b/src/server.mjs @@ -537,6 +537,7 @@ async function getChunkedData(req, res, next) { } req.graph = JSON.parse(graphAsString); req.region = [rangeRegion.start, rangeRegion.end]; + // vg chunk always puts the path we reference on first automatically if (!sentResponse) { sentResponse = true; processAnnotationFile(req, res, next); @@ -593,6 +594,24 @@ async function getChunkedData(req, res, next) { } req.graph = JSON.parse(graphAsString); req.region = [rangeRegion.start, rangeRegion.end]; + + // We might not have the path we are referencing on appearing first. + if (parsedRegion.contig !== "node") { + // Make sure that path 0 is the path we actually asked about + let refPaths = []; + let otherPaths = []; + for (let path of req.graph.path) { + if (path.name === parsedRegion.contig) { + // This is the path we asked about, so it goes first + refPaths.push(path); + } else { + // Then we put each other path + otherPaths.push(path); + } + } + req.graph.path = refPaths.concat(otherPaths); + } + if (!sentResponse) { sentResponse = true; processAnnotationFile(req, res, next); @@ -669,6 +688,7 @@ function returnErrorMiddleware(err, req, res, next) { result.error += req.error.toString("utf-8"); } console.log("returning error: " + result.error); + console.error(err); if (err.status) { // Error comes with a status res.status(err.status); @@ -922,6 +942,10 @@ function processGamFiles(req, res, next) { } } +// Function to do the step of reading the "region" file, a BED inside the chunk +// that records the path and start offset that were used to define the chunk. +// +// Calls out to the next step, cleanUpAndSendResult function processRegionFile(req, res, next) { try { console.time("processing region file"); @@ -1432,14 +1456,21 @@ async function getBedRegions(bed) { if (fs.existsSync(track_json)) { // Create string of tracks data const string_data = fs.readFileSync(track_json); - const parser = new JSONParser(); - parser.onValue = (value, key, parent, stack) => { - if (stack > 0) return; // ignore inner values + const parser = new JSONParser({separator: ''}); + parser.onValue = ({value, key, parent, stack}) => { + if (stack.length > 0) { + // ignore inner values + return; + } + if (!Object.hasOwn(value, 'trackFile')) { + throw new BadRequestError('Non-track object in tracks.json: ' + JSON.stringify(value)) + } // put tracks in array tracks_array.push(value); }; parser.write(string_data); - tracks = tracks_array; + // Convert to object container like the client component prop types expect + tracks = {...tracks_array}; } } diff --git a/src/util/tubemap.js b/src/util/tubemap.js index 7e1322e7..a7b4e368 100644 --- a/src/util/tubemap.js +++ b/src/util/tubemap.js @@ -26,6 +26,7 @@ const greys = [ "#000000", ]; +// Greys but with a special color for the first thing. const ygreys = [ "#9467bd", "#d9d9d9", @@ -4229,6 +4230,12 @@ export function vgExtractReads( for (let i = 0; i < myReads.length; i += 1) { const read = myReads[i]; + + if (!read.path) { + // Read does not have a path assigned, this is an unmapped read. + continue; + } + const sequence = []; const sequenceNew = []; let firstIndex = -1; // index within mapping of the first node id contained in nodeNames From 0cfc9b3825d8f5eb49e111f56bebdbb0836cfaf7 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 15 Sep 2023 15:34:57 -0400 Subject: [PATCH 2/2] Document the new subgraph to chunk script to fix #326 --- README.md | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index bde4859e..03b2fbda 100644 --- a/README.md +++ b/README.md @@ -137,28 +137,51 @@ That can sometimes up to 10-20 seconds. If you already know of regions/subgraphs that you will be looking at, you can pre-fetch the data in advance. This will save some time during the interactive visualization, especially if there are a lot of regions to visualize. -This is a 2 step process that involves creating the chunk and linking it to a bed file +The net result needs to be one or more chunk directories on disk, referenced from a BED file. -1. -The subgraphs need to be pre-fetched using `vg chunk` like shown in [`prepare_chunks.sh`](scripts/prepare_chunks.sh). For example: +To generate each chunk, you can use the `prepare_chunks.sh` script. You ought to run it from the directory containing your input files and where your output chunks will be stored (i.e. the `dataPath` in `sequenceTubeMpas/src/config.json`), which defaults to the `exampleData` directory in the repo. + +For example: ``` -./prepare_chunk.sh -x mygraph.xg -h mygraph.gbwt -r chr1:1-100 -o chunk-chr1-1-100 -g mygam1.gam -g mygam2.gam +cd exampleData/ +../scripts/prepare_chunk.sh -x mygraph.xg -h mygraph.gbwt -r chr1:1-100 -d 'Region A' -o chunk-chr1-1-100 -g mygam1.gam -g mygam2.gam >> mychunks.bed +../scripts/prepare_chunk.sh -x mygraph.xg -h mygraph.gbwt -r chr1:101-200 -d 'Region B' -o chunk-chr1-100-200 -g mygam1.gam -g mygam2.gam >> mychunks.bed ``` -2. -Then compile those regions in a BED file with two additional columns: +The BED file linking to the chunks has two additional nonstandard columns: - a description of the region (column 4) - the path to the output directory of the chunk, `chunk-chr1-1-100` in the example above, (column 5). ``` -ref 1 10 region one to ten chunk-ref-1-20 -ref 10 20 region ten to twenty chunk-ref-1-20 +chr1 1 100 Region A chunk-chr1-1-100 +chr1 101 200 Region B chunk-chr2-101-200 ``` Note each column is seperated by tabs -This BED file will be read if placed in the `dataPath` directory, like for other files to mount (see above). +This BED file needs to be in the `dataPath` directory, or it can be hosted on the web along with its chunk directories and accessed via URL. + +##### Pre-made subgraphs + +You may want to look at a graph that has already been extracted from a larger graph. +To support this, there is a `prepare_local_chunk.sh` script, which takes a subgraph rather than a full graph. +It supports most of the options that `prepare_chunks.sh` does, with the notable exception of haplotype files. +It assumes that the graph represents some region along some reference path that is present in the graph, and expects that region to be provided with the `-r` option. +It assumes that path names in the subgraph *don't* use subregion suffixes (bracket-enclosed numbers). +The path name used in the region should *exactly* match the name of one of the paths in the graph. + +For example, you can run it like: + +``` +cd exampleData/ +../scripts/prepare_local_chunk.sh -x subgraph.gfa -r chr5:1023911-1025911 -g subgraph_reads.gam -g other_sample_reads.gam -o subgraph1 >> subgraphs.bed +``` + +If the original subgraph file does not remain in place under the configured `dataPath` and accessible by the tube map, errors may occur complaining that it couldn't be accessed when the tube map attempts to list ist contained paths. + +The net result will be that you can select the BED file, select the region it specifies, and view a precomputed view of the subgraph, with coordinates computed assuming it covers the region provided to `prepare_local_chunk.sh`. + #### Development Mode