From 1fc34c4b319add5da38f4022f18dda69f80c4c8d Mon Sep 17 00:00:00 2001 From: Charlotte Herzeel Date: Thu, 23 May 2019 14:52:01 +0200 Subject: [PATCH] Various bug fixes and extensions: * Added --tmp-path option for sfm. * Fixed /dev/stdin and /dev/stdout for sfm. * Added a command for merging intermediate metrics files. * Fixed detection of directory path names in sfm. --- README.md | 64 +++++++++++- cmd/merge-optical-duplicates-metrics.go | 132 ++++++++++++++++++++++++ cmd/merge.go | 4 +- cmd/sfm.go | 45 ++++++-- cmd/util.go | 2 +- filters/mark-optical-duplicates.go | 2 +- filters/print-bqsr.go | 4 +- go.mod | 6 +- go.sum | 12 +-- internal/files.go | 16 +-- main.go | 7 +- sam/split-merge.go | 9 +- 12 files changed, 262 insertions(+), 41 deletions(-) create mode 100644 cmd/merge-optical-duplicates-metrics.go diff --git a/README.md b/README.md index 515d44c..09da548 100644 --- a/README.md +++ b/README.md @@ -289,6 +289,33 @@ This option is used to specify the number of levels for quantizing quality score This option is used to indicate to use static quantized quality scores to a given number of levels during base quality score recalibration (--bqsr). This list should be of the form "[nr, nr, nr]". The default value is []. +### --mark-optical-duplicates-intermediate file + +This option is used in the context of filtering files created using the elprep split command. It is used internally by +the elprep sfm command, but can be used when writing your own split/filter/merge scripts. + +This option tells elPrep to perform optical duplicate marking and to write the result to an intermediate metrics file. +The intermediate metrics file generated this way can later be merged with other intermediate metrics files, see the +merge-optical-duplicates-metrics command. + +### --bqsr-tables-only table-file + +This option is used in the context of filtering files created using the elprep split command. It is used internally by +the elprep sfm command, but can be used when writing your own split/filter/merge scripts. + +This option tells elPrep to perform base quality score recalibration and to write the result of the recalibration to an +intermediate table file. This table file will need to be merged with other intermediate recalibration results during the +application of the base quality score recalibration. See the --bqsr-apply option. + +### --bqsr-apply path + +This option is used when filtering files created by the elprep split command. It is used internally by the elprep sfm +command, and can be used when writing your own split/filter/merge scripts. + +This option is used for applying base quality score recalibration on an input file. It expects a path parameter that +refers to a directory that contains intermediate recalibration results for multiple files created using the +--bqsr-tables-only option. + ## Sorting Command Options ### --sorting-order [keep | unknown | unsorted | queryname | coordinate] @@ -384,6 +411,7 @@ The elprep split command can be used to split up .sam files into smaller files t Splitting the .sam file into smaller files for processing "per chromosome" is useful for reducing the memory pressure as these split files are typically significantly smaller than the input file as a whole. Splitting also makes it possible to parallelize the processing of a single .sam file by distributing the different split files across different processing nodes. We provide an sfm command that executes a pipeline while silently using the elprep filter and split/merge tools. It is of course possible to write scripts to combine the filter and split/merge tools yourself. +We provide a recipe for writing your own split/filter/merge scripts on our github wiki. ## Name @@ -395,8 +423,6 @@ We provide an sfm command that executes a pipeline while silently using the elpr elprep sfm input.bam output.bam --mark-duplicates --mark-optical-duplicates output.metrics --sorting-order coordinate --bqsr output.recal --bqsr-reference hg38.elfasta --known-sites dbsnp_138.hg38.elsites - elprep sfm --mark-duplicates --mark-optical-duplicates output.metrics --sorting-order coordinate --bqsr output.recal --bqsr-reference hg38.elfasta --known-sites dbsnp_138.hg38.elsites - ## Description The elprep sfm command is a drop-in replacement for the elprep filter command that minimises the use of RAM. For this, it silently calls the elprep split and merge tools to split up the data "per chromosome" for processing, which requires less RAM than processing a .sam/.bam file as a whole (see Split and Merge tools). @@ -409,6 +435,10 @@ The elprep sfm command has the same options as the elprep filter command, with t This command option sets the format of the split files. By default, elprep uses the same format as the input file for the split files. Changing the intermediate file output type may improve either runtime (.sam) or reduce peak disk usage (.bam). +### --tmp-path + +This command option is used to specify a path where elPrep can store temporary files that are created (and deleted) by the split and merge commands that are silently called by the elprep sfm command. The default path is the folder from where you call elprep sfm. + ### --single-end Use this command option to indicate the sfm command is processing single-end data. This information is important for the split/merge tools to operate correcly. For more details, see the description of the elprep split and elprep merge commands. @@ -439,6 +469,8 @@ Choosing the value 1 for the --contig-group-size tells elprep split to split the The elprep split command requires two arguments: 1) the input file or a path to multiple input files and 2) a path to a directory where elPrep can store the split files. The input file(s) can be .sam or .bam. It is also possible to use /dev/stdin as the input for using Unix pipes. There are no structural requirements on the input file(s) for using elprep split. For example, it is not necessary to sort the input file, nor is it necessary to convert to .bam or index the input file. +Warning: If you pass a path to multiple input files to the elprep split command, elprep assumes that they all have the same (or compatible) headers, and just picks the first header it finds as the header for all input files. elprep currently does not make an attempt to resolve potential conflicts between headers, especially with regard to the @SQ, @RG, or @PG header records. We will include proper merging of different SAM/BAM files in a future version of elprep. In the meantime, if you need proper merging of SAM/BAM files, please use samtools merge, Picard MergeSamFiles, or a similar tool. (If such a tool produces SAM file as output, it can be piped into elprep using Unix pipes.) + elPrep creates the output directory denoted by the output path, unless the directory already exists, in which case elPrep may override the existing files in that directory. Please make sure elPrep has the correct permissions for writing that directory. By default, the elprep split command assumes it is processing pair-end data. The flag --single-end can be used for processing single-end data. The output will look different for paired-end and single-end data. @@ -524,6 +556,34 @@ Sets the path for writing a log file. The --contig-group-size parameter for the elprep merge command is deprecated since version 4.1.1. The elprep merge command now correctly processes the split files without that information. +## Name + +### elprep merge-optical-duplicate-metrics - a commandline tool for merging intermediate metrics files created by the --mark-optical-duplicates-intermediate option + +## Synopsis + + elprep merge-optical-duplicates-metrics input-file output-file metrics-file /path/to/intermediate/metrics --remove-duplicates + +## Description + +The elprep merge-optical-duplicates-metrics command requires four arguments: +the names of the original input and output .sam/.bam files for which the metrics are calculated, +the metrics file to which the merged metrics should be written, and a path to the intermediate metrics files that need +to be merged (and were generated using --mark-optical-duplicates-intermediate). + +## Options + +### --nr-of-threads number + +This command option sets the number of threads that elPrep uses during execution for parsing/outputting .sam/.bam data. The default number of threads is equal to the number of cpu threads. + +It is normally not necessary to set this option. elPrep by default allocates the optimal number of threads. + +## --remove-duplicates + +Pass this option if the metrics were generated for a file for which the duplicates were removed. This information will +be included in the merged metrics file. + # Extending elPrep If you wish to extend elPrep, for example by adding your own filters, please consult our [API documentation](https://godoc.org/github.com/ExaScience/elprep). diff --git a/cmd/merge-optical-duplicates-metrics.go b/cmd/merge-optical-duplicates-metrics.go new file mode 100644 index 0000000..119d3ea --- /dev/null +++ b/cmd/merge-optical-duplicates-metrics.go @@ -0,0 +1,132 @@ +// elPrep: a high-performance tool for preparing SAM/BAM files. +// Copyright (c) 2017-2019 imec vzw. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version, and Additional Terms +// (see below). + +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Affero General Public License for more details. + +// You should have received a copy of the GNU Affero General Public +// License and Additional Terms along with this program. If not, see +// . + +package cmd + +import ( + "bytes" + "flag" + "fmt" + "log" + "os" + "path/filepath" + "runtime" + + "github.com/exascience/elprep/v4/filters" +) + +// MergeOpticalDuplicatesMetricsHelp is the help string for this command. +const MergeOpticalDuplicatesMetricsHelp = "\nmerge-optical-duplicates-metrics parameters:\n" + + "elprep merge-optical-duplicates-metrics sam-input-file sam-output-file metrics-file /path/to/intermediate/metrics\n" + + "[--remove-duplicates]\n" + + "[--nr-of-threads nr]\n" + + "[--timed]\n" + + "[--log-path path]\n" + +// Merge implements the elprep merge command. +func MergeOpticalDuplicatesMetrics() error { + var ( + profile, logPath string + nrOfThreads int + timed, removeDuplicates bool + ) + + var flags flag.FlagSet + + flags.IntVar(&nrOfThreads, "nr-of-threads", 0, "number of worker threads") + flags.BoolVar(&timed, "timed", false, "measure the runtime") + flags.BoolVar(&removeDuplicates, "remove-duplicates", false, "use when duplicates were removed during duplicate marking") + flags.StringVar(&profile, "profile", "", "write a runtime profile to the specified file(s)") + flags.StringVar(&logPath, "log-path", "", "write log files to the specified directory") + + parseFlags(flags, 6, MergeOpticalDuplicatesMetricsHelp) + + input := getFilename(os.Args[2], MergeOpticalDuplicatesMetricsHelp) + output := getFilename(os.Args[3], MergeOpticalDuplicatesMetricsHelp) + metrics := getFilename(os.Args[4], MergeOpticalDuplicatesMetricsHelp) + intermediateMetrics := getFilename(os.Args[5], MergeOpticalDuplicatesMetricsHelp) + + setLogOutput(logPath) + + // sanity checks + + var sanityChecksFailed bool + + if !checkExist("", input) { + log.Println("Warning: Input file does not exist: ", input) + } + + if !checkExist("", intermediateMetrics) { + sanityChecksFailed = true + } + + if profile != "" && !checkCreate("--profile", profile) { + sanityChecksFailed = true + } + + metricsDir, err := filepath.Abs(intermediateMetrics) + if err != nil { + return err + } + + if nrOfThreads < 0 { + sanityChecksFailed = true + log.Println("Error: Invalid nr-of-threads: ", nrOfThreads) + } + + if sanityChecksFailed { + fmt.Fprint(os.Stderr, MergeOpticalDuplicatesMetricsHelp) + os.Exit(1) + } + + // building output command line + + var command bytes.Buffer + fmt.Fprint(&command, os.Args[0], " merge-optical-duplicates-metrics ", input, " ", output, " ", metrics, " ", intermediateMetrics) + if nrOfThreads > 0 { + runtime.GOMAXPROCS(nrOfThreads) + fmt.Fprint(&command, " --nr-of-threads ", nrOfThreads) + } + if timed { + fmt.Fprint(&command, " --timed ") + } + if logPath != "" { + fmt.Fprint(&command, " --log-path ", logPath) + } + if removeDuplicates { + fmt.Fprint(&command, " --remove-duplicates") + } + + // executing command + + log.Println("Executing command:\n", command.String()) + + var ctr filters.DuplicatesCtrMap + + // merge intermediate metrics files + err = timedRun(timed, profile, "Loading and combining duplicate metrics.", 1, func() error { + ctr = filters.LoadAndCombineDuplicateMetrics(metricsDir) + return ctr.Err() + }) + if err != nil { + return err + } + return timedRun(timed, profile, "Printing comdined duplicate metrics.", 2, func() error { + return filters.PrintDuplicatesMetrics(input, output, metrics, removeDuplicates, ctr) + }) +} diff --git a/cmd/merge.go b/cmd/merge.go index d950fbe..3179fcf 100644 --- a/cmd/merge.go +++ b/cmd/merge.go @@ -1,5 +1,5 @@ // elPrep: a high-performance tool for preparing SAM/BAM files. -// Copyright (c) 2017, 2018 imec vzw. +// Copyright (c) 2017-2019 imec vzw. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as @@ -84,7 +84,7 @@ func Merge() error { if err != nil { return err } - filesToMerge, err := internal.Directory(fullInputPath) + fullInputPath, filesToMerge, err := internal.Directory(fullInputPath) if err != nil { log.Printf("Given directory %v causes error %v.\n", input, err) sanityChecksFailed = true diff --git a/cmd/sfm.go b/cmd/sfm.go index e1894ef..b7ce962 100644 --- a/cmd/sfm.go +++ b/cmd/sfm.go @@ -67,6 +67,7 @@ const SfmHelp = "\nsfm parameters:\n" + "[--log-path path]\n" + "[--intermediate-files-output-prefix name]\n" + "[--intermediate-files-output-type [sam | bam]]\n" + + "[--tmp-path path]\n" + "[--single-end]\n" + "[--contig-group-size nr]\n" @@ -99,6 +100,7 @@ const CombinedSfmFilterHelp = "filter/sfm parameters:\n" + "[--log-path path]\n" + "[--intermediate-files-output-prefix name] (sfm only)\n" + "[--intermediate-files-output-type [sam | bam]] (sfm only)\n" + + "[--tmp-path path]\n" + "[--single-end] (sfm only)\n" + "[--contig-group-size nr] (sfm only)\n" @@ -128,7 +130,7 @@ func Sfm() error { nrOfThreads int timed bool profile string - logPath string + logPath, tmpPath string renameChromosomes bool outputPrefix, outputType string contigGroupSize int @@ -165,6 +167,7 @@ func Sfm() error { flags.BoolVar(&timed, "timed", false, "measure the runtime") flags.StringVar(&profile, "profile", "", "write a runtime profile to the specified file(s)") flags.StringVar(&logPath, "log-path", "", "write log files to the specified directory") + flags.StringVar(&tmpPath, "tmp-path", "", "write split files to a specified directory") flags.BoolVar(&renameChromosomes, "rename-chromosomes", false, "") //split/merge flags flags.StringVar(&outputPrefix, "intermediate-files-output-prefix", "", "prefix for the output files") @@ -400,6 +403,13 @@ func Sfm() error { mergeArgs = append(mergeArgs, "--log-path", logPath) } + if tmpPath != "" { + fmt.Fprint(&command, " --tmp-path ", tmpPath) + if err := os.MkdirAll(tmpPath, 0700); err != nil { + log.Fatal(err, ", while trying to create directories for split files ", tmpPath) + } + } + if deterministic { fmt.Fprint(&command, " --deterministic") filterArgs = append(filterArgs, "--deterministic") @@ -415,7 +425,11 @@ func Sfm() error { splitArgs = append(splitArgs, "--output-prefix", outputPrefix) if outputType == "" { - outputType = ext[1:] + if ext == "" { + outputType = "sam" + } else { + outputType = ext[1:] + } } fmt.Fprint(&command, " --intermediate-files-output-type ", outputType) splitArgs = append(splitArgs, "--output-type", outputType) @@ -438,16 +452,19 @@ func Sfm() error { // split command timeStamp := time.Now().Format(time.RFC3339) - splitsName := fmt.Sprintf("elprep-splits-%s", timeStamp) + splitsName := filepath.Join(tmpPath, fmt.Sprintf("elprep-splits-%s", timeStamp)) splitsDir, err := filepath.Abs(splitsName) if err != nil { return err } splitsDir = splitsDir + string(filepath.Separator) - splitOpt := []string{"split", os.Args[2], splitsDir} + splitOpt := []string{"split", input, splitsDir} splitArgs = append(splitOpt, splitArgs...) log.Println("Splitting...") splitCmd := exec.Command(os.Args[0], splitArgs...) + if input == "/dev/stdin" { + splitCmd.Stdin = os.Stdin + } splitCmd.Stderr = os.Stderr err = splitCmd.Run() if err != nil { @@ -455,9 +472,9 @@ func Sfm() error { } // set up directory for metrics - metricsName := fmt.Sprintf("elprep-metrics-%s", timeStamp) metricsDir := "" if markOpticalDuplicates != "" { + metricsName := filepath.Join(tmpPath, fmt.Sprintf("elprep-metrics-%s", timeStamp)) metricsDir, err = filepath.Abs(metricsName) if err != nil { return err @@ -470,7 +487,7 @@ func Sfm() error { } // filter commands - mergeName := fmt.Sprintf("elprep-splits-processed-%s", timeStamp) + string(filepath.Separator) + mergeName := filepath.Join(tmpPath, fmt.Sprintf("elprep-splits-processed-%s", timeStamp)) mergeDir, err := filepath.Abs(mergeName) if err != nil { return err @@ -478,16 +495,17 @@ func Sfm() error { mergeDir = mergeDir + string(filepath.Separator) splitFilesDir := splitsDir if !singleEnd { - splitFilesDir = path.Join(splitsDir, "splits") + string(filepath.Separator) + splitFilesDir = path.Join(splitsDir, "splits") } - files, err := internal.Directory(splitFilesDir) + splitFilesDir, files, err := internal.Directory(splitFilesDir) if err != nil { return err } log.Println("Filtering...") if bqsr != "" { // phase 1: Recalibration - tabsDir, err := filepath.Abs("elprep-tabs-" + timeStamp) + tabsName := filepath.Join(tmpPath, fmt.Sprintf("elprep-tabs-%s", timeStamp)) + string(filepath.Separator) + tabsDir, err := filepath.Abs(tabsName) if err != nil { return err } @@ -599,7 +617,8 @@ func Sfm() error { log.Println("Filtering...") filterOpt2 := []string{"filter", "/dev/stdin", output} filterArgs2 = append(filterOpt2, filterArgs2...) - tabsDir, err := filepath.Abs("elprep-tabs-" + timeStamp) + tabsName := filepath.Join(tmpPath, fmt.Sprintf("elprep-tabs-%s", timeStamp)) + string(filepath.Separator) + tabsDir, err := filepath.Abs(tabsName) if err != nil { return err } @@ -607,6 +626,9 @@ func Sfm() error { filterArgs2 = append(filterArgs2, "--bqsr-apply", tabsDir, "--recal-file", bqsr) applyBqsrCommand := exec.Command(os.Args[0], filterArgs2...) applyBqsrCommand.Stdin = outPipe + if output == "/dev/stdout" { + applyBqsrCommand.Stdout = os.Stdout + } applyBqsrCommand.Stderr = os.Stderr err = mergeCmd.Start() if err != nil { @@ -632,6 +654,9 @@ func Sfm() error { mergeOpt := []string{"merge", mergeDir, output} mergeArgs = append(mergeOpt, mergeArgs...) mergeCmd := exec.Command(os.Args[0], mergeArgs...) + if output == "/dev/stdout" { + mergeCmd.Stdout = os.Stdout + } mergeCmd.Stderr = os.Stderr err = mergeCmd.Run() if err != nil { diff --git a/cmd/util.go b/cmd/util.go index 04e9b73..e6ad684 100644 --- a/cmd/util.go +++ b/cmd/util.go @@ -37,7 +37,7 @@ const ( // ProgramName is "elprep" ProgramName = "elprep" // ProgramVersion is the version of the elprep binary - ProgramVersion = "4.1.3" + ProgramVersion = "4.1.4" // ProgramURL is the repository for the elprep source code ProgramURL = "http://github.com/exascience/elprep" ) diff --git a/filters/mark-optical-duplicates.go b/filters/mark-optical-duplicates.go index 5ee02bc..b04444e 100644 --- a/filters/mark-optical-duplicates.go +++ b/filters/mark-optical-duplicates.go @@ -845,7 +845,7 @@ func LoadAndCombineDuplicateMetrics(metricsPath string) DuplicatesCtrMap { // create ctr ctrs := DuplicatesCtrMap{Map: make(map[string]*DuplicatesCtr)} // go through the files, loading intermediate metrics - files, err := internal.Directory(metricsPath) + metricsPath, files, err := internal.Directory(metricsPath) if err != nil { ctrs.err = err return ctrs diff --git a/filters/print-bqsr.go b/filters/print-bqsr.go index 8deff24..7313dba 100644 --- a/filters/print-bqsr.go +++ b/filters/print-bqsr.go @@ -1,5 +1,5 @@ // elPrep: a high-performance tool for preparing SAM/BAM files. -// Copyright (c) 2017, 2018 imec vzw. +// Copyright (c) 2017-2019 imec vzw. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as @@ -338,7 +338,7 @@ func LoadAndCombineBQSRTables(bqsrPath string) (BaseRecalibratorTables, error) { // create bqsr tables result := NewBaseRecalibratorTables() // go through the files, loading intermediate tables - files, err := internal.Directory(bqsrPath) + bqsrPath, files, err := internal.Directory(bqsrPath) if err != nil { return BaseRecalibratorTables{}, err } diff --git a/go.mod b/go.mod index 0d9f885..dcd09ec 100644 --- a/go.mod +++ b/go.mod @@ -2,8 +2,8 @@ module github.com/exascience/elprep/v4 require ( github.com/exascience/pargo v1.0.1 - golang.org/x/exp v0.0.0-20190426190305-956cc1757749 // indirect - golang.org/x/sys v0.0.0-20190429094411-2cc0cad0ac78 - gonum.org/v1/gonum v0.0.0-20190424212039-2a1643c79af2 // indirect + golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522 // indirect + golang.org/x/sys v0.0.0-20190522044717-8097e1b27ff5 + gonum.org/v1/gonum v0.0.0-20190520094443-a5f8f3a4840b // indirect gonum.org/v1/netlib v0.0.0-20190331212654-76723241ea4e // indirect ) diff --git a/go.sum b/go.sum index 5bb3c36..4828150 100644 --- a/go.sum +++ b/go.sum @@ -5,21 +5,21 @@ github.com/remyoudompheng/bigfft v0.0.0-20170806203942-52369c62f446/go.mod h1:uY golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190312203227-4b39c73a6495/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= -golang.org/x/exp v0.0.0-20190426190305-956cc1757749 h1:Bduxdpx1O6126WsH6F6NwKywZ/FPncphlTduoPxFG78= -golang.org/x/exp v0.0.0-20190426190305-956cc1757749/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= +golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522 h1:OeRHuibLsmZkFj773W4LcfAGsSxJgfPONhr8cmO+eLA= +golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190429094411-2cc0cad0ac78 h1:ddnrbGOgFiC0zV+uaYoSkl0f47vnII6Zu426zWQrWkg= -golang.org/x/sys v0.0.0-20190429094411-2cc0cad0ac78/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190522044717-8097e1b27ff5 h1:f005F/Jl5JLP036x7QIvUVhNTqxvSYwFIiyOh2q12iU= +golang.org/x/sys v0.0.0-20190522044717-8097e1b27ff5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= gonum.org/v1/gonum v0.0.0-20190331200053-3d26580ed485/go.mod h1:2ltnJ7xHfj0zHS40VVPYEAAMTa3ZGguvHGBSJeRWqE0= -gonum.org/v1/gonum v0.0.0-20190424212039-2a1643c79af2 h1:KTRD63fFTJiXuYJfxAI7BLujKCVAi7s9QD7rgzfY7MU= -gonum.org/v1/gonum v0.0.0-20190424212039-2a1643c79af2/go.mod h1:2ltnJ7xHfj0zHS40VVPYEAAMTa3ZGguvHGBSJeRWqE0= +gonum.org/v1/gonum v0.0.0-20190520094443-a5f8f3a4840b h1:6TIR0pAaLZhSQXUKh+yxFO4qaPuWt+ZlC8cEVzeME5I= +gonum.org/v1/gonum v0.0.0-20190520094443-a5f8f3a4840b/go.mod h1:zXcK6UmEkbNk22MqyPrZPx3T6fsE/O56XzkDfeYUF+Y= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/netlib v0.0.0-20190331212654-76723241ea4e h1:jRyg0XfpwWlhEV8mDfdNGBeSJM2fuyh9Yjrnd8kF2Ts= gonum.org/v1/netlib v0.0.0-20190331212654-76723241ea4e/go.mod h1:kS+toOQn6AQKjmKJ7gzohV1XkqsFehRA2FbsbkopSuQ= diff --git a/internal/files.go b/internal/files.go index df6822b..42125e2 100644 --- a/internal/files.go +++ b/internal/files.go @@ -1,5 +1,5 @@ // elPrep: a high-performance tool for preparing SAM/BAM files. -// Copyright (c) 2017, 2018 imec vzw. +// Copyright (c) 2017-2019 imec vzw. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as @@ -27,17 +27,17 @@ import ( // refers to a directory, return a slice of names of files that are in // this directory. If the given filename does not refer to a // directory, return a slice with this filename as the only entry. -func Directory(file string) (files []string, err error) { +func Directory(file string) (directory string, files []string, err error) { info, err := os.Stat(file) if err != nil { - return nil, err + return "", nil, err } if !info.IsDir() { - return []string{filepath.Base(file)}, nil + return filepath.Dir(file), []string{filepath.Base(file)}, nil } f, err := os.Open(file) if err != nil { - return nil, err + return "", nil, err } defer func() { nerr := f.Close() @@ -45,5 +45,9 @@ func Directory(file string) (files []string, err error) { err = nerr } }() - return f.Readdirnames(0) + files, err = f.Readdirnames(0) + if err != nil { + return "", nil, err + } + return filepath.Clean(file), files, err } diff --git a/main.go b/main.go index 2e9f2f6..cd0957a 100755 --- a/main.go +++ b/main.go @@ -1,5 +1,5 @@ // elPrep: a high-performance tool for preparing SAM/BAM files. -// Copyright (c) 2017, 2018 imec vzw. +// Copyright (c) 2017-2019 imec vzw. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as @@ -42,11 +42,12 @@ func printHelp() { } func prinExtendedHelp() { - fmt.Fprintln(os.Stderr, "Available commands: filter, split, merge, sfm, vcf-to-elsites, bed-to-elsites, fasta-to-elfasta") + fmt.Fprintln(os.Stderr, "Available commands: filter, split, merge, merge-optical-duplicates-metrics, sfm, vcf-to-elsites, bed-to-elsites, fasta-to-elfasta") fmt.Fprint(os.Stderr, "\n", cmd.FilterExtendedHelp) fmt.Fprint(os.Stderr, "\n", cmd.SplitHelp) fmt.Fprint(os.Stderr, "\n", cmd.MergeHelp) fmt.Fprint(os.Stderr, "\n", cmd.SfmHelp) + fmt.Fprint(os.Stderr, "\n", cmd.MergeOpticalDuplicatesMetricsHelp) fmt.Fprint(os.Stderr, "\n", cmd.VcfToElsitesHelp) fmt.Fprint(os.Stderr, "\n", cmd.BedToElsitesHelp) fmt.Fprint(os.Stderr, "\n", cmd.FastaToElfastaHelp) @@ -69,6 +70,8 @@ func main() { err = cmd.Split() case "merge": err = cmd.Merge() + case "merge-optical-duplicates-metrics": + err = cmd.MergeOpticalDuplicatesMetrics() case "vcf-to-elsites": err = cmd.VcfToElsites() case "bed-to-elsites": diff --git a/sam/split-merge.go b/sam/split-merge.go index 14a66b6..04d939b 100644 --- a/sam/split-merge.go +++ b/sam/split-merge.go @@ -1,5 +1,5 @@ // elPrep: a high-performance tool for preparing SAM/BAM files. -// Copyright (c) 2017, 2018 imec vzw. +// Copyright (c) 2017-2019 imec vzw. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as @@ -78,11 +78,10 @@ func computeContigGroups(SQ []utils.StringMap, contigGroupSize int) (groups []st // pairs where the reads map to that chromosome. There are no // requirements on the input file for splitting. func SplitFilePerChromosome(input, outputPath, outputPrefix, outputExtension string, contigGroupSize int) (funcErr error) { - files, err := internal.Directory(input) + inputPath, files, err := internal.Directory(input) if err != nil { return fmt.Errorf("%v, while attempting to fetch file(s) %v in SplitFilePerChromosome", err, input) } - inputPath := filepath.Dir(input) firstFile := filepath.Join(inputPath, files[0]) firstIn, err := Open(firstFile) if err != nil { @@ -624,12 +623,10 @@ func MergeUnsortedFilesSplitPerChromosome(inputPath, output, inputPrefix, inputE // chromosome, containing all reads that map to that chromosome. There // are no requirements on the input file for splitting. func SplitSingleEndFilePerChromosome(input, outputPath, outputPrefix, outputExtension string, contigGroupSize int) (funcErr error) { - - files, err := internal.Directory(input) + inputPath, files, err := internal.Directory(input) if err != nil { return fmt.Errorf("%v, while attempting to fetch file(s) %v in SplitSingleEndFilePerChromosome", err, input) } - inputPath := filepath.Dir(input) firstFile := filepath.Join(inputPath, files[0]) firstIn, err := Open(firstFile) if err != nil {