From 8b3aeb4d503b3676b498ed3b7a78f975450af4ef Mon Sep 17 00:00:00 2001 From: Charlotte Herzeel Date: Fri, 1 Oct 2021 11:45:16 +0200 Subject: [PATCH] Added merging of headers when using the sfm command with a directory containing multiple files as input. The merging respects the requirements outlined in the SAM spec: - order of sequence dictionaries is kept - read group identifiers are made unique if needed + optional rg tags of reads are updated accordingly - program line identifiers are made unique if needed + optional pg tags of reads are updated accordingly - comment lines are merged - the sorting order is set to Unknown If elprep cannot merge while guaranteeing the above constraints, an error is produced. --- README.md | 2 +- sam/sam-types.go | 5 +++++ sam/split-merge.go | 32 ++++++++++++++++++++++++++++++-- utils/programinfo.go | 2 +- 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 24ad459..c2d6878 100644 --- a/README.md +++ b/README.md @@ -772,7 +772,7 @@ file is created for each entry in the sequence dictionary. The elprep split command requires two arguments: 1) the input file or a path to multiple input files and 2) a path to a directory where elPrep can store the split files. The input file(s) can be .sam or .bam. It is also possible to use /dev/stdin as the input for using Unix pipes. There are no structural requirements on the input file(s) for using elprep split. For example, it is not necessary to sort the input file, nor is it necessary to convert to .bam or index the input file. -Warning: If you pass a path to multiple input files to the elprep split command, elprep assumes that they all have the same (or compatible) headers, and just picks the first header it finds as the header for all input files. elprep currently does not make an attempt to resolve potential conflicts between headers, especially with regard to the @SQ, @RG, or @PG header records. We will include proper merging of different SAM/BAM files in a future version of elprep. In the meantime, if you need proper merging of SAM/BAM files, please use samtools merge, Picard MergeSamFiles, or a similar tool. (If such a tool produces SAM file as output, it can be piped into elprep using Unix pipes.) +If you pass a path to multiple input files to the elprep split command, elprep attempts to merge the headers, resolving potential conflicts by adhering to the SAM specification. Specifically, while merging headers: 1) the order of sequence dictionaries must be kept (@sq); 2) read group identifiers must be unique (@rg) and in case of collisions elprep makes the IDs unique and updates optional @rg tags in alignments accordingly; 3) program identifiers must be unique (@pg) and elprep changes IDs to be unique in case of collisions and updates optional @pg tags in alignments accordingly; 4) comment lines are all merged (any order); 5) the order of the header is updated (GO entry). In case the headers are incompatible and merging violates any of the SAM specification requirements, elPrep produces an error and aborts the execution of the command. elPrep creates the output directory denoted by the output path, unless the directory already exists, in which case elPrep may override the existing files in that directory. Please make sure elPrep has the correct permissions for writing that directory. diff --git a/sam/sam-types.go b/sam/sam-types.go index ae3a94b..f4f7b8d 100644 --- a/sam/sam-types.go +++ b/sam/sam-types.go @@ -358,6 +358,11 @@ func (aln *Alignment) SetRG(rg interface{}) { aln.TAGS.Set(RG, rg) } +// SetPG sets the PG optional field. +func (aln *Alignment) SetPG(pg interface{}) { + aln.TAGS.Set(PG, pg) +} + // REFID returns the REFID temporary field. // // If REFID field is not set, this will panic with a log message. The diff --git a/sam/split-merge.go b/sam/split-merge.go index 6c3f9c0..c45f432 100644 --- a/sam/split-merge.go +++ b/sam/split-merge.go @@ -229,9 +229,16 @@ func (hdr *Header) Contigs() (contigs []string, ok bool) { // requirements on the input file for splitting. func SplitFilePerChromosome(input, outputPath, outputPrefix, outputExtension string, contigGroupSize int) { inputPath, files := internal.Directory(input) + var header *Header + var filters []AlignmentFilter firstFile := filepath.Join(inputPath, files[0]) firstIn := Open(firstFile) - header := firstIn.ParseHeader() + if len(files) > 1 { + header, filters = MergeInputs(inputPath, files) + firstIn.SkipHeader() // skip header because it won't be used as it is replaced by the merged header + } else { + header = firstIn.ParseHeader() + } groups, contigToGroup, groupToContigs := computeContigGroups(header.SQ, contigGroupSize) splitsPath := filepath.Join(outputPath, "splits") internal.MkdirAll(splitsPath, 0700) @@ -266,8 +273,15 @@ func SplitFilePerChromosome(input, outputPath, outputPrefix, outputExtension str pipeline.LimitedPar(0, BytesToAlignment(in)), pipeline.StrictOrd(pipeline.Receive(func(_ int, data interface{}) interface{} { for _, aln := range data.([]*Alignment) { + //execute the filters that may update optional rg and pg tags in reads + for _, filter := range filters { + filter(aln) + } group := contigToGroup[aln.RNAME] out := groupFiles[group] + if out == nil { + log.Panic("Attempting to output a read mapped to a region not present in the header: QNAME: ", aln.QNAME, "RNAME: ", aln.RNAME) + } buf = out.FormatAlignment(aln, buf[:0]) if aln.RNEXT == "=" || aln.RNAME == "*" || contigToGroup[aln.RNEXT] == group { out.Write(buf) // untagged @@ -649,9 +663,16 @@ func MergeSortedFilesSplitPerChromosomeWithoutSpreadFile(inputPath, output, inpu // are no requirements on the input file for splitting. func SplitSingleEndFilePerChromosome(input, outputPath, outputPrefix, outputExtension string, contigGroupSize int) { inputPath, files := internal.Directory(input) + var header *Header + var filters []AlignmentFilter firstFile := filepath.Join(inputPath, files[0]) firstIn := Open(firstFile) - header := firstIn.ParseHeader() + if len(files) > 1 { + header, filters = MergeInputs(inputPath, files) + firstIn.SkipHeader() + } else { + header = firstIn.ParseHeader() + } groups, contigToGroup, _ := computeContigGroups(header.SQ, contigGroupSize) groupFiles := make(map[string]*OutputFile) header.AddUserRecord("@sr", utils.StringMap{"co": "This file was created using elprep split --single-end."}) @@ -673,7 +694,14 @@ func SplitSingleEndFilePerChromosome(input, outputPath, outputPrefix, outputExte pipeline.LimitedPar(0, BytesToAlignment(in)), pipeline.StrictOrd(pipeline.Receive(func(_ int, data interface{}) interface{} { for _, aln := range data.([]*Alignment) { + //execute the filters that may update optional rg and pg tags in reads + for _, filter := range filters { + filter(aln) + } out := groupFiles[contigToGroup[aln.RNAME]] + if out == nil { + log.Panic("Attempting to output a read mapped to a region not present in the header: QNAME: ", aln.QNAME, " RNAME: ", aln.RNAME) + } buf = out.FormatAlignment(aln, buf[:0]) out.Write(buf) // untagged } diff --git a/utils/programinfo.go b/utils/programinfo.go index aaaf585..9d7a153 100644 --- a/utils/programinfo.go +++ b/utils/programinfo.go @@ -23,7 +23,7 @@ const ( ProgramName = "elprep" // ProgramVersion is the version of the elprep binary - ProgramVersion = "5.0.2" + ProgramVersion = "5.1.0" // ProgramURL is the repository for the elprep source code ProgramURL = "http://github.com/exascience/elprep"