diff --git a/CMakeLists.txt b/CMakeLists.txt index 093d437..52fb339 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,10 +4,10 @@ enable_testing() project (Sailfish) -set(CPACK_PACKAGE_VERSION "0.6.0") +set(CPACK_PACKAGE_VERSION "0.6.1") set(CPACK_PACKAGE_VERSION_MAJOR "0") set(CPACK_PACKAGE_VERSION_MINOR "6") -set(CPACK_PACKAGE_VERSION_PATCH "0") +set(CPACK_PACKAGE_VERSION_PATCH "1") set(CPACK_GENERATOR "TGZ") set(CPACK_SOURCE_GENERATOR "TGZ") set(CPACK_PACKAGE_VENDOR "Carnegie Mellon University") @@ -323,7 +323,7 @@ ExternalProject_Add(libshark URL http://www.cs.cmu.edu/~robp/files/Shark.tar.gz SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/Shark INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install - CMAKE_ARGS -DOPT_DYNAMIC_LIBRARY=TRUE -DBoost_NO_SYSTEM_PATHS=TRUE -DBOOST_INCLUDEDIR=${Boost_INCLUDE_DIRS} + CMAKE_ARGS -DOPT_DYNAMIC_LIBRARY=FALSE -DBoost_NO_SYSTEM_PATHS=TRUE -DBOOST_INCLUDEDIR=${Boost_INCLUDE_DIRS} -DBOOST_LIBRARYDIR=${Boost_LIBRARY_DIRS} -DOPT_MAKE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=${SHARK_INSTALL_DIR} BUILD_COMMAND sh -c "make" INSTALL_COMMAND sh -c "make install" diff --git a/include/BinaryLiteral.hpp b/include/BinaryLiteral.hpp index c07d1e0..f111634 100644 --- a/include/BinaryLiteral.hpp +++ b/include/BinaryLiteral.hpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
+class ReadProducer { +}; + +template <> +class ReadProducer { +public: + ReadProducer(jellyfish::parse_read& parser) : s_(new ReadSeq), stream_(parser.new_thread()) {} + ~ReadProducer() { delete s_; } + + bool nextRead(ReadSeq*& s) { + if ((read_ = stream_.next_read())) { + // we iterate over the entire read + const char* start = read_->seq_s; + const char* const end = read_->seq_e; + uint32_t readLen = std::distance(start, end); + + s_->seq = const_cast(read_->seq_s); + s_->len = readLen; + s_->name = const_cast(read_->header); + s_->nlen = read_->hlen; + s = s_; + return true; + } else { + return false; + } + } + + void finishedWithRead(ReadSeq*& s) { s = nullptr; } + +private: + ReadSeq* s_; + jellyfish::parse_read::read_t* read_;//{parser.new_thread()}; + jellyfish::parse_read::thread stream_;//{parser.new_thread()}; +}; + +template <> +class ReadProducer { +public: + ReadProducer(StreamingReadParser& parser) : parser_(parser) {} + + bool nextRead(ReadSeq*& s) { + if (parser_.nextRead(s)) { + return true; + } else { + s = nullptr; + return false; + } + } + + void finishedWithRead(ReadSeq*& s) { parser_.finishedWithRead(s); } + +private: + StreamingReadParser& parser_; +}; + +#endif // __READPRODUCER_HPP__ diff --git a/include/SailfishConfig.hpp b/include/SailfishConfig.hpp index 16aba1d..24ee037 100644 --- a/include/SailfishConfig.hpp +++ b/include/SailfishConfig.hpp @@ -28,8 +28,8 @@ namespace Sailfish { constexpr char majorVersion[] = "0"; constexpr char minorVersion[] = "6"; - constexpr char patchVersion[] = "0"; - constexpr char version[] = "0.6.0"; + constexpr char patchVersion[] = "1"; + constexpr char version[] = "0.6.1"; } #endif // SAILFISH_CONFIG_HPP diff --git a/include/StreamingSequenceParser.hpp b/include/StreamingSequenceParser.hpp index 69d28d4..0959208 100644 --- a/include/StreamingSequenceParser.hpp +++ b/include/StreamingSequenceParser.hpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
#include #include diff --git a/src/BuildLUT.cpp b/src/BuildLUT.cpp index d58f251..d8272b7 100644 --- a/src/BuildLUT.cpp +++ b/src/BuildLUT.cpp @@ -62,6 +62,8 @@ #include "CountDBNew.hpp" #include "ezETAProgressBar.hpp" #include "PartitionRefiner.hpp" +#include "StreamingSequenceParser.hpp" +#include "ReadProducer.hpp" using TranscriptID = uint32_t; using KmerID = uint64_t; @@ -95,6 +97,7 @@ int buildLUTs( using tbb::blocked_range; + /* char** fnames = new char*[transcriptFiles.size()]; size_t z{0}; size_t numFnames{0}; @@ -105,9 +108,13 @@ int buildLUTs( fnames[numFnames] = const_cast(s.c_str()); ++numFnames; } - // Create a jellyfish parser jellyfish::parse_read parser( fnames, fnames+numFnames, 1000); + */ + + vector paths{transcriptFiles[0]}; + StreamingReadParser parser(paths); + parser.start(); vector threads; vector transcriptsForKmer; @@ -119,17 +126,17 @@ int buildLUTs( auto merLen = transcriptHash.kmerLength(); bool done {false}; atomic numRes {0}; - atomic nworking{numThreads-1}; + atomic nworking{(numThreads > 1) ? (numThreads - 1) : 1}; // Start the thread that will print the progress bar std::cerr << "Number of kmers : " << transcriptHash.size() << "\n"; - std::cerr << "Parsing transcripts an building k-mer equivalence classes\n"; + std::cerr << "Parsing transcripts and building k-mer equivalence classes\n"; threads.push_back( std::thread( [&numRes, &nworking, numTranscripts] () { size_t lastCount = numRes; ez::ezETAProgressBar show_progress(numTranscripts); show_progress.start(); - while ( nworking > 0 ) { + while ( numRes < numTranscripts ) { auto diff = numRes - lastCount; if ( diff > 0 ) { show_progress += static_cast(diff); @@ -147,38 +154,42 @@ int buildLUTs( // Start the desired number of threads to parse the transcripts // and build our data structure. - for (size_t i = 0; i < numThreads - 1; ++i) { + size_t numWorkers = (numThreads > 1) ? numThreads -1 : 1; + for (size_t i = 0; i < numWorkers; ++i) { threads.push_back( std::thread( [&numRes, &tgmap, &parser, &transcriptHash, &nworking, &transcripts, &transcriptIndex, &transcriptsForKmer, &refiner, &refinerMutex, merLen]() -> void { // Each thread gets it's own stream - jellyfish::parse_read::thread stream = parser.new_thread(); - jellyfish::parse_read::read_t* read; + //jellyfish::parse_read::thread stream = parser.new_thread(); + //jellyfish::parse_read::read_t* read; + ReadProducer producer(parser); + ReadSeq* s; + auto INVALID = transcriptHash.INVALID; bool useCanonical{transcriptIndex.canonical()}; // while there are transcripts left to process - while ( (read = stream.next_read()) ) { - + while (producer.nextRead(s)) { // The transcript name - std::string fullHeader(read->header, read->hlen); + std::string fullHeader(s->name, s->nlen); std::string header = fullHeader.substr(0, fullHeader.find(' ')); // The transcript sequence; strip the newlines from the // sequence and put it into a string (newSeq) - std::string seq(read->seq_s, std::distance(read->seq_s, read->seq_e) - 1); - auto newEnd = std::remove( seq.begin(), seq.end(), '\n' ); - auto readLen = std::distance( seq.begin(), newEnd ); - std::string newSeq(seq.begin(), seq.begin() + readLen); + //std::string seq(s->seq_s, std::distance(read->seq_s, read->seq_e) - 1); + //auto newEnd = std::remove( seq.begin(), seq.end(), '\n' ); + //auto readLen = std::distance( seq.begin(), newEnd ); + auto readLen = s->len; + std::string newSeq(s->seq, s->len);//seq.begin(), seq.begin() + readLen); // Lookup the ID of this transcript in our transcript -> gene map auto transcriptIndex = tgmap.findTranscriptID(header); bool valid = (transcriptIndex != tgmap.INVALID); auto geneIndex = tgmap.gene(transcriptIndex); - if ( not valid ) { continue; } + if ( not valid ) { ++numRes; continue; } ++numRes; size_t numKmers {(readLen >= merLen) ? static_cast(readLen) - merLen + 1 : 0}; @@ -216,6 +227,7 @@ int buildLUTs( refinerMutex.unlock(); transcripts[transcriptIndex] = tinfo; + producer.finishedWithRead(s); } --nworking; diff --git a/src/ComputeBiasFeatures.cpp b/src/ComputeBiasFeatures.cpp index a30d67b..433c655 100644 --- a/src/ComputeBiasFeatures.cpp +++ b/src/ComputeBiasFeatures.cpp @@ -1,3 +1,4 @@ + /** >HEADER Copyright (c) 2013 Rob Patro robp@cs.cmu.edu @@ -43,6 +44,8 @@ #include #include "CommonTypes.hpp" +#include "ReadProducer.hpp" +#include "StreamingSequenceParser.hpp" // holding 2-mers as a uint64_t is a waste of space, // but using Jellyfish makes life so much easier, so @@ -51,91 +54,29 @@ using Kmer = uint64_t; using Sailfish::TranscriptFeatures; namespace bfs = boost::filesystem; - -int computeBiasFeatures( - std::vector& transcriptFiles, - bfs::path outFilePath, - size_t numThreads) { - - using std::string; - - std::vector alphabet{{"AA", "AC", "AG", "AT", - "CA", "CC", "CG", "CT", - "GA", "GC", "GG", "GT", - "TA", "TC", "TG", "TT"}}; - - std::vector diNucleotides(16); - for (auto i : boost::irange(size_t{0}, diNucleotides.size())) { - diNucleotides[i] = jellyfish::parse_dna::mer_string_to_binary(alphabet[i].c_str(), 2); - } - - - std::vector readFiles = transcriptFiles; - for( auto rf : readFiles ) { - std::cerr << "readFile: " << rf << ", "; - } - std::cerr << "\n"; - - char** fnames = new char*[readFiles.size()]; - size_t z{0}; - size_t numFnames{0}; - for ( auto& s : readFiles ){ - // Ugly, yes? But this is not as ugly as the alternatives. - // The char*'s contained in fname are owned by the readFiles - // vector and need not be manually freed. - fnames[numFnames] = const_cast(s.c_str()); - ++numFnames; - } - - // Create a jellyfish parser - jellyfish::parse_read parser( fnames, fnames+numFnames, 5000); - - - size_t merLen = 2; - Kmer lshift(2 * (merLen - 1)); - Kmer masq((1UL << (2 * merLen)) - 1); - std::atomic readNum{0}; - - - size_t numActors = numThreads; - size_t numComplete = 0; - std::vector threads; - auto tstart = std::chrono::steady_clock::now(); - - tbb::concurrent_bounded_queue featQueue; - - std::ofstream ofile(outFilePath.string()); - - auto outputThread = std::thread( - [&ofile, &numComplete, &featQueue, numActors]() -> void { - TranscriptFeatures tf{}; - while( numComplete < numActors or !featQueue.empty() ) { - while(featQueue.try_pop(tf)) { - ofile << tf.name << '\t'; - ofile << tf.length << '\t'; - ofile << tf.gcContent << '\t'; - for (auto i : boost::irange(size_t{0}, tf.diNucleotides.size())) { - ofile << tf.diNucleotides[i]; - char end = (i == tf.diNucleotides.size() - 1) ? '\n' : '\t'; - ofile << end; - } - } - boost::this_thread::sleep_for(boost::chrono::milliseconds(100)); - - } - ofile.close(); - } - ); - - for (auto i : boost::irange(size_t{0}, numActors)) { - threads.push_back(std::thread( +template +bool computeBiasFeaturesHelper(ParserT& parser, + tbb::concurrent_bounded_queue& featQueue, + size_t& numComplete, size_t numThreads) { + size_t merLen = 2; + Kmer lshift(2 * (merLen - 1)); + Kmer masq((1UL << (2 * merLen)) - 1); + std::atomic readNum{0}; + + size_t numActors = numThreads; + std::vector threads; + auto tstart = std::chrono::steady_clock::now(); + + for (auto i : boost::irange(size_t{0}, numActors)) { + threads.push_back(std::thread( [&featQueue, &numComplete, &parser, &readNum, &tstart, lshift, masq, merLen, numActors]() -> void { - jellyfish::parse_read::read_t* read; - jellyfish::parse_read::thread stream = parser.new_thread(); + ReadProducer producer(parser); + + ReadSeq* s; size_t cmlen, kmer, numKmers; - while ( (read = stream.next_read()) ) { - ++readNum; //++locallyProcessedReads; + while (producer.nextRead(s)) { + ++readNum; if (readNum % 1000 == 0) { auto tend = std::chrono::steady_clock::now(); auto sec = std::chrono::duration_cast(tend-tstart); @@ -144,20 +85,23 @@ int computeBiasFeatures( std::cerr << "processed " << readNum << " transcripts (" << rate << ") transcripts/s\r\r"; } - // we iterate over the entire read - const char *start = read->seq_s; - const char * const end = read->seq_e; + const char* start = s->seq; + uint32_t readLen = s->len; + const char* const end = s->seq + readLen; - TranscriptFeatures tfeat{}; + TranscriptFeatures tfeat{}; // reset all of the counts numKmers = 0; cmlen = kmer = 0; - uint32_t readLen = std::distance(start, end); + // the maximum number of kmers we'd have to store + uint32_t maxNumKmers = (readLen >= merLen) ? readLen - merLen + 1 : 0; + if (maxNumKmers == 0) { featQueue.push(tfeat); continue; } + // The transcript name - std::string fullHeader(read->header, read->hlen); + std::string fullHeader(s->name, s->nlen); tfeat.name = fullHeader.substr(0, fullHeader.find(' ')); tfeat.length = readLen; auto nfact = 1.0 / readLen; @@ -186,9 +130,9 @@ int computeBiasFeatures( // form the new kmer kmer = ((kmer << 2) & masq) | c; if (++cmlen >= merLen) { - tfeat.diNucleotides[kmer]++; - if (base == 'G' or base == 'C') { tfeat.gcContent += nfact; } - } + tfeat.diNucleotides[kmer]++; + if (base == 'G' or base == 'C') { tfeat.gcContent += nfact; } + } } // end switch @@ -198,15 +142,88 @@ int computeBiasFeatures( if (lastBase == 'G' or lastBase == 'C') { tfeat.gcContent += nfact; } featQueue.push(tfeat); + producer.finishedWithRead(s); + } // end reads } // end lambda )); - } // actor loop + } // actor loop - for (auto& t : threads) { t.join(); ++numComplete; } - std::cerr << "\n"; - outputThread.join(); + for (auto& t : threads) { t.join(); ++numComplete; } + return true; +} + +int computeBiasFeatures( + std::vector& transcriptFiles, + bfs::path outFilePath, + bool useStreamingParser, + size_t numThreads) { + using std::string; + using std::vector; + using std::cerr; + + size_t numActors = numThreads; + size_t numComplete = 0; + tbb::concurrent_bounded_queue featQueue; + + std::ofstream ofile(outFilePath.string()); + + auto outputThread = std::thread( + [&ofile, &numComplete, &featQueue, numActors]() -> void { + TranscriptFeatures tf{}; + while( numComplete < numActors or !featQueue.empty() ) { + while(featQueue.try_pop(tf)) { + ofile << tf.name << '\t'; + ofile << tf.length << '\t'; + ofile << tf.gcContent << '\t'; + for (auto i : boost::irange(size_t{0}, tf.diNucleotides.size())) { + ofile << tf.diNucleotides[i]; + char end = (i == tf.diNucleotides.size() - 1) ? '\n' : '\t'; + ofile << end; + } + } + boost::this_thread::sleep_for(boost::chrono::milliseconds(100)); + + } + ofile.close(); + }); + + bool tryJellyfish = !useStreamingParser; + + std::vector readFiles = transcriptFiles; + for( auto rf : readFiles ) { + std::cerr << "readFile: " << rf << ", "; + } + std::cerr << "\n"; + + for (auto& readFile : readFiles) { + std::cerr << "file " << readFile << ": \n"; + + namespace bfs = boost::filesystem; + bfs::path filePath(readFile); + + // If this is a regular file, then use the Jellyfish parser + if (tryJellyfish and bfs::is_regular_file(filePath)) { + + char** fnames = new char*[1];// fnames[1]; + fnames[0] = const_cast(readFile.c_str()); + + jellyfish::parse_read parser(fnames, fnames+1, 5000); + + computeBiasFeaturesHelper( + parser, featQueue, numComplete, numActors); + + } else { // If this is a named pipe, then use the kseq-based parser + vector paths{readFile}; + StreamingReadParser parser(paths); + parser.start(); + computeBiasFeaturesHelper( + parser, featQueue, numComplete, numActors); + } + } + std::cerr << "\n"; + outputThread.join(); } diff --git a/src/HeptamerIndex.cpp b/src/HeptamerIndex.cpp index 0b7b6f5..ae48e29 100644 --- a/src/HeptamerIndex.cpp +++ b/src/HeptamerIndex.cpp @@ -1,9 +1,30 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
//#include "BinaryLiteral.hpp" -HeptamerIndex::HeptamerIndex() : +HeptamerIndex::HeptamerIndex() : heptamers_(std::vector(HeptamerIndex::PossibleHeptamers)) {} @@ -14,7 +35,7 @@ std::size_t HeptamerIndex::index(uint64_t heptamer) { idx += mult_[1] * ((heptamer & 0x0000000C) >> 2); // base 3 idx += mult_[2] * ((heptamer & 0x00000030) >> 4); - // base 4 + // base 4 idx += mult_[3] * ((heptamer & 0x000000C0) >> 6); // base 5 idx += mult_[4] * ((heptamer & 0x00000300) >> 8); @@ -36,5 +57,5 @@ std::size_t HeptamerIndex::index(uint64_t heptamer) { HeptamerIndex::incHeptamer(uint64_t heptamer) { auto idx = index_(heptamer); - -} \ No newline at end of file + +} diff --git a/src/IndexedCounter.cpp b/src/IndexedCounter.cpp index 1a31963..28d850a 100644 --- a/src/IndexedCounter.cpp +++ b/src/IndexedCounter.cpp @@ -49,6 +49,8 @@ #include "tbb/parallel_for.h" #include "tbb/task_scheduler_init.h" +#include "ReadProducer.hpp" + #include "jellyfish/parse_dna.hpp" #include "jellyfish/mapped_file.hpp" #include "jellyfish/parse_read.hpp" @@ -66,66 +68,6 @@ enum class MerDirection : std::int8_t { FORWARD = 1, REVERSE = 2, BOTH = 3 }; -template -class ReadProducer { -}; - -template <> -class ReadProducer { -public: - ReadProducer(jellyfish::parse_read& parser) : s_(new ReadSeq), stream_(parser.new_thread()) {} - ~ReadProducer() { delete s_; } - - bool nextRead(ReadSeq*& s) { - if ((read_ = stream_.next_read())) { - // we iterate over the entire read - const char* start = read_->seq_s; - const char* const end = read_->seq_e; - uint32_t readLen = std::distance(start, end); - - s_->seq = const_cast(read_->seq_s); - s_->len = readLen; - s = s_; - return true; - } else { - return false; - } - } - - void finishedWithRead(ReadSeq*& s) { s = nullptr; } - -private: - ReadSeq* s_; - jellyfish::parse_read::read_t* read_;//{parser.new_thread()}; - jellyfish::parse_read::thread stream_;//{parser.new_thread()}; -}; - - - -template <> -class ReadProducer { -public: - ReadProducer(StreamingReadParser& parser) : parser_(parser) {} - - bool nextRead(ReadSeq*& s) { - if (parser_.nextRead(s)) { - return true; - } else { - s = nullptr; - return false; - } - } - - void finishedWithRead(ReadSeq*& s) { parser_.finishedWithRead(s); } - -private: - StreamingReadParser& parser_; -}; - - - - - template bool countKmers(ParserT& parser, PerfectHashIndex& phi, CountDBNew& rhash, size_t merLen, bool discardPolyA, MerDirection direction, std::atomic& numReadsProcessed, diff --git a/src/LookUpTableUtils.cpp b/src/LookUpTableUtils.cpp index c7fe065..72ac80c 100644 --- a/src/LookUpTableUtils.cpp +++ b/src/LookUpTableUtils.cpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
#include #include diff --git a/src/PCA.cpp b/src/PCA.cpp index 0a821b9..b5cc1b8 100644 --- a/src/PCA.cpp +++ b/src/PCA.cpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
#ifdef DEBUG @@ -9,11 +31,11 @@ using namespace std; using namespace Eigen; -int Pca::Calculate(vector &x, - const unsigned int &nrows, - const unsigned int &ncols, - const bool is_corr, - const bool is_center, +int Pca::Calculate(vector &x, + const unsigned int &nrows, + const unsigned int &ncols, + const bool is_corr, + const bool is_center, const bool is_scale) { _ncols = ncols; _nrows = nrows; @@ -43,7 +65,7 @@ int Pca::Calculate(vector &x, for (unsigned int i = 0; i < _ncols; ++i) { VectorXf curr_col = VectorXf::Constant(_nrows, mean_vector(i)); // mean(x) for column x curr_col = _xXf.col(i) - curr_col; // x - mean(x) - curr_col = curr_col.array().square(); // (x-mean(x))^2 + curr_col = curr_col.array().square(); // (x-mean(x))^2 sd_vector(i) = sqrt((curr_col.sum())/denom); if (0 == sd_vector(i)) { zero_sd_num++; @@ -112,13 +134,13 @@ int Pca::Calculate(vector &x, } #ifdef DEBUG cout << "\n\nStandard deviations for PCs:\n"; - copy(_sd.begin(), _sd.end(),std::ostream_iterator(std::cout," ")); + copy(_sd.begin(), _sd.end(),std::ostream_iterator(std::cout," ")); cout << "\n\nKaiser criterion: PC #" << _kaiser << endl; #endif tmp_vec.resize(0); // PC's cumulative proportion _thresh95 = 1; - _cum_prop.push_back(_prop_of_var[0]); + _cum_prop.push_back(_prop_of_var[0]); for (unsigned int i = 1; i < _prop_of_var.size(); ++i) { _cum_prop.push_back(_cum_prop[i-1]+_prop_of_var[i]); if (_cum_prop[i] < 0.95) { @@ -127,7 +149,7 @@ int Pca::Calculate(vector &x, } #ifdef DEBUG cout << "\nCumulative proportion:\n"; - copy(_cum_prop.begin(), _cum_prop.end(),std::ostream_iterator(std::cout," ")); + copy(_cum_prop.begin(), _cum_prop.end(),std::ostream_iterator(std::cout," ")); cout << "\n\nThresh95 criterion: PC #" << _thresh95 << endl; #endif // Scores @@ -144,8 +166,8 @@ int Pca::Calculate(vector &x, eigen_scores.resize(0, 0); #ifdef DEBUG cout << "\n\nScores in vector:\n"; - copy(_scores.begin(), _scores.end(),std::ostream_iterator(std::cout," ")); - cout << "\n"; + copy(_scores.begin(), _scores.end(),std::ostream_iterator(std::cout," ")); + cout << "\n"; #endif } else { // COR OR COV MATRICES ARE HERE _method = "cor"; @@ -167,14 +189,14 @@ int Pca::Calculate(vector &x, #ifdef DEBUG cout << endl << eigen_eigenvalues.transpose() << endl; #endif - MatrixXf eigen_eigenvectors = edc.eigenvectors().real(); + MatrixXf eigen_eigenvectors = edc.eigenvectors().real(); #ifdef DEBUG cout << endl << eigen_eigenvectors << endl; #endif // The eigenvalues and eigenvectors are not sorted in any particular order. // So, we should sort them typedef pair eigen_pair; - vector ep; + vector ep; for (unsigned int i = 0 ; i < _ncols; ++i) { ep.push_back(make_pair(eigen_eigenvalues(i), i)); } @@ -191,11 +213,11 @@ int Pca::Calculate(vector &x, #ifdef DEBUG cout << endl << eigen_eigenvalues_sorted.transpose() << endl; cout << endl << eigen_eigenvectors_sorted << endl; - #endif + #endif // We don't need not sorted arrays anymore eigen_eigenvalues.resize(0); eigen_eigenvectors.resize(0, 0); - + _sd.clear(); _prop_of_var.clear(); _kaiser = 0; float tmp_sum = eigen_eigenvalues_sorted.sum(); for (unsigned int i = 0; i < _ncols; ++i) { @@ -207,9 +229,9 @@ int Pca::Calculate(vector &x, } #ifdef DEBUG cout << "\nStandard deviations for PCs:\n"; - copy(_sd.begin(), _sd.end(), std::ostream_iterator(std::cout," ")); + copy(_sd.begin(), _sd.end(), std::ostream_iterator(std::cout," ")); cout << "\nProportion of variance:\n"; - copy(_prop_of_var.begin(), _prop_of_var.end(), std::ostream_iterator(std::cout," ")); + copy(_prop_of_var.begin(), _prop_of_var.end(), std::ostream_iterator(std::cout," ")); cout << "\nKaiser criterion: PC #" << _kaiser << endl; #endif // PC's cumulative proportion @@ -220,10 +242,10 @@ int Pca::Calculate(vector &x, if (_cum_prop[i] < 0.95) { _thresh95 = i+1; } - } + } #ifdef DEBUG cout << "\n\nCumulative proportions:\n"; - copy(_cum_prop.begin(), _cum_prop.end(), std::ostream_iterator(std::cout," ")); + copy(_cum_prop.begin(), _cum_prop.end(), std::ostream_iterator(std::cout," ")); cout << "\n\n95% threshold: PC #" << _thresh95 << endl; #endif // Scores for PCA with correlation matrix @@ -246,14 +268,14 @@ int Pca::Calculate(vector &x, eigen_scores.resize(0, 0); #ifdef DEBUG cout << "\n\nScores in vector:\n"; - copy(_scores.begin(), _scores.end(), std::ostream_iterator(std::cout," ")); - cout << "\n"; + copy(_scores.begin(), _scores.end(), std::ostream_iterator(std::cout," ")); + cout << "\n"; #endif } return 0; } -std::vector Pca::sd(void) { return _sd; }; +std::vector Pca::sd(void) { return _sd; }; std::vector Pca::prop_of_var(void) {return _prop_of_var; }; std::vector Pca::cum_prop(void) { return _cum_prop; }; std::vector Pca::scores(void) { return _scores; }; @@ -270,7 +292,7 @@ Pca::Pca(void) { _ncols = 0; // Variables will be scaled by default _is_center = true; - _is_scale = true; + _is_scale = true; // By default will be used singular value decomposition _method = "svd"; _is_corr = false; @@ -278,7 +300,7 @@ Pca::Pca(void) { _kaiser = 0; _thresh95 = 1; } -Pca::~Pca(void) { +Pca::~Pca(void) { _xXf.resize(0, 0); _x.clear(); } diff --git a/src/PCAUtils.cpp b/src/PCAUtils.cpp index 445be43..ce4d5fd 100644 --- a/src/PCAUtils.cpp +++ b/src/PCAUtils.cpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
#include diff --git a/src/PartitionRefiner.cpp b/src/PartitionRefiner.cpp index 5c4cab0..fa4dc27 100644 --- a/src/PartitionRefiner.cpp +++ b/src/PartitionRefiner.cpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
#include diff --git a/src/PerfectHashIndexer.cpp b/src/PerfectHashIndexer.cpp index c8cda91..34d0862 100644 --- a/src/PerfectHashIndexer.cpp +++ b/src/PerfectHashIndexer.cpp @@ -234,6 +234,7 @@ void buildLUTs( int computeBiasFeatures( std::vector& transcriptFiles, boost::filesystem::path outFilePath, + bool useStreamingParser, size_t numThreads); int mainIndex( int argc, char *argv[] ) { @@ -241,6 +242,7 @@ int mainIndex( int argc, char *argv[] ) { namespace po = boost::program_options; uint32_t maxThreads = std::thread::hardware_concurrency(); + bool useStreamingParser = true; po::options_description generic("Command Line Options"); generic.add_options() @@ -265,7 +267,7 @@ int mainIndex( int argc, char *argv[] ) { ; po::variables_map vm; - + try { po::store(po::command_line_parser(argc, argv).options(generic).run(), vm); @@ -321,7 +323,7 @@ the Jellyfish database [thash] of the transcripts. // First, compute the transcript features in case the user // ever wants to bias-correct his / her results bfs::path transcriptBiasFile(outputPath); transcriptBiasFile /= "bias_feats.txt"; - computeBiasFeatures(transcriptFiles, transcriptBiasFile, numThreads); + computeBiasFeatures(transcriptFiles, transcriptBiasFile, useStreamingParser, numThreads); bfs::path jfHashFile(outputPath); jfHashFile /= "jf.counts_0"; @@ -381,6 +383,7 @@ the Jellyfish database [thash] of the transcripts. std::cerr << "building transcript to gene map using transcript fasta file [" << transcriptFiles[0] << "] . . .\n"; tgmap = sailfish::utils::transcriptToGeneMapFromFasta(transcriptFiles[0]); + std::cerr << "there are " << tgmap.numTranscripts() << " transcripts . . . "; std::cerr << "done\n"; } diff --git a/src/PerformBiasCorrection.cpp b/src/PerformBiasCorrection.cpp index f17d378..58481aa 100644 --- a/src/PerformBiasCorrection.cpp +++ b/src/PerformBiasCorrection.cpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
#include #include diff --git a/src/PerformBiasCorrection_old.cpp b/src/PerformBiasCorrection_old.cpp index 22d2dac..cf2117b 100644 --- a/src/PerformBiasCorrection_old.cpp +++ b/src/PerformBiasCorrection_old.cpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
#include #include @@ -134,7 +156,7 @@ int main(int argc, char* argv[]) { auto& tname = features[i].name; auto rpkm = sfres.expressions[tname].rpkm; shark::RealVector v(1); - + if ( rpkm >= 0.001 ) { retainedRows.emplace_back(i); retainedNames.push_back(tname); @@ -147,7 +169,7 @@ int main(int argc, char* argv[]) { minLRPKM = std::min(minLRPKM, v(0)); maxLRPKM = std::max(maxLRPKM, v(0)); avgLRPKM += v(0); - RPKMLabels.emplace_back(v); + RPKMLabels.emplace_back(v); } } avgLRPKM /= retainedRows.size(); @@ -203,7 +225,7 @@ int main(int argc, char* argv[]) { v(0) = std::log(static_cast(features[r].length)); train.X[c][0] = std::log(static_cast(features[r].length)); - train2.X[c][0] = std::log(static_cast(features[r].length)); + train2.X[c][0] = std::log(static_cast(features[r].length)); for (auto j : boost::irange(size_t{1}, dimCutoff+1)) { train.X[c][j] = encodedXsub.element(c)(j-1); train2.X[c][j] = encodedXsub.element(c)(j-1); @@ -270,8 +292,8 @@ int main(int argc, char* argv[]) { ));*/ /* - std::cerr << "there are " << numRetainedSamples << " samples\n"; - std::cerr << "there are " << train.n_features << " features\n"; + std::cerr << "there are " << numRetainedSamples << " samples\n"; + std::cerr << "there are " << train.n_features << " features\n"; reg->build(train.X, train.y, numRetainedSamples); REAL* pred = new REAL[numRetainedSamples]; @@ -283,19 +305,19 @@ int main(int argc, char* argv[]) { std::cerr << "Train RMSE=" << trn_rmse << ", Correlation Coefficient=" << trn_r2 << "\n"; */ - + shark::Data input = shark::createDataFromRange(X); shark::Data regLabels = shark::createDataFromRange(labels); shark::LabeledData regressionData(input, regLabels); - + shark::RFTrainer trainer; shark::RFClassifier model; - + trainer.setNodeSize(20); trainer.setNTrees(500); trainer.setOOBratio(0.001); - + //std::cerr << "number of classes = " << shark::numberOfClasses(regressionSubset) << "\n"; //std::cerr << "number of classes = " << shark::numberOfClasses(regressionSubset) << "\n"; trainer.train(model, regressionData); @@ -316,10 +338,10 @@ int main(int argc, char* argv[]) { /* auto mm = std::minmax_element(train.y, train.y + numRetainedSamples); double minLRPKM = std::get<0>(mm); - double maxLRPKM = std::get<1>(mm); + double maxLRPKM = std::get<1>(mm); */ - for (auto i : boost::irange(size_t{0}, size_t{numRetainedSamples})) { + for (auto i : boost::irange(size_t{0}, size_t{numRetainedSamples})) { pred[i] = RPKMLabels[i](0) - pred[i]; } @@ -333,9 +355,9 @@ int main(int argc, char* argv[]) { std::cerr << "min, max pred : " << minPred << ", " << maxPred << "\n"; std::cerr << "SCALE: " << scale << "\n"; - + minPred = std::numeric_limits::max(); - for (auto i : boost::irange(size_t{0}, size_t{numRetainedSamples})) { + for (auto i : boost::irange(size_t{0}, size_t{numRetainedSamples})) { pred[i] *= scale; minPred = std::min(minPred, pred[i]); } @@ -348,7 +370,7 @@ int main(int argc, char* argv[]) { minPred = std::min(minPred, pred[i]); maxPred = std::max(maxPred, pred[i]); } - + /* trn_rmse=rmse(pred, train.y, numRetainedSamples); trn_r2=R2(pred, train.y, numRetainedSamples); @@ -365,8 +387,8 @@ int main(int argc, char* argv[]) { double rpkm = 0.0; auto& name = features[i].name; - auto& r = sfres.expressions[name]; - + auto& r = sfres.expressions[name]; + if (i == retainedRows[retainedCnt]) { //rpkm = std::exp(pred[retainedCnt]); rpkm = std::exp(pred[retainedCnt]); @@ -381,7 +403,7 @@ int main(int argc, char* argv[]) { } rpkm = std::exp(RPKMLabels[retainedCnt](0));// pred[retainedCnt]); - + ++retainedCnt; } else { rpkm = sfres.expressions[features[i].name].rpkm; @@ -395,4 +417,4 @@ int main(int argc, char* argv[]) { //for ( auto i : retainedRows ) { std::cerr << i << " "; } ofile.close(); -} \ No newline at end of file +} diff --git a/src/SailfishUtils.cpp b/src/SailfishUtils.cpp index b2aaf83..b9136b8 100644 --- a/src/SailfishUtils.cpp +++ b/src/SailfishUtils.cpp @@ -35,6 +35,8 @@ #include "TranscriptGeneMap.hpp" #include "GenomicFeature.hpp" #include "SailfishUtils.hpp" +#include "ReadProducer.hpp" +#include "StreamingSequenceParser.hpp" namespace sailfish { namespace utils { @@ -167,23 +169,24 @@ TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile ) { TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile ) { + using std::vector; NameVector transcriptNames; NameVector geneNames {"gene"}; - char* fnames[1] = { const_cast(transcriptsFile.c_str()) }; - // Create a jellyfish parser - jellyfish::parse_read parser( fnames, fnames+1, 1000); + vector paths{transcriptsFile}; + StreamingReadParser parser(paths); + parser.start(); - // Each thread gets it's own stream - jellyfish::parse_read::thread stream = parser.new_thread(); - jellyfish::parse_read::read_t* read; + ReadProducer producer(parser); + ReadSeq* s; // while there are transcripts left to process - while ( (read = stream.next_read()) ) { + while (producer.nextRead(s)) { // The transcript name - std::string fullHeader(read->header, read->hlen); + std::string fullHeader(s->name, s->nlen); std::string header = fullHeader.substr(0, fullHeader.find(' ')); transcriptNames.emplace_back(header); + producer.finishedWithRead(s); } // Sort the transcript names diff --git a/src/StreamingSequenceParser.cpp b/src/StreamingSequenceParser.cpp index 76e00d0..4435d8e 100644 --- a/src/StreamingSequenceParser.cpp +++ b/src/StreamingSequenceParser.cpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
@@ -79,11 +101,23 @@ bool StreamingReadParser::start() { int ksv = kseq_read(seq); while (ksv >= 0) { this->seqContainerQueue_.pop(s); + + // Possibly allocate more space for the sequence if (seq->seq.l > s->len) { s->seq = static_cast(realloc(s->seq, seq->seq.l)); } + // Copy the sequence length and sequence over to the ReadSeq struct s->len = seq->seq.l; memcpy(s->seq, seq->seq.s, s->len); + + // Possibly allocate more space for the name + if (seq->name.l > s->nlen) { + s->name = static_cast(realloc(s->name, seq->name.l)); + } + // Copy the name length and name over to the ReadSeq struct + s->nlen = seq->name.l; + memcpy(s->name, seq->name.s, s->nlen); + this->readQueue_.push(s); ksv = kseq_read(seq); } @@ -91,7 +125,6 @@ bool StreamingReadParser::start() { // destroy the parser and close the file kseq_destroy(seq); close(pfd.fd); - } @@ -105,10 +138,10 @@ bool StreamingReadParser::start() { } bool StreamingReadParser::nextRead(ReadSeq*& seq) { - while(parsing_) { + while(parsing_ or !readQueue_.empty()) { if (readQueue_.try_pop(seq)) { return true; } } return false; - } +} void StreamingReadParser::finishedWithRead(ReadSeq*& s) { seqContainerQueue_.push(s); } diff --git a/src/TestHeptamerIndex.cpp b/src/TestHeptamerIndex.cpp index 014ce3d..196f823 100644 --- a/src/TestHeptamerIndex.cpp +++ b/src/TestHeptamerIndex.cpp @@ -1,3 +1,25 @@ +/** +>HEADER + Copyright (c) 2013 Rob Patro robp@cs.cmu.edu + + This file is part of Sailfish. + + Sailfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Sailfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Sailfish. If not, see . +
#include @@ -21,7 +43,7 @@ int main(int argc, char* argv[]) { jellyfish::parse_read::thread stream{parser.new_thread()}; HeptamerIndex hi; - size_t merLen = 7; + size_t merLen = 7; uint64_t lshift{2 * (merLen - 1)}; uint64_t masq{(1UL << (2 * merLen)) - 1}; @@ -48,7 +70,7 @@ int main(int argc, char* argv[]) { cmlen = kmer = rkmer = 0; break; - default: + default: kmer = ((kmer << 2) & masq) | c; rkmer = (rkmer >> 2) | ((0x3 - c) << lshift); // count if the kmer is valid in the forward and