Skip to content

Commit

Permalink
Make indexer use streaming parser; fix bug in streaming parser
Browse files Browse the repository at this point in the history
  • Loading branch information
rob-p committed Nov 19, 2013
1 parent e1a0962 commit 6f15ff7
Show file tree
Hide file tree
Showing 22 changed files with 628 additions and 239 deletions.
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ enable_testing()

project (Sailfish)

set(CPACK_PACKAGE_VERSION "0.6.0")
set(CPACK_PACKAGE_VERSION "0.6.1")
set(CPACK_PACKAGE_VERSION_MAJOR "0")
set(CPACK_PACKAGE_VERSION_MINOR "6")
set(CPACK_PACKAGE_VERSION_PATCH "0")
set(CPACK_PACKAGE_VERSION_PATCH "1")
set(CPACK_GENERATOR "TGZ")
set(CPACK_SOURCE_GENERATOR "TGZ")
set(CPACK_PACKAGE_VENDOR "Carnegie Mellon University")
Expand Down Expand Up @@ -323,7 +323,7 @@ ExternalProject_Add(libshark
URL http://www.cs.cmu.edu/~robp/files/Shark.tar.gz
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/Shark
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
CMAKE_ARGS -DOPT_DYNAMIC_LIBRARY=TRUE -DBoost_NO_SYSTEM_PATHS=TRUE -DBOOST_INCLUDEDIR=${Boost_INCLUDE_DIRS}
CMAKE_ARGS -DOPT_DYNAMIC_LIBRARY=FALSE -DBoost_NO_SYSTEM_PATHS=TRUE -DBOOST_INCLUDEDIR=${Boost_INCLUDE_DIRS}
-DBOOST_LIBRARYDIR=${Boost_LIBRARY_DIRS} -DOPT_MAKE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=${SHARK_INSTALL_DIR}
BUILD_COMMAND sh -c "make"
INSTALL_COMMAND sh -c "make install"
Expand Down
24 changes: 23 additions & 1 deletion include/BinaryLiteral.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
/**
>HEADER
Copyright (c) 2013 Rob Patro [email protected]
This file is part of Sailfish.
Sailfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Sailfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Sailfish. If not, see <http://www.gnu.org/licenses/>.
<HEADER
**/


#ifndef __BINARY_LITERAL_HPP__
#define __BINARY_LITERAL_HPP__

Expand Down Expand Up @@ -52,4 +74,4 @@ constexpr unsigned long long operator "" _binary()

//===============================================================

#endif // __BINARY_LITERAL_HPP__
#endif // __BINARY_LITERAL_HPP__
22 changes: 22 additions & 0 deletions include/HeptamerIndex.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
/**
>HEADER
Copyright (c) 2013 Rob Patro [email protected]
This file is part of Sailfish.
Sailfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Sailfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Sailfish. If not, see <http://www.gnu.org/licenses/>.
<HEADER
**/


#ifndef __HEPTAMER_INDEX_HPP__
#define __HEPTAMER_INDEX_HPP__

Expand Down
22 changes: 22 additions & 0 deletions include/PartitionRefiner.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
/**
>HEADER
Copyright (c) 2013 Rob Patro [email protected]
This file is part of Sailfish.
Sailfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Sailfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Sailfish. If not, see <http://www.gnu.org/licenses/>.
<HEADER
**/


#ifndef __PARTITION_REFINER_HPP__
#define __PARTITION_REFINER_HPP__

Expand Down
92 changes: 92 additions & 0 deletions include/ReadProducer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/**
>HEADER
Copyright (c) 2013 Rob Patro [email protected]
This file is part of Sailfish.
Sailfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Sailfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Sailfish. If not, see <http://www.gnu.org/licenses/>.
<HEADER
**/


#ifndef __READPRODUCER_HPP__
#define __READPRODUCER_HPP__

#include "jellyfish/parse_dna.hpp"
#include "jellyfish/mapped_file.hpp"
#include "jellyfish/parse_read.hpp"
#include "jellyfish/sequence_parser.hpp"
#include "jellyfish/dna_codes.hpp"
#include "jellyfish/compacted_hash.hpp"
#include "jellyfish/mer_counting.hpp"
#include "jellyfish/misc.hpp"
#include "StreamingSequenceParser.hpp"

template <typename Parser>
class ReadProducer {
};

template <>
class ReadProducer<jellyfish::parse_read> {
public:
ReadProducer(jellyfish::parse_read& parser) : s_(new ReadSeq), stream_(parser.new_thread()) {}
~ReadProducer() { delete s_; }

bool nextRead(ReadSeq*& s) {
if ((read_ = stream_.next_read())) {
// we iterate over the entire read
const char* start = read_->seq_s;
const char* const end = read_->seq_e;
uint32_t readLen = std::distance(start, end);

s_->seq = const_cast<char*>(read_->seq_s);
s_->len = readLen;
s_->name = const_cast<char*>(read_->header);
s_->nlen = read_->hlen;
s = s_;
return true;
} else {
return false;
}
}

void finishedWithRead(ReadSeq*& s) { s = nullptr; }

private:
ReadSeq* s_;
jellyfish::parse_read::read_t* read_;//{parser.new_thread()};
jellyfish::parse_read::thread stream_;//{parser.new_thread()};
};

template <>
class ReadProducer<StreamingReadParser> {
public:
ReadProducer(StreamingReadParser& parser) : parser_(parser) {}

bool nextRead(ReadSeq*& s) {
if (parser_.nextRead(s)) {
return true;
} else {
s = nullptr;
return false;
}
}

void finishedWithRead(ReadSeq*& s) { parser_.finishedWithRead(s); }

private:
StreamingReadParser& parser_;
};

#endif // __READPRODUCER_HPP__
4 changes: 2 additions & 2 deletions include/SailfishConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
namespace Sailfish {
constexpr char majorVersion[] = "0";
constexpr char minorVersion[] = "6";
constexpr char patchVersion[] = "0";
constexpr char version[] = "0.6.0";
constexpr char patchVersion[] = "1";
constexpr char version[] = "0.6.1";
}

#endif // SAILFISH_CONFIG_HPP
28 changes: 26 additions & 2 deletions include/StreamingSequenceParser.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
/**
>HEADER
Copyright (c) 2013 Rob Patro [email protected]
This file is part of Sailfish.
Sailfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Sailfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Sailfish. If not, see <http://www.gnu.org/licenses/>.
<HEADER
**/


#ifndef __STREAMING_READ_PARSER__
#define __STREAMING_READ_PARSER__

Expand All @@ -21,8 +43,10 @@ extern "C" {
namespace bfs = boost::filesystem;

struct ReadSeq {
char* seq = nullptr;
size_t len = 0;
char* seq = nullptr;
size_t len = 0;
char* name = nullptr;
size_t nlen = 0;
};

class StreamingReadParser {
Expand Down
22 changes: 22 additions & 0 deletions src/BiasCorrectionDriver.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
/**
>HEADER
Copyright (c) 2013 Rob Patro [email protected]
This file is part of Sailfish.
Sailfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Sailfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Sailfish. If not, see <http://www.gnu.org/licenses/>.
<HEADER
**/


#include <iostream>
#include <cstdio>
#include <cstdlib>
Expand Down
42 changes: 27 additions & 15 deletions src/BuildLUT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
#include "CountDBNew.hpp"
#include "ezETAProgressBar.hpp"
#include "PartitionRefiner.hpp"
#include "StreamingSequenceParser.hpp"
#include "ReadProducer.hpp"

using TranscriptID = uint32_t;
using KmerID = uint64_t;
Expand Down Expand Up @@ -95,6 +97,7 @@ int buildLUTs(

using tbb::blocked_range;

/*
char** fnames = new char*[transcriptFiles.size()];
size_t z{0};
size_t numFnames{0};
Expand All @@ -105,9 +108,13 @@ int buildLUTs(
fnames[numFnames] = const_cast<char*>(s.c_str());
++numFnames;
}

// Create a jellyfish parser
jellyfish::parse_read parser( fnames, fnames+numFnames, 1000);
*/

vector<bfs::path> paths{transcriptFiles[0]};
StreamingReadParser parser(paths);
parser.start();

vector<std::thread> threads;
vector<TranscriptList> transcriptsForKmer;
Expand All @@ -119,17 +126,17 @@ int buildLUTs(
auto merLen = transcriptHash.kmerLength();
bool done {false};
atomic<size_t> numRes {0};
atomic<size_t> nworking{numThreads-1};
atomic<size_t> nworking{(numThreads > 1) ? (numThreads - 1) : 1};

// Start the thread that will print the progress bar
std::cerr << "Number of kmers : " << transcriptHash.size() << "\n";
std::cerr << "Parsing transcripts an building k-mer equivalence classes\n";
std::cerr << "Parsing transcripts and building k-mer equivalence classes\n";

threads.push_back( std::thread( [&numRes, &nworking, numTranscripts] () {
size_t lastCount = numRes;
ez::ezETAProgressBar show_progress(numTranscripts);
show_progress.start();
while ( nworking > 0 ) {
while ( numRes < numTranscripts ) {
auto diff = numRes - lastCount;
if ( diff > 0 ) {
show_progress += static_cast<unsigned int>(diff);
Expand All @@ -147,38 +154,42 @@ int buildLUTs(

// Start the desired number of threads to parse the transcripts
// and build our data structure.
for (size_t i = 0; i < numThreads - 1; ++i) {
size_t numWorkers = (numThreads > 1) ? numThreads -1 : 1;
for (size_t i = 0; i < numWorkers; ++i) {

threads.push_back( std::thread(
[&numRes, &tgmap, &parser, &transcriptHash, &nworking, &transcripts,
&transcriptIndex, &transcriptsForKmer, &refiner, &refinerMutex, merLen]() -> void {

// Each thread gets it's own stream
jellyfish::parse_read::thread stream = parser.new_thread();
jellyfish::parse_read::read_t* read;
//jellyfish::parse_read::thread stream = parser.new_thread();
//jellyfish::parse_read::read_t* read;
ReadProducer<StreamingReadParser> producer(parser);
ReadSeq* s;

auto INVALID = transcriptHash.INVALID;
bool useCanonical{transcriptIndex.canonical()};

// while there are transcripts left to process
while ( (read = stream.next_read()) ) {

while (producer.nextRead(s)) {
// The transcript name
std::string fullHeader(read->header, read->hlen);
std::string fullHeader(s->name, s->nlen);
std::string header = fullHeader.substr(0, fullHeader.find(' '));

// The transcript sequence; strip the newlines from the
// sequence and put it into a string (newSeq)
std::string seq(read->seq_s, std::distance(read->seq_s, read->seq_e) - 1);
auto newEnd = std::remove( seq.begin(), seq.end(), '\n' );
auto readLen = std::distance( seq.begin(), newEnd );
std::string newSeq(seq.begin(), seq.begin() + readLen);
//std::string seq(s->seq_s, std::distance(read->seq_s, read->seq_e) - 1);
//auto newEnd = std::remove( seq.begin(), seq.end(), '\n' );
//auto readLen = std::distance( seq.begin(), newEnd );
auto readLen = s->len;
std::string newSeq(s->seq, s->len);//seq.begin(), seq.begin() + readLen);

// Lookup the ID of this transcript in our transcript -> gene map
auto transcriptIndex = tgmap.findTranscriptID(header);
bool valid = (transcriptIndex != tgmap.INVALID);
auto geneIndex = tgmap.gene(transcriptIndex);

if ( not valid ) { continue; }
if ( not valid ) { ++numRes; continue; }
++numRes;

size_t numKmers {(readLen >= merLen) ? static_cast<size_t>(readLen) - merLen + 1 : 0};
Expand Down Expand Up @@ -216,6 +227,7 @@ int buildLUTs(
refinerMutex.unlock();

transcripts[transcriptIndex] = tinfo;
producer.finishedWithRead(s);
}

--nworking;
Expand Down
Loading

0 comments on commit 6f15ff7

Please sign in to comment.