diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..5d56faf --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,2 @@ +[toolchain] +channel = "nightly" diff --git a/src/fasta.rs b/src/fasta.rs index c3d9104..1076ae5 100644 --- a/src/fasta.rs +++ b/src/fasta.rs @@ -1,58 +1,62 @@ -use nom::{ - bytes::complete::{tag, take_while}, - multi::many0, - IResult, -}; +use std::io::{BufReader, BufRead, Read}; -use crate::dna::Dna; +pub struct FastaReader { + reader: BufReader, + next_id: Option, -pub fn parse_records(input: &str) -> IResult<&str, Vec> { - many0(parse_record)(input) } -fn from_fasta_body(ascii: &str) -> Dna { - let mut dna = Dna { - length: ascii.len(), - nucleotides: vec![0; Dna::bytes_to_store(ascii.len())], - }; - - let mut i = 0; - for c in ascii.chars() { - match c { - '\n' => { - continue; - } - 'C' | 'c' => dna.init_with(i, 1), - 'G' | 'g' => dna.init_with(i, 2), - 'T' | 't' => dna.init_with(i, 3), - _ => dna.init_with(i, 0), - } - - i += 1; +impl FastaReader { + pub fn new(reader: R) -> Self { + Self { reader: BufReader::new(reader), next_id: None } } - - dna.trim(i); - dna } -fn parse_record(input: &str) -> IResult<&str, FastaDna> { - let (input, id) = parse_id(input)?; - let (input, sequence) = parse_sequence(input)?; - - Ok((input, FastaDna { id, sequence })) -} +impl Iterator for FastaReader { + type Item = FastaDna; -fn parse_id(input: &str) -> IResult<&str, String> { - let (input, _) = tag(">")(input)?; - let (input, id) = take_while(|c| c != '\n')(input)?; + fn next(&mut self) -> Option { + let mut id = self.next_id.take().unwrap_or_default(); + let mut sequence = String::with_capacity(1000); + //if let Ok(buffer) = self.reader.fill_buf() { + // + //} else { + // + //} - Ok((&input[1..], id.to_string())) + loop { + let mut line = String::new(); + match self.reader.read_line(&mut line) { + // buffer is completely read + Ok(0) => { + if id.is_empty() { + return None; + } else { + return Some(FastaDna { id, sequence: Dna::from_ascii(&sequence) }); + } + } + // buffer is not empty yet + Ok(_) => { + if line.starts_with('>') { + self.next_id = Some(line[1..].trim().to_string()); + if !id.is_empty() { + return Some(FastaDna { id, sequence: Dna::from_ascii(&sequence) }); + } + id = line[1..].trim().to_string(); + sequence.clear(); + } else { + sequence.push_str(&line.trim()); + } + } + Err(_) => { + return None; + } + } + } + } } -fn parse_sequence(input: &str) -> IResult<&str, Dna> { - let (input, sequence) = take_while(|c| c != '>')(input)?; - Ok((input, from_fasta_body(sequence))) -} +use crate::dna::Dna; #[derive(Debug, PartialEq)] pub struct FastaDna { diff --git a/tests/dna_test.rs b/tests/dna_test.rs index 123c0c0..6def123 100644 --- a/tests/dna_test.rs +++ b/tests/dna_test.rs @@ -52,7 +52,7 @@ fn can_be_appended() { #[test] fn kmer_cache_contains_correct_value() { - let cache = nuc::dna::KMER_CACHE; + let cache = &nuc::dna::KMER_CACHE; assert_eq!(cache[0], [4, 0, 0, 0]); assert_eq!(cache[1], [3, 1, 0, 0]); assert_eq!(cache[2], [3, 0, 1, 0]); diff --git a/tests/fasta_test.rs b/tests/fasta_test.rs index 6e8c7ea..aee0d83 100644 --- a/tests/fasta_test.rs +++ b/tests/fasta_test.rs @@ -1,21 +1,23 @@ - use nuc::{ dna::Dna, - fasta::{self, FastaDna}, + fasta::{self, FastaDna, FastaReader}, }; #[test] fn can_read_an_example_fasta_file() { - let fasta_content = ">Some Identifier\n\ + let reader = FastaReader::new( + ">Some Identifier\n\ ATGCCGTA\n\ >Another Identifier\n\ CTAACGAA\n\ >Yet Another Identifier\n\ ATAC\n\ - ATAAGTAGGG"; - let records = fasta::parse_records(fasta_content).unwrap().1; + ATAAGTAGGG" + .as_bytes() + ); + assert_eq!( - records, + reader.into_iter().collect::>(), vec![ FastaDna { id: "Some Identifier".to_string(), @@ -35,6 +37,6 @@ fn can_read_an_example_fasta_file() { #[test] fn can_read_an_empty_fasta_file() { - let records = fasta::parse_records("").unwrap().1; + let records = FastaReader::new("".as_bytes()).into_iter().collect::>(); assert_eq!(records, vec![]); }