Refactor index to use Record and add helper methods to Record

DavidsonGroup · Jan 1, 2025 · f630113 · f630113
1 parent fd51e58
commit f630113
Show file tree

Hide file tree

Showing 4 changed files with 116 additions and 40 deletions.
diff --git a/src/index.rs b/src/index.rs
@@ -14,6 +14,7 @@ use thiserror::Error;
 
 use crate::duplicates::RecordIdentifier;
 use crate::file::FastqFile;
+use crate::io::Record;
 use tempfile::tempfile_in;
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -106,7 +107,7 @@ fn iter_lines_with_regex<W: Write>(
     let mut expected_len: Option<usize> = None;
 
     let mut fastq_reader = needletail::parser::FastqReader::new(reader);
-    let mut total_quality = 0f64;
+    let mut total_quality = 0u32;
     let mut total_len = 0;
 
     while let Some(rec) = fastq_reader.next() {
@@ -116,29 +117,30 @@ fn iter_lines_with_regex<W: Write>(
             info!("Processed: {}", info.read_count)
         }
 
-        let rec = rec.expect("Invalid record");
-        let id = std::str::from_utf8(rec.id()).context("Could not convert id to string")?;
-        let position = rec.position().byte() as usize;
+        let sequence_rec = rec.expect("Invalid record");
+        let position = sequence_rec.position().byte() as usize;
+        let file_len = sequence_rec.all().len() + 1;
+        let mut rec = Record::try_from(sequence_rec)?;
 
-        match extract_bc_from_header(id, re, position) {
+        match extract_bc_from_header(&rec.id, re, position) {
             Ok((len, identifier)) => {
-                match expected_len {
-                    None => expected_len = Some(len),
-                    Some(expected) => {
-                        if expected != len {
-                            bail!(IndexGenerationErr::DifferentMatchCounts {
-                                header: id.to_string(),
-                                re: re.clone(),
-                                pos: position,
-                                count: len,
-                                expected
-                            })
-                        }
-                    }
+                let expected_len = *expected_len.get_or_insert(len);
+
+                if expected_len != len {
+                    bail!(IndexGenerationErr::DifferentMatchCounts {
+                        header: rec.id,
+                        re: re.clone(),
+                        pos: position,
+                        count: len,
+                        expected: expected_len
+                    })
                 }
 
-                total_quality += write_read(wtr, &rec, identifier.to_string(), position)?;
-                total_len += rec.num_bases();
+                rec.id = identifier.to_string();
+
+                rec.write_index(wtr, position, file_len)?;
+                total_quality += rec.phred_quality_total();
+                total_len += rec.len();
                 info.matched_read_count += 1;
             }
             Err(e) => {
@@ -152,7 +154,7 @@ fn iter_lines_with_regex<W: Write>(
 
     wtr.flush()?;
 
-    info.avg_qual = total_quality / (info.matched_read_count as f64);
+    info.avg_qual = (total_quality as f64) / (info.matched_read_count as f64);
     info.avg_len = (total_len as f64) / (info.matched_read_count as f64);
     info.gb = (fastq_reader.position().byte() as f64) / (1024u32.pow(3) as f64);
 
@@ -218,7 +220,7 @@ fn iter_lines_with_cluster_file<W: Write>(
     let mut fastq_reader = needletail::parser::FastqReader::new(reader);
 
     // we store the total quality and length so that we can take an average at the end
-    let mut total_quality = 0f64;
+    let mut total_quality = 0u32;
     let mut total_len = 0;
 
     while let Some(rec) = fastq_reader.next() {
@@ -229,29 +231,31 @@ fn iter_lines_with_cluster_file<W: Write>(
             info!("Processed: {}", info.read_count);
         }
 
-        let rec = rec.expect("Invalid record");
-        let id = std::str::from_utf8(rec.id()).context("Could not convert id to string")?;
-        let position = rec.position().byte() as usize;
+        let sequence_rec = rec.expect("Invalid record");
+        let position = sequence_rec.position().byte() as usize;
+        let file_len = sequence_rec.all().len() + 1;
+        let mut rec = Record::try_from(sequence_rec)?;
 
-        let Some(identifier) = cluster_map.get(id) else {
+        let Some(identifier) = cluster_map.get(&rec.id) else {
             if !skip_invalid_ids {
-                bail!(RowNotInClusters {
-                    header: id.to_string()
-                })
+                bail!(RowNotInClusters { header: rec.id })
             }
             info.unmatched_read_count += 1;
             continue;
         };
         info.matched_read_count += 1;
 
-        total_quality += write_read(wtr, &rec, identifier.clone(), position)?;
-        total_len += rec.num_bases();
+        rec.id = identifier.clone();
+        rec.write_index(wtr, position, file_len)?;
+
+        total_quality += rec.phred_quality_total();
+        total_len += rec.len();
     }
 
     wtr.flush()?;
 
     // compute summary statistics
-    info.avg_qual = total_quality / (info.matched_read_count as f64);
+    info.avg_qual = (total_quality as f64) / (info.matched_read_count as f64);
     info.avg_len = (total_len as f64) / (info.matched_read_count as f64);
     info.gb = (fastq_reader.position().byte() as f64) / (1024u32.pow(3) as f64);
 

diff --git a/src/io.rs b/src/io.rs
@@ -4,8 +4,11 @@ use needletail::parser::SequenceRecord;
 use needletail::{parser::FastqReader, FastxReader};
 use std::fmt::Write as FmtWrite;
 // needed for write! to be implemented on Strings
+use serde::{Deserialize, Serialize};
 use std::fs::File;
 use std::io::{BufReader, Read, Seek, SeekFrom, Write};
+use std::iter::Map;
+use std::slice::Iter;
 
 #[derive(PartialEq, Eq)]
 pub enum ReadType {
@@ -25,6 +28,7 @@ pub struct Record {
 impl TryFrom<SequenceRecord<'_>> for Record {
     type Error = std::string::FromUtf8Error;
 
+    /// Attempt to create a Record from a SequenceRecord.
     fn try_from(rec: SequenceRecord) -> Result<Self, Self::Error> {
         Ok(Record {
             id: String::from_utf8(rec.id().to_vec())?,
@@ -35,10 +39,40 @@ impl TryFrom<SequenceRecord<'_>> for Record {
 }
 
 impl Record {
+    /// Returns the PHRED quality scores of the record as a byte slice.
+    pub fn phred_quality(&self) -> Map<Iter<u8>, fn(&u8) -> u32> {
+        // we transform the quality to a PHRED score (ASCII ! to I)
+        // https://en.wikipedia.org/wiki/Phred_quality_score
+        self.qual
+            .as_bytes()
+            .into_iter()
+            .map(|&x| (x as u32) - 33u32)
+    }
+
+    /// Returns the average PHRED quality score of the record
+    pub fn phred_quality_avg(&self) -> f64 {
+        let qual = (self.phred_quality_total() as f64) / (self.len() as f64);
+        // round to 2dp
+        const ROUND_PRECISION: f64 = 100.0;
+        (qual * ROUND_PRECISION).round() / ROUND_PRECISION
+    }
+
+    /// Returns the sum of the PHRED quality scores of the record
+    pub fn phred_quality_total(&self) -> u32 {
+        self.phred_quality().sum()
+    }
+
+    /// Returns the sequence length in base count of the record
+    pub fn len(&self) -> usize {
+        self.seq.len()
+    }
+
+    /// Write the Record in a .fastq format
     pub fn write_fastq(&self, writer: &mut impl Write) -> Result<(), std::io::Error> {
         write!(writer, "@{}\n{}\n+\n{}", self.id, self.seq, self.qual)
     }
 
+    /// Write the Record in a .fasta format
     pub fn write_fasta(&self, writer: &mut impl Write) -> Result<(), std::io::Error> {
         write!(writer, ">{}\n{}", self.id, self.seq)
     }
@@ -52,6 +86,9 @@ impl Record {
     /// * `group_idx` - The index of the read in the group.
     /// * `group_size` - The size of the group.
     /// * `avg_qual` - The average quality score of the group.
+    ///
+    /// # Note
+    /// This function will modify the Record irreversibly by changing the Record's `id` field
     pub fn add_metadata(
         &mut self,
         umi_group: usize,
@@ -80,6 +117,40 @@ impl Record {
     }
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+struct IndexRecord {
+    id: String,
+    pos: usize,
+    avg_qual: f64,
+    n_bases: usize,
+    rec_len: usize,
+}
+
+impl Record {
+    /// Writes information about the Record to an external index writer, provided with
+    /// extra information.
+    ///
+    /// # Arguments
+    ///
+    /// * `wtr` - A mutable reference to a CSV writer.
+    /// * `pos` - The position of the record in the file.
+    /// * `file_len` - The bytes consumed by the record in the file (the _length_ on _file_)
+    pub fn write_index<W: Write>(
+        &self,
+        wtr: &mut csv::Writer<W>,
+        pos: usize,
+        file_len: usize,
+    ) -> csv::Result<()> {
+        wtr.serialize(IndexRecord {
+            id: self.id.clone(),
+            pos,
+            avg_qual: self.phred_quality_avg(),
+            n_bases: self.len(),
+            rec_len: file_len,
+        })
+    }
+}
+
 pub struct UMIGroup {
     /// The "Identifier" of this group, typically a "BC_UMI" string
     pub id: RecordIdentifier,
@@ -114,7 +185,7 @@ pub struct UMIGroup {
 /// let record = get_read_at_position(&mut file, 12345)?;
 /// println!("Record ID: {}", record.id);
 /// ```
-pub fn get_read_at_position<R: Read + Seek + Send>(
+pub fn get_record_from_position<R: Read + Seek + Send>(
     // file: &mut File,
     reader: &mut R,
     // file_sequential: &mut dyn Read,
@@ -245,9 +316,9 @@ where
         }
 
         let read = if single {
-            get_read_at_position(reader_seq, pos)
+            get_record_from_position(reader_seq, pos)
         } else {
-            get_read_at_position(reader_rnd, pos)
+            get_record_from_position(reader_rnd, pos)
         };
 
         match read {

diff --git a/src/main.rs b/src/main.rs
@@ -22,6 +22,7 @@ mod index;
 mod io;
 mod preset;
 mod summary;
+mod filter;
 
 use cli::{Cli, Commands};
 

diff --git a/tests/correct/index.tsv b/tests/correct/index.tsv
@@ -1,4 +1,4 @@
-#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2024-12-11T14:06:55.279837+08:00","elapsed":0.276829667,"gb":0.02817634679377079,"matched_read_count":14143,"unmatched_read_count":0,"read_count":14143,"avg_qual":21.152054726719978,"avg_len":1030.5793678851728}
+#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2025-01-01T21:46:26.373596+08:00","elapsed":0.021128792,"gb":0.02817634679377079,"matched_read_count":14143,"unmatched_read_count":0,"read_count":14143,"avg_qual":22967.658700417167,"avg_len":1030.5793678851728}
 id	pos	avg_qual	n_bases	rec_len
 TCTGGCTCATTCTCCG_GCAGCGAAGCCC	0	19.47	593	1264
 GCACTAAGTTGAAGTA_ATCAATGTATTG	1264	9.52	1655	3388
@@ -6704,7 +6704,7 @@ TGACGCGAGTCTACCA_CTCCCCCTTCCG	14207110	24.26	1506	3090
 GATTCGATCAAGCCTA_AAAATCGAGCAG	14210200	24.36	512	1102
 AAGTCGTCAGTAGGAC_TTGACCCGGTAG	14211302	21.78	890	1858
 ATACTTCTCCCGAGTG_AGGTTCGCTCCT	14213160	25.64	400	878
-CAGGTATCATGGGTTT_TTCCAAAGTTTT	14214038	21.17	1840	3758
+CAGGTATCATGGGTTT_TTCCAAAGTTTT	14214038	21.18	1840	3758
 CACGGGTAGGCCACTC_TAACTTTCCCGT	14217796	21.23	1112	2302
 TCAGTTTGTAGCGCTC_GGTTAACTTTAT	14220098	17.68	677	1432
 TGACGCGAGTCTACCA_AGTCTCAACTCC	14221530	22.58	779	1636
@@ -7466,7 +7466,7 @@ AGGGAGTCAGGTCAAG_GGAAGTTTTCGT	15815868	21.98	631	1340
 TCCATGCGTTGCTCGG_TAAACTGGTGAG	15817208	22.35	1023	2124
 TTACCATCATGTCAGT_TACTGTTTTGTG	15819332	22.25	590	1258
 CACGGGTAGGCCACTC_ATGTTACCCACT	15820590	22.64	1180	2438
-TTCCTTCAGGGTACGT_TTGCAAATGCGC	15823028	22.67	1080	2238
+TTCCTTCAGGGTACGT_TTGCAAATGCGC	15823028	22.68	1080	2238
 TGATTCTTCGTGTCAA_TTACCTTAAGGG	15825266	16.98	1274	2626
 CGGCAGTTCTGGGCGT_TGTATATTATAA	15827892	19.7	1268	2614
 AATCGACAGGGTGGGA_GGATTCTCAATA	15830506	18.33	414	906
@@ -12934,7 +12934,7 @@ AGACCCGAGTTGGACG_ACAACGGTTTTC	27841400	22.25	1341	2760
 CTAAGTGAGAACAGGA_CTCCCAGCTGTC	27844160	19.32	735	1548
 CTAAGTGAGAACAGGA_CTCCCAGCTGTC	27845708	21.04	689	1456
 CTAAGTGAGAACAGGA_CTCCCAGCTGTC	27847164	22.91	742	1562
-TTGAGTGCAGGGCTTC_TGCCACTATGTA	27848726	14.92	560	1198
+TTGAGTGCAGGGCTTC_TGCCACTATGTA	27848726	14.93	560	1198
 TTGAGTGCAGGGCTTC_TGCCACTATGTA	27849924	20.11	616	1310
 TTGAGTGCAGGGCTTC_TGCCACTATGTA	27851234	14.61	575	1228
 CAGTTCCCAGAGTCTT_TGCGGTTCCGTT	27852462	19.5	328	734
@@ -13476,7 +13476,7 @@ AGCGCCACAATCCAGT_TCAGTGCCCTTT	28919244	21.01	656	1390
 AGCGCCACAATCCAGT_TCAGTGCCCTTT	28920634	20.56	666	1410
 CTAAGTGAGAACAGGA_TCTACATAACCA	28922044	27.7	928	1934
 CTAAGTGAGAACAGGA_TCTACATAACCA	28923978	16.09	918	1914
-CTAAGTGAGAACAGGA_TCTACATAACCA	28925892	23.42	480	1038
+CTAAGTGAGAACAGGA_TCTACATAACCA	28925892	23.43	480	1038
 CAGCACGTCTCTAAGG_TTCATAGGCTTT	28926930	20.95	1944	3966
 CAGCACGTCTCTAAGG_TTCATAGGCTTT	28930896	22.7	780	1638
 CAGCACGTCTCTAAGG_TTCATAGGCTTT	28932534	20.17	773	1624