Skip to content

Commit

Permalink
Refactor index to use Record and add helper methods to Record
Browse files Browse the repository at this point in the history
  • Loading branch information
olliecheng committed Jan 1, 2025
1 parent fd51e58 commit f630113
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 40 deletions.
68 changes: 36 additions & 32 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use thiserror::Error;

use crate::duplicates::RecordIdentifier;
use crate::file::FastqFile;
use crate::io::Record;
use tempfile::tempfile_in;

#[derive(Debug, Serialize, Deserialize)]
Expand Down Expand Up @@ -106,7 +107,7 @@ fn iter_lines_with_regex<W: Write>(
let mut expected_len: Option<usize> = None;

let mut fastq_reader = needletail::parser::FastqReader::new(reader);
let mut total_quality = 0f64;
let mut total_quality = 0u32;
let mut total_len = 0;

while let Some(rec) = fastq_reader.next() {
Expand All @@ -116,29 +117,30 @@ fn iter_lines_with_regex<W: Write>(
info!("Processed: {}", info.read_count)
}

let rec = rec.expect("Invalid record");
let id = std::str::from_utf8(rec.id()).context("Could not convert id to string")?;
let position = rec.position().byte() as usize;
let sequence_rec = rec.expect("Invalid record");
let position = sequence_rec.position().byte() as usize;
let file_len = sequence_rec.all().len() + 1;
let mut rec = Record::try_from(sequence_rec)?;

match extract_bc_from_header(id, re, position) {
match extract_bc_from_header(&rec.id, re, position) {
Ok((len, identifier)) => {
match expected_len {
None => expected_len = Some(len),
Some(expected) => {
if expected != len {
bail!(IndexGenerationErr::DifferentMatchCounts {
header: id.to_string(),
re: re.clone(),
pos: position,
count: len,
expected
})
}
}
let expected_len = *expected_len.get_or_insert(len);

if expected_len != len {
bail!(IndexGenerationErr::DifferentMatchCounts {
header: rec.id,
re: re.clone(),
pos: position,
count: len,
expected: expected_len
})
}

total_quality += write_read(wtr, &rec, identifier.to_string(), position)?;
total_len += rec.num_bases();
rec.id = identifier.to_string();

rec.write_index(wtr, position, file_len)?;
total_quality += rec.phred_quality_total();
total_len += rec.len();
info.matched_read_count += 1;
}
Err(e) => {
Expand All @@ -152,7 +154,7 @@ fn iter_lines_with_regex<W: Write>(

wtr.flush()?;

info.avg_qual = total_quality / (info.matched_read_count as f64);
info.avg_qual = (total_quality as f64) / (info.matched_read_count as f64);
info.avg_len = (total_len as f64) / (info.matched_read_count as f64);
info.gb = (fastq_reader.position().byte() as f64) / (1024u32.pow(3) as f64);

Expand Down Expand Up @@ -218,7 +220,7 @@ fn iter_lines_with_cluster_file<W: Write>(
let mut fastq_reader = needletail::parser::FastqReader::new(reader);

// we store the total quality and length so that we can take an average at the end
let mut total_quality = 0f64;
let mut total_quality = 0u32;
let mut total_len = 0;

while let Some(rec) = fastq_reader.next() {
Expand All @@ -229,29 +231,31 @@ fn iter_lines_with_cluster_file<W: Write>(
info!("Processed: {}", info.read_count);
}

let rec = rec.expect("Invalid record");
let id = std::str::from_utf8(rec.id()).context("Could not convert id to string")?;
let position = rec.position().byte() as usize;
let sequence_rec = rec.expect("Invalid record");
let position = sequence_rec.position().byte() as usize;
let file_len = sequence_rec.all().len() + 1;
let mut rec = Record::try_from(sequence_rec)?;

let Some(identifier) = cluster_map.get(id) else {
let Some(identifier) = cluster_map.get(&rec.id) else {
if !skip_invalid_ids {
bail!(RowNotInClusters {
header: id.to_string()
})
bail!(RowNotInClusters { header: rec.id })
}
info.unmatched_read_count += 1;
continue;
};
info.matched_read_count += 1;

total_quality += write_read(wtr, &rec, identifier.clone(), position)?;
total_len += rec.num_bases();
rec.id = identifier.clone();
rec.write_index(wtr, position, file_len)?;

total_quality += rec.phred_quality_total();
total_len += rec.len();
}

wtr.flush()?;

// compute summary statistics
info.avg_qual = total_quality / (info.matched_read_count as f64);
info.avg_qual = (total_quality as f64) / (info.matched_read_count as f64);
info.avg_len = (total_len as f64) / (info.matched_read_count as f64);
info.gb = (fastq_reader.position().byte() as f64) / (1024u32.pow(3) as f64);

Expand Down
77 changes: 74 additions & 3 deletions src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ use needletail::parser::SequenceRecord;
use needletail::{parser::FastqReader, FastxReader};
use std::fmt::Write as FmtWrite;
// needed for write! to be implemented on Strings
use serde::{Deserialize, Serialize};
use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom, Write};
use std::iter::Map;
use std::slice::Iter;

#[derive(PartialEq, Eq)]
pub enum ReadType {
Expand All @@ -25,6 +28,7 @@ pub struct Record {
impl TryFrom<SequenceRecord<'_>> for Record {
type Error = std::string::FromUtf8Error;

/// Attempt to create a Record from a SequenceRecord.
fn try_from(rec: SequenceRecord) -> Result<Self, Self::Error> {
Ok(Record {
id: String::from_utf8(rec.id().to_vec())?,
Expand All @@ -35,10 +39,40 @@ impl TryFrom<SequenceRecord<'_>> for Record {
}

impl Record {
/// Returns the PHRED quality scores of the record as a byte slice.
pub fn phred_quality(&self) -> Map<Iter<u8>, fn(&u8) -> u32> {
// we transform the quality to a PHRED score (ASCII ! to I)
// https://en.wikipedia.org/wiki/Phred_quality_score
self.qual
.as_bytes()
.into_iter()
.map(|&x| (x as u32) - 33u32)
}

/// Returns the average PHRED quality score of the record
pub fn phred_quality_avg(&self) -> f64 {
let qual = (self.phred_quality_total() as f64) / (self.len() as f64);
// round to 2dp
const ROUND_PRECISION: f64 = 100.0;
(qual * ROUND_PRECISION).round() / ROUND_PRECISION
}

/// Returns the sum of the PHRED quality scores of the record
pub fn phred_quality_total(&self) -> u32 {
self.phred_quality().sum()
}

/// Returns the sequence length in base count of the record
pub fn len(&self) -> usize {
self.seq.len()
}

/// Write the Record in a .fastq format
pub fn write_fastq(&self, writer: &mut impl Write) -> Result<(), std::io::Error> {
write!(writer, "@{}\n{}\n+\n{}", self.id, self.seq, self.qual)
}

/// Write the Record in a .fasta format
pub fn write_fasta(&self, writer: &mut impl Write) -> Result<(), std::io::Error> {
write!(writer, ">{}\n{}", self.id, self.seq)
}
Expand All @@ -52,6 +86,9 @@ impl Record {
/// * `group_idx` - The index of the read in the group.
/// * `group_size` - The size of the group.
/// * `avg_qual` - The average quality score of the group.
///
/// # Note
/// This function will modify the Record irreversibly by changing the Record's `id` field
pub fn add_metadata(
&mut self,
umi_group: usize,
Expand Down Expand Up @@ -80,6 +117,40 @@ impl Record {
}
}

#[derive(Debug, Serialize, Deserialize)]
struct IndexRecord {
id: String,
pos: usize,
avg_qual: f64,
n_bases: usize,
rec_len: usize,
}

impl Record {
/// Writes information about the Record to an external index writer, provided with
/// extra information.
///
/// # Arguments
///
/// * `wtr` - A mutable reference to a CSV writer.
/// * `pos` - The position of the record in the file.
/// * `file_len` - The bytes consumed by the record in the file (the _length_ on _file_)
pub fn write_index<W: Write>(
&self,
wtr: &mut csv::Writer<W>,
pos: usize,
file_len: usize,
) -> csv::Result<()> {
wtr.serialize(IndexRecord {
id: self.id.clone(),
pos,
avg_qual: self.phred_quality_avg(),
n_bases: self.len(),
rec_len: file_len,
})
}
}

pub struct UMIGroup {
/// The "Identifier" of this group, typically a "BC_UMI" string
pub id: RecordIdentifier,
Expand Down Expand Up @@ -114,7 +185,7 @@ pub struct UMIGroup {
/// let record = get_read_at_position(&mut file, 12345)?;
/// println!("Record ID: {}", record.id);
/// ```
pub fn get_read_at_position<R: Read + Seek + Send>(
pub fn get_record_from_position<R: Read + Seek + Send>(
// file: &mut File,
reader: &mut R,
// file_sequential: &mut dyn Read,
Expand Down Expand Up @@ -245,9 +316,9 @@ where
}

let read = if single {
get_read_at_position(reader_seq, pos)
get_record_from_position(reader_seq, pos)
} else {
get_read_at_position(reader_rnd, pos)
get_record_from_position(reader_rnd, pos)
};

match read {
Expand Down
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ mod index;
mod io;
mod preset;
mod summary;
mod filter;

use cli::{Cli, Commands};

Expand Down
10 changes: 5 additions & 5 deletions tests/correct/index.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2024-12-11T14:06:55.279837+08:00","elapsed":0.276829667,"gb":0.02817634679377079,"matched_read_count":14143,"unmatched_read_count":0,"read_count":14143,"avg_qual":21.152054726719978,"avg_len":1030.5793678851728}
#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2025-01-01T21:46:26.373596+08:00","elapsed":0.021128792,"gb":0.02817634679377079,"matched_read_count":14143,"unmatched_read_count":0,"read_count":14143,"avg_qual":22967.658700417167,"avg_len":1030.5793678851728}
id pos avg_qual n_bases rec_len
TCTGGCTCATTCTCCG_GCAGCGAAGCCC 0 19.47 593 1264
GCACTAAGTTGAAGTA_ATCAATGTATTG 1264 9.52 1655 3388
Expand Down Expand Up @@ -6704,7 +6704,7 @@ TGACGCGAGTCTACCA_CTCCCCCTTCCG 14207110 24.26 1506 3090
GATTCGATCAAGCCTA_AAAATCGAGCAG 14210200 24.36 512 1102
AAGTCGTCAGTAGGAC_TTGACCCGGTAG 14211302 21.78 890 1858
ATACTTCTCCCGAGTG_AGGTTCGCTCCT 14213160 25.64 400 878
CAGGTATCATGGGTTT_TTCCAAAGTTTT 14214038 21.17 1840 3758
CAGGTATCATGGGTTT_TTCCAAAGTTTT 14214038 21.18 1840 3758
CACGGGTAGGCCACTC_TAACTTTCCCGT 14217796 21.23 1112 2302
TCAGTTTGTAGCGCTC_GGTTAACTTTAT 14220098 17.68 677 1432
TGACGCGAGTCTACCA_AGTCTCAACTCC 14221530 22.58 779 1636
Expand Down Expand Up @@ -7466,7 +7466,7 @@ AGGGAGTCAGGTCAAG_GGAAGTTTTCGT 15815868 21.98 631 1340
TCCATGCGTTGCTCGG_TAAACTGGTGAG 15817208 22.35 1023 2124
TTACCATCATGTCAGT_TACTGTTTTGTG 15819332 22.25 590 1258
CACGGGTAGGCCACTC_ATGTTACCCACT 15820590 22.64 1180 2438
TTCCTTCAGGGTACGT_TTGCAAATGCGC 15823028 22.67 1080 2238
TTCCTTCAGGGTACGT_TTGCAAATGCGC 15823028 22.68 1080 2238
TGATTCTTCGTGTCAA_TTACCTTAAGGG 15825266 16.98 1274 2626
CGGCAGTTCTGGGCGT_TGTATATTATAA 15827892 19.7 1268 2614
AATCGACAGGGTGGGA_GGATTCTCAATA 15830506 18.33 414 906
Expand Down Expand Up @@ -12934,7 +12934,7 @@ AGACCCGAGTTGGACG_ACAACGGTTTTC 27841400 22.25 1341 2760
CTAAGTGAGAACAGGA_CTCCCAGCTGTC 27844160 19.32 735 1548
CTAAGTGAGAACAGGA_CTCCCAGCTGTC 27845708 21.04 689 1456
CTAAGTGAGAACAGGA_CTCCCAGCTGTC 27847164 22.91 742 1562
TTGAGTGCAGGGCTTC_TGCCACTATGTA 27848726 14.92 560 1198
TTGAGTGCAGGGCTTC_TGCCACTATGTA 27848726 14.93 560 1198
TTGAGTGCAGGGCTTC_TGCCACTATGTA 27849924 20.11 616 1310
TTGAGTGCAGGGCTTC_TGCCACTATGTA 27851234 14.61 575 1228
CAGTTCCCAGAGTCTT_TGCGGTTCCGTT 27852462 19.5 328 734
Expand Down Expand Up @@ -13476,7 +13476,7 @@ AGCGCCACAATCCAGT_TCAGTGCCCTTT 28919244 21.01 656 1390
AGCGCCACAATCCAGT_TCAGTGCCCTTT 28920634 20.56 666 1410
CTAAGTGAGAACAGGA_TCTACATAACCA 28922044 27.7 928 1934
CTAAGTGAGAACAGGA_TCTACATAACCA 28923978 16.09 918 1914
CTAAGTGAGAACAGGA_TCTACATAACCA 28925892 23.42 480 1038
CTAAGTGAGAACAGGA_TCTACATAACCA 28925892 23.43 480 1038
CAGCACGTCTCTAAGG_TTCATAGGCTTT 28926930 20.95 1944 3966
CAGCACGTCTCTAAGG_TTCATAGGCTTT 28930896 22.7 780 1638
CAGCACGTCTCTAAGG_TTCATAGGCTTT 28932534 20.17 773 1624
Expand Down

0 comments on commit f630113

Please sign in to comment.