Skip to content

Commit

Permalink
Formatting and code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
olliecheng committed Jan 1, 2025
1 parent d525ff1 commit fd51e58
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 84 deletions.
33 changes: 18 additions & 15 deletions src/duplicates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ impl RecordIdentifier {
pub fn from_string(s: &str) -> Self {
let split_loc = match s.find('_') {
Some(v) => v,
None => s.len() - 1
None => s.len() - 1,
};

RecordIdentifier {
Expand Down Expand Up @@ -113,7 +113,8 @@ pub fn get_duplicates(index: &str) -> Result<(DuplicateMap, DuplicateStatistics,
let mut header = String::new();

// read the first line, which is NOT in CSV format
file.read_line(&mut header).context("Could not read the first line")?;
file.read_line(&mut header)
.context("Could not read the first line")?;

assert!(header.starts_with('#'));
let info: FastqFile = serde_json::from_str(&header[1..])?;
Expand Down Expand Up @@ -146,21 +147,23 @@ pub fn get_duplicates(index: &str) -> Result<(DuplicateMap, DuplicateStatistics,

// Compute information about the duplicates
stats.duplicate_ids = 0;
stats.duplicate_reads = map.values().map(|v| {
let length = v.len();
if length > 1 {
stats.duplicate_ids += 1;

if let Some(x) = stats.distribution.get_mut(&length) {
*x += 1
stats.duplicate_reads = map
.values()
.map(|v| {
let length = v.len();
if length > 1 {
stats.duplicate_ids += 1;

if let Some(x) = stats.distribution.get_mut(&length) {
*x += 1
} else {
stats.distribution.insert(length, 1);
}
length
} else {
stats.distribution.insert(length, 1);
0
}
length
} else {
0
}
})
})
.sum();

stats
Expand Down
2 changes: 1 addition & 1 deletion src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ pub struct FastqFile {
pub read_count: usize,
pub avg_qual: f64,
pub avg_len: f64,
}
}
116 changes: 60 additions & 56 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ fn write_read<W: Write>(
position: usize,
) -> Result<f64> {
let len = rec.num_bases();
let qual: u32 = rec.qual().expect(".fastq should not fail here")
let qual: u32 = rec
.qual()
.expect(".fastq should not fail here")
.iter()
.map(|x| *x as u32)
.sum();
Expand All @@ -63,22 +65,17 @@ fn write_read<W: Write>(
// round to 2dp
let phred_qual = (phred_qual * 100.0).round() / 100.0;

// eprintln!("Buffer:\n{}---", std::str::from_utf8(rec.all()).unwrap());

wtr.serialize(
IndexRecord {
id: identifier,
pos: position,
avg_qual: phred_qual,
n_bases: len,
rec_len: rec.all().len() + 1,
}
)?;
wtr.serialize(IndexRecord {
id: identifier,
pos: position,
avg_qual: phred_qual,
n_bases: len,
rec_len: rec.all().len() + 1,
})?;

Ok(phred_qual)
}


/// Iterates over lines in a FASTQ file, extracting barcodes using a regex
/// and writing the results to a CSV writer.
///
Expand Down Expand Up @@ -126,20 +123,16 @@ fn iter_lines_with_regex<W: Write>(
match extract_bc_from_header(id, re, position) {
Ok((len, identifier)) => {
match expected_len {
None => {
expected_len = Some(len)
}
None => expected_len = Some(len),
Some(expected) => {
if expected != len {
bail!(
IndexGenerationErr::DifferentMatchCounts {
header: id.to_string(),
re: re.clone(),
pos: position,
count: len,
expected
}
)
bail!(IndexGenerationErr::DifferentMatchCounts {
header: id.to_string(),
re: re.clone(),
pos: position,
count: len,
expected
})
}
}
}
Expand Down Expand Up @@ -212,15 +205,16 @@ fn iter_lines_with_cluster_file<W: Write>(
3 => format!("{}_{}", &record[1], &record[2]),

// doesn't make sense
_ => bail!(InvalidClusterRow {row: record.as_slice().to_string()})
_ => bail!(InvalidClusterRow {
row: record.as_slice().to_string()
}),
};

cluster_map.insert(read_id, identifier);
}

info!("Finished reading clusters. ");


let mut fastq_reader = needletail::parser::FastqReader::new(reader);

// we store the total quality and length so that we can take an average at the end
Expand All @@ -241,7 +235,9 @@ fn iter_lines_with_cluster_file<W: Write>(

let Some(identifier) = cluster_map.get(id) else {
if !skip_invalid_ids {
bail!(RowNotInClusters {header: id.to_string()})
bail!(RowNotInClusters {
header: id.to_string()
})
}
info.unmatched_read_count += 1;
continue;
Expand Down Expand Up @@ -291,7 +287,8 @@ fn extract_bc_from_header(
});
};

let captures = captures.iter()
let captures = captures
.iter()
.skip(1)
.flatten()
.map(|m| m.as_str())
Expand All @@ -302,7 +299,7 @@ fn extract_bc_from_header(
RecordIdentifier {
head: captures[0].to_string(),
tail: captures[1..].join("_"),
}
},
))
}

Expand Down Expand Up @@ -373,9 +370,7 @@ pub fn construct_index(
let re = Regex::new(barcode_regex)?;
let mut result = match clusters {
// no cluster file has been used
None => {
iter_lines_with_regex(reader, &mut wtr, &re, skip_unmatched, file_info)
}
None => iter_lines_with_regex(reader, &mut wtr, &re, skip_unmatched, file_info),

// cluster file is being used
Some(filepath) => {
Expand All @@ -384,24 +379,30 @@ pub fn construct_index(
.has_headers(false)
.from_path(filepath)?;

iter_lines_with_cluster_file(reader, &mut wtr, &mut cluster_rdr, skip_unmatched, file_info)
iter_lines_with_cluster_file(
reader,
&mut wtr,
&mut cluster_rdr,
skip_unmatched,
file_info,
)
}
}?;


// amount of time passed
result.elapsed = now.elapsed().as_secs_f64();

// report results
if skip_unmatched {
info!(
"Stats: {} matched reads, {} unmatched reads, {:.1}s runtime",
result.matched_read_count,
result.unmatched_read_count,
result.elapsed,
result.matched_read_count, result.unmatched_read_count, result.elapsed,
)
} else {
info!("Stats: {} reads, {:.1}s runtime", result.matched_read_count, result.elapsed)
info!(
"Stats: {} reads, {:.1}s runtime",
result.matched_read_count, result.elapsed
)
}

info!("Writing to {outfile}...");
Expand All @@ -415,30 +416,35 @@ pub fn construct_index(
temp_file.seek(std::io::SeekFrom::Start(0))?;

// copy from the temporary file into the final output file
std::io::copy(
&mut temp_file,
&mut wtr_out,
)?;
std::io::copy(&mut temp_file, &mut wtr_out)?;

Ok(())
}

#[derive(Error, Debug)]
enum IndexGenerationErr {
#[error("no matches produced:
#[error(
"no matches produced:
position {pos}
`{header}`
with capture group
{re:?}
suggestion: if some of the reads should not produce a barcode, pass the --skip-unmatched flag")]
NoMatch { header: String, re: Regex, pos: usize },
suggestion: if some of the reads should not produce a barcode, pass the --skip-unmatched flag"
)]
NoMatch {
header: String,
re: Regex,
pos: usize,
},

#[error("inconsistent identifier count:
#[error(
"inconsistent identifier count:
position {pos}
`{header}`
has {count} matches, whereas {expected} matches were expected
using capture group
{re:?}")]
{re:?}"
)]
DifferentMatchCounts {
header: String,
re: Regex,
Expand All @@ -447,17 +453,15 @@ using capture group
expected: usize,
},

#[error("invalid cluster row: should be of the format
#[error(
"invalid cluster row: should be of the format
`READ_ID;BC;UMI`
or
`READ_ID;BC`, but instead got
{row}")]
InvalidClusterRow {
row: String
},
{row}"
)]
InvalidClusterRow { row: String },

#[error("Row {header} of input file not present in cluster file")]
RowNotInClusters {
header: String
},
RowNotInClusters { header: String },
}
13 changes: 6 additions & 7 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,14 @@ use anyhow::Result;
use clap::Parser;

mod call;
mod cli;
mod duplicates;
mod file;
mod group;
mod index;
mod cli;
mod io;
mod preset;
mod file;
mod summary;
mod io;
mod group;


use cli::{Cli, Commands};

Expand Down Expand Up @@ -70,7 +69,7 @@ fn try_main() -> Result<()> {
preset,
barcode_regex,
clusters,
skip_unmatched
skip_unmatched,
} => {
let barcode_regex = match barcode_regex {
Some(v) => {
Expand Down Expand Up @@ -116,7 +115,7 @@ fn try_main() -> Result<()> {
Commands::Group {
index,
input,
output
output,
} => {
let (duplicates, _, _) =
duplicates::get_duplicates(index).expect("Could not parse index.");
Expand Down
8 changes: 4 additions & 4 deletions src/preset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ pub enum PresetBarcodeFormats {
/// A `String` containing the regular expression for the specified barcode format.
pub fn get_barcode_regex(preset: &PresetBarcodeFormats) -> String {
match preset {
PresetBarcodeFormats::BcUmi => { String::from(r"^([ATCG]{16})_([ATCG]{12})") }
PresetBarcodeFormats::UmiTools => { String::from(r"_([ATCG]+)$") }
PresetBarcodeFormats::Illumina => { String::from(r":([ATCG]+)$") }
PresetBarcodeFormats::BcUmi => String::from(r"^([ATCG]{16})_([ATCG]{12})"),
PresetBarcodeFormats::UmiTools => String::from(r"_([ATCG]+)$"),
PresetBarcodeFormats::Illumina => String::from(r":([ATCG]+)$"),
}
}
}
2 changes: 1 addition & 1 deletion src/summary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ pub fn summarize(index: &str, output: &str) -> Result<()> {
reg.render_template_to_write(TEMPLATE_HTML, &data, file)?;

Ok(())
}
}

0 comments on commit fd51e58

Please sign in to comment.