Formatting and code cleanup

DavidsonGroup · Jan 1, 2025 · fd51e58 · fd51e58
1 parent d525ff1
commit fd51e58
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 84 deletions.
diff --git a/src/duplicates.rs b/src/duplicates.rs
@@ -62,7 +62,7 @@ impl RecordIdentifier {
     pub fn from_string(s: &str) -> Self {
         let split_loc = match s.find('_') {
             Some(v) => v,
-            None => s.len() - 1
+            None => s.len() - 1,
         };
 
         RecordIdentifier {
@@ -113,7 +113,8 @@ pub fn get_duplicates(index: &str) -> Result<(DuplicateMap, DuplicateStatistics,
     let mut header = String::new();
 
     // read the first line, which is NOT in CSV format
-    file.read_line(&mut header).context("Could not read the first line")?;
+    file.read_line(&mut header)
+        .context("Could not read the first line")?;
 
     assert!(header.starts_with('#'));
     let info: FastqFile = serde_json::from_str(&header[1..])?;
@@ -146,21 +147,23 @@ pub fn get_duplicates(index: &str) -> Result<(DuplicateMap, DuplicateStatistics,
 
     // Compute information about the duplicates
     stats.duplicate_ids = 0;
-    stats.duplicate_reads = map.values().map(|v| {
-        let length = v.len();
-        if length > 1 {
-            stats.duplicate_ids += 1;
-
-            if let Some(x) = stats.distribution.get_mut(&length) {
-                *x += 1
+    stats.duplicate_reads = map
+        .values()
+        .map(|v| {
+            let length = v.len();
+            if length > 1 {
+                stats.duplicate_ids += 1;
+
+                if let Some(x) = stats.distribution.get_mut(&length) {
+                    *x += 1
+                } else {
+                    stats.distribution.insert(length, 1);
+                }
+                length
             } else {
-                stats.distribution.insert(length, 1);
+                0
             }
-            length
-        } else {
-            0
-        }
-    })
+        })
         .sum();
 
     stats

diff --git a/src/file.rs b/src/file.rs
@@ -12,4 +12,4 @@ pub struct FastqFile {
     pub read_count: usize,
     pub avg_qual: f64,
     pub avg_len: f64,
-}
+}
diff --git a/src/index.rs b/src/index.rs
@@ -49,7 +49,9 @@ fn write_read<W: Write>(
     position: usize,
 ) -> Result<f64> {
     let len = rec.num_bases();
-    let qual: u32 = rec.qual().expect(".fastq should not fail here")
+    let qual: u32 = rec
+        .qual()
+        .expect(".fastq should not fail here")
         .iter()
         .map(|x| *x as u32)
         .sum();
@@ -63,22 +65,17 @@ fn write_read<W: Write>(
     // round to 2dp
     let phred_qual = (phred_qual * 100.0).round() / 100.0;
 
-    // eprintln!("Buffer:\n{}---", std::str::from_utf8(rec.all()).unwrap());
-
-    wtr.serialize(
-        IndexRecord {
-            id: identifier,
-            pos: position,
-            avg_qual: phred_qual,
-            n_bases: len,
-            rec_len: rec.all().len() + 1,
-        }
-    )?;
+    wtr.serialize(IndexRecord {
+        id: identifier,
+        pos: position,
+        avg_qual: phred_qual,
+        n_bases: len,
+        rec_len: rec.all().len() + 1,
+    })?;
 
     Ok(phred_qual)
 }
 
-
 /// Iterates over lines in a FASTQ file, extracting barcodes using a regex
 /// and writing the results to a CSV writer.
 ///
@@ -126,20 +123,16 @@ fn iter_lines_with_regex<W: Write>(
         match extract_bc_from_header(id, re, position) {
             Ok((len, identifier)) => {
                 match expected_len {
-                    None => {
-                        expected_len = Some(len)
-                    }
+                    None => expected_len = Some(len),
                     Some(expected) => {
                         if expected != len {
-                            bail!(
-                                IndexGenerationErr::DifferentMatchCounts {
-                                    header: id.to_string(),
-                                    re: re.clone(),
-                                    pos: position,
-                                    count: len,
-                                    expected
-                                }
-                            )
+                            bail!(IndexGenerationErr::DifferentMatchCounts {
+                                header: id.to_string(),
+                                re: re.clone(),
+                                pos: position,
+                                count: len,
+                                expected
+                            })
                         }
                     }
                 }
@@ -212,15 +205,16 @@ fn iter_lines_with_cluster_file<W: Write>(
             3 => format!("{}_{}", &record[1], &record[2]),
 
             // doesn't make sense
-            _ => bail!(InvalidClusterRow {row: record.as_slice().to_string()})
+            _ => bail!(InvalidClusterRow {
+                row: record.as_slice().to_string()
+            }),
         };
 
         cluster_map.insert(read_id, identifier);
     }
 
     info!("Finished reading clusters. ");
 
-
     let mut fastq_reader = needletail::parser::FastqReader::new(reader);
 
     // we store the total quality and length so that we can take an average at the end
@@ -241,7 +235,9 @@ fn iter_lines_with_cluster_file<W: Write>(
 
         let Some(identifier) = cluster_map.get(id) else {
             if !skip_invalid_ids {
-                bail!(RowNotInClusters {header: id.to_string()})
+                bail!(RowNotInClusters {
+                    header: id.to_string()
+                })
             }
             info.unmatched_read_count += 1;
             continue;
@@ -291,7 +287,8 @@ fn extract_bc_from_header(
         });
     };
 
-    let captures = captures.iter()
+    let captures = captures
+        .iter()
         .skip(1)
         .flatten()
         .map(|m| m.as_str())
@@ -302,7 +299,7 @@ fn extract_bc_from_header(
         RecordIdentifier {
             head: captures[0].to_string(),
             tail: captures[1..].join("_"),
-        }
+        },
     ))
 }
 
@@ -373,9 +370,7 @@ pub fn construct_index(
     let re = Regex::new(barcode_regex)?;
     let mut result = match clusters {
         // no cluster file has been used
-        None => {
-            iter_lines_with_regex(reader, &mut wtr, &re, skip_unmatched, file_info)
-        }
+        None => iter_lines_with_regex(reader, &mut wtr, &re, skip_unmatched, file_info),
 
         // cluster file is being used
         Some(filepath) => {
@@ -384,24 +379,30 @@ pub fn construct_index(
                 .has_headers(false)
                 .from_path(filepath)?;
 
-            iter_lines_with_cluster_file(reader, &mut wtr, &mut cluster_rdr, skip_unmatched, file_info)
+            iter_lines_with_cluster_file(
+                reader,
+                &mut wtr,
+                &mut cluster_rdr,
+                skip_unmatched,
+                file_info,
+            )
         }
     }?;
 
-
     // amount of time passed
     result.elapsed = now.elapsed().as_secs_f64();
 
     // report results
     if skip_unmatched {
         info!(
             "Stats: {} matched reads, {} unmatched reads, {:.1}s runtime",
-            result.matched_read_count,
-            result.unmatched_read_count,
-            result.elapsed,
+            result.matched_read_count, result.unmatched_read_count, result.elapsed,
         )
     } else {
-        info!("Stats: {} reads, {:.1}s runtime", result.matched_read_count, result.elapsed)
+        info!(
+            "Stats: {} reads, {:.1}s runtime",
+            result.matched_read_count, result.elapsed
+        )
     }
 
     info!("Writing to {outfile}...");
@@ -415,30 +416,35 @@ pub fn construct_index(
     temp_file.seek(std::io::SeekFrom::Start(0))?;
 
     // copy from the temporary file into the final output file
-    std::io::copy(
-        &mut temp_file,
-        &mut wtr_out,
-    )?;
+    std::io::copy(&mut temp_file, &mut wtr_out)?;
 
     Ok(())
 }
 
 #[derive(Error, Debug)]
 enum IndexGenerationErr {
-    #[error("no matches produced:
+    #[error(
+        "no matches produced:
 position {pos}
     `{header}`
 with capture group
     {re:?}
-suggestion: if some of the reads should not produce a barcode, pass the --skip-unmatched flag")]
-    NoMatch { header: String, re: Regex, pos: usize },
+suggestion: if some of the reads should not produce a barcode, pass the --skip-unmatched flag"
+    )]
+    NoMatch {
+        header: String,
+        re: Regex,
+        pos: usize,
+    },
 
-    #[error("inconsistent identifier count:
+    #[error(
+        "inconsistent identifier count:
 position {pos}
     `{header}`
 has {count} matches, whereas {expected} matches were expected
 using capture group
-    {re:?}")]
+    {re:?}"
+    )]
     DifferentMatchCounts {
         header: String,
         re: Regex,
@@ -447,17 +453,15 @@ using capture group
         expected: usize,
     },
 
-    #[error("invalid cluster row: should be of the format
+    #[error(
+        "invalid cluster row: should be of the format
   `READ_ID;BC;UMI`
 or
   `READ_ID;BC`, but instead got
-{row}")]
-    InvalidClusterRow {
-        row: String
-    },
+{row}"
+    )]
+    InvalidClusterRow { row: String },
 
     #[error("Row {header} of input file not present in cluster file")]
-    RowNotInClusters {
-        header: String
-    },
+    RowNotInClusters { header: String },
 }
diff --git a/src/main.rs b/src/main.rs
@@ -14,15 +14,14 @@ use anyhow::Result;
 use clap::Parser;
 
 mod call;
+mod cli;
 mod duplicates;
+mod file;
+mod group;
 mod index;
-mod cli;
+mod io;
 mod preset;
-mod file;
 mod summary;
-mod io;
-mod group;
-
 
 use cli::{Cli, Commands};
 
@@ -70,7 +69,7 @@ fn try_main() -> Result<()> {
             preset,
             barcode_regex,
             clusters,
-            skip_unmatched
+            skip_unmatched,
         } => {
             let barcode_regex = match barcode_regex {
                 Some(v) => {
@@ -116,7 +115,7 @@ fn try_main() -> Result<()> {
         Commands::Group {
             index,
             input,
-            output
+            output,
         } => {
             let (duplicates, _, _) =
                 duplicates::get_duplicates(index).expect("Could not parse index.");

diff --git a/src/preset.rs b/src/preset.rs
@@ -22,8 +22,8 @@ pub enum PresetBarcodeFormats {
 /// A `String` containing the regular expression for the specified barcode format.
 pub fn get_barcode_regex(preset: &PresetBarcodeFormats) -> String {
     match preset {
-        PresetBarcodeFormats::BcUmi => { String::from(r"^([ATCG]{16})_([ATCG]{12})") }
-        PresetBarcodeFormats::UmiTools => { String::from(r"_([ATCG]+)$") }
-        PresetBarcodeFormats::Illumina => { String::from(r":([ATCG]+)$") }
+        PresetBarcodeFormats::BcUmi => String::from(r"^([ATCG]{16})_([ATCG]{12})"),
+        PresetBarcodeFormats::UmiTools => String::from(r"_([ATCG]+)$"),
+        PresetBarcodeFormats::Illumina => String::from(r":([ATCG]+)$"),
     }
-}
+}
diff --git a/src/summary.rs b/src/summary.rs
@@ -37,4 +37,4 @@ pub fn summarize(index: &str, output: &str) -> Result<()> {
     reg.render_template_to_write(TEMPLATE_HTML, &data, file)?;
 
     Ok(())
-}
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,4 +37,4 @@ pub fn summarize(index: &str, output: &str) -> Result<()> { @@
         reg.render_template_to_write(TEMPLATE_HTML, &data, file)?;
         Ok(())
-    }
+    }