Skip to content

Commit

Permalink
Kvg phase (#43)
Browse files Browse the repository at this point in the history
* Remove dust and repetitive noise kmers from cleaning graph

* Correct reads using a sliding window approach

* New command (`call`) to emit two fully phased haplotypes
  • Loading branch information
kvg authored Nov 28, 2024
1 parent 099fd47 commit 741f1dc
Show file tree
Hide file tree
Showing 15 changed files with 907 additions and 119 deletions.
7 changes: 6 additions & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ RUN apt-get update --allow-releaseinfo-change && \
apt-get install -y --no-install-recommends \
curl wget bzip2 make gcc cmake g++ keychain git build-essential zlib1g-dev libssl-dev lbzip2 \
libbz2-dev libcurl4-gnutls-dev libncurses5-dev libstdc++6 libncursesw5-dev liblzma-dev clang \
libclang-dev pkg-config libtcmalloc-minimal4 python3-dev python3-setuptools && \
libclang-dev pkg-config libtcmalloc-minimal4 python3-dev python3-setuptools fontconfig && \
rm -rf /var/lib/apt/lists/*

# Reduced gcloud installation (from framegrace: https://github.com/GoogleCloudPlatform/gsutil/issues/1732#issuecomment-2029591598)
Expand Down Expand Up @@ -59,6 +59,11 @@ RUN wget -O samtools-1.21.tar.bz2 https://github.com/samtools/samtools/releases/
# Final stage
FROM python:3.9-slim

# Install some libraries in final image
RUN apt-get update && \
apt-get install -y --no-install-recommends fontconfig libcurl4-gnutls-dev && \
rm -rf /var/lib/apt/lists/*

# Copy necessary files from builder stage
COPY --from=builder /usr/local/bin/minimap2 /usr/local/bin/minimap2
COPY --from=builder /usr/local/bin/samtools /usr/local/bin/samtools
Expand Down
6 changes: 4 additions & 2 deletions src/hidive/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ indicatif = { version = "0.17.8", features = ["rayon"] }
itertools = "0.13.0"
# linfa = "0.5"
linked-hash-map = "0.5.6"
minimap2 = { version = "0.1.20+minimap2.2.28", features = ["simde"] }
# minimap2 = { version = "0.1.20+minimap2.2.28", features = ["simde"] }
minimap2 = "0.1.21"
needletail = "0.5.1"
# noodles = "0.78.0"
# ndarray = "0.15"
Expand All @@ -36,10 +37,11 @@ petgraph = "0.6.5"
rand = "0.8.5"
rayon = "1.10.0"
#recgraph = { git = "https://github.com/AlgoLab/RecGraph" }
# regex = "1.10.5"
regex = "1.10.5"
#russcip = "0.3.4"
rust-htslib = { version = "0.47.0", features = ["gcs", "serde_feature"] }
# rust-wfa2 = {git = "https://github.com/pairwise-alignment/rust-wfa2.git"}
sdust = "=0.1.0"
serde = "1.0.204"
serde_json = "1.0.120"
spoa = {git = "https://github.com/nlhepler/spoa-rs.git"}
Expand Down
92 changes: 92 additions & 0 deletions src/hidive/src/assemble.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
use std::collections::{HashMap, HashSet};
use std::{fs::File, path::PathBuf, io::Write};

use needletail::Sequence;
use petgraph::graph::NodeIndex;
use rayon::prelude::*;
use indicatif::ParallelProgressIterator;

use skydive::ldbg::LdBG;
use skydive::mldbg::MLdBG;
use skydive::utils::*;

pub fn start(
output: &PathBuf,
gfa_output: Option<PathBuf>,
kmer_size: usize,
model_path: &PathBuf,
long_read_fasta_path: &PathBuf,
short_read_fasta_path: &PathBuf,
) {
let long_read_seq_urls = skydive::parse::parse_file_names(&[long_read_fasta_path.clone()]);
let short_read_seq_urls = skydive::parse::parse_file_names(&[short_read_fasta_path.clone()]);

// Read all long reads.
skydive::elog!("Processing long-read samples {:?}...", long_read_seq_urls.iter().map(|url| url.as_str()).collect::<Vec<&str>>());
let all_lr_seqs = skydive::utils::read_fasta(&vec![long_read_fasta_path.clone()]);

// Read all short reads.
skydive::elog!("Processing short-read samples {:?}...", short_read_seq_urls.iter().map(|url| url.as_str()).collect::<Vec<&str>>());
let all_sr_seqs = skydive::utils::read_fasta(&vec![short_read_fasta_path.clone()]);

let l1 = LdBG::from_sequences("lr".to_string(), kmer_size, &all_lr_seqs);
let s1 = LdBG::from_sequences("sr".to_string(), kmer_size, &all_sr_seqs);

let m = MLdBG::from_ldbgs(vec![l1, s1])
.score_kmers(model_path)
.collapse()
.clean(0.1)
.build_links(&all_lr_seqs, false);

skydive::elog!("Built MLdBG with {} k-mers.", m.kmers.len());

skydive::elog!("Correcting reads...");
let corrected_seqs = m.correct_seqs(&all_lr_seqs);

let mut fa_file = File::create(output).unwrap();
for (i, corrected_seq) in corrected_seqs.iter().enumerate() {
let _ = writeln!(fa_file, ">corrected_{}\n{}", i, String::from_utf8(corrected_seq.clone()).unwrap());
}

// let read = b"CAGCTGCCCATGCCACCTCCTCCTTCTCTGCCCGCCCCAGTGCCTTATGGGTCCAAGGTTGACTCCTGTCCCTAGGGCAGGCCTGTGGGCCCTGCCTGATCCCTACTGGGAGGATGGTACCTAGGGTTGGAGCCAAACAAGTGTCCTCCTCCAGCGCCAGCCTGGCCCTGAGTGCGAACTCGTCACTGGTCAGGGGTCTGTACAGCAGCGTCCCTGAGGGCCCAGAGAGGTAGCCAGTCCTGTGGTGAGGTGACGAGGCTGAGGGTGGTGGCTCAGTCCTGGGCTTCCATGGGGCCTTCCCAGGGAACGTTCTGGCACCTGCCGACTGAGCCCTGGGAGGTAGGTAGCCCTGGCCTATAGCTCCCTGACGCCATGATTTGTCTTCCGTTTTTGGGGTGTCATATATGAAGGGAGGTGACTGTGATGGTGCTGGCAGGACTGCTGTCCCTGATGTGGGGTGGGCTGAGTTAGGCCTGAAATATGGGCCTCCAGGCTGAGTCCTGCCCTCTCCACCACATCCAGGGCTGACTGACACCTCTAGTCAGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCATCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACATGGGATTATGGGCAAGGGTAACTGACATCTGCTCAGCCTCAACGTACCCCTGTCTCAAATGCGGCCAGGCGGTGGGGTAAGCAGGAATGAGGCAGGGGTTGGGGTTGCCCTGAGGAGGATGATCCCAACGAGGGCGTGAGCAGGGGACCCGAGTT";
// let corrected_seqs = m.correct_seqs(&vec![read.to_vec()]);
// let mut fa_file = File::create(output).unwrap();
// for (i, corrected_seq) in corrected_seqs.iter().enumerate() {
// let _ = writeln!(fa_file, ">corrected_{}\n{}", i, String::from_utf8(corrected_seq.clone()).unwrap());
// }

if let Some(gfa_output) = gfa_output {
skydive::elog!("Writing GFA to {}", gfa_output.display());

let g = m.traverse_all_kmers();

let _ = write_gfa(&mut File::create(gfa_output.clone()).unwrap(), &g);

let csv_output = gfa_output.with_extension("csv");
let mut csv_file = File::create(&csv_output).unwrap();

writeln!(csv_file, "node,label,kmer,cov,entropy").unwrap();

for (node_index, node_label) in g.node_indices().zip(g.node_weights()) {
let kmer = node_label.as_bytes();
let cn_kmer = skydive::utils::canonicalize_kmer(kmer);
let score = (100.0 * *m.scores.get(&cn_kmer).unwrap_or(&0.0)) as u32;
let cov = if m.kmers.contains_key(&cn_kmer) { m.kmers.get(&cn_kmer).unwrap().coverage() } else { 0 };
let entropy = skydive::utils::shannon_entropy(kmer);
let sources = m.sources.get(&cn_kmer).unwrap_or(&vec![]).clone();

let source = if sources.len() == 1 { sources[0] } else { 2 };

writeln!(
csv_file,
"{},{},{},{},{}",
node_index.index(),
format!("{} ({})", source, score),
node_label,
cov,
entropy,
)
.unwrap();
}
}
}
2 changes: 1 addition & 1 deletion src/hidive/src/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -971,7 +971,7 @@ pub fn start(output: &PathBuf, k: usize, fasta_path: &PathBuf, reference_name: S
.collect();

// Filter edges with little read support.
let filtered_edges = filter_undersupported_edges(&edge_info, &stem, 4);
let filtered_edges = filter_undersupported_edges(&edge_info, &stem, 30);

// Write final graph to disk.
let _ = write_gfa(
Expand Down
Loading

0 comments on commit 741f1dc

Please sign in to comment.