-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbed_data.rs
234 lines (185 loc) · 7.5 KB
/
bed_data.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, Write};
use bam::{BamReader, Record, Header, RecordReader}; // assuming bam crate for reading BAM files
use tokio::runtime::Runtime;
use std::thread::Builder;
use bigtools::BigWigWrite;
use bigtools::beddata::BedParserStreamingIterator;
use std::path::Path;
use crate::data_iter::DataIter;
/// Represents a single value in a bigWig file
#[derive(Copy, Clone, Debug, PartialEq)]
#[cfg_attr(feature = "write", derive(Serialize, Deserialize))]
pub struct Value {
pub start: u32,
pub end: u32,
pub value: f32,
}
impl Value{
pub fn flat(&self) -> ( u32, u32, f32) {
( self.start, self.end, self.value )
}
}
pub struct BedData {
pub genome_info: Vec<(String, usize, usize)>, // (chromosome name, length, bin offset)
pub search: HashMap<String, usize>, // get id for chr
pub coverage_data: Vec<u32>, // coverage data array for bins
pub bin_width: usize, // bin width for coverage
pub threads: usize, // how many worker threads should we use here?
}
impl BedData {
// Constructor to initialize a BedData instance
pub fn new(bam_file: &str , bin_width: usize, threads:usize) -> Self {
let mut bam_reader = match BamReader::from_path(&bam_file, 1){
Ok(f) => f,
Err(e) => {
panic!("We hit an erro in Bam file reading: {e:?}");
}
};
let header = bam_reader.header().clone();
let genome_info:Vec<(String, usize, usize)> = Self::create_ref_id_to_name_map( &header,bin_width );
let search = Self::genome_info_to_search( &genome_info );
let num_bins = genome_info
.iter()
.map(|(_, length, _)| (length + bin_width - 1) / bin_width)
.sum::<usize>();
let mut coverage_data = vec![0u32; num_bins];
let mut record = bam::Record::new();
let mut last_start = 0;
// Process BAM records
loop {
match bam_reader.read_into(&mut record) {
Ok(true) => {},
Ok(false) => {
println!("bam file file read completely");
break
},
Err(e) => panic!("{}", e),
}
let chrom_idx:usize = record.ref_id().try_into().unwrap();
let (chrom_name, chrom_length, chrom_offset) = &genome_info[chrom_idx];
let start:usize = record.start().try_into().unwrap();
let end = start + record.sequence().raw().len();
// Update bins
for pos in (start..end).step_by(bin_width) {
//println!("Trying to add {chrom_name}:{pos} - 1");
if chrom_length >= &pos {
let index= chrom_offset+ pos / bin_width;
coverage_data[index] += 1;
//println!("Adding a one to the data in index {index}");
}else {
//println!( "pos {pos} is outside of the chromsome?! {chrom_length}")
}
}
}
BedData {
genome_info,
search,
coverage_data,
bin_width,
threads,
}
}
pub fn genome_info_to_search(genome_info: &Vec< (String, usize, usize) > ) -> HashMap<String, usize> {
genome_info
.iter()
.enumerate()
.map(|(index, (name, _, _))| (name.clone(), index))
.collect()
}
/// Create a mapping of reference ID to reference name, chromosome length, and bin offset.
/// `bin_width` is the size of each bin.
fn create_ref_id_to_name_map( header_view: &Header, bin_width: usize) -> Vec<(String, usize, usize)> {
let reference_names = header_view.reference_names();
let reference_lengths = header_view.reference_lengths();
let mut total_bins = 0; // This will store the cumulative bin count (offset)
// Zip the reference names and lengths together and compute the bin offsets
reference_names
.iter()
.zip(reference_lengths.iter())
.map(|(name, &length)| {
// Calculate number of bins for this chromosome
let bins = (length as usize + bin_width - 1) / bin_width;
// Create the tuple: (chromosome name, length, bin offset)
let result = (name.to_string(), length as usize, total_bins);
// Update the total_bins to include this chromosome's bins
total_bins += bins;
result
})
.collect()
}
pub fn id_for_chr_start( &self, chr:&str, start:usize ) -> Option<usize> {
match self.search.get( chr ){
Some(id) => {
// Some ( offset + id)
Some( self.genome_info[*id].2 + start / self.bin_width )
},
None => None
}
}
/// get the chromosme name for and value id.
/// This checks the offsets of all chromosomes and returns None if the id exeeds the range.
pub fn current_chr_for_id(&self, id:usize) -> Option<( String, usize, usize)> {
for ( chr, length, offset ) in &self.genome_info{
if id >= *offset && (id - offset) * self.bin_width < *length {
return Some( (chr.clone(), *length, *offset) );
}
}
// the id did not map to any entry here!
return None
}
/// Write the coverage data to a BEDGraph file.
pub fn write_bedgraph(
&self,
file_path: &str, // Output file path
) -> std::io::Result<()> {
// Open the output file
let mut file = File::create(file_path)?;
// Iterate over the genome info and coverage data
let mut bin_start: usize = 0; // Start position of each bin
for (chrom_index, (chrom_name, chrom_length, bin_offset)) in self.genome_info.iter().enumerate() {
let mut bin_end = 0; // End position of each bin
// Iterate through the coverage data for the current chromosome
for bin_index in 0..(chrom_length / self.bin_width) {
// Calculate start and end positions for each bin
bin_start = bin_index * self.bin_width;
bin_end = (bin_index + 1) * self.bin_width;
if bin_end > *chrom_length {
bin_end = *chrom_length; // Adjust for last bin that may not fill up to full width
}
// Get the coverage value for this bin (you'll need to map the correct coverage)
let coverage_value = self.coverage_data.get(bin_offset + bin_index).unwrap_or(&0);
// Write to the BedGraph file
if *coverage_value > 0 {
writeln!(
file,
"{}\t{}\t{}\t{}",
chrom_name, // Chromosome name
bin_start, // Start position (0-based)
bin_end, // End position (1-based)
coverage_value // Coverage value
).unwrap();
}
}
}
Ok(())
}
pub fn write_bigwig( &self, file: &str) -> Result<(),String>{
let runtime = tokio::runtime::Builder::new_multi_thread()
.worker_threads( self.threads )
.build()
.expect("Unable to create runtime.");
let iter = DataIter::new( self );
let data = BedParserStreamingIterator::wrap_infallible_iter(iter, true);
let chrom_map: HashMap<String, u32> = self.genome_info.iter().map(|(chrom, len, _)| (chrom.clone(), *len as u32)).collect();
let outfile = Path::new(file);
// Create the BigWig writer
let outb = BigWigWrite::create_file(outfile, chrom_map)
.map_err(|e| format!("Failed to create BigWig file: {}", e))?;
// Write data using the runtime
outb.write(data, runtime)
.map_err(|e| format!("Failed to write BigWig file: {}", e))?;
Ok(())
}
}