Skip to content

Commit

Permalink
Merge pull request #44 from bacpop/0.3.1-candidate
Browse files Browse the repository at this point in the history
Enchanced filtering options
  • Loading branch information
johnlees authored Jul 7, 2023
2 parents 93b7ecf + 3a3f48d commit 5d6c1dc
Show file tree
Hide file tree
Showing 13 changed files with 248 additions and 35 deletions.
21 changes: 19 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ pub const DEFAULT_KMER: usize = 17;
pub const DEFAULT_STRAND: bool = false;
/// Default repeat masking behaviour
pub const DEFAULT_REPEATMASK: bool = false;
/// Default ambiguous masking behaviour
pub const DEFAULT_AMBIGMASK: bool = false;
/// Default minimum k-mer count for FASTQ files
pub const DEFAULT_MINCOUNT: u16 = 10;
/// Default minimum base quality (PHRED score) for FASTQ files
Expand Down Expand Up @@ -71,14 +73,16 @@ pub enum FileType {
Aln,
}

/// Possible variant filters for align
/// Possible variant filters
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum FilterType {
/// Output all variants
NoFilter,
/// Filter constant bases
NoConst,
/// Filter constant bases, and any variants where there are ambiguous bases
/// Filter any site with an ambiguous base
NoAmbig,
/// Filter constant bases, and any ambiguous bases
NoAmbigOrConst,
}

Expand All @@ -88,6 +92,7 @@ impl fmt::Display for FilterType {
match *self {
Self::NoFilter => write!(f, "No filtering"),
Self::NoConst => write!(f, "No constant sites"),
Self::NoAmbig => write!(f, "No ambiguous sites"),
Self::NoAmbigOrConst => write!(f, "No constant sites or ambiguous bases"),
}
}
Expand Down Expand Up @@ -171,6 +176,10 @@ pub enum Commands {
#[arg(long, value_enum, default_value_t = FilterType::NoConst)]
filter: FilterType,

/// Mask any ambiguous bases in the alignment with 'N'
#[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
ambig_mask: bool,

/// Number of CPU threads
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
threads: usize,
Expand All @@ -191,6 +200,10 @@ pub enum Commands {
#[arg(short, long, value_enum, default_value_t = FileType::Aln)]
format: FileType,

/// Mask any ambiguous bases in the alignment with 'N'
#[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
ambig_mask: bool,

/// Mask any repeats in the alignment with 'N'
#[arg(long, default_value_t = DEFAULT_REPEATMASK)]
repeat_mask: bool,
Expand Down Expand Up @@ -266,6 +279,10 @@ pub enum Commands {
/// Filter for constant middle base sites
#[arg(long, value_enum, default_value_t = FilterType::NoFilter)]
filter: FilterType,

/// Mask any ambiguous bases in the alignment with 'N'
#[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
ambig_mask: bool,
},
/// Get the number of k-mers in a split k-mer file, and other information
Nk {
Expand Down
32 changes: 22 additions & 10 deletions src/generic_modes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@ pub fn align<IntT: for<'a> UInt<'a>>(
ska_array: &mut MergeSkaArray<IntT>,
output: &Option<String>,
filter: &FilterType,
mask_ambig: bool,
min_freq: f64,
) {
// In debug mode (cannot be set from CLI, give details)
log::debug!("{ska_array}");

// Apply filters
apply_filters(ska_array, min_freq, filter);
apply_filters(ska_array, min_freq, filter, mask_ambig);

// Write out to file/stdout
log::info!("Writing alignment");
Expand Down Expand Up @@ -102,11 +103,12 @@ pub fn apply_filters<IntT: for<'a> UInt<'a>>(
ska_array: &mut MergeSkaArray<IntT>,
min_freq: f64,
filter: &FilterType,
ambig_mask: bool,
) -> i32 {
let update_kmers = false;
let filter_threshold = f64::ceil(ska_array.nsamples() as f64 * min_freq) as usize;
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter}");
ska_array.filter(filter_threshold, filter, update_kmers)
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}");
ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers)
}

/// Calculate distances between samples
Expand All @@ -122,12 +124,13 @@ pub fn distance<IntT: for<'a> UInt<'a>>(
// In debug mode (cannot be set from CLI, give details)
log::debug!("{ska_array}");

let constant = apply_filters(ska_array, min_freq, &FilterType::NoConst);
let mask_ambig = false;
let constant = apply_filters(ska_array, min_freq, &FilterType::NoConst, mask_ambig);
if filt_ambig || (min_freq * ska_array.nsamples() as f64 >= 1.0) {
if filt_ambig {
apply_filters(ska_array, min_freq, &FilterType::NoAmbigOrConst);
apply_filters(ska_array, min_freq, &FilterType::NoAmbigOrConst, mask_ambig);
} else {
apply_filters(ska_array, min_freq, &FilterType::NoFilter);
apply_filters(ska_array, min_freq, &FilterType::NoFilter, mask_ambig);
}
}

Expand Down Expand Up @@ -181,6 +184,7 @@ pub fn weed<IntT: for<'a> UInt<'a>>(
reverse: bool,
min_freq: f64,
filter: &FilterType,
ambig_mask: bool,
out_file: &str,
) {
if let Some(weed_fasta) = weed_file {
Expand All @@ -189,7 +193,15 @@ pub fn weed<IntT: for<'a> UInt<'a>>(
ska_array.kmer_len(),
ska_array.rc()
);
let ska_weed = RefSka::new(ska_array.kmer_len(), weed_fasta, ska_array.rc(), false);
let repeat_mask = false;
let filter_ambig = false;
let ska_weed = RefSka::new(
ska_array.kmer_len(),
weed_fasta,
ska_array.rc(),
repeat_mask,
filter_ambig,
);

if !reverse {
log::info!("Removing weed k-mers");
Expand All @@ -200,10 +212,10 @@ pub fn weed<IntT: for<'a> UInt<'a>>(
}

let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize;
if filter_threshold > 0 || *filter != FilterType::NoFilter {
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter}");
if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask {
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}");
let update_kmers = true;
ska_array.filter(filter_threshold, filter, update_kmers);
ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers);
}

log::info!("Saving modified skf file");
Expand Down
12 changes: 12 additions & 0 deletions src/io_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,15 @@ pub fn get_input_list(
None => read_input_fastas(seq_files.as_ref().unwrap()),
}
}

/// Checks if any input files are fastq
pub fn any_fastq(files: &[InputFastx]) -> bool {
let mut fastq = false;
for file in files {
if file.2.is_some() {
fastq = true;
break;
}
}
fastq
}
41 changes: 37 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
//! Various optimisations are used to make this as fast as possible. For a more thorough comparison with version 1.0 of SKA, see the
//! [github description](https://github.com/bacpop/ska.rust/blob/master/README.md).
//!
//! //! *NB As split k-mers are even lengths, it is possible that they are their
//! *NB As split k-mers are even lengths, it is possible that they are their
//! own reverse complement. The original version of ska would randomly pick a strand,
//! possibly introducing a SNP across samples. This version uses an ambiguous middle
//! base (W for A/T; S for C/G) to represent this case.*
Expand All @@ -50,6 +50,18 @@
//! to stream into another compatible program on STDIN. You can also add an output
//! file prefix directly with `-o` (for `ska build` this is required).
//!
//! ## Important notes
//!
//! - In this version of ska the k-mer length is the _total_ split k-mer size,
//! whereas in ska1 the k-mer length is half the split length. So the default of
//! k=15 in ska1 corresponds to choosing `-k 31` in this version.
//! - If you are using FASTQ input files (reads), these must be provided as two
//! deinterleaved files using the `-f` option to `ska build`. Providing them as
//! a single file will treat them as FASTA, and not correct sequencing errors.
//! - If you are running `ska map`, it is more memory efficient to run these one at
//! a time, rather than merging a single `.skf` file. A command for doing this
//! in parallel is listed below.
//!
//! ## Common options
//!
//! Version can be viewed by running `ska -V`.
Expand Down Expand Up @@ -168,6 +180,18 @@
//! ska map ref.fa seq1.fa seq2.fa -f vcf --threads 2 | bgzip -c - > seqs.vcf.gz
//! ```
//!
//! ### Efficiency
//!
//! As `ska map` is independent for each input file, the most efficient way to
//! run this on a number of samples is with gnu parallel, for example:
//! ```bash
//! cat ${FILE_LIST} | parallel -j 8 "ID=\$(basename {} | sed 's/_a.*fasta/_temp_skf/g');
//! ska build -o \$ID -k 31 {} ;
//! MAPOUT=\$(basename {} | sed 's/_a.*fasta/_aln/g');
//! ska map $REFERENCE_LOC \${ID}.skf -o \${MAPOUT}.aln"
//! cat *.aln > ska_map.aln
//! ```
//!
//! ## ska distance
//!
//! Use to calculate distances between all samples within an `.skf` file. The output
Expand Down Expand Up @@ -342,7 +366,8 @@
//! let min_count = 2;
//! let update_kmers = false;
//! let filter = FilterType::NoConst;
//! ska_array.filter(min_count, &filter, update_kmers);
//! let ambig_mask = false;
//! ska_array.filter(min_count, &filter, update_kmers, ambig_mask);
//!
//! // Write out to stdout
//! let mut out_stream = set_ostream(&None);
Expand Down Expand Up @@ -485,17 +510,18 @@ pub fn main() {
output,
min_freq,
filter,
ambig_mask,
threads,
} => {
check_threads(*threads);
if let Ok(mut ska_array) = load_array::<u64>(input, *threads) {
// In debug mode (cannot be set from CLI, give details)
log::debug!("{ska_array}");
align(&mut ska_array, output, filter, *min_freq);
align(&mut ska_array, output, filter, *ambig_mask, *min_freq);
} else if let Ok(mut ska_array) = load_array::<u128>(input, *threads) {
// In debug mode (cannot be set from CLI, give details)
log::debug!("{ska_array}");
align(&mut ska_array, output, filter, *min_freq);
align(&mut ska_array, output, filter, *ambig_mask, *min_freq);
} else {
panic!("Could not read input file(s): {input:?}");
}
Expand All @@ -505,10 +531,12 @@ pub fn main() {
input,
output,
format,
ambig_mask,
repeat_mask,
threads,
} => {
check_threads(*threads);

log::info!("Loading skf as dictionary");
if let Ok(mut ska_array) = load_array::<u64>(input, *threads) {
log::info!(
Expand All @@ -520,6 +548,7 @@ pub fn main() {
ska_array.kmer_len(),
reference,
ska_array.rc(),
*ambig_mask,
*repeat_mask,
);
map(&mut ska_array, &mut ska_ref, output, format, *threads);
Expand All @@ -533,6 +562,7 @@ pub fn main() {
ska_array.kmer_len(),
reference,
ska_array.rc(),
*ambig_mask,
*repeat_mask,
);
map(&mut ska_array, &mut ska_ref, output, format, *threads);
Expand Down Expand Up @@ -607,6 +637,7 @@ pub fn main() {
weed_file,
reverse,
min_freq,
ambig_mask,
filter,
} => {
log::info!("Loading skf file");
Expand All @@ -617,6 +648,7 @@ pub fn main() {
*reverse,
*min_freq,
filter,
*ambig_mask,
skf_file,
);
} else if let Ok(mut ska_array) = MergeSkaArray::<u128>::load(skf_file) {
Expand All @@ -626,6 +658,7 @@ pub fn main() {
*reverse,
*min_freq,
filter,
*ambig_mask,
skf_file,
);
} else {
Expand Down
39 changes: 36 additions & 3 deletions src/merge_ska_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,17 @@ use crate::cli::FilterType;
/// // Remove constant sites and save
/// let min_count = 1; // no filtering by minor allele frequency
/// let filter = FilterType::NoAmbigOrConst; // remove sites with no minor allele
/// let mask_ambiguous = false; // leave ambiguous sites as IUPAC codes
/// let update_counts = true; // keep counts updated, as saving
/// ska_array.filter(min_count, &filter, update_counts);
/// ska_array.filter(min_count, &filter, mask_ambiguous, update_counts);
/// ska_array.save(&"no_const_sites.skf");
///
/// // Delete a sample
/// ska_array.delete_samples(&[&"test_1"]);
///
/// // Delete k-mers
/// let mask_repeats = false;
/// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc(), mask_repeats);
/// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc(), mask_repeats, mask_ambiguous);
/// let reverse = false;
/// ska_array.weed(&ska_weed, reverse);
/// ```
Expand Down Expand Up @@ -234,11 +235,18 @@ where
///
/// - `min_count` -- minimum number of samples split k-mer found in.
/// - `const_sites` -- include sites where all middle bases are the same.
/// - `mask_ambig` -- replace any non-ACGTUN- with N
/// - `update_kmers` -- update counts and split k-mers after removing variants.
///
/// The default for `update_kmers` should be `true`, but it can be `false`
/// if [`write_fasta()`] will be called immediately afterwards.
pub fn filter(&mut self, min_count: usize, filter: &FilterType, update_kmers: bool) -> i32 {
pub fn filter(
&mut self,
min_count: usize,
filter: &FilterType,
mask_ambig: bool,
update_kmers: bool,
) -> i32 {
let total = self.names.len();
let mut filtered_variants = Array2::zeros((0, total));
let mut filtered_counts = Vec::new();
Expand All @@ -265,6 +273,16 @@ where
}
keep
}
FilterType::NoAmbig => {
let mut keep = true;
for var in row {
if is_ambiguous(*var) {
keep = false;
break;
}
}
keep
}
FilterType::NoAmbigOrConst => {
let mut keep = false;
let mut first_var = None;
Expand Down Expand Up @@ -300,6 +318,21 @@ where
self.split_kmers = filtered_kmers;
}
log::info!("Filtering removed {removed} split k-mers");

// Replace any ambiguous variants with Ns, if requested
if mask_ambig {
let mut masked = 0;
self.variants.mapv_inplace(|v| {
if is_ambiguous(v) {
masked += 1;
b'N'
} else {
v
}
});
log::info!("Masked {masked} ambiguous bases (non-A/C/G/T/U/N/-) with 'N'");
}

removed
}

Expand Down
Loading

0 comments on commit 5d6c1dc

Please sign in to comment.