Skip to content
This repository has been archived by the owner on Jan 15, 2025. It is now read-only.

Commit

Permalink
FIxing algo
Browse files Browse the repository at this point in the history
  • Loading branch information
RishabhSaini committed Apr 26, 2023
1 parent 22e27df commit 570e993
Showing 1 changed file with 119 additions and 60 deletions.
179 changes: 119 additions & 60 deletions lib/src/chunking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -417,31 +417,57 @@ fn std_deviation(data: &[u64]) -> Option<f64> {
}
}

fn median_absolute_deviation(data: &mut Vec<u64>) -> (f64, f64) {
//Sort data
//data.sort_by(|a, b| a.partial_cmp(b).unwrap());

//Find median of data
let median_data: f64 = match data.len() % 2 {
1 => data[data.len() / 2] as f64,
_ => 0.5 * (data[data.len() / 2 - 1] + data[data.len() / 2]) as f64,
};

//Absolute deviations
let mut absolute_deviations = Vec::new();
for size in data {
absolute_deviations.push(f64::abs(*size as f64 - median_data))
}

absolute_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap());
let mad: f64 = match absolute_deviations.len() % 2 {
1 => absolute_deviations[absolute_deviations.len() / 2],
_ => {
0.5 * (absolute_deviations[absolute_deviations.len() / 2 - 1]
+ absolute_deviations[absolute_deviations.len() / 2])
}
};

(median_data, mad)
}

//Assumes components is sorted by descending size
//Use MAD as threshold to partition packages [abs(low_limit), high_limit]
fn get_partitions_with_threshold(
components: Vec<&ObjectSourceMetaSized>,
limit_hs_bins: usize,
threshold: f64,
) -> Option<BTreeMap<String, Vec<&ObjectSourceMetaSized>>> {
let mut bins: BTreeMap<String, Vec<&ObjectSourceMetaSized>> = BTreeMap::new();
let mut med_size: Vec<&ObjectSourceMetaSized> = Vec::new();
let mut high_size: Vec<&ObjectSourceMetaSized> = Vec::new();

//Calculate Mean and Stddev for Size
let sizes: Vec<u64> = components.iter().map(|a| a.size).collect();
let mean_size = mean(&sizes)?;
let stddev_size = std_deviation(&sizes)?;
let mut size_low_limit = mean_size - threshold * stddev_size;
if size_low_limit < 0 as f64 {
size_low_limit = 100000_f64;
}
let size_high_limit = mean_size + threshold * stddev_size;
let mut sizes: Vec<u64> = components.iter().map(|a| a.size).collect();
let (median_size, mad_size) = median_absolute_deviation(&mut sizes);

let size_low_limit = 0.5 * f64::abs(median_size - threshold * mad_size);
let size_high_limit = median_size + threshold * mad_size;

for pkg in components {
let size = pkg.size as f64;

//hs
if size >= size_high_limit {
bins.entry("1hs".to_string())
.and_modify(|bin| bin.push(pkg))
.or_insert_with(|| vec![pkg]);
high_size.push(pkg);
}
//ls
else if size <= size_low_limit {
Expand All @@ -455,22 +481,38 @@ fn get_partitions_with_threshold(
}
}

let med_frequencies: Vec<u64> = med_size
//Extra hs packages
let mut remaining_pkgs: Vec<_> = high_size.drain(limit_hs_bins..).collect();
assert_eq!(high_size.len(), limit_hs_bins);

//Concatenate Extra hs packages + med_sizes keeps it still descending sorted
remaining_pkgs.append(&mut med_size);
bins.insert("1hs".to_string(), high_size);

//Ascending sorted by frequency, so each partition within MS is freq sorted
remaining_pkgs.sort_by(|a, b| {
a.meta
.change_frequency
.partial_cmp(&b.meta.change_frequency)
.unwrap()
});
let med_sizes: Vec<u64> = remaining_pkgs.iter().map(|a| a.size).collect();
let med_frequencies: Vec<u64> = remaining_pkgs
.iter()
.map(|a| a.meta.change_frequency.into())
.collect();
let med_sizes: Vec<u64> = med_size.iter().map(|a| a.size).collect();

let med_mean_freq = mean(&med_frequencies)?;
let med_stddev_freq = std_deviation(&med_frequencies)?;
let med_mean_size = mean(&med_sizes)?;
let med_stddev_size = std_deviation(&med_sizes)?;

let med_freq_low_limit = med_mean_freq - threshold * med_stddev_freq;
let med_freq_low_limit = 0.5f64 * f64::abs(med_mean_freq - threshold * med_stddev_freq);
let med_freq_high_limit = med_mean_freq + threshold * med_stddev_freq;
let med_size_low_limit = med_mean_size - threshold * med_stddev_size;
let med_size_low_limit = 0.5f64 * f64::abs(med_mean_size - threshold * med_stddev_size);
let med_size_high_limit = med_mean_size + threshold * med_stddev_size;

for pkg in med_size {
for pkg in remaining_pkgs {
let size = pkg.size as f64;
let freq = pkg.meta.change_frequency as f64;

Expand Down Expand Up @@ -557,6 +599,24 @@ fn get_partitions_with_threshold(
/// and a number of bins (possible container layers) to use, determine which components
/// go in which bin. This algorithm is pretty simple:
// Todo
//
// 2 stats to use:
// - Size
// - Probability[update] = no of changelogs * last buildtime epoch
//
// Total available bins = n
//
// 1 bin for all max_freq pkgs
// 1 bin for all new pkgs
// 1 bin for all low size pkgs
//
// 60% of n-3 bins for HS
// 40% of n-3 bins for MS
//
// If HS bins > limit, spillover to MS to package with LF(LS, MS)
// If MS bins > limit, fold by merging 2 bins from the end
//
fn basic_packing<'a>(
components: &'a [ObjectSourceMetaSized],
bin_size: NonZeroU32,
Expand Down Expand Up @@ -641,7 +701,6 @@ fn basic_packing<'a>(

println!("Creating new packing structure");

components.sort_by(|a, b| a.meta.change_frequency.cmp(&b.meta.change_frequency));
let mut max_freq_components: Vec<&ObjectSourceMetaSized> = Vec::new();
components.retain(|pkg| {
let retain: bool = pkg.meta.change_frequency != u32::MAX;
Expand All @@ -654,52 +713,48 @@ fn basic_packing<'a>(
match components_len_after_max_freq {
0 => (),
_ => {
let partitions = get_partitions_with_threshold(components, 0.5)
.expect("Partitioning components into sets");

// Max_bins -:
// 1 for max_freq
// 1 for new_pkgs
// 1 for ls
// n for hs
// Left for ms

let qty_hs_bins = match partitions.get("1hs") {
Some(n) => n.len(),
None => 0usize,
};
let qty_hs_pkgs = qty_hs_bins.clone();

let qty_ls_bins = 1usize;
let qty_ls_pkgs = match partitions.get("2ls") {
//Defining Limits of each bins
let limit_ls_bins = 1usize;
let limit_new_bins = 1usize;
let _limit_new_pkgs = 0usize;
let limit_max_frequency_bins = 1usize;
let _limit_max_frequency_pkgs = max_freq_components.len();
let limit_hs_bins = (0.6
* (bin_size.get()
- (limit_ls_bins + limit_new_bins + limit_max_frequency_bins) as u32)
as f32)
.floor() as usize;
let limit_ms_bins = (0.4
* (bin_size.get()
- (limit_ls_bins + limit_new_bins + limit_max_frequency_bins) as u32)
as f32)
.floor() as usize;

let partitions =
get_partitions_with_threshold(components, limit_hs_bins as usize, 2f64)
.expect("Partitioning components into sets");

let limit_ls_pkgs = match partitions.get("2ls") {
Some(n) => n.len(),
None => 0usize,
};

let qty_new_bins = 1usize;
let _qty_new_pkgs = 0usize;

let qty_max_frequency_bins = 1usize;
let _qty_max_frequency_pkgs = max_freq_components.len();

//Can be negative or very low if qty_hs_pkgs is very high
let qty_ms_bins = bin_size.get() as usize
- (qty_hs_bins + qty_ls_bins + qty_new_bins + qty_max_frequency_bins);

let pkg_per_bin_ms: usize =
match (components_len_after_max_freq - qty_hs_pkgs - qty_ls_pkgs)
.checked_div(qty_ms_bins)
match (components_len_after_max_freq - limit_hs_bins - limit_ls_pkgs)
.checked_div(limit_ms_bins)
{
Some(n) => {
if n >= 1 {
n
} else {
3usize
if n < 1 {
panic!("Error: No of bins <= 3");
}
n
}
None => {
panic!("Error: No of bins <= 3")
}
None => 6usize,
};

//Bins assignment
for partition in partitions.keys() {
let pkgs = partitions.get(partition).expect("hashset");

Expand Down Expand Up @@ -730,14 +785,17 @@ fn basic_packing<'a>(
}
}
}

println!("Bins before unoptimized build: {}", r.len());
//Leave second last bin for max_freq_components
//Leave last bin for new packages added, so to not disturb
//previous bins.
while r.len() > (bin_size.get() - 2) as usize {
for i in (1..r.len() - 1).step_by(2).rev() {
if r.len() <= (bin_size.get() - 2) as usize {

//Addressing MS breach by wrapping MS layers to follow the limit
while r.len() > (bin_size.get() as usize - limit_new_bins - limit_max_frequency_bins) {
for i in (limit_ls_bins + limit_hs_bins..r.len() - 1)
.step_by(2)
.rev()
{
if r.len()
<= (bin_size.get() as usize - limit_new_bins - limit_max_frequency_bins)
{
break;
}
let prev = &r[i - 1];
Expand All @@ -754,6 +812,7 @@ fn basic_packing<'a>(
}
}
r.push(max_freq_components);

let new_pkgs_bin: Vec<&ObjectSourceMetaSized> = Vec::new();
r.push(new_pkgs_bin);
let mut after_processing_pkgs_len = 0;
Expand Down

0 comments on commit 570e993

Please sign in to comment.