Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run containers part 2 #66

Draft
wants to merge 25 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9b67893
WIP: Run container
josephglanville Mar 23, 2020
3124aa4
Fix some bugs in the run container implementation
Kerollmops Sep 11, 2020
2068bb6
Fix the to_array/bitmap impl for runs, the end bound is inclusive
Kerollmops Sep 11, 2020
e605f64
Rework the array bitmap intersect_with using Vec::retain
Kerollmops Sep 11, 2020
9321618
Implement the array run intersect_with operation
Kerollmops Sep 11, 2020
a62fc7d
Implement the run array intersect_with operation
Kerollmops Sep 11, 2020
d658f28
Implement the run run union_with operation
Kerollmops Sep 11, 2020
0ded028
Implement the run array union_with operation
Kerollmops Sep 12, 2020
fe8a4ab
Implement the array run union_with operation
Kerollmops Sep 12, 2020
613163f
Implement the bitmap run union_with operation
Kerollmops Sep 12, 2020
0a66483
Implement the run run intersect_with operation
Kerollmops Sep 12, 2020
9af4366
Implement the bitmap run intersect_with operation
Kerollmops Sep 12, 2020
9612ae9
Implement the run bitmap intersect_with operation
Kerollmops Sep 12, 2020
4ae8986
Simplify the run run intersect_with operation
Kerollmops Sep 12, 2020
924d4db
Implement the remove_range operation for the run store type
Kerollmops Sep 12, 2020
d7bcad3
Implement the run array and array run is_disjoint operation
Kerollmops Sep 12, 2020
cb69d80
Implement the run run is_disjoint operation
Kerollmops Sep 13, 2020
c77c0f8
Simplify the array bitmap difference_with operation
Kerollmops Sep 13, 2020
3a9eefd
Implement the array run difference_with operation
Kerollmops Sep 13, 2020
183c1bb
Implement the bitmap run difference_with operation
Kerollmops Sep 13, 2020
07d0fcc
Clippy and fmt pass
Kerollmops Sep 13, 2020
3c99804
Implement the run array difference_with operation
Kerollmops Sep 13, 2020
c762f93
Mark array run symmetric_difference_with operation as unimplemented
Kerollmops Sep 13, 2020
9744f12
Implement the array run is_subset operation
Kerollmops Sep 13, 2020
67784ad
Implement the run run difference_with operation
Kerollmops Sep 13, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions src/bitmap/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use std::fmt;
use super::store::{self, Store};
use super::util;

const ARRAY_LIMIT: u64 = 4096;
pub const ARRAY_LIMIT: u64 = 4096;
pub const RUN_MAX_SIZE: u64 = 2048;

#[derive(PartialEq, Clone)]
pub struct Container {
Expand Down Expand Up @@ -103,14 +104,33 @@ impl Container {
self.store.max()
}

fn ensure_correct_store(&mut self) {
fn ensure_correct_store(&mut self) -> bool {
let new_store = match (&self.store, self.len) {
(store @ &Store::Bitmap(..), len) if len <= ARRAY_LIMIT => Some(store.to_array()),
(store @ &Store::Array(..), len) if len > ARRAY_LIMIT => Some(store.to_bitmap()),
_ => None,
};
if let Some(new_store) = new_store {
self.store = new_store;
true
} else {
false
}
}

pub fn optimize(&mut self) -> bool {
match self.store {
Store::Array(..) | Store::Bitmap(..) => {
let num_runs = self.store.count_runs();
if num_runs <= RUN_MAX_SIZE && num_runs <= self.len / 2 {
// convert to run container
self.store = self.store.to_run();
true
} else {
self.ensure_correct_store()
}
}
Store::Run(..) => self.ensure_correct_store(),
}
}
}
Expand Down
5 changes: 3 additions & 2 deletions src/bitmap/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ impl fmt::Debug for RoaringBitmap {
} else {
write!(
f,
"RoaringBitmap<{:?} values between {:?} and {:?}>",
"RoaringBitmap<{:?} values between {:?} and {:?} in {:?} containers>",
self.len(),
self.min().unwrap(),
self.max().unwrap()
self.max().unwrap(),
self.containers.len(),
)
}
}
Expand Down
18 changes: 18 additions & 0 deletions src/bitmap/inherent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,24 @@ impl RoaringBitmap {
.last()
.map(|tail| util::join(tail.key, tail.max()))
}

// TODO(jpg) actually come up with example that illustrates creation of run containers
/// Optimizes the container storage for this bitmap.
/// Returns true if the container storage was modified, false if not.
///
/// # Examples
/// use roaring::RoaringBitmap;
///
/// let mut rb = RoaringBitmap::from_iter(1000..100000)
/// rb.optimize()
/// ```
pub fn optimize(&mut self) -> bool {
let mut changed = false;
for container in &mut self.containers {
changed |= container.optimize()
}
changed
}
}

impl Default for RoaringBitmap {
Expand Down
171 changes: 140 additions & 31 deletions src/bitmap/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,25 @@ use std::io;

use super::container::Container;
use super::store::Store;
use crate::bitmap::container::ARRAY_LIMIT;
use crate::bitmap::store::{Interval, BITMAP_LENGTH};
use crate::RoaringBitmap;

const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
const SERIAL_COOKIE: u16 = 12347;
// TODO: Need this once run containers are supported
// const NO_OFFSET_THRESHOLD: u8 = 4;
const NO_OFFSET_THRESHOLD: usize = 4;

// Sizes of header structures
const COOKIE_BYTES: usize = 4;
const SIZE_BYTES: usize = 4;
const DESCRIPTION_BYTES: usize = 4;
const OFFSET_BYTES: usize = 4;

// Sizes of container structures
const BITMAP_BYTES: usize = BITMAP_LENGTH * 8;
const ARRAY_ELEMENT_BYTES: usize = 2;
const RUN_NUM_BYTES: usize = 2;
const RUN_ELEMENT_BYTES: usize = 4;

impl RoaringBitmap {
/// Return the size in bytes of the serialized output.
Expand All @@ -27,17 +40,23 @@ impl RoaringBitmap {
/// assert_eq!(rb1, rb2);
/// ```
pub fn serialized_size(&self) -> usize {
let mut has_run_containers = false;
let size = self.containers.len();
let container_sizes: usize = self
.containers
.iter()
.map(|container| match container.store {
Store::Array(ref values) => 8 + values.len() * 2,
Store::Bitmap(..) => 8 + 8 * 1024,
Store::Array(ref values) => values.len() * ARRAY_ELEMENT_BYTES,
Store::Bitmap(..) => BITMAP_BYTES,
Store::Run(ref intervals) => {
has_run_containers = true;
RUN_NUM_BYTES + (RUN_ELEMENT_BYTES * intervals.len())
}
})
.sum();

// header + container sizes
8 + container_sizes
header_size(size, has_run_containers) + container_sizes
}

/// Serialize this bitmap into [the standard Roaring on-disk format][format].
Expand All @@ -58,27 +77,61 @@ impl RoaringBitmap {
/// assert_eq!(rb1, rb2);
/// ```
pub fn serialize_into<W: io::Write>(&self, mut writer: W) -> io::Result<()> {
writer.write_u32::<LittleEndian>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
let has_run_containers = self.containers.iter().any(|c| {
if let Store::Run(_) = c.store {
true
} else {
false
}
});
let size = self.containers.len();

// Depending on if run containers are present or not write the appropriate header
if has_run_containers {
// The new format stores the container count in the most significant bits of the header
let cookie = SERIAL_COOKIE as u32 | ((size as u32 - 1) << 16);
writer.write_u32::<LittleEndian>(cookie)?;
// It is then followed by a bitset indicating which containers are run containers
let run_container_bitmap_size = (size + 7) / 8;
let mut run_container_bitmap = vec![0; run_container_bitmap_size];
for (i, container) in self.containers.iter().enumerate() {
if let Store::Run(_) = container.store {
run_container_bitmap[i / 8] |= 1 << (i % 8);
}
}
writer.write_all(&run_container_bitmap)?;
} else {
// Write old format, cookie followed by container count
writer.write_u32::<LittleEndian>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
writer.write_u32::<LittleEndian>(size as u32)?;
}

// Write the container descriptions
for container in &self.containers {
writer.write_u16::<LittleEndian>(container.key)?;
writer.write_u16::<LittleEndian>((container.len - 1) as u16)?;
}

let mut offset = 8 + 8 * self.containers.len() as u32;
for container in &self.containers {
writer.write_u32::<LittleEndian>(offset)?;
match container.store {
Store::Array(ref values) => {
offset += values.len() as u32 * 2;
}
Store::Bitmap(..) => {
offset += 8 * 1024;
// Write offsets if there are no runs or NO_OFFSET_THRESHOLD containers is reached
if !has_run_containers || size >= NO_OFFSET_THRESHOLD {
let mut offset = header_size(size, has_run_containers) as u32;
for container in &self.containers {
writer.write_u32::<LittleEndian>(offset)?;
match container.store {
Store::Array(ref values) => {
offset += (values.len() * ARRAY_ELEMENT_BYTES) as u32;
}
Store::Bitmap(..) => {
offset += BITMAP_BYTES as u32;
}
Store::Run(ref intervals) => {
offset += (RUN_NUM_BYTES + (intervals.len() * RUN_ELEMENT_BYTES)) as u32;
}
}
}
}

// Finally serialize each of the containers
for container in &self.containers {
match container.store {
Store::Array(ref values) => {
Expand All @@ -91,6 +144,13 @@ impl RoaringBitmap {
writer.write_u64::<LittleEndian>(value)?;
}
}
Store::Run(ref intervals) => {
writer.write_u16::<LittleEndian>(intervals.len() as u16)?;
for iv in intervals {
writer.write_u16::<LittleEndian>(iv.start)?;
writer.write_u16::<LittleEndian>(iv.end - iv.start)?;
}
}
}
}

Expand All @@ -116,60 +176,109 @@ impl RoaringBitmap {
/// assert_eq!(rb1, rb2);
/// ```
pub fn deserialize_from<R: io::Read>(mut reader: R) -> io::Result<RoaringBitmap> {
let (size, has_offsets) = {
// First read the cookie to determine which version of the format we are reading
let (size, has_offsets, has_run_containers) = {
let cookie = reader.read_u32::<LittleEndian>()?;
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
(reader.read_u32::<LittleEndian>()? as usize, true)
(reader.read_u32::<LittleEndian>()? as usize, true, false)
} else if (cookie as u16) == SERIAL_COOKIE {
return Err(io::Error::new(
io::ErrorKind::Other,
"run containers are unsupported",
));
let size = ((cookie >> 16) + 1) as usize;
(size, size >= NO_OFFSET_THRESHOLD, true)
} else {
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
}
};

// Read the run container bitmap if necessary
let run_container_bitmap = if has_run_containers {
let mut bitmap = vec![0u8; (size + 7) / 8];
reader.read_exact(&mut bitmap)?;
Some(bitmap)
} else {
None
};

if size > u16::max_value() as usize {
return Err(io::Error::new(
io::ErrorKind::Other,
"size is greater than supported",
));
}

let mut description_bytes = vec![0u8; size * 4];
// Read the container descriptions
let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES];
reader.read_exact(&mut description_bytes)?;
let description_bytes = &mut &description_bytes[..];

// Read the offsets if present
if has_offsets {
let mut offsets = vec![0u8; size * 4];
let mut offsets = vec![0u8; size * OFFSET_BYTES];
reader.read_exact(&mut offsets)?;
drop(offsets); // Not useful when deserializing into memory
}

let mut containers = Vec::with_capacity(size);

for _ in 0..size {
// Read each of the containers
for i in 0..size {
let key = description_bytes.read_u16::<LittleEndian>()?;
let len = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
let cardinality = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;

let store = if len <= 4096 {
let mut values = Vec::with_capacity(len as usize);
for _ in 0..len {
// If the run container bitmap is present, check if this container is a run container
let is_run_container = match run_container_bitmap {
Some(ref bm) => bm[i / 8] & (1 << (i % 8)) != 0,
None => false,
};

let store = if is_run_container {
let runs = reader.read_u16::<LittleEndian>()?;
let mut intervals = Vec::with_capacity(runs as usize);
for _ in 0..runs {
let start = reader.read_u16::<LittleEndian>()?;
let run_len = reader.read_u16::<LittleEndian>()?;
let end = start + run_len;
intervals.push(Interval { start, end })
}
Store::Run(intervals)
} else if cardinality <= ARRAY_LIMIT {
let mut values = Vec::with_capacity(cardinality as usize);
for _ in 0..cardinality {
values.push(reader.read_u16::<LittleEndian>()?);
}
Store::Array(values)
} else {
let mut values = Box::new([0; 1024]);
let mut values = Box::new([0; BITMAP_LENGTH]);
for value in values.iter_mut() {
*value = reader.read_u64::<LittleEndian>()?;
}
Store::Bitmap(values)
};

containers.push(Container { key, len, store });
containers.push(Container {
key,
len: cardinality,
store,
});
}

Ok(RoaringBitmap { containers })
}
}

fn header_size(size: usize, has_run_containers: bool) -> usize {
if has_run_containers {
// New format encodes the size (number of containers) into the 4 byte cookie
// Additionally a bitmap is included marking which containers are run containers
let run_container_bitmap_size = (size + 7) / 8;
// New format conditionally includes offsets if there are 4 or more containers
if size >= NO_OFFSET_THRESHOLD {
COOKIE_BYTES + ((DESCRIPTION_BYTES + OFFSET_BYTES) * size) + run_container_bitmap_size
} else {
COOKIE_BYTES + (DESCRIPTION_BYTES * size) + run_container_bitmap_size
}
} else {
// Old format encodes cookie followed by container count
// It also always includes the offsets
COOKIE_BYTES + SIZE_BYTES + ((DESCRIPTION_BYTES + OFFSET_BYTES) * size)
}
}
Loading