Skip to content

Commit

Permalink
feat: kmeans1d (#533)
Browse files Browse the repository at this point in the history
* chore: do not require clang version =15

Signed-off-by: usamoi <[email protected]>

* refactor: Vec2

Signed-off-by: usamoi <[email protected]>

* feat: kmeans1d

Signed-off-by: usamoi <[email protected]>

* ci: set clang-16 as default clang

Signed-off-by: usamoi <[email protected]>

* fix: kmeans1d boundary check

Signed-off-by: usamoi <[email protected]>

---------

Signed-off-by: usamoi <[email protected]>
  • Loading branch information
usamoi authored Jul 22, 2024
1 parent 97636d7 commit 3cd2ca5
Show file tree
Hide file tree
Showing 19 changed files with 257 additions and 221 deletions.
1 change: 1 addition & 0 deletions .github/workflows/psql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ jobs:
wget --quiet -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
sudo apt-get update
sudo apt-get install -y clang-16
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-16 128
- name: Set up Pgrx
run: |
# pg_config
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ jobs:
wget --quiet -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
sudo apt-get update
sudo apt-get install -y clang-16
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-16 128
- name: Set up Pgrx
run: |
# pg_config
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ jobs:
wget --quiet -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
sudo apt-get update
sudo apt-get install -y clang-16
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-16 128
- name: Set up Pgrx
run: |
# pg_config
Expand Down Expand Up @@ -149,6 +150,7 @@ jobs:
wget --quiet -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
sudo apt-get update
sudo apt-get install -y clang-16
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-16 128
- name: Set up Pgrx
run: |
# pg_config
Expand Down
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions crates/base/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ mod f32;
mod half_f16;
mod i8;

use std::iter::Sum;

pub use f32::F32;
pub use half_f16::F16;
pub use i8::I8;
Expand All @@ -19,7 +21,9 @@ pub trait ScalarLike:
+ num_traits::Zero
+ num_traits::NumOps
+ num_traits::NumAssignOps
+ Default
+ crate::pod::Pod
+ Sum
{
fn from_f32(x: f32) -> Self;
fn to_f32(self) -> f32;
Expand Down
2 changes: 1 addition & 1 deletion crates/c/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ fn main() {
println!("cargo:rerun-if-changed=src/f16.h");
println!("cargo:rerun-if-changed=src/f16.c");
cc::Build::new()
.compiler("clang-16")
.compiler("clang")
.file("./src/f16.c")
.opt_level(3)
.flag("-fassociative-math")
Expand Down
4 changes: 4 additions & 0 deletions crates/c/src/f16.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
#if !(__clang_major__ >= 16)
#error "clang version must be >= 16"
#endif

#include "f16.h"
#include <math.h>

Expand Down
1 change: 0 additions & 1 deletion crates/common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@ pub mod remap;
pub mod sample;
pub mod variants;
pub mod vec2;
pub mod vec3;
8 changes: 4 additions & 4 deletions crates/common/src/sample.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ pub fn sample<O: Operator>(vectors: &impl Vectors<O>) -> Vec2<Scalar<O>> {
let n = vectors.len();
let m = std::cmp::min(SAMPLES as u32, n);
let f = super::rand::sample_u32(&mut rand::thread_rng(), n, m);
let mut samples = Vec2::new(vectors.dims(), m as usize);
let mut samples = Vec2::zeros((m as usize, vectors.dims() as usize));
for i in 0..m {
let v = vectors.vector(f[i as usize] as u32).to_vec();
samples[i as usize].copy_from_slice(&v);
samples[(i as usize,)].copy_from_slice(&v);
}
samples
}
Expand All @@ -27,12 +27,12 @@ pub fn sample_subvector_transform<O: Operator>(
let n = vectors.len();
let m = std::cmp::min(SAMPLES as u32, n);
let f = super::rand::sample_u32(&mut rand::thread_rng(), n, m);
let mut samples = Vec2::new((e - s) as u32, m as usize);
let mut samples = Vec2::zeros((m as usize, e - s));
for i in 0..m {
let v = transform(vectors.vector(f[i as usize] as u32))
.as_borrowed()
.to_vec();
samples[i as usize].copy_from_slice(&v[s..e]);
samples[(i as usize,)].copy_from_slice(&v[s..e]);
}
samples
}
85 changes: 45 additions & 40 deletions crates/common/src/vec2.rs
Original file line number Diff line number Diff line change
@@ -1,70 +1,75 @@
use base::pod::Pod;
use serde::{Deserialize, Serialize};
use std::ops::{Deref, DerefMut, Index, IndexMut};
use std::ops::{Index, IndexMut};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Vec2<T> {
dims: u32,
v: Vec<T>,
shape: (usize, usize),
base: Vec<T>,
}

impl<T: Pod + Ord> Vec2<T> {
pub fn new(dims: u32, n: usize) -> Self {
impl<T: Default + Copy> Vec2<T> {
pub fn zeros(shape: (usize, usize)) -> Self {
Self {
dims,
v: base::pod::zeroed_vec(dims as usize * n),
shape,
base: vec![T::default(); shape.0 * shape.1],
}
}
pub fn dims(&self) -> u32 {
self.dims
pub fn from_vec(shape: (usize, usize), base: Vec<T>) -> Self {
assert_eq!(shape.0 * shape.1, base.len());
Self { shape, base }
}
pub fn len(&self) -> usize {
self.v.len() / self.dims as usize
}

impl<T: Copy> Vec2<T> {
pub fn copy_within(&mut self, (l_i,): (usize,), (r_i,): (usize,)) {
assert!(l_i < self.shape.0);
assert!(r_i < self.shape.0);
let src_from = l_i * self.shape.1;
let src_to = src_from + self.shape.1;
let dest = r_i * self.shape.1;
self.base.copy_within(src_from..src_to, dest);
}
}

impl<T> Vec2<T> {
pub fn shape_0(&self) -> usize {
self.shape.0
}
pub fn is_empty(&self) -> bool {
self.len() == 0
pub fn shape_1(&self) -> usize {
self.shape.1
}
pub fn argsort(&self) -> Vec<usize> {
let mut index: Vec<usize> = (0..self.len()).collect();
index.sort_by_key(|i| &self[*i]);
index
pub fn as_slice(&self) -> &[T] {
self.base.as_slice()
}
pub fn copy_within(&mut self, i: usize, j: usize) {
assert!(i < self.len() && j < self.len());
unsafe {
if i != j {
let src = self.v.as_ptr().add(self.dims as usize * i);
let dst = self.v.as_mut_ptr().add(self.dims as usize * j);
std::ptr::copy_nonoverlapping(src, dst, self.dims as usize);
}
}
pub fn as_mut_slice(&mut self) -> &mut [T] {
self.base.as_mut_slice()
}
}

impl<T> Index<usize> for Vec2<T> {
impl<T> Index<(usize,)> for Vec2<T> {
type Output = [T];

fn index(&self, index: usize) -> &Self::Output {
&self.v[self.dims as usize * index..][..self.dims as usize]
fn index(&self, (i,): (usize,)) -> &Self::Output {
&self.base[i * self.shape.1..][..self.shape.1]
}
}

impl<T> IndexMut<usize> for Vec2<T> {
fn index_mut(&mut self, index: usize) -> &mut Self::Output {
&mut self.v[self.dims as usize * index..][..self.dims as usize]
impl<T> IndexMut<(usize,)> for Vec2<T> {
fn index_mut(&mut self, (i,): (usize,)) -> &mut Self::Output {
&mut self.base[i * self.shape.1..][..self.shape.1]
}
}

impl<T> Deref for Vec2<T> {
type Target = [T];
impl<T> Index<(usize, usize)> for Vec2<T> {
type Output = T;

fn deref(&self) -> &Self::Target {
self.v.deref()
fn index(&self, (i, j): (usize, usize)) -> &Self::Output {
&self.base[i * self.shape.1..][j]
}
}

impl<T> DerefMut for Vec2<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.v.deref_mut()
impl<T> IndexMut<(usize, usize)> for Vec2<T> {
fn index_mut(&mut self, (i, j): (usize, usize)) -> &mut Self::Output {
&mut self.base[i * self.shape.1..][j]
}
}
101 changes: 0 additions & 101 deletions crates/common/src/vec3.rs

This file was deleted.

4 changes: 2 additions & 2 deletions crates/ivf/src/ivf_naive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ fn from_nothing<O: Op>(
rayon::check();
let centroids = {
let mut samples = samples;
for i in 0..samples.len() {
O::elkan_k_means_normalize(&mut samples[i]);
for i in 0..samples.shape_0() {
O::elkan_k_means_normalize(&mut samples[(i,)]);
}
k_means(nlist as usize, samples)
};
Expand Down
8 changes: 4 additions & 4 deletions crates/ivf/src/ivf_residual.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ impl<O: Op> IvfResidual<O> {
);
let vectors = lists
.iter()
.map(|&(_, i)| O::vector_sub(vector, &self.centroids[i]))
.map(|&(_, i)| O::vector_sub(vector, &self.centroids[(i,)]))
.collect::<Vec<_>>();
let mut reranker = self
.quantization
Expand Down Expand Up @@ -104,8 +104,8 @@ fn from_nothing<O: Op>(
rayon::check();
let centroids = {
let mut samples = samples;
for i in 0..samples.len() {
O::elkan_k_means_normalize(&mut samples[i]);
for i in 0..samples.shape_0() {
O::elkan_k_means_normalize(&mut samples[(i,)]);
}
k_means(nlist as usize, samples)
};
Expand Down Expand Up @@ -141,7 +141,7 @@ fn from_nothing<O: Op>(
O::elkan_k_means_normalize(&mut vector);
k_means_lookup(&vector, &centroids)
};
O::vector_sub(vector, &centroids[target])
O::vector_sub(vector, &centroids[(target,)])
},
);
let payloads = MmapArray::create(
Expand Down
1 change: 1 addition & 0 deletions crates/k_means/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ rand.workspace = true

base = { path = "../base" }
common = { path = "../common" }
smawk = "0.3.2"
stoppable_rayon = { path = "../stoppable_rayon" }

[lints]
Expand Down
Loading

0 comments on commit 3cd2ca5

Please sign in to comment.