Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(indexing)!: Use UUIDv3 for indexing node ids #277

Merged
merged 2 commits into from
Sep 6, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Use UUIDv3 for indexing node ids
timonv committed Sep 6, 2024
commit 268428ab3c364e34f380cf40fc03187ecd0c46ab
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -46,6 +46,7 @@ lazy_static = { version = "1.5.0" }
chrono = { version = "0.4" }
indoc = { version = "2.0" }
regex = { version = "1.10.6" }
uuid = { version = "1.10", features = ["v3", "v4", "serde"] }

# Integrations
spider = { version = "2.2" }
2 changes: 1 addition & 1 deletion swiftide-core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -28,7 +28,7 @@ lazy_static = { workspace = true }
derive_builder = { workspace = true }

tera = { version = "1.20", default-features = false }
uuid = { version = "1.10", features = ["v4"] }
uuid = { workspace = true, features = ["v4", "v3"] }

[dev-dependencies]
test-case = { workspace = true }
30 changes: 20 additions & 10 deletions swiftide-core/src/node.rs
Original file line number Diff line number Diff line change
@@ -21,6 +21,7 @@ use std::{
collections::HashMap,
fmt::Debug,
hash::{Hash, Hasher},
os::unix::ffi::OsStrExt,
path::PathBuf,
};

@@ -37,7 +38,7 @@ use crate::{metadata::Metadata, Embedding, SparseEmbedding};
#[derive(Default, Clone, Serialize, Deserialize, PartialEq)]
pub struct Node {
/// Optional identifier for the node.
pub id: Option<u64>,
pub id: Option<uuid::Uuid>,
/// File path associated with the node.
pub path: PathBuf,
/// Data chunk contained in the node.
@@ -162,17 +163,26 @@ impl Node {
format!("{}\n{}", metadata, self.chunk)
}

/// Calculates a hash value for the node based on its path and chunk.
/// Retrieve the identifier of the node.
///
/// The hash value is calculated using the default hasher provided by the standard library.
/// Calculates the identifier of the node based on its path and chunk as bytes, returning a
/// UUID (v3).
///
/// # Returns
///
/// A 64-bit hash value representing the node.
pub fn calculate_hash(&self) -> u64 {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
self.hash(&mut hasher);
hasher.finish()
/// If previously calculated, the identifier is returned directly. If not, a new identifier is
/// calculated without storing it.
pub fn id(&self) -> uuid::Uuid {
if let Some(id) = self.id {
return id;
}

let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat();
uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
}

/// Updates the identifier of the node.
pub fn update_id(&mut self) {
self.id = None;
self.id = Some(self.id());
}
}

4 changes: 3 additions & 1 deletion swiftide-integrations/src/fluvio/loader.rs
Original file line number Diff line number Diff line change
@@ -42,7 +42,7 @@ impl Loader for Fluvio {

#[cfg(test)]
mod tests {
use std::pin::Pin;
use std::{pin::Pin, thread::sleep, time::Duration};

use super::*;
use anyhow::Result;
@@ -84,6 +84,7 @@ mod tests {
.with_wait_for(testcontainers::core::WaitFor::message_on_stdout(
"started successfully",
))
.with_wait_for(testcontainers::core::WaitFor::seconds(1))
.with_network(NETWORK_NAME)
.with_container_name("sc")
.with_cmd("./fluvio-run sc --local /fluvio/metadata".split(' '))
@@ -96,6 +97,7 @@ mod tests {
.with_wait_for(testcontainers::core::WaitFor::message_on_stdout(
"started successfully",
))
.with_wait_for(testcontainers::core::WaitFor::seconds(1))
.with_network(NETWORK_NAME)
.with_container_name("spu")
.with_cmd(format!("./fluvio-run spu -i 5001 -p spu:{SPU_PORT1} -v spu:{SPU_PORT2} --sc-addr sc:9004 --log-base-dir /fluvio/data").split(' '))
9 changes: 8 additions & 1 deletion swiftide-integrations/src/lancedb/mod.rs
Original file line number Diff line number Diff line change
@@ -157,7 +157,14 @@ impl LanceDBBuilder {
fields.push(Field::new(field.field_name(), DataType::Utf8, false));
}
FieldConfig::ID => {
fields.push(Field::new(field.field_name(), DataType::UInt64, false));
fields.push(Field::new(
field.field_name(),
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::UInt8, true)),
16,
),
false,
));
}
}
}
11 changes: 7 additions & 4 deletions swiftide-integrations/src/lancedb/persist.rs
Original file line number Diff line number Diff line change
@@ -3,12 +3,11 @@ use std::sync::Arc;
use anyhow::Context as _;
use anyhow::Result;
use arrow_array::types::Float32Type;
use arrow_array::types::UInt64Type;
use arrow_array::types::UInt8Type;
use arrow_array::types::Utf8Type;
use arrow_array::Array;
use arrow_array::FixedSizeListArray;
use arrow_array::GenericByteArray;
use arrow_array::PrimitiveArray;
use arrow_array::RecordBatch;
use arrow_array::RecordBatchIterator;
use async_trait::async_trait;
@@ -138,10 +137,14 @@ impl LanceDB {
FieldConfig::ID => {
let mut row = Vec::with_capacity(nodes.len());
for node in nodes {
let data = Some(node.calculate_hash());
let data = Some(node.id().as_bytes().map(Some));
row.push(data);
}
batches.push(Arc::new(PrimitiveArray::<UInt64Type>::from_iter(row)));
batches.push(Arc::new(FixedSizeListArray::from_iter_primitive::<
UInt8Type,
_,
_,
>(row, 16)));
}
}
}
18 changes: 9 additions & 9 deletions swiftide-integrations/src/qdrant/indexing_node.rs
Original file line number Diff line number Diff line change
@@ -35,7 +35,7 @@ impl TryInto<qdrant::PointStruct> for NodeWithVectors<'_> {
fn try_into(self) -> Result<qdrant::PointStruct> {
let node = self.node;
// Calculate a unique identifier for the node.
let id = node.calculate_hash();
let id = node.id();

// Extend the metadata with additional information.
// TODO: The node is already cloned in the `NodeWithVectors` constructor.
@@ -64,7 +64,7 @@ impl TryInto<qdrant::PointStruct> for NodeWithVectors<'_> {
try_create_vectors(&self.vector_fields, vectors, node.sparse_vectors.clone())?;

// Construct the `qdrant::PointStruct` and return it.
Ok(qdrant::PointStruct::new(id, vectors, payload))
Ok(qdrant::PointStruct::new(id.to_string(), vectors, payload))
}
}

@@ -121,10 +121,10 @@ mod tests {

use crate::qdrant::indexing_node::NodeWithVectors;

static EXPECTED_VECTOR_ID: u64 = 17_298_870_094_173_045_322;
static EXPECTED_UUID: &str = "d42d252d-671d-37ef-a157-8e85d0710610";

#[test_case(
Node { id: Some(1), path: "/path".into(), chunk: "data".into(),
Node { id: None, path: "/path".into(), chunk: "data".into(),
vectors: Some(HashMap::from([(EmbeddedField::Chunk, vec![1.0])])),
original_size: 4,
offset: 0,
@@ -133,7 +133,7 @@ mod tests {
..Default::default()
},
HashSet::from([EmbeddedField::Combined]),
PointStruct { id: Some(PointId::from(EXPECTED_VECTOR_ID)), payload: HashMap::from([
PointStruct { id: Some(PointId::from(EXPECTED_UUID)), payload: HashMap::from([
("content".into(), Value::from("data")),
("path".into(), Value::from("/path")),
("m1".into(), Value::from("mv1"))]),
@@ -142,7 +142,7 @@ mod tests {
"Node with single vector creates struct with unnamed vector"
)]
#[test_case(
Node { id: Some(1), path: "/path".into(), chunk: "data".into(),
Node { id: None, path: "/path".into(), chunk: "data".into(),
vectors: Some(HashMap::from([
(EmbeddedField::Chunk, vec![1.0]),
(EmbeddedField::Metadata("m1".into()), vec![2.0])
@@ -154,7 +154,7 @@ mod tests {
..Default::default()
},
HashSet::from([EmbeddedField::Chunk, EmbeddedField::Metadata("m1".into())]),
PointStruct { id: Some(PointId::from(EXPECTED_VECTOR_ID)), payload: HashMap::from([
PointStruct { id: Some(PointId::from(EXPECTED_UUID)), payload: HashMap::from([
("content".into(), Value::from("data")),
("path".into(), Value::from("/path")),
("m1".into(), Value::from("mv1"))]),
@@ -170,7 +170,7 @@ mod tests {
"Node with multiple vectors creates struct with named vectors"
)]
#[test_case(
Node { id: Some(1), path: "/path".into(), chunk: "data".into(),
Node { id: None, path: "/path".into(), chunk: "data".into(),
vectors: Some(HashMap::from([
(EmbeddedField::Chunk, vec![1.0]),
(EmbeddedField::Combined, vec![1.0]),
@@ -184,7 +184,7 @@ mod tests {
..Default::default()
},
HashSet::from([EmbeddedField::Combined]),
PointStruct { id: Some(PointId::from(EXPECTED_VECTOR_ID)), payload: HashMap::from([
PointStruct { id: Some(PointId::from(EXPECTED_UUID)), payload: HashMap::from([
("content".into(), Value::from("data")),
("path".into(), Value::from("/path")),
("m1".into(), Value::from("mv1")),
4 changes: 2 additions & 2 deletions swiftide-integrations/src/redis/mod.rs
Original file line number Diff line number Diff line change
@@ -123,15 +123,15 @@ impl Redis {
///
/// A `String` representing the Redis key for the node.
fn cache_key_for_node(&self, node: &Node) -> String {
format!("{}:{}", self.cache_key_prefix, node.calculate_hash())
format!("{}:{}", self.cache_key_prefix, node.id())
}

/// Generates a key for a given node to be persisted in Redis.
fn persist_key_for_node(&self, node: &Node) -> Result<String> {
if let Some(key_fn) = self.persist_key_fn {
key_fn(node)
} else {
let hash = node.calculate_hash();
let hash = node.id();
Ok(format!("{}:{}", node.path.to_string_lossy(), hash))
}
}
2 changes: 1 addition & 1 deletion swiftide-integrations/src/redis/node_cache.rs
Original file line number Diff line number Diff line change
@@ -99,7 +99,7 @@ mod tests {
cache.reset_cache().await;

let node = Node {
id: Some(1),
id: None,
path: "test".into(),
chunk: "chunk".into(),
..Default::default()
8 changes: 4 additions & 4 deletions swiftide-integrations/src/redis/persist.rs
Original file line number Diff line number Diff line change
@@ -109,7 +109,7 @@ mod tests {
.unwrap();

let node = Node {
id: Some(1),
id: None,
path: "test".into(),
chunk: "chunk".into(),
..Default::default()
@@ -134,12 +134,12 @@ mod tests {
.unwrap();
let nodes = vec![
Node {
id: Some(1),
id: None,
path: "test".into(),
..Default::default()
},
Node {
id: Some(2),
id: None,
path: "other".into(),
..Default::default()
},
@@ -168,7 +168,7 @@ mod tests {
.build()
.unwrap();
let node = Node {
id: Some(1),
id: None,
..Default::default()
};