Skip to content

Commit

Permalink
feat: Embed modes and named vectors (#123)
Browse files Browse the repository at this point in the history
Added named vector support to qdrant. A pipeline can now have its embed
mode configured, either per field, chunk and metadata combined (default)
or both. Vectors need to be configured on the qdrant client side.

See `examples/store_multiple_vectors.rs` for an example.

Shoutout to @pwalski for the contribution. Closes #62.

---------

Co-authored-by: Przemyslaw Walski <[email protected]>
  • Loading branch information
pwalski and pwalski authored Jul 9, 2024
1 parent 2b13523 commit 699cfe4
Show file tree
Hide file tree
Showing 18 changed files with 887 additions and 150 deletions.
34 changes: 34 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,7 @@ path = "scraping_index_to_markdown.rs"
[[example]]
name = "aws-bedrock"
path = "aws_bedrock.rs"

[[example]]
name = "store-multiple-vectors"
path = "store_multiple_vectors.rs"
72 changes: 72 additions & 0 deletions examples/store_multiple_vectors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
//! # [Swiftide] Ingesting file with multiple metadata stored as named vectors
//!
//! This example demonstrates how to ingest a LICENSE file, generate multiple metadata, and store it all in Qdrant with individual named vectors
//!
//! The pipeline will:
//! - Load the LICENSE file from the current directory
//! - Chunk the file into pieces of 20 to 1024 bytes
//! - Generate questions and answers for each chunk
//! - Generate a summary for each chunk
//! - Generate a title for each chunk
//! - Generate keywords for each chunk
//! - Embed each chunk
//! - Embed each metadata
//! - Store the nodes in Qdrant with chunk and metadata embeds as named vectors
//!
//! [Swiftide]: https://github.com/bosun-ai/swiftide
//! [examples]: https://github.com/bosun-ai/swiftide/blob/master/examples
use swiftide::{
indexing::{self, EmbedMode, EmbeddedField},
integrations::{
self,
qdrant::{Distance, Qdrant, VectorConfig},
},
loaders::FileLoader,
transformers::{
metadata_keywords, metadata_qa_text, metadata_summary, metadata_title, ChunkMarkdown,
Embed, MetadataKeywords, MetadataQAText, MetadataSummary, MetadataTitle,
},
};

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::fmt::init();

let openai_client = integrations::openai::OpenAI::builder()
.default_embed_model("text-embedding-3-small")
.default_prompt_model("gpt-4o")
.build()?;

indexing::Pipeline::from_loader(FileLoader::new("LICENSE"))
.with_concurrency(1)
.with_embed_mode(EmbedMode::PerField)
.then_chunk(ChunkMarkdown::from_chunk_range(20..2048))
.then(MetadataQAText::new(openai_client.clone()))
.then(MetadataSummary::new(openai_client.clone()))
.then(MetadataTitle::new(openai_client.clone()))
.then(MetadataKeywords::new(openai_client.clone()))
.then_in_batch(10, Embed::new(openai_client.clone()))
.log_all()
.filter_errors()
.then_store_with(
Qdrant::builder()
.batch_size(50)
.vector_size(1536)
.collection_name("swiftide-multi-vectors")
.with_vector(EmbeddedField::Chunk)
.with_vector(EmbeddedField::Metadata(metadata_qa_text::NAME.into()))
.with_vector(EmbeddedField::Metadata(metadata_summary::NAME.into()))
.with_vector(
VectorConfig::builder()
.embedded_field(EmbeddedField::Metadata(metadata_title::NAME.into()))
.distance(Distance::Manhattan)
.build()?,
)
.with_vector(EmbeddedField::Metadata(metadata_keywords::NAME.into()))
.build()?,
)
.run()
.await?;
Ok(())
}
1 change: 1 addition & 0 deletions swiftide/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ testcontainers = "0.20.0"
mockall = "0.12.1"
temp-dir = "0.1.13"
wiremock = "0.6.0"
test-case = "3.3.1"

[lints.rust]
unsafe_code = "forbid"
Expand Down
78 changes: 69 additions & 9 deletions swiftide/src/indexing/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
//! individual units of data. It is particularly useful in scenarios where metadata and data chunks
//! need to be processed together.
use std::{
collections::HashMap,
collections::{BTreeMap, HashMap},
fmt::Debug,
hash::{Hash, Hasher},
path::PathBuf,
};

use itertools::Itertools;
use serde::{Deserialize, Serialize};

/// Represents a unit of data in the indexing process.
Expand All @@ -39,10 +40,12 @@ pub struct Node {
pub path: PathBuf,
/// Data chunk contained in the node.
pub chunk: String,
/// Optional vector representation of the data chunk.
pub vector: Option<Vec<f32>>,
/// Optional vector representation of embedded data.
pub vectors: Option<HashMap<EmbeddedField, Vec<f32>>>,
/// Metadata associated with the node.
pub metadata: HashMap<String, String>,
pub metadata: BTreeMap<String, String>,
/// Mode of embedding data Chunk and Metadata
pub embed_mode: EmbedMode,
}

impl Debug for Node {
Expand All @@ -57,9 +60,16 @@ impl Debug for Node {
.field("chunk", &self.chunk)
.field("metadata", &self.metadata)
.field(
"vector",
&self.vector.as_ref().map(|v| format!("[{}]", v.len())),
"vectors",
&self
.vectors
.iter()
.map(HashMap::iter)
.flatten()
.map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
.join(","),
)
.field("embed_mode", &self.embed_mode)
.finish()
}
}
Expand All @@ -75,15 +85,37 @@ impl Node {
}
}

/// Converts the node into an embeddable string format.
/// Creates embeddable data depending on chosen `EmbedMode`.
///
/// The embeddable format consists of the metadata formatted as key-value pairs, each on a new line,
/// # Returns
///
/// Embeddable data mapped to their `EmbeddedField`.
pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
let mut embeddables = Vec::new();

if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
}

if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
for (name, value) in self.metadata.iter() {
embeddables.push((EmbeddedField::Metadata(name.clone()), value.clone()));
}
}

embeddables
}

/// Converts the node into an [self::EmbeddedField::Combined] type of embeddable.
///
/// This embeddable format consists of the metadata formatted as key-value pairs, each on a new line,
/// followed by the data chunk.
///
/// # Returns
///
/// A string representing the embeddable format of the node.
pub fn as_embeddable(&self) -> String {
fn combine_chunk_with_metadata(&self) -> String {
// Metadata formatted by newlines joined with the chunk
let metadata = self
.metadata
Expand Down Expand Up @@ -118,3 +150,31 @@ impl Hash for Node {
self.chunk.hash(state);
}
}

/// Embed mode of the pipeline.
///
/// See also [super::pipeline::Pipeline::with_embed_mode].
#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
pub enum EmbedMode {
#[default]
/// Embedding Chunk of data combined with Metadata.
SingleWithMetadata,
/// Embedding Chunk of data and every Metadata separately.
PerField,
/// Embedding Chunk of data and every Metadata separately and Chunk of data combined with Metadata.
Both,
}

/// Type of Embeddable stored in model.
#[derive(Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display)]
pub enum EmbeddedField {
#[default]
/// Embeddable created from Chunk of data combined with Metadata.
Combined,
/// Embeddable created from Chunk of data only.
Chunk,
/// Embeddable created from Metadata.
/// String stores Metadata name.
#[strum(to_string = "Metadata: {0}")]
Metadata(String),
}
25 changes: 24 additions & 1 deletion swiftide/src/indexing/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use tracing::Instrument;

use std::{sync::Arc, time::Duration};

use super::{IndexingStream, Node};
use super::{EmbedMode, IndexingStream, Node};

/// A pipeline for indexing files, adding metadata, chunking, transforming, embedding, and then storing them.
///
Expand Down Expand Up @@ -84,6 +84,29 @@ impl Pipeline {
self
}

/// Sets the embed mode for the pipeline.
///
/// See also [super::node::EmbedMode].
///
/// # Arguments
///
/// * `embed_mode` - The desired embed mode.
///
/// # Returns
///
/// An instance of `Pipeline` with the updated embed mode.
pub fn with_embed_mode(mut self, embed_mode: EmbedMode) -> Self {
self.stream = self
.stream
.map_ok(move |mut node| {
node.embed_mode = embed_mode;
node
})
.boxed()
.into();
self
}

/// Filters out cached nodes using the provided cache.
///
/// # Arguments
Expand Down
Loading

0 comments on commit 699cfe4

Please sign in to comment.