Skip to content

Commit

Permalink
feat(indexing): Metadata as first class citizen (#204)
Browse files Browse the repository at this point in the history
Adds our own implementation for metadata, internally still using a
BTreeMap. The Value type is now a `serde_json::Value` enum. This allows
us to store the metadata in the same format as the rest of the document,
and also allows us to use values programmatically later.

As is, all current meta data is still stored as Strings.

Closes #162
  • Loading branch information
timonv authored Jul 28, 2024
1 parent 573aff6 commit ec1fb04
Show file tree
Hide file tree
Showing 14 changed files with 246 additions and 53 deletions.
3 changes: 1 addition & 2 deletions examples/aws_bedrock.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.get_all_values()
.await
.iter()
.filter_map(|n| n.metadata.get("Summary"))
.cloned()
.filter_map(|n| n.metadata.get("Summary").map(|v| v.to_string()))
.collect::<Vec<_>>()
.join("\n---\n")
);
Expand Down
2 changes: 1 addition & 1 deletion examples/index_groq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.get_all_values()
.await
.into_iter()
.flat_map(|n| n.metadata.into_values())
.flat_map(|n| n.metadata.into_values().map(|v| v.to_string()))
.collect::<Vec<_>>()
.join("\n")
);
Expand Down
3 changes: 3 additions & 0 deletions swiftide-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@ pub mod type_aliases;
pub mod prompt;
pub use type_aliases::*;

mod metadata;

/// All traits are available from the root
pub use crate::indexing_traits::*;
pub use crate::query_traits::*;

pub mod indexing {
pub use crate::indexing_stream::IndexingStream;
pub use crate::indexing_traits::*;
pub use crate::metadata::*;
pub use crate::node::*;
}

Expand Down
176 changes: 176 additions & 0 deletions swiftide-core/src/metadata.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
//! Metadata is a key-value store for indexation nodes
//!
//! Typically metadata is used to extract or generate additional information about the node
//!
//! Internally it uses a `BTreeMap` to store the key-value pairs, to ensure the data is sorted.
use std::collections::{btree_map::IntoValues, BTreeMap};

use serde::Deserializer;

#[derive(Debug, Clone, Default, PartialEq)]
pub struct Metadata {
inner: BTreeMap<String, serde_json::Value>,
}

impl Metadata {
pub fn iter(&self) -> impl Iterator<Item = (&String, &serde_json::Value)> {
self.inner.iter()
}

pub fn insert<K, V>(&mut self, key: K, value: V)
where
K: Into<String>,
V: Into<serde_json::Value>,
{
self.inner.insert(key.into(), value.into());
}

pub fn get(&self, key: impl AsRef<str>) -> Option<&serde_json::Value> {
self.inner.get(key.as_ref())
}

pub fn into_values(self) -> IntoValues<String, serde_json::Value> {
self.inner.into_values()
}
}

impl<K, V> Extend<(K, V)> for Metadata
where
K: Into<String>,
V: Into<serde_json::Value>,
{
fn extend<T: IntoIterator<Item = (K, V)>>(&mut self, iter: T) {
self.inner
.extend(iter.into_iter().map(|(k, v)| (k.into(), v.into())));
}
}

impl<K, V> From<Vec<(K, V)>> for Metadata
where
K: Into<String>,
V: Into<serde_json::Value>,
{
fn from(items: Vec<(K, V)>) -> Self {
let inner = items
.into_iter()
.map(|(k, v)| (k.into(), v.into()))
.collect();
Metadata { inner }
}
}

impl<'a, K, V> From<&'a [(K, V)]> for Metadata
where
K: Into<String> + Clone,
V: Into<serde_json::Value> + Clone,
{
fn from(items: &'a [(K, V)]) -> Self {
let inner = items
.iter()
.cloned()
.map(|(k, v)| (k.into(), v.into()))
.collect();
Metadata { inner }
}
}

impl<K: Ord, V, const N: usize> From<[(K, V); N]> for Metadata
where
K: Ord + Into<String>,
V: Into<serde_json::Value>,
{
fn from(mut arr: [(K, V); N]) -> Self {
if N == 0 {
return Metadata {
inner: BTreeMap::new(),
};
}
arr.sort_by(|a, b| a.0.cmp(&b.0));
let inner: BTreeMap<String, serde_json::Value> =
arr.into_iter().map(|(k, v)| (k.into(), v.into())).collect();
Metadata { inner }
}
}

impl IntoIterator for Metadata {
type Item = (String, serde_json::Value);
type IntoIter = std::collections::btree_map::IntoIter<String, serde_json::Value>;
fn into_iter(self) -> Self::IntoIter {
self.inner.into_iter()
}
}

impl<'iter> IntoIterator for &'iter Metadata {
type Item = (&'iter String, &'iter serde_json::Value);
type IntoIter = std::collections::btree_map::Iter<'iter, String, serde_json::Value>;
fn into_iter(self) -> Self::IntoIter {
self.inner.iter()
}
}

impl<'de> serde::Deserialize<'de> for Metadata {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
BTreeMap::deserialize(deserializer).map(|inner| Metadata { inner })
}
}

impl serde::Serialize for Metadata {
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.inner.serialize(serializer)
}
}

#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;

#[test]
fn test_insert_and_get() {
let mut metadata = Metadata::default();
let key = "key";
let value = "value";
metadata.insert(key, "value");

assert_eq!(metadata.get(key).unwrap().as_str(), Some(value));
}

#[test]
fn test_iter() {
let mut metadata = Metadata::default();
metadata.insert("key1", json!("value1"));
metadata.insert("key2", json!("value2"));

let mut iter = metadata.iter();
assert_eq!(iter.next(), Some((&"key1".to_string(), &json!("value1"))));
assert_eq!(iter.next(), Some((&"key2".to_string(), &json!("value2"))));
assert_eq!(iter.next(), None);
}

#[test]
fn test_extend() {
let mut metadata = Metadata::default();
metadata.extend(vec![("key1", json!("value1")), ("key2", json!("value2"))]);

assert_eq!(metadata.get("key1"), Some(&json!("value1")));
assert_eq!(metadata.get("key2"), Some(&json!("value2")));
}

#[test]
fn test_from_vec() {
let metadata = Metadata::from(vec![("key1", json!("value1")), ("key2", json!("value2"))]);

assert_eq!(metadata.get("key1"), Some(&json!("value1")));
assert_eq!(metadata.get("key2"), Some(&json!("value2")));
}

#[test]
fn test_into_values() {
let mut metadata = Metadata::default();
metadata.insert("key1", json!("value1"));
metadata.insert("key2", json!("value2"));

let values: Vec<_> = metadata.into_values().collect();
assert_eq!(values, vec![json!("value1"), json!("value2")]);
}
}
19 changes: 15 additions & 4 deletions swiftide-core/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
//! individual units of data. It is particularly useful in scenarios where metadata and data chunks
//! need to be processed together.
use std::{
collections::{BTreeMap, HashMap},
collections::HashMap,
fmt::Debug,
hash::{Hash, Hasher},
path::PathBuf,
Expand All @@ -27,6 +27,8 @@ use std::{
use itertools::Itertools;
use serde::{Deserialize, Serialize};

use crate::metadata::Metadata;

/// Represents a unit of data in the indexing process.
///
/// `Node` encapsulates all necessary information for a single unit of data being processed
Expand All @@ -43,7 +45,7 @@ pub struct Node {
/// Optional vector representation of embedded data.
pub vectors: Option<HashMap<EmbeddedField, Vec<f32>>>,
/// Metadata associated with the node.
pub metadata: BTreeMap<String, String>,
pub metadata: Metadata,
/// Mode of embedding data Chunk and Metadata
pub embed_mode: EmbedMode,
}
Expand Down Expand Up @@ -99,7 +101,10 @@ impl Node {
if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
for (name, value) in &self.metadata {
embeddables.push((EmbeddedField::Metadata(name.clone()), value.clone()));
let value = value
.as_str()
.map_or_else(|| value.to_string(), ToString::to_string);
embeddables.push((EmbeddedField::Metadata(name.clone()), value));
}
}

Expand All @@ -119,7 +124,13 @@ impl Node {
let metadata = self
.metadata
.iter()
.map(|(k, v)| format!("{k}: {v}"))
.map(|(k, v)| {
let v = v
.as_str()
.map_or_else(|| v.to_string(), ToString::to_string);

format!("{k}: {v}")
})
.collect::<Vec<String>>()
.join("\n");

Expand Down
28 changes: 11 additions & 17 deletions swiftide-indexing/src/transformers/embed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,11 @@ impl BatchableTransformer for Embed {

#[cfg(test)]
mod tests {
use swiftide_core::indexing::{EmbedMode, EmbeddedField, Node};
use swiftide_core::indexing::{EmbedMode, EmbeddedField, Metadata, Node};
use swiftide_core::{BatchableTransformer, MockEmbeddingModel};

use super::Embed;

use std::collections::HashMap;

use futures_util::StreamExt;
use mockall::predicate::*;
use test_case::test_case;
Expand All @@ -130,7 +128,7 @@ mod tests {
struct TestData<'a> {
pub embed_mode: EmbedMode,
pub chunk: &'a str,
pub metadata: HashMap<&'a str, &'a str>,
pub metadata: Metadata,
pub expected_embedables: Vec<&'a str>,
pub expected_vectors: Vec<(EmbeddedField, Vec<f32>)>,
}
Expand All @@ -139,14 +137,14 @@ mod tests {
TestData {
embed_mode: EmbedMode::SingleWithMetadata,
chunk: "chunk_1",
metadata: HashMap::from([("meta_1", "prompt_1")]),
metadata: Metadata::from([("meta_1", "prompt_1")]),
expected_embedables: vec!["meta_1: prompt_1\nchunk_1"],
expected_vectors: vec![(EmbeddedField::Combined, vec![1f32])]
},
TestData {
embed_mode: EmbedMode::SingleWithMetadata,
chunk: "chunk_2",
metadata: HashMap::from([("meta_2", "prompt_2")]),
metadata: Metadata::from([("meta_2", "prompt_2")]),
expected_embedables: vec!["meta_2: prompt_2\nchunk_2"],
expected_vectors: vec![(EmbeddedField::Combined, vec![2f32])]
}
Expand All @@ -155,7 +153,7 @@ mod tests {
TestData {
embed_mode: EmbedMode::PerField,
chunk: "chunk_1",
metadata: HashMap::from([("meta_1", "prompt 1")]),
metadata: Metadata::from([("meta_1", "prompt 1")]),
expected_embedables: vec!["chunk_1", "prompt 1"],
expected_vectors: vec![
(EmbeddedField::Chunk, vec![10f32]),
Expand All @@ -165,7 +163,7 @@ mod tests {
TestData {
embed_mode: EmbedMode::PerField,
chunk: "chunk_2",
metadata: HashMap::from([("meta_2", "prompt 2")]),
metadata: Metadata::from([("meta_2", "prompt 2")]),
expected_embedables: vec!["chunk_2", "prompt 2"],
expected_vectors: vec![
(EmbeddedField::Chunk, vec![20f32]),
Expand All @@ -177,7 +175,7 @@ mod tests {
TestData {
embed_mode: EmbedMode::Both,
chunk: "chunk_1",
metadata: HashMap::from([("meta_1", "prompt 1")]),
metadata: Metadata::from([("meta_1", "prompt 1")]),
expected_embedables: vec!["meta_1: prompt 1\nchunk_1", "chunk_1", "prompt 1"],
expected_vectors: vec![
(EmbeddedField::Combined, vec![10f32]),
Expand All @@ -188,7 +186,7 @@ mod tests {
TestData {
embed_mode: EmbedMode::Both,
chunk: "chunk_2",
metadata: HashMap::from([("meta_2", "prompt 2")]),
metadata: Metadata::from([("meta_2", "prompt 2")]),
expected_embedables: vec!["meta_2: prompt 2\nchunk_2", "chunk_2", "prompt 2"],
expected_vectors: vec![
(EmbeddedField::Combined, vec![20f32]),
Expand All @@ -201,7 +199,7 @@ mod tests {
TestData {
embed_mode: EmbedMode::Both,
chunk: "chunk_1",
metadata: HashMap::from([("meta_10", "prompt 10"), ("meta_11", "prompt 11"), ("meta_12", "prompt 12")]),
metadata: Metadata::from([("meta_10", "prompt 10"), ("meta_11", "prompt 11"), ("meta_12", "prompt 12")]),
expected_embedables: vec!["meta_10: prompt 10\nmeta_11: prompt 11\nmeta_12: prompt 12\nchunk_1", "chunk_1", "prompt 10", "prompt 11", "prompt 12"],
expected_vectors: vec![
(EmbeddedField::Combined, vec![10f32]),
Expand All @@ -214,7 +212,7 @@ mod tests {
TestData {
embed_mode: EmbedMode::Both,
chunk: "chunk_2",
metadata: HashMap::from([("meta_20", "prompt 20"), ("meta_21", "prompt 21"), ("meta_22", "prompt 22")]),
metadata: Metadata::from([("meta_20", "prompt 20"), ("meta_21", "prompt 21"), ("meta_22", "prompt 22")]),
expected_embedables: vec!["meta_20: prompt 20\nmeta_21: prompt 21\nmeta_22: prompt 22\nchunk_2", "chunk_2", "prompt 20", "prompt 21", "prompt 22"],
expected_vectors: vec![
(EmbeddedField::Combined, vec![20f32]),
Expand All @@ -232,11 +230,7 @@ mod tests {
.iter()
.map(|data| Node {
chunk: data.chunk.into(),
metadata: data
.metadata
.iter()
.map(|(k, v)| ((*k).to_string(), (*v).to_string()))
.collect(),
metadata: data.metadata.clone(),
embed_mode: data.embed_mode,
..Default::default()
})
Expand Down
2 changes: 1 addition & 1 deletion swiftide-indexing/src/transformers/metadata_keywords.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ impl Transformer for MetadataKeywords {
let prompt = self.prompt_template.to_prompt().with_node(&node);
let response = self.client.prompt(prompt).await?;

node.metadata.insert(NAME.into(), response);
node.metadata.insert(NAME, response);

Ok(node)
}
Expand Down
2 changes: 1 addition & 1 deletion swiftide-indexing/src/transformers/metadata_qa_code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ impl Transformer for MetadataQACode {

let response = self.client.prompt(prompt).await?;

node.metadata.insert(NAME.into(), response);
node.metadata.insert(NAME, response);

Ok(node)
}
Expand Down
Loading

0 comments on commit ec1fb04

Please sign in to comment.