diff --git a/Cargo.lock b/Cargo.lock index 79d804eb..7f24a15b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -685,6 +685,15 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "examples" +version = "0.0.0" +dependencies = [ + "swiftide", + "tokio", + "tracing-subscriber", +] + [[package]] name = "flate2" version = "1.0.30" @@ -2732,6 +2741,7 @@ dependencies = [ "once_cell", "regex", "sharded-slab", + "smallvec", "thread_local", "tracing", "tracing-core", diff --git a/Cargo.toml b/Cargo.toml index 7d011ae8..b0fe0640 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,5 +3,5 @@ incremental = true debug = 0 [workspace] -members = ["swiftide"] +members = ["swiftide", "examples"] resolver = "2" diff --git a/examples/Cargo.toml b/examples/Cargo.toml new file mode 100644 index 00000000..ef4e34e1 --- /dev/null +++ b/examples/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "examples" +version = "0.0.0" +publish = false +edition = "2021" + +[dev-dependencies] +tokio = { version = "1.0", features = ["full"] } +swiftide = { path = "../swiftide/" } +tracing-subscriber = "0.3" + +[[example]] +name = "ingest-codebase" +path = "ingest_codebase.rs" diff --git a/examples/ingest_codebase.rs b/examples/ingest_codebase.rs new file mode 100644 index 00000000..10c6ec4e --- /dev/null +++ b/examples/ingest_codebase.rs @@ -0,0 +1,69 @@ +//! # [Swiftide] Ingesting the Swiftide itself example +//! +//! This example demonstrates how to ingest the Swiftide codebase itself. +//! Note that for it to work correctly you need to have OPENAI_API_KEY set, redis and qdrant +//! running. +//! +//! The pipeline will: +//! - Load all `.rs` files from the current directory +//! - Skip any nodes previously processed; hashes are based on the path and chunk (not the +//! metadata!) +//! - Run metadata QA on each chunk; generating questions and answers and adding metadata +//! - Chunk the code into pieces of 10 to 2048 bytes +//! - Embed the chunks in batches of 10, Metadata is embedded by default +//! - Store the nodes in Qdrant +//! +//! Note that metadata is copied over to smaller chunks when chunking. When making LLM requests +//! with lots of small chunks, consider the rate limits of the API. +//! +//! [Swiftide]: https://github.com/bosun-ai/swiftide +//! [examples]: https://github.com/bosun-ai/swiftide/blob/master/examples + +use swiftide::{ + ingestion, + integrations::{self, qdrant::Qdrant, redis::RedisNodeCache}, + loaders::FileLoader, + transformers::{ChunkCode, MetadataQACode, OpenAIEmbed}, +}; + +#[tokio::main] +async fn main() -> Result<(), Box> { + tracing_subscriber::fmt::init(); + + let openai_client = integrations::openai::OpenAI::builder() + .default_embed_model("text-embedding-3-small") + .default_prompt_model("gpt-3.5-turbo") + .build()?; + + let redis_url = std::env::var("REDIS_URL") + .as_deref() + .unwrap_or("redis://localhost:6379") + .to_owned(); + + let qdrant_url = std::env::var("QDRANT_URL") + .as_deref() + .unwrap_or("http://localhost:6334") + .to_owned(); + + ingestion::IngestionPipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"])) + .filter_cached(RedisNodeCache::try_from_url( + redis_url, + "swiftide-examples", + )?) + .then(MetadataQACode::new(openai_client.clone())) + .then_chunk(ChunkCode::try_for_language_and_chunk_size( + "rust", + 10..2048, + )?) + .then_in_batch(10, OpenAIEmbed::new(openai_client.clone())) + .store_with( + Qdrant::try_from_url(qdrant_url)? + .batch_size(50) + .vector_size(1536) + .collection_name("swiftide-examples".to_string()) + .build()?, + ) + .run() + .await?; + Ok(()) +} diff --git a/swiftide/src/ingestion/ingestion_pipeline.rs b/swiftide/src/ingestion/ingestion_pipeline.rs index acc76572..d6ff5f2a 100644 --- a/swiftide/src/ingestion/ingestion_pipeline.rs +++ b/swiftide/src/ingestion/ingestion_pipeline.rs @@ -143,6 +143,10 @@ impl IngestionPipeline { #[tracing::instrument(skip_all, fields(total_nodes), name = "ingestion_pipeline.run")] pub async fn run(mut self) -> Result<()> { + tracing::info!( + "Starting ingestion pipeline with {} concurrency", + self.concurrency + ); let Some(ref storage) = self.storage else { anyhow::bail!("No storage configured for ingestion pipeline") }; diff --git a/swiftide/src/integrations/openai/mod.rs b/swiftide/src/integrations/openai/mod.rs index 200f8b46..e65ea6e8 100644 --- a/swiftide/src/integrations/openai/mod.rs +++ b/swiftide/src/integrations/openai/mod.rs @@ -42,4 +42,28 @@ impl OpenAIBuilder { self.client = Some(Arc::new(client)); self } + + pub fn default_embed_model(&mut self, model: impl Into) -> &mut Self { + if let Some(options) = self.default_options.as_mut() { + options.embed_model = Some(model.into()); + } else { + self.default_options = Some(Options { + embed_model: Some(model.into()), + ..Default::default() + }); + } + self + } + + pub fn default_prompt_model(&mut self, model: impl Into) -> &mut Self { + if let Some(options) = self.default_options.as_mut() { + options.prompt_model = Some(model.into()); + } else { + self.default_options = Some(Options { + prompt_model: Some(model.into()), + ..Default::default() + }); + } + self + } } diff --git a/swiftide/src/integrations/qdrant/mod.rs b/swiftide/src/integrations/qdrant/mod.rs index 4851315b..e1f662be 100644 --- a/swiftide/src/integrations/qdrant/mod.rs +++ b/swiftide/src/integrations/qdrant/mod.rs @@ -17,7 +17,7 @@ pub struct Qdrant { #[builder(default = "DEFAULT_COLLECTION_NAME.to_string()")] collection_name: String, vector_size: usize, - #[builder(default)] + #[builder(default, setter(strip_option))] batch_size: Option, } @@ -26,8 +26,8 @@ impl Qdrant { QdrantBuilder::default() } - pub fn try_from_url(url: &str) -> Result { - Ok(QdrantBuilder::default().client(QdrantClient::from_url(url).build()?)) + pub fn try_from_url(url: impl AsRef) -> Result { + Ok(QdrantBuilder::default().client(QdrantClient::from_url(url.as_ref()).build()?)) } pub async fn create_index_if_not_exists(&self) -> Result<()> { diff --git a/swiftide/src/integrations/redis/node_cache.rs b/swiftide/src/integrations/redis/node_cache.rs index 6e744513..0b56d543 100644 --- a/swiftide/src/integrations/redis/node_cache.rs +++ b/swiftide/src/integrations/redis/node_cache.rs @@ -13,12 +13,12 @@ pub struct RedisNodeCache { } impl RedisNodeCache { - pub fn try_from_url(url: &str, prefix: &str) -> Result { - let client = redis::Client::open(url).context("Failed to open redis client")?; + pub fn try_from_url(url: impl AsRef, prefix: impl AsRef) -> Result { + let client = redis::Client::open(url.as_ref()).context("Failed to open redis client")?; Ok(Self { client, connection_manager: RwLock::new(None), - key_prefix: prefix.to_string(), + key_prefix: prefix.as_ref().to_string(), }) } diff --git a/swiftide/src/transformers/chunk_code.rs b/swiftide/src/transformers/chunk_code.rs index 28e9fd0c..f6913b5c 100644 --- a/swiftide/src/transformers/chunk_code.rs +++ b/swiftide/src/transformers/chunk_code.rs @@ -20,8 +20,8 @@ impl ChunkCode { }) } - pub fn for_language_and_chunk_size( - lang: impl Into, + pub fn try_for_language_and_chunk_size( + lang: impl TryInto, chunk_size: impl Into, ) -> Result { Ok(Self {