-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Code outlines in chunk metadata (#137)
Added a transformer that generates outlines for code files using tree sitter. And another that compresses the outline to be more relevant to chunks. Additionally added a step to the metadata QA tool that uses the outline to improve the contextual awareness during QA generation.
- Loading branch information
Showing
23 changed files
with
748 additions
and
131 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
//! # [Swiftide] Indexing the Swiftide itself example with reduced context size | ||
//! | ||
//! This example demonstrates how to index the Swiftide codebase itself, optimizing for a smaller context size. | ||
//! Note that for it to work correctly you need to have OPENAI_API_KEY set, redis and qdrant | ||
//! running. | ||
//! | ||
//! The pipeline will: | ||
//! - Load all `.rs` files from the current directory | ||
//! - Skip any nodes previously processed; hashes are based on the path and chunk (not the | ||
//! metadata!) | ||
//! - Generate an outline of the symbols defined in each file to be used as context in a later step and store it in the metadata | ||
//! - Chunk the code into pieces of 10 to 2048 bytes | ||
//! - For each chunk, generate a condensed subset of the symbols outline tailored for that specific chunk and store that in the metadata | ||
//! - Run metadata QA on each chunk; generating questions and answers and adding metadata | ||
//! - Embed the chunks in batches of 10, Metadata is embedded by default | ||
//! - Store the nodes in Qdrant | ||
//! | ||
//! Note that metadata is copied over to smaller chunks when chunking. When making LLM requests | ||
//! with lots of small chunks, consider the rate limits of the API. | ||
//! | ||
//! [Swiftide]: https://github.com/bosun-ai/swiftide | ||
//! [examples]: https://github.com/bosun-ai/swiftide/blob/master/examples | ||
use swiftide::indexing; | ||
use swiftide::indexing::loaders::FileLoader; | ||
use swiftide::indexing::transformers::{ChunkCode, Embed, MetadataQACode}; | ||
use swiftide::integrations::{self, qdrant::Qdrant, redis::Redis}; | ||
|
||
#[tokio::main] | ||
async fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
tracing_subscriber::fmt::init(); | ||
|
||
let openai_client = integrations::openai::OpenAI::builder() | ||
.default_embed_model("text-embedding-3-small") | ||
.default_prompt_model("gpt-3.5-turbo") | ||
.build()?; | ||
|
||
let redis_url = std::env::var("REDIS_URL") | ||
.as_deref() | ||
.unwrap_or("redis://localhost:6379") | ||
.to_owned(); | ||
|
||
let chunk_size = 2048; | ||
|
||
indexing::Pipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"])) | ||
.filter_cached(Redis::try_from_url( | ||
redis_url, | ||
"swiftide-examples-codebase-reduced-context", | ||
)?) | ||
.then( | ||
indexing::transformers::OutlineCodeTreeSitter::try_for_language( | ||
"rust", | ||
Some(chunk_size), | ||
)?, | ||
) | ||
.then(MetadataQACode::new(openai_client.clone())) | ||
.then_chunk(ChunkCode::try_for_language_and_chunk_size( | ||
"rust", | ||
10..chunk_size, | ||
)?) | ||
.then(indexing::transformers::CompressCodeOutline::new( | ||
openai_client.clone(), | ||
)) | ||
.then_in_batch(10, Embed::new(openai_client.clone())) | ||
.then_store_with( | ||
Qdrant::builder() | ||
.batch_size(50) | ||
.vector_size(1536) | ||
.collection_name("swiftide-examples-codebase-reduced-context") | ||
.build()?, | ||
) | ||
.run() | ||
.await?; | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
171 changes: 171 additions & 0 deletions
171
swiftide-indexing/src/transformers/compress_code_outline.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
//! `CompressCodeOutline` is a transformer that reduces the size of the outline of a the parent file of a chunk to make it more relevant to the chunk. | ||
use derive_builder::Builder; | ||
use std::sync::Arc; | ||
|
||
use anyhow::Result; | ||
use async_trait::async_trait; | ||
use swiftide_core::{indexing::Node, prompt::PromptTemplate, SimplePrompt, Transformer}; | ||
|
||
/// `CompressCodeChunk` rewrites the "Outline" metadata field of a chunk to | ||
/// condense it and make it more relevant to the chunk in question. It is useful as a | ||
/// step after chunking a file that has had outline generated for it with `FileToOutlineTreeSitter`. | ||
#[derive(Debug, Clone, Builder)] | ||
#[builder(setter(into, strip_option))] | ||
pub struct CompressCodeOutline { | ||
#[builder(setter(custom))] | ||
client: Arc<dyn SimplePrompt>, | ||
#[builder(default = "default_prompt()")] | ||
prompt_template: PromptTemplate, | ||
#[builder(default)] | ||
concurrency: Option<usize>, | ||
} | ||
|
||
fn extract_markdown_codeblock(text: String) -> String { | ||
let re = regex::Regex::new(r"(?sm)```\w*\n(.*?)```").unwrap(); | ||
let captures = re.captures(text.as_str()); | ||
captures | ||
.map(|c| c.get(1).unwrap().as_str().to_string()) | ||
.unwrap_or(text) | ||
} | ||
|
||
impl CompressCodeOutline { | ||
pub fn builder() -> CompressCodeOutlineBuilder { | ||
CompressCodeOutlineBuilder::default() | ||
} | ||
|
||
pub fn from_client(client: impl SimplePrompt + 'static) -> CompressCodeOutlineBuilder { | ||
CompressCodeOutlineBuilder::default() | ||
.client(client) | ||
.to_owned() | ||
} | ||
/// Creates a new instance of `CompressCodeOutline`. | ||
/// | ||
/// # Arguments | ||
/// | ||
/// * `client` - An implementation of the `SimplePrompt` trait used to generate questions and answers. | ||
/// | ||
/// # Returns | ||
/// | ||
/// A new instance of `CompressCodeOutline` with a default prompt and a default number of questions. | ||
pub fn new(client: impl SimplePrompt + 'static) -> Self { | ||
Self { | ||
client: Arc::new(client), | ||
prompt_template: default_prompt(), | ||
concurrency: None, | ||
} | ||
} | ||
|
||
#[must_use] | ||
pub fn with_concurrency(mut self, concurrency: usize) -> Self { | ||
self.concurrency = Some(concurrency); | ||
self | ||
} | ||
} | ||
|
||
/// Returns the default prompt template for generating questions and answers. | ||
/// | ||
/// This template includes placeholders for the number of questions and the code chunk. | ||
/// | ||
/// # Returns | ||
/// | ||
/// A string representing the default prompt template. | ||
fn default_prompt() -> PromptTemplate { | ||
include_str!("prompts/compress_code_outline.prompt.md").into() | ||
} | ||
|
||
impl CompressCodeOutlineBuilder { | ||
pub fn client(&mut self, client: impl SimplePrompt + 'static) -> &mut Self { | ||
self.client = Some(Arc::new(client)); | ||
self | ||
} | ||
} | ||
|
||
#[async_trait] | ||
impl Transformer for CompressCodeOutline { | ||
/// Asynchronously transforms an `Node` by reducing the size of the outline to make it more relevant to the chunk. | ||
/// | ||
/// This method uses the `SimplePrompt` client to compress the outline of the `Node` and updates the `Node` with the compressed outline. | ||
/// | ||
/// # Arguments | ||
/// | ||
/// * `node` - The `Node` to be transformed. | ||
/// | ||
/// # Returns | ||
/// | ||
/// A result containing the transformed `Node` or an error if the transformation fails. | ||
/// | ||
/// # Errors | ||
/// | ||
/// This function will return an error if the `SimplePrompt` client fails to generate a response. | ||
#[tracing::instrument(skip_all, name = "transformers.compress_code_outline")] | ||
async fn transform_node(&self, mut node: Node) -> Result<Node> { | ||
let maybe_outline = node.metadata.get("Outline"); | ||
|
||
let Some(outline) = maybe_outline else { | ||
return Ok(node); | ||
}; | ||
|
||
let prompt = self | ||
.prompt_template | ||
.to_prompt() | ||
.with_context_value("outline", outline.as_str()) | ||
.with_context_value("code", node.chunk.as_str()); | ||
|
||
let response = extract_markdown_codeblock(self.client.prompt(prompt).await?); | ||
|
||
node.metadata.insert("Outline".to_string(), response); | ||
|
||
Ok(node) | ||
} | ||
|
||
fn concurrency(&self) -> Option<usize> { | ||
self.concurrency | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use swiftide_core::MockSimplePrompt; | ||
|
||
use super::*; | ||
|
||
#[test_log::test(tokio::test)] | ||
async fn test_compress_code_template() { | ||
let template = default_prompt(); | ||
|
||
let outline = "Relevant Outline"; | ||
let code = "Code using outline"; | ||
|
||
let prompt = template | ||
.to_prompt() | ||
.with_context_value("outline", outline) | ||
.with_context_value("code", code); | ||
|
||
insta::assert_snapshot!(prompt.render().await.unwrap()); | ||
} | ||
|
||
#[tokio::test] | ||
async fn test_compress_code_outline() { | ||
let mut client = MockSimplePrompt::new(); | ||
|
||
client | ||
.expect_prompt() | ||
.returning(|_| Ok("RelevantOutline".to_string())); | ||
|
||
let transformer = CompressCodeOutline::builder() | ||
.client(client) | ||
.build() | ||
.unwrap(); | ||
let mut node = Node::new("Some text"); | ||
node.offset = 0; | ||
node.original_size = 100; | ||
|
||
node.metadata | ||
.insert("Outline".to_string(), "Some outline".to_string()); | ||
|
||
let result = transformer.transform_node(node).await.unwrap(); | ||
|
||
assert_eq!(result.chunk, "Some text"); | ||
assert_eq!(result.metadata.get("Outline").unwrap(), "RelevantOutline"); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.