Skip to content

Commit

Permalink
feat(fluyt/code_ops): add languages to chunker and range for chunk si…
Browse files Browse the repository at this point in the history
…ze (#334)

* feat(fluyt/code_ops): add more treesitter languages

* fix: clippy + fmt

* feat(fluyt/code_ops): implement builder and support range

* feat(fluyt/code_ops): implement range limits for code chunking

* feat(fluyt/indexing): code chunking supports size
  • Loading branch information
timonv authored May 27, 2024
1 parent 7453ddc commit 0986136
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 43 deletions.
5 changes: 5 additions & 0 deletions crates/code_ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ tree-sitter-rust = "0.21.0"
indoc = { workspace = true }
anyhow = { workspace = true }
infrastructure = { path = "../infrastructure/" }
tree-sitter-python = "0.21.0"
tree-sitter-ruby = "0.21.0"
tree-sitter-typescript = "0.21.1"
tree-sitter-javascript = "0.21.3"
derive_builder = { workspace = true }

[build-dependencies]
cc = "1.0.95"
13 changes: 4 additions & 9 deletions crates/code_ops/src/code_parser.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,15 @@
#![allow(dead_code)]
use tree_sitter::{Language, Node, Parser, Tree};
use tree_sitter::{Node, Parser, Tree};

use anyhow::{Context as _, Result};
use infrastructure::supported_languages::SupportedLanguages;

use crate::supported_language_to_tree_sitter;

pub struct CodeParser {
parser: Parser,
}

fn try_map_language(language: &SupportedLanguages) -> Result<Language> {
match language {
SupportedLanguages::Rust => Ok(tree_sitter_rust::language()),
_ => anyhow::bail!("Language {language} not supported by code splitter"),
}
}

pub struct CodeNode {
// parent: Option<Box<CodeNode<'a>>>,
children: Vec<CodeNode>,
Expand Down Expand Up @@ -44,7 +39,7 @@ impl CodeTree {
impl CodeParser {
pub fn try_new(language: SupportedLanguages) -> Result<Self> {
let mut parser = Parser::new();
parser.set_language(&try_map_language(&language)?)?;
parser.set_language(&supported_language_to_tree_sitter(&language))?;

Ok(Self { parser })
}
Expand Down
151 changes: 120 additions & 31 deletions crates/code_ops/src/code_splitter.rs
Original file line number Diff line number Diff line change
@@ -1,55 +1,79 @@
#![allow(dead_code)]
extern crate tree_sitter;

use std::ops::Range;

use anyhow::{Context as _, Result};
use derive_builder::Builder;
use infrastructure::supported_languages::SupportedLanguages;
use tree_sitter::{Language, Node, Parser};
use tree_sitter::{Node, Parser};

use crate::supported_language_to_tree_sitter;

// TODO: Instead of counting bytes, count tokens with titktoken
const DEFAULT_MAX_BYTES: usize = 1500;

#[derive(Debug)]
#[derive(Debug, Builder)]
/// Splits code files into meaningful chunks
///
/// Supports splitting code files into chunks based on a maximum size or a range of bytes.
pub struct CodeSplitter {
max_bytes: usize,
/// Maximum size of a chunk in bytes or a range of bytes
#[builder(default, setter(into))]
chunk_size: ChunkSize,
language: SupportedLanguages,
}

fn try_map_language(language: &SupportedLanguages) -> Result<Language> {
match language {
SupportedLanguages::Rust => Ok(tree_sitter_rust::language()),
_ => anyhow::bail!("Language {language} not supported by code splitter"),
#[derive(Debug, Clone)]
pub enum ChunkSize {
Bytes(usize),
Range(Range<usize>),
}

impl From<usize> for ChunkSize {
fn from(size: usize) -> Self {
ChunkSize::Bytes(size)
}
}

/// Splits code files into meaningful chunks
impl CodeSplitter {
pub fn try_new(language: SupportedLanguages, max_bytes: Option<usize>) -> Result<Self> {
Ok(Self {
max_bytes: max_bytes.unwrap_or(DEFAULT_MAX_BYTES),
language,
})
impl From<Range<usize>> for ChunkSize {
fn from(range: Range<usize>) -> Self {
ChunkSize::Range(range)
}
}

impl Default for ChunkSize {
fn default() -> Self {
ChunkSize::Bytes(DEFAULT_MAX_BYTES)
}
}

pub fn new(language: SupportedLanguages, max_bytes: Option<usize>) -> Self {
impl CodeSplitter {
pub fn new(language: SupportedLanguages) -> Self {
Self {
max_bytes: max_bytes.unwrap_or(DEFAULT_MAX_BYTES),
chunk_size: Default::default(),
language,
}
}

pub fn builder() -> CodeSplitterBuilder {
CodeSplitterBuilder::default()
}

fn chunk_node(&self, node: Node, source: &str, mut last_end: usize) -> Vec<String> {
let mut new_chunks: Vec<String> = Vec::new();
let mut current_chunk = String::new();

for child in node.children(&mut node.walk()) {
if child.end_byte() - child.start_byte() > self.max_bytes {
if child.end_byte() - child.start_byte() > self.max_bytes() {
// Child is too big, recursively chunk the child
if !current_chunk.is_empty() {
if !current_chunk.is_empty() && current_chunk.len() > self.min_bytes() {
new_chunks.push(current_chunk);
}
current_chunk = String::new();
new_chunks.extend(self.chunk_node(child, source, last_end));
} else if current_chunk.len() + child.end_byte() - child.start_byte() > self.max_bytes {
} else if current_chunk.len() + child.end_byte() - child.start_byte() > self.max_bytes()
{
// Child would make the current chunk too big, so start a new chunk
new_chunks.push(current_chunk.trim().to_string());
current_chunk = source[last_end..child.end_byte()].to_string();
Expand All @@ -59,7 +83,7 @@ impl CodeSplitter {
last_end = child.end_byte();
}

if !current_chunk.is_empty() {
if !current_chunk.is_empty() && current_chunk.len() > self.min_bytes() {
new_chunks.push(current_chunk)
}

Expand All @@ -68,7 +92,7 @@ impl CodeSplitter {

pub fn split(&self, code: &str) -> Result<Vec<String>> {
let mut parser = Parser::new();
parser.set_language(&try_map_language(&self.language)?)?;
parser.set_language(&supported_language_to_tree_sitter(&self.language))?;
let tree = parser.parse(code, None).context("No nodes found")?;
let root_node = tree.root_node();

Expand All @@ -78,6 +102,21 @@ impl CodeSplitter {
Ok(self.chunk_node(root_node, code, 0))
}
}

fn max_bytes(&self) -> usize {
match &self.chunk_size {
ChunkSize::Bytes(size) => *size,
ChunkSize::Range(range) => range.end,
}
}

fn min_bytes(&self) -> usize {
if let ChunkSize::Range(range) = &self.chunk_size {
range.start
} else {
0
}
}
}

#[cfg(test)]
Expand All @@ -89,7 +128,7 @@ mod test {
fn test_split_single_chunk() {
let code = "fn hello_world() {}";

let splitter = CodeSplitter::new(SupportedLanguages::Rust, None);
let splitter = CodeSplitter::new(SupportedLanguages::Rust);

let chunks = splitter.split(code);

Expand All @@ -98,7 +137,7 @@ mod test {

#[test]
fn test_chunk_lines() {
let splitter = CodeSplitter::new(SupportedLanguages::Rust, None);
let splitter = CodeSplitter::new(SupportedLanguages::Rust);

let text = indoc! {r#"
fn main() {
Expand All @@ -120,10 +159,11 @@ mod test {

#[test]
fn test_max_bytes_limit() {
let splitter = CodeSplitter::new(
SupportedLanguages::Rust,
Some(50), // Max 50 bytes
);
let splitter = CodeSplitter::builder()
.language(SupportedLanguages::Rust)
.chunk_size(50)
.build()
.unwrap();

let text = indoc! {r#"
fn main() {
Expand All @@ -146,15 +186,64 @@ mod test {

#[test]
fn test_empty_text() {
let splitter = CodeSplitter::new(
SupportedLanguages::Rust,
Some(50), // Max 50 characters
);
let splitter = CodeSplitter::builder()
.language(SupportedLanguages::Rust)
.chunk_size(50)
.build()
.unwrap();

let text = "";
let chunks = splitter.split(text).unwrap();

dbg!(&chunks);
assert_eq!(chunks.len(), 0);
}

#[test]
fn test_range_max() {
let splitter = CodeSplitter::builder()
.language(SupportedLanguages::Rust)
.chunk_size(0..50)
.build()
.unwrap();

let text = indoc! {r#"
fn main() {
println!("Hello, World!");
println!("Goodbye, World!");
}
"#};
let chunks = splitter.split(text).unwrap();
assert_eq!(
chunks,
vec![
"fn main()",
"{\n println!(\"Hello, World!\");",
"\n println!(\"Goodbye, World!\");\n}",
]
)
}

#[test]
fn test_range_min_and_max() {
let splitter = CodeSplitter::builder()
.language(SupportedLanguages::Rust)
.chunk_size(20..50)
.build()
.unwrap();
let text = indoc! {r#"
fn main() {
println!("Hello, World!");
println!("Goodbye, World!");
}
"#};
let chunks = splitter.split(text).unwrap();
assert_eq!(
chunks,
vec![
"{\n println!(\"Hello, World!\");",
"\n println!(\"Goodbye, World!\");\n}",
]
)
}
}
14 changes: 13 additions & 1 deletion crates/code_ops/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
mod code_parser;
mod code_splitter;

pub use {code_parser::CodeParser, code_splitter::CodeSplitter};
use infrastructure::SupportedLanguages;
use tree_sitter::Language;
pub use {code_parser::CodeParser, code_splitter::ChunkSize, code_splitter::CodeSplitter};

pub(crate) fn supported_language_to_tree_sitter(language: &SupportedLanguages) -> Language {
match language {
SupportedLanguages::Rust => tree_sitter_rust::language(),
SupportedLanguages::Python => tree_sitter_python::language(),
SupportedLanguages::Typescript => tree_sitter_typescript::language_typescript(),
SupportedLanguages::Javascript => tree_sitter_javascript::language(),
SupportedLanguages::Ruby => tree_sitter_ruby::language(),
}
}
21 changes: 19 additions & 2 deletions crates/indexing/src/transformers/chunk_code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::{
ingestion_node::IngestionNode, ingestion_pipeline::IngestionStream, traits::ChunkerTransformer,
};
use async_trait::async_trait;
use code_ops::CodeSplitter;
use code_ops::{ChunkSize, CodeSplitter};
use futures_util::{stream, StreamExt};
use infrastructure::SupportedLanguages;

Expand All @@ -15,7 +15,24 @@ impl ChunkCode {
pub fn for_language(lang: impl Into<SupportedLanguages>) -> Self {
let lang = lang.into();
Self {
chunker: CodeSplitter::new(lang, None),
chunker: CodeSplitter::builder()
.language(lang)
.build()
.expect("Failed to build code splitter"),
}
}

pub fn for_language_and_chunk_size(
lang: impl Into<SupportedLanguages>,
chunk_size: impl Into<ChunkSize>,
) -> Self {
let lang = lang.into();
Self {
chunker: CodeSplitter::builder()
.language(lang)
.chunk_size(chunk_size.into())
.build()
.expect("Failed to build code splitter"),
}
}
}
Expand Down

0 comments on commit 0986136

Please sign in to comment.