Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(loaders): CSV Loader to Document Loaders #30

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
407 changes: 406 additions & 1 deletion Cargo.lock

Large diffs are not rendered by default.

13 changes: 9 additions & 4 deletions rig-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ repository = "https://github.com/0xPlaygrounds/rig"
name="rig"
path="src/lib.rs"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
reqwest = { version = "0.11.22", features = ["json"] }
serde = { version = "1.0.193", features = ["derive"] }
Expand All @@ -22,8 +20,15 @@ futures = "0.3.29"
ordered-float = "4.2.0"
schemars = "0.8.16"
thiserror = "1.0.61"
async-trait = "0.1.68"
csv = "1.3.0"
lopdf = "0.34.0"
html5ever = "0.29.0"
markdown = "0.3.0"
tokio = { version = "1.34.0", features = ["fs", "io-util"] }
anyhow = "1.0.75"
walkdir = "2.5.0"

[dev-dependencies]
anyhow = "1.0.75"
tokio = { version = "1.34.0", features = ["full"] }
tracing-subscriber = "0.3.18"
tracing-subscriber = "0.3.18"
87 changes: 87 additions & 0 deletions rig-core/examples/document_loaders.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
use rig::{
completion::Prompt,
document_loaders::PdfLoader,
embeddings::EmbeddingsBuilder,
providers::openai::{Client, TEXT_EMBEDDING_ADA_002},
vector_store::{in_memory_store::InMemoryVectorStore, VectorStore},
};
use std::env;
use std::path::PathBuf;

#[tokio::main]
async fn main() -> Result<(), anyhow::Error> {
// Print current working directory
println!("Current working directory: {:?}", env::current_dir()?);

// Path to the PDF file
let pdf_path = PathBuf::from("rig-core/examples/sample_data/moores_law_for_everything.pdf");

// Print absolute path
println!(
"Attempting to access file at: {:?}",
pdf_path.canonicalize()?
);

// Check if the file exists
if !pdf_path.exists() {
eprintln!("Error: The file {} does not exist.", pdf_path.display());
return Ok(());
}

println!("File found successfully!");

// Initialize OpenAI client
let openai = Client::from_env();
let embedding_model = openai.embedding_model(TEXT_EMBEDDING_ADA_002);

// Create vector store
let mut vector_store = InMemoryVectorStore::default();

// Build embeddings
let embeddings = EmbeddingsBuilder::new(embedding_model.clone())
.add_loader(PdfLoader::new(pdf_path.to_str().unwrap()))
.build()
.await?;

println!(
"Embeddings created successfully. Count: {}",
embeddings.len()
);
for emb in &embeddings {
// println!("Document ID: {}", emb.id);
// println!("Document Content: {:?}", emb.document);
println!("Number of embeddings: {}", emb.embeddings.len());
println!(
"First embedding vector length: {}",
emb.embeddings.first().map_or(0, |e| e.vec.len())
);
println!("--------------------");
}

// Add documents to vector store
vector_store.add_documents(embeddings).await?;

// Create vector store index
let index = vector_store.index(embedding_model);

// Create RAG agent
let rag_agent = openai
.agent("gpt-4")
.preamble(
"
You are a knowledgeable assistant.
Use the information provided to you to answer questions.
",
)
.dynamic_context(5, index)
.build();

// Prompt the agent and print the response
let response = rag_agent
.prompt("give me a summary of the document.")
.await?;

println!("Agent Response:\n{}", response);

Ok(())
}
85 changes: 85 additions & 0 deletions rig-core/examples/rag_with_csv.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use rig::{
completion::Prompt,
document_loaders::CsvLoader,
embeddings::EmbeddingsBuilder,
providers::openai::{Client, TEXT_EMBEDDING_ADA_002},
vector_store::{in_memory_store::InMemoryVectorStore, VectorStore},
};
use std::env;
use std::path::PathBuf;

#[tokio::main]
async fn main() -> Result<(), anyhow::Error> {
// Print current working directory
println!("Current working directory: {:?}", env::current_dir()?);

// Path to the CSV file
let csv_path = PathBuf::from("rig-core/examples/sample_data/top_rated_movies.csv");

// Print absolute path
println!(
"Attempting to access file at: {:?}",
csv_path.canonicalize()?
);

// Check if the file exists
if !csv_path.exists() {
eprintln!("Error: The file {} does not exist.", csv_path.display());
return Ok(());
}

println!("File found successfully!");

// Initialize OpenAI client
let openai = Client::from_env();
let embedding_model = openai.embedding_model(TEXT_EMBEDDING_ADA_002);

// Create vector store
let mut vector_store = InMemoryVectorStore::default();

// Build embeddings
let embeddings = EmbeddingsBuilder::new(embedding_model.clone())
.add_loader(CsvLoader::new(csv_path.to_str().unwrap()))
.build()
.await?;

println!(
"Embeddings created successfully. Count: {}",
embeddings.len()
);
for emb in &embeddings {
println!("Number of embeddings: {}", emb.embeddings.len());
println!(
"First embedding vector length: {}",
emb.embeddings.first().map_or(0, |e| e.vec.len())
);
println!("--------------------");
}

// Add documents to vector store
vector_store.add_documents(embeddings).await?;

// Create vector store index
let index = vector_store.index(embedding_model);

// Create RAG agent
let rag_agent = openai
.agent("gpt-4")
.preamble(
"
You are a knowledgeable assistant.
Use the information provided to you to answer questions about the CSV data.
",
)
.dynamic_context(5, index)
.build();

// Prompt the agent and print the response
let response = rag_agent
.prompt("Give me a summary of the CSV data.")
.await?;

println!("Agent Response:\n{}", response);

Ok(())
}
Binary file not shown.
110 changes: 110 additions & 0 deletions rig-core/examples/sample_data/top_rated_movies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
popularity,release_date,title,vote_average
174.522,9/23/1994,The Shawshank Redemption,8.706
165.677,3/14/1972,The Godfather,8.69
174.522,9/23/1994,The Shawshank Redemption,8.706
165.677,3/14/1972,The Godfather,8.69
47.916,12/20/1997,Life Is Beautiful,8.449
197.569,11/5/2014,Interstellar,8.44
42.629,10/13/2023,TAYLOR SWIFT | THE ERAS TOUR,8.388
21.39,11/19/2020,Gabriel's Inferno: Part III,8.4
69.527,7/3/1985,Back to the Future,8.318
49.507,6/2/1989,Dead Poets Society,8.312
18.452,10/28/1998,The Legend of 1900,8.266
14.67,8/22/2020,Given,8.3
173.556,12/7/2022,Puss in Boots: The Last Wish,8.227
29.933,10/26/2020,Wolfwalkers,8.22
135.05,10/7/2016,Hacksaw Ridge,8.198
46.126,12/19/1971,A Clockwork Orange,8.2
20.423,1/28/2005,Innocent Voices,8.174
27.808,11/3/1953,Tokyo Story,8.2
83.731,3/18/2021,Zack Snyder's Justice League,8.148
29.312,10/25/2019,Better Days,8.1
89.436,7/3/1991,Terminator 2: Judgment Day,8.119
14.482,7/6/1944,Double Indemnity,8.1
55.332,9/19/2013,Prisoners,8.098
14.629,3/31/1954,Sansho the Bailiff,8.098
11.283,11/24/2021,Far from the Tree,8.074
67.735,9/16/2005,Pride & Prejudice,8.075
38.208,2/26/2014,The Grand Budapest Hotel,8.1
42.511,12/3/2019,How to Train Your Dragon: Homecoming,8.048
13.881,10/27/2022,Beyond the Universe,8.027
26.21,10/18/2019,Jojo Rabbit,8.024
12.038,11/27/2020,Black Beauty,8
37.023,6/8/2009,Hachi: A Dog's Tale,8.008
5.126,6/1/2017,In a Heartbeat,7.995
33.577,12/23/2009,3 Idiots,7.995
137.914,5/3/2023,Guardians of the Galaxy Vol. 3,8
18.167,1/31/2009,Love Exposure,8
38.425,8/6/1999,The Sixth Sense,7.957
21.449,12/15/2004,Million Dollar Baby,7.957
56.873,6/13/2007,No Country for Old Men,7.944
35.17,10/18/2013,12 Years a Slave,7.942
12.423,1/21/2022,My Father's Violin,7.926
20.658,6/22/1954,On the Waterfront,7.9
10.913,8/1/1997,Children of Heaven,7.914
27.948,12/21/2016,Dangal,7.913
30.657,1/15/2021,Wish Dragon,7.902
28.602,11/4/2016,A Street Cat Named Bob,7.905
11.461,6/10/2008,La Maison en Petits Cubes,7.893
52.355,12/20/2017,The Greatest Showman,7.891
19.146,9/20/2000,Yi Yi,7.875
19.15,9/1/2000,Dancer in the Dark,7.875
14.602,12/21/2011,My Way,7.858
18.928,9/16/2004,Downfall,7.858
13.954,3/15/1940,The Grapes of Wrath,7.8
43.665,3/30/1990,Dances with Wolves,7.847
16.392,5/1/1983,Nostalgia,7.838
28.293,12/22/1960,Two Women,7.837
42.994,12/3/2022,The First Slam Dunk,7.8
92.789,6/21/2007,Ratatouille,7.824
32.022,3/20/1972,Solaris,7.8
22.13,6/16/2004,Before Sunset,7.818
33.161,10/23/2009,Fantastic Mr. Fox,7.8
141.3,7/9/2003,Pirates of the Caribbean: The Curse of the Black Pearl,7.804
10.15,8/21/1988,A Short Film About Love,7.794
45.622,9/20/2012,The Perks of Being a Wallflower,7.793
16.517,8/31/2000,Nine Queens,7.784
12.185,12/17/1993,The Wrong Trousers,7.784
20.295,12/4/1990,Awakenings,7.768
11.949,5/28/2009,Partly Cloudy,7.767
18.816,11/18/1974,A Woman Under the Influence,7.8
18.124,2/14/2008,The Chaser,7.758
86.727,2/11/2016,Zootopia,7.749
12.807,3/19/1980,The King and the Mockingbird,7.749
20.424,9/28/2019,Marriage Story,7.738
56.129,7/13/2022,The Killer,7.7
16.337,11/20/2020,Sound of Metal,7.727
14.088,9/20/1962,Vivre Sa Vie,7.727
18.88,12/22/2004,Hotel Rwanda,7.7
24.783,2/18/2017,Sword Art Online: The Movie – Ordinal Scale,7.718
62.912,6/10/2005,Batman Begins,7.709
28.113,12/9/1965,A Charlie Brown Christmas,7.707
8.592,4/7/1966,For Love and Gold,7.698
15.608,3/31/2011,The Turin Horse,7.7
61.64,9/30/2015,The Martian,7.687
13.588,1/8/2014,Boys,7.687
12.97,6/1/1998,"Black Cat, White Cat",7.68
12.521,9/25/1961,The Hustler,7.68
14.101,6/27/1951,Strangers on a Train,7.671
16.624,5/23/2019,The Traitor,7.672
29.525,3/6/1996,Primal Fear,7.661
26.983,3/31/2016,Hunt for the Wilderpeople,7.7
13.112,9/2/1949,White Heat,7.6
46.7,7/24/2020,The Kissing Booth 2,7.648
20.064,9/28/2022,Entergalactic,7.641
78.458,2/22/2024,Exhuma,7.64
56.569,1/19/2017,A Dog's Purpose,7.632
47.331,9/26/2008,Fireproof,7.632
18.036,10/19/1970,Le Cercle Rouge,7.623
65.228,2/24/2017,Get Out,7.623
38.806,4/9/2015,The Longest Ride,7.614
20.63,3/30/2005,Mysterious Skin,7.615
24.423,3/24/1989,The Killer,7.6
13.757,6/19/1969,The Wild Bunch,7.607
25.542,10/24/2008,Changeling,7.6
25.864,12/20/1991,JFK,7.6
23.749,6/19/2020,Feel the Beat,7.59
45.792,3/30/1999,10 Things I Hate About You,7.6
24.649,8/24/2018,Searching,7.583
79.872,12/15/2009,Avatar,7.583
17.303,6/19/2014,What We Do in the Shadows,7.575
49 changes: 49 additions & 0 deletions rig-core/src/document_loaders/csv.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
use async_trait::async_trait;
use csv::Reader;
use serde_json::json;
use std::error::Error as StdError;
use tokio::fs::File;
use tokio::io::AsyncReadExt;

use super::DocumentLoader;
use crate::embeddings::DocumentEmbeddings;

pub struct CsvLoader {
path: String,
}

impl CsvLoader {
pub fn new(path: &str) -> Self {
Self {
path: path.to_string(),
}
}
}

#[async_trait]
impl DocumentLoader for CsvLoader {
async fn load(&self) -> Result<Vec<DocumentEmbeddings>, Box<dyn StdError + Send + Sync>> {
let mut file = File::open(&self.path).await?;
let mut contents = String::new();
file.read_to_string(&mut contents).await?;

let mut reader = Reader::from_reader(contents.as_bytes());
let headers: Vec<String> = reader.headers()?.iter().map(|h| h.to_string()).collect();

let mut csv_content = String::new();

for result in reader.records() {
let record = result?;
for (i, field) in record.iter().enumerate() {
csv_content.push_str(&format!("{}: {}\n", headers[i], field));
}
csv_content.push('\n'); // Changed from push_str("\n") to push('\n')
}

Ok(vec![DocumentEmbeddings {
id: self.path.clone(),
document: json!({"text": csv_content}),
embeddings: vec![],
}])
}
}
28 changes: 28 additions & 0 deletions rig-core/src/document_loaders/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//! This module contains the implementation of document loaders for various file formats.
//! Currently, it includes loaders for CSV and PDF files.

mod csv;
// mod directory;
// mod html;
// mod json;
// mod markdown;
// mod office;
mod pdf;

use crate::embeddings::DocumentEmbeddings;
use async_trait::async_trait;
use std::error::Error as StdError;

#[async_trait]
pub trait DocumentLoader {
/// Asynchronously loads the document and returns a vector of document embeddings.
async fn load(&self) -> Result<Vec<DocumentEmbeddings>, Box<dyn StdError + Send + Sync>>;
}

pub use csv::CsvLoader;
// pub use directory::DirectoryLoader;
// pub use html::HtmlLoader;
// pub use json::JsonLoader;
// pub use markdown::MarkdownLoader;
// pub use office::OfficeLoader;
pub use pdf::PdfLoader;
Loading