Skip to content

Commit

Permalink
Merge pull request #342 from MichaelClifford/pdf
Browse files Browse the repository at this point in the history
Add pdf option to rag recipe
  • Loading branch information
rhatdan authored Apr 25, 2024
2 parents df755ca + 1270b68 commit 477bda9
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 11 deletions.
39 changes: 28 additions & 11 deletions recipes/natural_language_processing/rag/app/rag_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,18 @@
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.callbacks import StreamlitCallbackHandler
from langchain.schema.document import Document
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import PyPDFLoader
from langchain.schema.document import Document
from chromadb import HttpClient
from chromadb.config import Settings
import chromadb.utils.embedding_functions as embedding_functions

import streamlit as st

import tempfile
import uuid
import os
import argparse
import pathlib

model_service = os.getenv("MODEL_ENDPOINT","http://0.0.0.0:8001/v1")
model_service = os.getenv("MODEL_ENDPOINT","http://0.0.0.0:8001")
model_service = f"{model_service}/v1"
chunk_size = os.getenv("CHUNK_SIZE", 150)
embedding_model = os.getenv("EMBEDDING_MODEL","BAAI/bge-base-en-v1.5")
Expand All @@ -39,9 +36,28 @@ def clear_vdb():
except:
pass

def read_file(file):
file_type = file.type

if file_type == "application/pdf":
temp = tempfile.NamedTemporaryFile()
with open(temp.name, "wb") as f:
f.write(file.getvalue())
loader = PyPDFLoader(temp.name)
pages = loader.load()
text = "".join([p.page_content for p in pages])

if file_type == "text/plain":
text = file.read().decode()

return text

st.title("📚 RAG DEMO")
with st.sidebar:
data = st.file_uploader(label="📄 Upload Document",type=['txt'], on_change=clear_vdb)
file = st.file_uploader(label="📄 Upload Document",
type=[".txt",".pdf"],
on_change=clear_vdb
)

### populate the DB ####
os.environ["TOKENIZERS_PARALLELISM"] = "false"
Expand All @@ -51,9 +67,10 @@ def clear_vdb():

collection = vectorDB_client.get_or_create_collection(vdb_name,
embedding_function=embedding_func)
if collection.count() < 1 and data != None:
if collection.count() < 1 and file != None:
print("populating db")
raw_documents = [Document(page_content=data.getvalue().decode("utf-8"),
text = read_file(file)
raw_documents = [Document(page_content=text,
metadata={"":""})]
text_splitter = CharacterTextSplitter(separator = ".",
chunk_size=int(chunk_size),
Expand All @@ -65,7 +82,7 @@ def clear_vdb():
metadatas=doc.metadata,
documents=doc.page_content
)
if data == None:
if file == None:
print("Empty VectorDB")
else:
print("DB already populated")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ langchain
chromadb
sentence-transformers
streamlit
pypdf

0 comments on commit 477bda9

Please sign in to comment.