-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoffee.py
110 lines (91 loc) · 3.74 KB
/
coffee.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import streamlit as st
import os
import re
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFDirectoryLoader
from dotenv import load_dotenv
import time
load_dotenv()
# Load the NVIDIA API key
api_key = os.getenv("NVIDIA_API_KEY")
if api_key:
os.environ['NVIDIA_API_KEY'] = api_key
else:
st.error("NVIDIA_API_KEY not found in environment variables.")
def clean_text(text):
# Remove invalid characters
return re.sub(r'[^\x00-\x7F]+', '', text)
def vector_embedding():
if "vectors" not in st.session_state:
st.session_state.embeddings = NVIDIAEmbeddings()
st.session_state.loader = PyPDFDirectoryLoader("./coffee") # Data Ingestion
st.session_state.docs = st.session_state.loader.load() # Document Loading
if not st.session_state.docs:
st.error("No documents loaded.")
return
# Clean the text of each document
for doc in st.session_state.docs:
doc.page_content = clean_text(doc.page_content)
st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=50) # Chunk Creation
st.session_state.final_documents = st.session_state.text_splitter.split_documents(st.session_state.docs) # Splitting
if not st.session_state.final_documents:
st.error("No documents after splitting.")
return
st.session_state.vectors = FAISS.from_documents(st.session_state.final_documents, st.session_state.embeddings) # Vector embeddings
if not st.session_state.vectors:
st.error("Vector embedding failed.")
return
# Use CSS to center the title
st.markdown(
"""
<style>
.centered-title {
text-align: center;
}
</style>
""",
unsafe_allow_html=True
)
# Apply the CSS class to the title
st.markdown('<h1 class="centered-title">Art of Coffee</h1>', unsafe_allow_html=True)
llm = ChatNVIDIA(model="meta/llama3-70b-instruct")
st.image('coffee.jpg')
prompt = ChatPromptTemplate.from_template(
"""
Answer the questions based on the provided context only.
Please provide the most accurate response based on the question.
<context>
{context}
</context>
Questions: {input}
"""
)
prompt1 = st.text_input("Iced, hot or coffee cocktail?")
if st.button("Brew me first"):
vector_embedding()
st.write("Your coffee is ready")
if prompt1:
if "vectors" not in st.session_state:
st.error("Vectors are not initialized. Please click 'Brew me first'.")
else:
document_chain = create_stuff_documents_chain(llm, prompt)
retriever = st.session_state.vectors.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)
start = time.process_time()
response = retrieval_chain.invoke({'input': prompt1})
st.write(f"Response time: {time.process_time() - start:.2f} seconds")
st.write(response['answer'])
# With a Streamlit expander
with st.expander("Similar Coffee Recipes"):
# Find the relevant chunks
for i, doc in enumerate(response["context"]):
st.write(doc.page_content)
st.write("-------------------------------")