meta-llama · ashwinb · Dec 4, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 13, 2024
@@ -167,3 +167,17 @@ xcuserdata/
 *.pte
 *.model
 Package.resolved
+
+#embeddings
+**/faiss_index.index
+**/faiss_index.index.json
+**/chroma
+
+
+# DocQA
+/examples/DocQA/data/input
+/examples/DocQA/data/output
+**/.gradio
+**/RAG_service.json
+examples/DocQA/example_data/chroma.sqlite3
+examples/DocQA/example_data/**/*.bin
@@ -0,0 +1,51 @@
+## DocQA
+
+This is an end-to-end Retrieval Augmented Generation (RAG) App leveraging llama-stack that handles the logic for ingesting documents, storing them in a vector database and providing an inference interface.
+
+We share the details of how to run first and then an outline of how it works:
+
+### Prerequisite:
+
+Install docker: Check [this doc for Mac](https://docs.docker.com/desktop/setup/install/mac-install/), [this doc for Windows](https://docs.docker.com/desktop/setup/install/windows-install/) and this [instruction for Linux](https://docs.docker.com/engine/install/).
+
+For Mac and Windows users, you need to start the Docker app manually after installation.
+
+### How to run the pipeline:
+
+![RAG_workflow](./data/assets/DocQA.png)
+
+The above is the workflow diagram for this RAG app. To run the app, please read the following instructions:
+
+1. Copy the template configuration file `docqa_env_template` to create your own `docqv_env` inside the docker folder:
+
+```bash
+cd docker
+cp docqa_env_template docqv_env
+```
+
+2. Then update `model_name` and `document_path` accordingly in your `docqv_env`, for example:
+
+```
+DOC_PATH=/path/to/your/llama-stack-apps/examples/DocQA/example_data
+MODEL_NAME=llama3.2:1b-instruct-fp16
+HOST=localhost
+LLAMA_STACK_PORT=5000
+CHROMA_PORT=6000
+GRADIO_SERVER_PORT=7860
+USE_GPU_FOR_DOC_INGESTION=false
+```
+
+3. In the `docker` folder, run following code:
+
+```bash
+bash run_RAG.sh
+```
+
+4.  Once the service is ready, open the link http://localhost:7861/ in your browser to chat with your documents.
+
+### Overview of how the RAG app works:
+
+1. We use [docling](https://github.com/DS4SD/docling) framework for handling multiple file input formats (PDF, PPTX, DOCX)
+2. If you are using a GPU, we have an option to use `Llama-3.2-11B-Vision` to caption images in the documents. On a CPU-only machine this step is skipped.
+3. Once ingested, we use a llama-stack distribution running chroma-db and `Llama-3.2-3B-Instruct` to ingest chunks into a memory_bank
+4. Once the vectordb is created, we then use llama-stack with the `Llama-3.2-3B-Instruct` to chat with the model.
@@ -0,0 +1,301 @@
+import asyncio
+import json
+import os
+import re
+import uuid
+from queue import Queue
+from threading import Thread
+from typing import AsyncGenerator, Generator, List, Optional
+
+import chromadb
+import gradio as gr
+import requests
+from chromadb.utils import embedding_functions
+from dotenv import load_dotenv
+from llama_stack_client import LlamaStackClient
+from llama_stack_client.lib.agents.agent import Agent
+from llama_stack_client.lib.agents.event_logger import EventLogger
+from llama_stack_client.types.agent_create_params import AgentConfig
+from llama_stack_client.types.memory_insert_params import Document
+
+
+# Load environment variables
+load_dotenv()
+
+HOST = os.getenv("HOST", "localhost")
+LLAMA_STACK_PORT = int(os.getenv("LLAMA_STACK_PORT", "5000"))
+GRADIO_SERVER_PORT = int(os.getenv("GRADIO_SERVER_PORT", "7861"))
+USE_GPU_FOR_DOC_INGESTION = os.getenv("USE_GPU_FOR_DOC_INGESTION", False)
+MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.2-1B-Instruct")
+# if USE_GPU_FOR_DOC_INGESTION, then the documents will be processed to output folder
+DOCS_DIR = "/root/rag_data/output" if USE_GPU_FOR_DOC_INGESTION else "/root/rag_data/"
+
+CUSTOM_CSS = """
+.context-block {
+    font-size: 0.8em;
+    border-left: 3px solid #e9ecef;
+    margin: 0.5em 0;
+    padding: 0.5em 1em;
+    opacity: 0.85;
+}
+
+.context-title {
+    font-size: 0.8em;
+    color: #9ca3af;
+    font-weight: 400;
+    display: flex;
+    align-items: center;
+    gap: 0.5em;
+    margin-bottom: 0.3em;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+
+.context-title::before {
+    content: "📄";
+    font-size: 1em;
+    opacity: 0.7;
+}
+
+.context-content {
+    color: #6b7280;
+    line-height: 1.4;
+    font-weight: 400;
+}
+
+.inference-response {
+    font-size: 1em;
+    color: #111827;
+    line-height: 1.5;
+    margin-top: 1em;
+}
+"""
+
+
+class LlamaChatInterface:
+    def __init__(self, host: str, port: int, docs_dir: str):
+        self.host = host
+        self.port = port
+        self.docs_dir = docs_dir
+        self.client = LlamaStackClient(base_url=f"http://{host}:{port}")
+        self.agent = None
+        self.session_id = None
+        self.memory_bank_id = "docqa_bank"
+
+    async def initialize_system(self):
+        """Initialize the entire system including memory bank and agent."""
+        await self.setup_memory_bank()
+        await self.initialize_agent()
+
+    async def setup_memory_bank(self):
+        """Set up the memory bank if it doesn't exist."""
+        providers = self.client.providers.list()
+        provider_id = providers["memory"][0].provider_id
+        memory_banks = self.client.memory_banks.list()
+        print(f"Memory banks: {memory_banks}")
+
+        # Check if memory bank exists by identifier
+        if any(bank.identifier == self.memory_bank_id for bank in memory_banks):
+            print(f"Memory bank '{self.memory_bank_id}' exists.")
+        else:
+            print(f"Memory bank '{self.memory_bank_id}' does not exist. Creating...")
+            self.client.memory_banks.register(
+                memory_bank_id=self.memory_bank_id,
+                params={
+                    "embedding_model": "all-MiniLM-L6-v2",
+                    "chunk_size_in_tokens": 100,
+                    "overlap_size_in_tokens": 10,
+                },
+                provider_id=provider_id,
+            )
+            await self.load_documents()
+            print(f"Memory bank registered.")
+
+    async def load_documents(self):
+        """Load documents from the specified directory into memory bank."""
+        documents = []
+        for filename in os.listdir(self.docs_dir):
+            if filename.endswith((".txt", ".md")):
+                file_path = os.path.join(self.docs_dir, filename)
+                with open(file_path, "r", encoding="utf-8") as file:
+                    content = file.read()
+                    document = Document(
+                        document_id=filename,
+                        content=content,
+                        mime_type="text/plain",
+                        metadata={"filename": filename},
+                    )
+                    documents.append(document)
+
+        if documents:
+            self.client.memory.insert(
+                bank_id=self.memory_bank_id,
+                documents=documents,
+            )
+            print(f"Loaded {len(documents)} documents from {self.docs_dir}")
+
+    async def initialize_agent(self):
+        """Initialize the agent with model registration and configuration."""
+
+        if "1b" in MODEL_NAME:
+            model_name = "Llama3.2-1B-Instruct"
+        elif "3b" in MODEL_NAME:
+            model_name = "Llama3.2-3B-Instruct"
+        elif "8b" in MODEL_NAME:
+            model_name = "Llama3.1-8B-Instruct"
+        else:
+            model_name = MODEL_NAME
+
+        agent_config = AgentConfig(
+            model=model_name,
+            instructions="You are a helpful assistant that can answer questions based on provided documents. Return your answer short and concise, less than 50 words.",
+            sampling_params={"strategy": "greedy", "temperature": 1.0, "top_p": 0.9},
+            tools=[
+                {
+                    "type": "memory",
+                    "memory_bank_configs": [
+                        {"bank_id": self.memory_bank_id, "type": "vector"}
+                    ],
+                    "max_tokens_in_context": 300,
+                    "max_chunks": 5,
+                }
+            ],
+            tool_choice="auto",
+            tool_prompt_format="json",
+            enable_session_persistence=True,
+        )
+        self.agent = Agent(self.client, agent_config)
+        self.session_id = self.agent.create_session(f"session-{uuid.uuid4()}")
+
+    def chat_stream(
+        self, message: str, history: List[List[str]]
+    ) -> Generator[List[List[str]], None, None]:
+        """Stream chat responses token by token with proper history handling."""
+
+        history = history or []
+        history.append([message, ""])
+
+        if self.agent is None:
+            asyncio.run(self.initialize_system())
+
+        response = self.agent.create_turn(
+            messages=[{"role": "user", "content": message}],
+            session_id=self.session_id,
+        )
+
+        current_response = ""
+        context_shown = False
+
+        for log in EventLogger().log(response):
+            log.print()
+            if hasattr(log, "content"):
+                # Format context blocks if present
+                if not context_shown and "Retrieved context from banks" in str(log):
+                    context = self.format_context(str(log))
+                    current_response = context + current_response
+                    context_shown = True
+                else:
+                    current_response += log.content
+
+                history[-1][1] = current_response
+                yield history.copy()
+
+    def format_context(self, log_str: str) -> str:
+        """Format the context block with custom styling."""
+        # Extract context and clean up the markers
+        context_match = re.search(
+            r"Retrieved context from banks:.*?\n(.*?===.*?===.*?)(?=\n>|$)",
+            log_str,
+            re.DOTALL,
+        )
+        if context_match:
+            context = context_match.group(1).strip()
+            # Remove the marker lines
+            context = re.sub(
+                r"====\s*Here are the retrieved documents for relevant context:\s*===\s*START-RETRIEVED-CONTEXT\s*===\s*",
+                "",
+                context,
+                flags=re.IGNORECASE,
+            )
+            return f"""
+<div class="context-block">
+    <div class="context-title">Retrieved Context</div>
+    <div class="context-content">{context}</div>
+</div>
+"""
+        return ""
+
+
+def create_gradio_interface(
+    host: str = HOST,
+    port: int = LLAMA_STACK_PORT,
+    docs_dir: str = DOCS_DIR,
+):
+    chat_interface = LlamaChatInterface(host, port, docs_dir)
+
+    with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as interface:
+        gr.Markdown("# LlamaStack Chat")
+
+        chatbot = gr.Chatbot(
+            bubble_full_width=False,
+            show_label=False,
+            height=400,
+            container=True,
+            render_markdown=True,
+        )
+        msg = gr.Textbox(
+            label="Message",
+            placeholder="Type your message here...",
+            show_label=False,
+            container=False,
+        )
+        with gr.Row():
+            submit = gr.Button("Send", variant="primary")
+            clear = gr.Button("Clear")
+
+        gr.Examples(
+            examples=[
+                "What topics are covered in the documents?",
+                "Can you summarize the main points?",
+                "Tell me more about specific details in the text.",
+            ],
+            inputs=msg,
+        )
+
+        def clear_chat():
+            return [], ""
+
+        submit_event = msg.submit(
+            fn=chat_interface.chat_stream,
+            inputs=[msg, chatbot],
+            outputs=chatbot,
+            queue=True,
+        ).then(
+            fn=lambda: "",
+            outputs=msg,
+        )
+
+        submit_click = submit.click(
+            fn=chat_interface.chat_stream,
+            inputs=[msg, chatbot],
+            outputs=chatbot,
+            queue=True,
+        ).then(
+            fn=lambda: "",
+            outputs=msg,
+        )
+
+        clear.click(clear_chat, outputs=[chatbot, msg], queue=False)
+
+        msg.submit(lambda: None, None, None, api_name=False)
+        interface.load(fn=chat_interface.initialize_system)
+
+    return interface
+
+
+if __name__ == "__main__":
+    # Create and launch the Gradio interface
+    interface = create_gradio_interface()
+    interface.launch(
+        server_name=HOST, server_port=GRADIO_SERVER_PORT, share=True, debug=True
+    )
@@ -0,0 +1,14 @@
+# Server Configuration
+HOST=localhost
+PORT=5000
+CHROMA_PORT=8000
+
+# Model and Memory Configuration
+MODEL_NAME=meta-llama/Llama-3.2-3B-Instruct
+MEMORY_BANK_ID=eval_bank
+
+# File Paths
+DOCS_DIR=../output
+
+# Optional: Add your API keys here if needed
+# OPENAI_API_KEY=your_api_key_here