Skip to content

Commit

Permalink
Enhancements & Refactored Code (#10)
Browse files Browse the repository at this point in the history
* Fix: Button loading state not reset on 'back'

* Feat: search post processing to filter by average score

* Feat: Updated api status error handling

* Feat: Updated error handling & timeout duration

* Refactored imports & moved global vars to constants.py

* Feat: Updated dockerfile to install llama-cpp-python with openblas support by default

* Add .env file and update gitignore, pipeline name, robots.txt, middleware, layout, page, sitemap, and navlink components

* Fixed Pipeline Name

* Updated Check File Size workflow
  • Loading branch information
xKhronoz authored Jan 30, 2024
1 parent 73a5bd8 commit f730525
Show file tree
Hide file tree
Showing 27 changed files with 539 additions and 221 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/check-file-size.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ jobs:
check-file-size:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Check large files
uses: ppremk/[email protected]
with:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
name: Test Build and Deploy
name: Pipeline

on:
push:
branches: [main]
# to run this workflow manually from the Actions tab
workflow_dispatch:

# Test, Build and Deploy the app
jobs:
check-file-size:
uses: ./.github/workflows/check-file-size.yml
Expand Down
35 changes: 23 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@ FROM nvidia/cuda:${CUDA_IMAGE}
# Set up a new user named "user" with user ID 1000
RUN useradd -m -u 1000 user

# Install the dependencies
# Install the dependencies & clean up
RUN apt-get update && apt-get upgrade -y \
&& apt-get install -y git build-essential \
python3.11 gcc wget \
ocl-icd-opencl-dev opencl-headers clinfo \
cmake protobuf-compiler pkg-config \
libclblast-dev libopenblas-dev \
liblapack-dev liblapacke-dev libeigen3-dev libboost-all-dev \
&& mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
&& mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \
# Cleaning cache:
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/*

# Install pip for python 3.11
RUN wget https://bootstrap.pypa.io/get-pip.py && \
Expand All @@ -23,21 +26,25 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && \
# Switch to the user 'user'
USER user

# Setting build related env vars
ENV CUDA_DOCKER_ARCH=all
ENV LLAMA_CUBLAS=1

# Set home to the user's home directory and Poetry's environment variables
ENV HOME=/home/user \
PATH=/home/user/.local/bin:$PATH \
# Setting build / container related env vars
ENV CUDA_DOCKER_ARCH=all \
LLAMA_CUBLAS=1 \
# Set home to the user's home directory and Poetry's environment variables
HOME=/home/user \
PATH=/home/user/.local/bin:$PATH \
PYTHONUNBUFFERED=1 \
POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_IN_PROJECT=1 \
POETRY_VIRTUALENVS_CREATE=1 \
POETRY_CACHE_DIR=/tmp/poetry_cache \
# Build llama-cpp-python with default cuda support
CMAKE_ARGS="-DLLAMA_CUBLAS=on"
# CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
# Set the uvicorn env
ENVIRONMENT=prod \
##########################################################
# Build llama-cpp-python with cuda support
# CMAKE_ARGS="-DLLAMA_CUBLAS=on"
# Build llama-cpp-python with openblas support on CPU
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
##########################################################

# Set the working directory to /app
WORKDIR $HOME/app
Expand All @@ -61,4 +68,8 @@ RUN poetry install --without dev,torch-cpu && \
# Change to the package directory
WORKDIR $HOME/app/backend

# Make port 8000 available to the world outside this container
EXPOSE 8000

# Run the app when the container launches
CMD ["poetry", "run", "uvicorn", "main:app", "--host", "0.0.0.0"]
1 change: 1 addition & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
__pycache__
storage
.env
5 changes: 3 additions & 2 deletions backend/backend/app/api/routers/chat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging
from typing import List

from app.utils.index import get_index
from app.utils.json import json_to_model
from fastapi import APIRouter, Depends, HTTPException, Request, status
from fastapi.responses import StreamingResponse
from fastapi.websockets import WebSocketDisconnect
Expand All @@ -13,6 +11,9 @@
from llama_index.prompts import PromptTemplate
from pydantic import BaseModel

from backend.app.utils.index import get_index
from backend.app.utils.json import json_to_model

chat_router = r = APIRouter()

"""
Expand Down
5 changes: 3 additions & 2 deletions backend/backend/app/api/routers/query.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import logging
from typing import List

from app.utils.index import get_index
from app.utils.json import json_to_model
from fastapi import APIRouter, Depends, HTTPException, Request, status
from fastapi.responses import StreamingResponse
from fastapi.websockets import WebSocketDisconnect
from llama_index import VectorStoreIndex
from llama_index.llms.types import MessageRole
from pydantic import BaseModel

from backend.app.utils.index import get_index
from backend.app.utils.json import json_to_model

query_router = r = APIRouter()

"""
Expand Down
18 changes: 11 additions & 7 deletions backend/backend/app/api/routers/search.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import logging
import re

from app.utils.index import get_index
from fastapi import APIRouter, Depends, HTTPException, Request, status
from llama_index import VectorStoreIndex
from llama_index.postprocessor import SimilarityPostprocessor
from llama_index.retrievers import VectorIndexRetriever

from backend.app.utils.index import get_index

search_router = r = APIRouter()

"""
Expand Down Expand Up @@ -36,17 +37,22 @@ async def search(
index=index,
similarity_top_k=10,
)
# similarity postprocessor: filter nodes below 0.45 similarity score
node_postprocessor = SimilarityPostprocessor(similarity_cutoff=0.45)

# retrieve results
query_results = retriever.retrieve(query)

query_results_scores = [result.get_score() for result in query_results]

# get average score
average_score = sum(query_results_scores) / len(query_results_scores)

logger.info(f"Search results similarity score: {query_results_scores}")
logger.info(f"Average similarity score: {average_score}")

# similarity postprocessor: filter nodes below 0.45 similarity score
node_postprocessor = SimilarityPostprocessor(similarity_cutoff=average_score)

# postprocess results
# postprocess results based on average score
filtered_results = node_postprocessor.postprocess_nodes(query_results)

filtered_results_scores = [result.get_score() for result in filtered_results]
Expand All @@ -68,9 +74,7 @@ async def search(
"^_+ | _+$", "", node_dict["text"]
) # remove leading and trailing underscores
data["text"] = cleaned_text
data["similarity_score"] = round(
node.get_score(), 2
) # round to 2 decimal places
data["similarity_score"] = node.get_score()
response.append(data)
id += 1

Expand Down
39 changes: 39 additions & 0 deletions backend/backend/app/utils/contants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
########################################################################
# Model Constants for the backend app #
########################################################################
from pathlib import Path

from torch.cuda import is_available as is_cuda_available

# Model Constants
MAX_NEW_TOKENS = 4096
CONTEXT_SIZE = MAX_NEW_TOKENS
DEVICE_TYPE = "cuda" if is_cuda_available() else "cpu"

# Get the current directory
CUR_DIR = Path.cwd()

STORAGE_DIR = str(CUR_DIR / "storage") # directory to cache the generated index
DATA_DIR = str(CUR_DIR / "data") # directory containing the documents to index

# LLM Model Constants
LLM_MODEL_URL = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
# Model Kwargs
# set to at least 1 to use GPU, adjust according to your GPU memory, but must be able to fit the model
MODEL_KWARGS = {"n_gpu_layers": 100} if DEVICE_TYPE == "cuda" else {}

# Service Context Constants
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100

# Embedding Model Constants
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
EMBED_POOLING = "mean"

# Prompt Helper Constants
# set maximum input size
CHUNK_SIZE_LIMIT = MAX_NEW_TOKENS
# set number of output tokens
NUM_OUTPUT = 256
# set maximum chunk overlap
CHUNK_OVERLAP_RATIO = 0.2
54 changes: 25 additions & 29 deletions backend/backend/app/utils/index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import os
from pathlib import Path

from llama_index import (
PromptHelper,
Expand All @@ -17,37 +16,34 @@
completion_to_prompt,
messages_to_prompt,
)
from torch.cuda import is_available as is_cuda_available

MAX_NEW_TOKENS = 4096
CONTEXT_SIZE = MAX_NEW_TOKENS
MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGUF"
DEVICE_TYPE = "cuda" if is_cuda_available() else "cpu"

# Get the current directory
current_directory = Path.cwd()

STORAGE_DIR = str(
current_directory / "storage"
) # directory to cache the generated index
DATA_DIR = str(
current_directory / "data"
) # directory containing the documents to index


# set to at least 1 to use GPU, adjust according to your GPU memory, but must be able to fit the model
model_kwargs = {"n_gpu_layers": 100} if DEVICE_TYPE == "cuda" else {}
from backend.app.utils.contants import (
CHUNK_OVERLAP,
CHUNK_OVERLAP_RATIO,
CHUNK_SIZE,
CHUNK_SIZE_LIMIT,
CONTEXT_SIZE,
DATA_DIR,
DEVICE_TYPE,
EMBED_MODEL_NAME,
EMBED_POOLING,
LLM_MODEL_URL,
MAX_NEW_TOKENS,
MODEL_KWARGS,
NUM_OUTPUT,
STORAGE_DIR,
)

llm = LlamaCPP(
model_url="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf",
model_url=LLM_MODEL_URL,
temperature=0.1,
max_new_tokens=MAX_NEW_TOKENS,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=CONTEXT_SIZE,
# kwargs to pass to __call__()
# generate_kwargs={},
# kwargs to pass to __init__()
model_kwargs=model_kwargs,
model_kwargs=MODEL_KWARGS,
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
Expand All @@ -63,22 +59,22 @@
max_chunk_overlap = 0.2

embed_model = HuggingFaceEmbedding(
model_name="sentence-transformers/all-MiniLM-L6-v2",
pooling="mean",
model_name=EMBED_MODEL_NAME,
pooling=EMBED_POOLING,
device=DEVICE_TYPE,
)

prompt_helper = PromptHelper(
chunk_size_limit=4096,
chunk_overlap_ratio=0.2,
num_output=256,
chunk_size_limit=CHUNK_SIZE_LIMIT,
chunk_overlap_ratio=CHUNK_OVERLAP_RATIO,
num_output=NUM_OUTPUT,
)

service_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
chunk_size=1000,
chunk_overlap=100,
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
prompt_helper=prompt_helper,
)

Expand Down
26 changes: 12 additions & 14 deletions backend/backend/main.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
import logging
import os

from app.api.routers.chat import chat_router
from app.api.routers.healthcheck import healthcheck_router
from app.api.routers.query import query_router
from app.api.routers.search import search_router
from app.utils.index import create_index
from dotenv import load_dotenv
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from torch.cuda import is_available as is_cuda_available

from backend.app.api.routers.chat import chat_router
from backend.app.api.routers.healthcheck import healthcheck_router
from backend.app.api.routers.query import query_router
from backend.app.api.routers.search import search_router
from backend.app.utils.index import create_index

load_dotenv()

app = FastAPI()

environment = os.getenv("ENVIRONMENT", "dev") # Default to 'development' if not set

# TODO: Add reading allowed origins from environment variables
# Add allowed origins from environment variables
allowed_origins = os.getenv("ALLOWED_ORIGINS", "*")

if environment == "dev":
logger = logging.getLogger("uvicorn")
logger.warning("Running in development mode - allowing CORS for all origins")
app.add_middleware(
CORSMiddleware,
middleware_class=CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
Expand All @@ -32,19 +34,15 @@

if environment == "prod":
# In production, specify the allowed origins
allowed_origins = [
"https://your-production-domain.com",
"https://another-production-domain.com",
# Add more allowed origins as needed
]
allowed_origins = allowed_origins.split(",") if allowed_origins != "*" else ["*"]

logger = logging.getLogger("uvicorn")
logger.info(f"Running in production mode - allowing CORS for {allowed_origins}")
app.add_middleware(
CORSMiddleware,
middleware_class=CORSMiddleware,
allow_origins=allowed_origins,
allow_credentials=True,
allow_methods=["GET", "POST", "PUT", "DELETE"],
allow_methods=["GET", "POST"],
allow_headers=["*"],
)

Expand Down
1 change: 1 addition & 0 deletions backend/example.env
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALLOWED_ORIGINS=http://localhost:3000
2 changes: 1 addition & 1 deletion frontend/app/about/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
export default function About() {

return (
<div className="rounded-xl shadow-xl p-4 mb-8 max-w-5xl w-full">
<div className="rounded-xl shadow-xl p-4 max-w-5xl w-full">
<div className="max-w-2xl mx-auto p-4">
<div className="bg-gradient-to-r from-blue-500 to-indigo-500 text-white p-8 rounded-lg shadow-lg">
<h1 className="text-2xl md:text-4xl font-bold mb-4">About Smart Retrieval</h1>
Expand Down
Loading

0 comments on commit f730525

Please sign in to comment.