Skip to content

Commit

Permalink
V.0.3.0 (#28)
Browse files Browse the repository at this point in the history
* Up Version to 0.3.0 as 2 new features are added

* Updated profile,signout/signin with skeleton loading

* Refactored code

* Added Admin Page & Navlink to Admin Page

* Overhauled Q&A for document upload components & api routes

* Added SweetAlert2 Package

* Added API route for getting user public collections requests

* Update route return error & removed redundant API Calls

* Added handles for buttons and fetching of data

* Upgraded Next.js from 13 to 14 & packages

* Added api route methods for management & refactored code

* Update API route to retrieve user profile data from public schema

* Updated Admin page with menu and pages data

* Removed unused import & updated delete func fetch with correct API endpoint

* Updated API routes naming convention

* Added toast promise for indexing

* Bunch of fixes & added API routes handles for admin sub pages

* Created Backend API route for indexing of user uploaded documents

* Aligned requests to API parameters

* Aligned requests to API parameters

* Fixed logic error in checking valid token

* Updated Python Packages

* Updated Indexer API Router

* Alignment with updated API Routes

* Removed postprocessing scores in search

* Updated search section with back button

* Added redirect to unauthorized page for admin page and api routes

* Added API Router for deleting collections via asyncpg in pgvecottr 'vecs' Schema

* Update middleware to skip is-admin api route

* Added API Route and functions to delete single and multiple user collections

* Updated HF Spaces Metadata

* Updated Sync to HF Hub with new space URL
  • Loading branch information
xKhronoz authored May 19, 2024
1 parent 4892344 commit fdaf912
Show file tree
Hide file tree
Showing 67 changed files with 7,998 additions and 1,785 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/sync-to-hugging-face-hub.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ jobs:
- name: Push to hub
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: git push https://khronoz:[email protected]/spaces/khronoz/Smart-Retrieval-API main
run: git push https://JTCSmartRetrieval:[email protected]/spaces/SmartRetrieval/Smart-Retrieval-Demo-API main
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
title: Smart Retrieval API
title: Smart Retrieval Demo API
emoji: 📝
colorFrom: blue
colorTo: indigo
Expand Down
10 changes: 5 additions & 5 deletions backend/backend/app/api/routers/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class _Message(BaseModel):

class _ChatData(BaseModel):
messages: List[_Message]
document: str
collection_id: str


# custom prompt template to be used by chat engine
Expand Down Expand Up @@ -72,11 +72,11 @@ async def chat(
data: _ChatData = Depends(json_to_model(_ChatData)),
):
logger = logging.getLogger("uvicorn")
# get the document set selected from the request body
document_set = data.document
logger.info(f"Document Set: {document_set}")
# get the collection_id from the request body
collection_id = data.collection_id
logger.info(f"Chat -> Collection ID: {collection_id}")
# get the index for the selected document set
index = get_index(collection_name=document_set)
index = get_index(collection_name=collection_id)
# check preconditions and get last message
if len(data.messages) == 0:
raise HTTPException(
Expand Down
107 changes: 107 additions & 0 deletions backend/backend/app/api/routers/collections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import logging
import os
import uuid
from typing import List

import asyncpg
from asyncpg.exceptions import PostgresError
from fastapi import APIRouter, Body, Depends, HTTPException
from pydantic import BaseModel

from backend.app.utils import auth


class _CollectionIds(BaseModel):
collection_ids: List[str]


collections_router = r = APIRouter(dependencies=[Depends(auth.validate_user)])

logger = logging.getLogger("uvicorn")

schema_name = "vecs"

"""
This router is for deleting collections functionality.
"""


def is_valid_uuidv4(uuid_str: str) -> bool:
try:
val = uuid.UUID(uuid_str, version=4)
except ValueError:
return False
return str(val) == uuid_str


async def drop_table(conn, collection_id):
try:
await conn.execute(
f'DROP TABLE IF EXISTS "{schema_name}"."{collection_id}" CASCADE'
)
return True
except PostgresError as e:
logger.error(f"Failed to drop table {collection_id}: {e}")
return False


@r.post("/delete/single")
async def delete_single(collection_id: str):
# Log the received collection_id
logger.info(f"Delete Collection: {collection_id}")

# Validate the collection_id to ensure it's a valid UUIDv4
if not is_valid_uuidv4(collection_id):
logger.error(f"Invalid collection_id: {collection_id}")
raise HTTPException(status_code=400, detail="Invalid collection_id format")

# Try to connect to the PostgreSQL database
db_url: str = os.environ.get("POSTGRES_CONNECTION_STRING")
if not db_url:
logger.error("POSTGRES_CONNECTION_STRING environment variable not set")
raise HTTPException(status_code=500, detail="Database configuration error")

try:
conn = await asyncpg.connect(dsn=db_url)
result = await drop_table(conn, collection_id)
except Exception as e:
logger.error(f"Failed to connect to the database: {e}")
raise HTTPException(status_code=500, detail="Failed to connect to the database")
finally:
await conn.close()

logger.debug(f"Delete Collection {collection_id}: {result}")
return {collection_id: result}


@r.post("/delete/multiple")
async def delete_multiple(collection_ids: _CollectionIds = Body(...)):
# Log the received collection_ids
logger.info(f"Delete Collections: {collection_ids.collection_ids}")

# Validate the collection_ids to ensure they are valid UUIDv4s
for collection_id in collection_ids.collection_ids:
if not is_valid_uuidv4(collection_id):
logger.error(f"Invalid collection_id: {collection_id}")
raise HTTPException(status_code=400, detail="Invalid collection_id format")

# Try to connect to the PostgreSQL database
db_url: str = os.environ.get("POSTGRES_CONNECTION_STRING")
if not db_url:
logger.error("POSTGRES_CONNECTION_STRING environment variable not set")
raise HTTPException(status_code=500, detail="Database configuration error")

results = {}
try:
conn = await asyncpg.connect(dsn=db_url)
for collection_id in collection_ids.collection_ids:
async with conn.transaction():
results[collection_id] = await drop_table(conn, collection_id)
except Exception as e:
logger.error(f"Failed to connect to the database: {e}")
raise HTTPException(status_code=500, detail="Failed to connect to the database")
finally:
await conn.close()

logger.debug(f"Delete Collections: {results}")
return results
73 changes: 73 additions & 0 deletions backend/backend/app/api/routers/indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import logging
import os
import tempfile
from typing import List

from fastapi import APIRouter, Depends, Form, File, HTTPException, UploadFile
from fastapi.responses import JSONResponse

from backend.app.utils import auth, index

# Initialize the logger
logger = logging.getLogger("uvicorn")

# Initialize the API Router with dependencies
indexer_router = r = APIRouter(dependencies=[Depends(auth.validate_user)])

"""
This router is for indexing of user uploaded documents functionality.
A list of files is received by the router and stored in a temporary directory.
The uploaded documents are indexed and stored in the vecs database.
"""


@r.post("")
async def indexer(
collection_id: str = Form(...),
files: List[UploadFile] = File(...),
user=Depends(auth.validate_user),
):
logger.info(f"Indexer -> Collection ID: {collection_id}")
logger.info(
f"User {user} is uploading {len(files)} files to collection {collection_id}"
)

try:
with tempfile.TemporaryDirectory() as temp_dir:
logger.info(f"Created temporary directory at {temp_dir}")

file_paths = []

for file in files:
contents = await file.read()
file_path = os.path.join(temp_dir, file.filename)
with open(file_path, "wb") as f:
f.write(contents)
file_paths.append(file_path)
logger.info(f"Saved file: {file.filename} at {file_path}")

# Call indexing function with the directory and collection_id
if len(file_paths) == 0:
raise HTTPException(
status_code=400, detail="No files uploaded for indexing"
)
if collection_id is None:
raise HTTPException(
status_code=400, detail="No collection ID provided for indexing"
)
if index.index_uploaded_files(temp_dir, collection_id):
logger.info("Files uploaded and indexed successfully.")
return JSONResponse(
status_code=200,
content={
"status": "Files uploaded and indexed successfully",
"filenames": [file.filename for file in files],
},
)
else:
raise HTTPException(
status_code=500, detail="Failed to upload and index files"
)
except Exception as e:
logger.error(f"Failed to upload and index files: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to upload and index files.")
11 changes: 6 additions & 5 deletions backend/backend/app/api/routers/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
This router is for query functionality which consist of query engine.
The query engine is used to query the index.
There is no chat memory used here, every query is independent of each other.
// Currently Depreciated - Not used in the current version of the application
"""


Expand All @@ -27,7 +28,7 @@ class _Message(BaseModel):

class _ChatData(BaseModel):
messages: List[_Message]
document: str
collection_id: str


@r.post("")
Expand All @@ -38,11 +39,11 @@ async def query(
data: _ChatData = Depends(json_to_model(_ChatData)),
):
logger = logging.getLogger("uvicorn")
# get the document set selected from the request body
document_set = data.document
logger.info(f"Document Set: {document_set}")
# get the collection_id selected from the request body
collection_id = data.collection_id
logger.info(f"Collection ID: {collection_id}")
# get the index for the selected document set
index = get_index(collection_name=document_set)
index = get_index(collection_name=collection_id)
# check preconditions and get last message which is query
if len(data.messages) == 0:
raise HTTPException(
Expand Down
18 changes: 11 additions & 7 deletions backend/backend/app/api/routers/search.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import re

from fastapi import APIRouter, Depends, HTTPException, Request, status
from fastapi import APIRouter, Depends, HTTPException, status
from llama_index.postprocessor import SimilarityPostprocessor
from llama_index.retrievers import VectorIndexRetriever

Expand All @@ -20,16 +20,15 @@

@r.get("")
async def search(
request: Request,
query: str = None,
docSelected: str = None,
collection_id: str = None,
):
# query = request.query_params.get("query")
logger = logging.getLogger("uvicorn")
logger.info(f"Document Set: {docSelected} | Search: {query}")
logger.info(f"Document Set: {collection_id} | Search: {query}")
# get the index for the selected document set
index = get_index(collection_name=docSelected)
if query is None or docSelected is None:
index = get_index(collection_name=collection_id)
if query is None or collection_id is None:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="No search info/document set provided",
Expand Down Expand Up @@ -62,6 +61,9 @@ async def search(

logger.info(f"Filtered Search results similarity score: {filtered_results_scores}")

# Skip postprocessing for now
filtered_results = query_results

response = []
id = 1
for node in filtered_results:
Expand All @@ -72,7 +74,9 @@ async def search(
data = {}
data["id"] = id
data["file_name"] = node_metadata["file_name"]
data["page_no"] = node_metadata["page_label"]
data["page_no"] = (
node_metadata["page_label"] if "page_label" in node_metadata else "N/A"
)
cleaned_text = re.sub(
"^_+ | _+$", "", node_dict["text"]
) # remove leading and trailing underscores
Expand Down
17 changes: 9 additions & 8 deletions backend/backend/app/utils/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ def get_user_from_JWT(token: str):
if payload is not None:
user_id = payload["sub"]
# Try to get the user from the database using the user_id
response = supabase.table("users").select("*").eq("id", user_id).execute()
response = supabase.table("users").select("id").eq("id", user_id).execute()
# print(response.data)
if len(response.data) == 0:
return False
else:
return True
return response.data[0]["id"]
else:
return False

Expand Down Expand Up @@ -109,7 +109,7 @@ async def validate_user(
)
else:
logger.info("Validated API key successfully!")
return None
return "Authenticated via API Key"
else:
auth_token = (
auth_token.strip()
Expand All @@ -125,16 +125,17 @@ async def validate_user(
"Supabase JWT Secret is not set in Backend Service!"
)
if not isBearer:
return (
raise ValueError(
"Invalid token scheme. Please use the format 'Bearer [token]'"
)
# Verify the JWT token is valid
if verify_jwt(jwtoken=jwtoken):
return "Invalid token. Please provide a valid token."
if verify_jwt(jwtoken=jwtoken) is False:
raise ValueError("Invalid token. Please provide a valid token.")
# Check if the user exists in the database
if get_user_from_JWT(token=jwtoken):
user = get_user_from_JWT(token=jwtoken)
if user:
logger.info("Validated User's Auth Token successfully!")
return None
return user
else:
raise ValueError("User does not exist in the database!")
else:
Expand Down
2 changes: 1 addition & 1 deletion backend/backend/app/utils/contants.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
DEF_EMBED_MODEL_DIMENSIONS = (
1536 # Default embedding model dimensions used by OpenAI text-embedding-ada-002
)
EMBED_BATCH_SIZE = 100 # batch size for openai embeddings
EMBED_BATCH_SIZE = 10 # batch size for openai embeddings

# Prompt Helper Constants
# set maximum input size
Expand Down
Loading

0 comments on commit fdaf912

Please sign in to comment.