V.0.3.0 (#28)

* Up Version to 0.3.0 as 2 new features are added * Updated profile,signout/signin with skeleton loading * Refactored code * Added Admin Page & Navlink to Admin Page * Overhauled Q&A for document upload components & api routes * Added SweetAlert2 Package * Added API route for getting user public collections requests * Update route return error & removed redundant API Calls * Added handles for buttons and fetching of data * Upgraded Next.js from 13 to 14 & packages * Added api route methods for management & refactored code * Update API route to retrieve user profile data from public schema * Updated Admin page with menu and pages data * Removed unused import & updated delete func fetch with correct API endpoint * Updated API routes naming convention * Added toast promise for indexing * Bunch of fixes & added API routes handles for admin sub pages * Created Backend API route for indexing of user uploaded documents * Aligned requests to API parameters * Aligned requests to API parameters * Fixed logic error in checking valid token * Updated Python Packages * Updated Indexer API Router * Alignment with updated API Routes * Removed postprocessing scores in search * Updated search section with back button * Added redirect to unauthorized page for admin page and api routes * Added API Router for deleting collections via asyncpg in pgvecottr 'vecs' Schema * Update middleware to skip is-admin api route * Added API Route and functions to delete single and multiple user collections * Updated HF Spaces Metadata * Updated Sync to HF Hub with new space URL
digitalbuiltenvironment · May 19, 2024 · fdaf912 · fdaf912
1 parent 4892344
commit fdaf912
Show file tree

Hide file tree

Showing 67 changed files with 7,998 additions and 1,785 deletions.
diff --git a/.github/workflows/sync-to-hugging-face-hub.yml b/.github/workflows/sync-to-hugging-face-hub.yml
@@ -21,4 +21,4 @@ jobs:
       - name: Push to hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: git push https://khronoz:[email protected]/spaces/khronoz/Smart-Retrieval-API main
+        run: git push https://JTCSmartRetrieval:[email protected]/spaces/SmartRetrieval/Smart-Retrieval-Demo-API main
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 ---
-title: Smart Retrieval API
+title: Smart Retrieval Demo API
 emoji: 📝
 colorFrom: blue
 colorTo: indigo

diff --git a/backend/backend/app/api/routers/chat.py b/backend/backend/app/api/routers/chat.py
@@ -34,7 +34,7 @@ class _Message(BaseModel):
 
 class _ChatData(BaseModel):
     messages: List[_Message]
-    document: str
+    collection_id: str
 
 
 # custom prompt template to be used by chat engine
@@ -72,11 +72,11 @@ async def chat(
     data: _ChatData = Depends(json_to_model(_ChatData)),
 ):
     logger = logging.getLogger("uvicorn")
-    # get the document set selected from the request body
-    document_set = data.document
-    logger.info(f"Document Set: {document_set}")
+    # get the collection_id from the request body
+    collection_id = data.collection_id
+    logger.info(f"Chat -> Collection ID: {collection_id}")
     # get the index for the selected document set
-    index = get_index(collection_name=document_set)
+    index = get_index(collection_name=collection_id)
     # check preconditions and get last message
     if len(data.messages) == 0:
         raise HTTPException(

diff --git a/backend/backend/app/api/routers/collections.py b/backend/backend/app/api/routers/collections.py
@@ -0,0 +1,107 @@
+import logging
+import os
+import uuid
+from typing import List
+
+import asyncpg
+from asyncpg.exceptions import PostgresError
+from fastapi import APIRouter, Body, Depends, HTTPException
+from pydantic import BaseModel
+
+from backend.app.utils import auth
+
+
+class _CollectionIds(BaseModel):
+    collection_ids: List[str]
+
+
+collections_router = r = APIRouter(dependencies=[Depends(auth.validate_user)])
+
+logger = logging.getLogger("uvicorn")
+
+schema_name = "vecs"
+
+"""
+This router is for deleting collections functionality.
+"""
+
+
+def is_valid_uuidv4(uuid_str: str) -> bool:
+    try:
+        val = uuid.UUID(uuid_str, version=4)
+    except ValueError:
+        return False
+    return str(val) == uuid_str
+
+
+async def drop_table(conn, collection_id):
+    try:
+        await conn.execute(
+            f'DROP TABLE IF EXISTS "{schema_name}"."{collection_id}" CASCADE'
+        )
+        return True
+    except PostgresError as e:
+        logger.error(f"Failed to drop table {collection_id}: {e}")
+        return False
+
+
+@r.post("/delete/single")
+async def delete_single(collection_id: str):
+    # Log the received collection_id
+    logger.info(f"Delete Collection: {collection_id}")
+
+    # Validate the collection_id to ensure it's a valid UUIDv4
+    if not is_valid_uuidv4(collection_id):
+        logger.error(f"Invalid collection_id: {collection_id}")
+        raise HTTPException(status_code=400, detail="Invalid collection_id format")
+
+    # Try to connect to the PostgreSQL database
+    db_url: str = os.environ.get("POSTGRES_CONNECTION_STRING")
+    if not db_url:
+        logger.error("POSTGRES_CONNECTION_STRING environment variable not set")
+        raise HTTPException(status_code=500, detail="Database configuration error")
+
+    try:
+        conn = await asyncpg.connect(dsn=db_url)
+        result = await drop_table(conn, collection_id)
+    except Exception as e:
+        logger.error(f"Failed to connect to the database: {e}")
+        raise HTTPException(status_code=500, detail="Failed to connect to the database")
+    finally:
+        await conn.close()
+
+    logger.debug(f"Delete Collection {collection_id}: {result}")
+    return {collection_id: result}
+
+
+@r.post("/delete/multiple")
+async def delete_multiple(collection_ids: _CollectionIds = Body(...)):
+    # Log the received collection_ids
+    logger.info(f"Delete Collections: {collection_ids.collection_ids}")
+
+    # Validate the collection_ids to ensure they are valid UUIDv4s
+    for collection_id in collection_ids.collection_ids:
+        if not is_valid_uuidv4(collection_id):
+            logger.error(f"Invalid collection_id: {collection_id}")
+            raise HTTPException(status_code=400, detail="Invalid collection_id format")
+
+    # Try to connect to the PostgreSQL database
+    db_url: str = os.environ.get("POSTGRES_CONNECTION_STRING")
+    if not db_url:
+        logger.error("POSTGRES_CONNECTION_STRING environment variable not set")
+        raise HTTPException(status_code=500, detail="Database configuration error")
+
+    results = {}
+    try:
+        conn = await asyncpg.connect(dsn=db_url)
+        for collection_id in collection_ids.collection_ids:
+            async with conn.transaction():
+                results[collection_id] = await drop_table(conn, collection_id)
+    except Exception as e:
+        logger.error(f"Failed to connect to the database: {e}")
+        raise HTTPException(status_code=500, detail="Failed to connect to the database")
+    finally:
+        await conn.close()
+
+    logger.debug(f"Delete Collections: {results}")
+    return results
diff --git a/backend/backend/app/api/routers/indexer.py b/backend/backend/app/api/routers/indexer.py
@@ -0,0 +1,73 @@
+import logging
+import os
+import tempfile
+from typing import List
+
+from fastapi import APIRouter, Depends, Form, File, HTTPException, UploadFile
+from fastapi.responses import JSONResponse
+
+from backend.app.utils import auth, index
+
+# Initialize the logger
+logger = logging.getLogger("uvicorn")
+
+# Initialize the API Router with dependencies
+indexer_router = r = APIRouter(dependencies=[Depends(auth.validate_user)])
+
+"""
+This router is for indexing of user uploaded documents functionality.
+A list of files is received by the router and stored in a temporary directory.
+The uploaded documents are indexed and stored in the vecs database.
+"""
+
+
+@r.post("")
+async def indexer(
+    collection_id: str = Form(...),
+    files: List[UploadFile] = File(...),
+    user=Depends(auth.validate_user),
+):
+    logger.info(f"Indexer -> Collection ID: {collection_id}")
+    logger.info(
+        f"User {user} is uploading {len(files)} files to collection {collection_id}"
+    )
+
+    try:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            logger.info(f"Created temporary directory at {temp_dir}")
+
+            file_paths = []
+
+            for file in files:
+                contents = await file.read()
+                file_path = os.path.join(temp_dir, file.filename)
+                with open(file_path, "wb") as f:
+                    f.write(contents)
+                file_paths.append(file_path)
+                logger.info(f"Saved file: {file.filename} at {file_path}")
+
+            # Call indexing function with the directory and collection_id
+            if len(file_paths) == 0:
+                raise HTTPException(
+                    status_code=400, detail="No files uploaded for indexing"
+                )
+            if collection_id is None:
+                raise HTTPException(
+                    status_code=400, detail="No collection ID provided for indexing"
+                )
+            if index.index_uploaded_files(temp_dir, collection_id):
+                logger.info("Files uploaded and indexed successfully.")
+                return JSONResponse(
+                    status_code=200,
+                    content={
+                        "status": "Files uploaded and indexed successfully",
+                        "filenames": [file.filename for file in files],
+                    },
+                )
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Failed to upload and index files"
+                )
+    except Exception as e:
+        logger.error(f"Failed to upload and index files: {str(e)}")
+        raise HTTPException(status_code=500, detail="Failed to upload and index files.")
diff --git a/backend/backend/app/api/routers/query.py b/backend/backend/app/api/routers/query.py
@@ -17,6 +17,7 @@
 This router is for query functionality which consist of query engine.
 The query engine is used to query the index.
 There is no chat memory used here, every query is independent of each other.
+// Currently Depreciated - Not used in the current version of the application
 """
 
 
@@ -27,7 +28,7 @@ class _Message(BaseModel):
 
 class _ChatData(BaseModel):
     messages: List[_Message]
-    document: str
+    collection_id: str
 
 
 @r.post("")
@@ -38,11 +39,11 @@ async def query(
     data: _ChatData = Depends(json_to_model(_ChatData)),
 ):
     logger = logging.getLogger("uvicorn")
-    # get the document set selected from the request body
-    document_set = data.document
-    logger.info(f"Document Set: {document_set}")
+    # get the collection_id selected from the request body
+    collection_id = data.collection_id
+    logger.info(f"Collection ID: {collection_id}")
     # get the index for the selected document set
-    index = get_index(collection_name=document_set)
+    index = get_index(collection_name=collection_id)
     # check preconditions and get last message which is query
     if len(data.messages) == 0:
         raise HTTPException(

diff --git a/backend/backend/app/api/routers/search.py b/backend/backend/app/api/routers/search.py
@@ -1,7 +1,7 @@
 import logging
 import re
 
-from fastapi import APIRouter, Depends, HTTPException, Request, status
+from fastapi import APIRouter, Depends, HTTPException, status
 from llama_index.postprocessor import SimilarityPostprocessor
 from llama_index.retrievers import VectorIndexRetriever
 
@@ -20,16 +20,15 @@
 
 @r.get("")
 async def search(
-    request: Request,
     query: str = None,
-    docSelected: str = None,
+    collection_id: str = None,
 ):
     # query = request.query_params.get("query")
     logger = logging.getLogger("uvicorn")
-    logger.info(f"Document Set: {docSelected} | Search: {query}")
+    logger.info(f"Document Set: {collection_id} | Search: {query}")
     # get the index for the selected document set
-    index = get_index(collection_name=docSelected)
-    if query is None or docSelected is None:
+    index = get_index(collection_name=collection_id)
+    if query is None or collection_id is None:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="No search info/document set provided",
@@ -62,6 +61,9 @@ async def search(
 
     logger.info(f"Filtered Search results similarity score: {filtered_results_scores}")
 
+    # Skip postprocessing for now
+    filtered_results = query_results
+
     response = []
     id = 1
     for node in filtered_results:
@@ -72,7 +74,9 @@ async def search(
         data = {}
         data["id"] = id
         data["file_name"] = node_metadata["file_name"]
-        data["page_no"] = node_metadata["page_label"]
+        data["page_no"] = (
+            node_metadata["page_label"] if "page_label" in node_metadata else "N/A"
+        )
         cleaned_text = re.sub(
             "^_+ | _+$", "", node_dict["text"]
         )  # remove leading and trailing underscores

diff --git a/backend/backend/app/utils/auth.py b/backend/backend/app/utils/auth.py
@@ -75,12 +75,12 @@ def get_user_from_JWT(token: str):
     if payload is not None:
         user_id = payload["sub"]
         # Try to get the user from the database using the user_id
-        response = supabase.table("users").select("*").eq("id", user_id).execute()
+        response = supabase.table("users").select("id").eq("id", user_id).execute()
         # print(response.data)
         if len(response.data) == 0:
             return False
         else:
-            return True
+            return response.data[0]["id"]
     else:
         return False
 
@@ -109,7 +109,7 @@ async def validate_user(
                     )
                 else:
                     logger.info("Validated API key successfully!")
-                    return None
+                    return "Authenticated via API Key"
             else:
                 auth_token = (
                     auth_token.strip()
@@ -125,16 +125,17 @@ async def validate_user(
                         "Supabase JWT Secret is not set in Backend Service!"
                     )
                 if not isBearer:
-                    return (
+                    raise ValueError(
                         "Invalid token scheme. Please use the format 'Bearer [token]'"
                     )
                 # Verify the JWT token is valid
-                if verify_jwt(jwtoken=jwtoken):
-                    return "Invalid token. Please provide a valid token."
+                if verify_jwt(jwtoken=jwtoken) is False:
+                    raise ValueError("Invalid token. Please provide a valid token.")
                 # Check if the user exists in the database
-                if get_user_from_JWT(token=jwtoken):
+                user = get_user_from_JWT(token=jwtoken)
+                if user:
                     logger.info("Validated User's Auth Token successfully!")
-                    return None
+                    return user
                 else:
                     raise ValueError("User does not exist in the database!")
         else:

diff --git a/backend/backend/app/utils/contants.py b/backend/backend/app/utils/contants.py
@@ -34,7 +34,7 @@
 DEF_EMBED_MODEL_DIMENSIONS = (
     1536  # Default embedding model dimensions used by OpenAI text-embedding-ada-002
 )
-EMBED_BATCH_SIZE = 100  # batch size for openai embeddings
+EMBED_BATCH_SIZE = 10  # batch size for openai embeddings
 
 # Prompt Helper Constants
 # set maximum input size