Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Completes features 1 & 5 in #14 #28

Merged
merged 32 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
89aa080
Updated profile,signout/signin with skeleton loading
xKhronoz May 13, 2024
bc520d2
Refactored code
xKhronoz May 13, 2024
2f96ee5
Added Admin Page & Navlink to Admin Page
xKhronoz May 13, 2024
3812e02
Overhauled Q&A for document upload components & api routes
xKhronoz May 13, 2024
df727eb
Added SweetAlert2 Package
xKhronoz May 14, 2024
1be1248
Added API route for getting user public collections requests
xKhronoz May 14, 2024
506bbb9
Update route return error & removed redundant API Calls
xKhronoz May 14, 2024
8333748
Added handles for buttons and fetching of data
xKhronoz May 14, 2024
40c0f9e
Upgraded Next.js from 13 to 14 & packages
xKhronoz May 14, 2024
4b14592
Added api route methods for management & refactored code
xKhronoz May 15, 2024
e873c87
Update API route to retrieve user profile data from public schema
xKhronoz May 15, 2024
8bd14e0
Updated Admin page with menu and pages data
xKhronoz May 15, 2024
4823f08
Removed unused import & updated delete func fetch with correct API en…
xKhronoz May 15, 2024
9aaebdf
Updated API routes naming convention
xKhronoz May 18, 2024
62fa31a
Added toast promise for indexing
xKhronoz May 18, 2024
433f155
Bunch of fixes & added API routes handles for admin sub pages
xKhronoz May 18, 2024
0132933
Created Backend API route for indexing of user uploaded documents
xKhronoz May 18, 2024
79b33ba
Aligned requests to API parameters
xKhronoz May 18, 2024
46896db
Aligned requests to API parameters
xKhronoz May 18, 2024
5fbbd68
Fixed logic error in checking valid token
xKhronoz May 18, 2024
bea5462
Updated Python Packages
xKhronoz May 18, 2024
b52661f
Updated Indexer API Router
xKhronoz May 18, 2024
e0fdf9a
Alignment with updated API Routes
xKhronoz May 18, 2024
a8c614c
Merge branch 'dev' of https://github.com/digitalbuiltenvironment/Smar…
xKhronoz May 18, 2024
f75df99
Removed postprocessing scores in search
xKhronoz May 18, 2024
d0e4c94
Updated search section with back button
xKhronoz May 18, 2024
c20ddc6
Added redirect to unauthorized page for admin page and api routes
xKhronoz May 18, 2024
c3bcb3f
Added API Router for deleting collections via asyncpg in pgvecottr 'v…
xKhronoz May 18, 2024
618d0e8
Update middleware to skip is-admin api route
xKhronoz May 18, 2024
8f364a9
Added API Route and functions to delete single and multiple user coll…
xKhronoz May 18, 2024
9aa362d
Updated HF Spaces Metadata
xKhronoz May 19, 2024
c809d7a
Updated Sync to HF Hub with new space URL
xKhronoz May 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/sync-to-hugging-face-hub.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ jobs:
- name: Push to hub
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: git push https://khronoz:[email protected]/spaces/khronoz/Smart-Retrieval-API main
run: git push https://JTCSmartRetrieval:[email protected]/spaces/SmartRetrieval/Smart-Retrieval-Demo-API main
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
title: Smart Retrieval API
title: Smart Retrieval Demo API
emoji: 📝
colorFrom: blue
colorTo: indigo
Expand Down
10 changes: 5 additions & 5 deletions backend/backend/app/api/routers/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class _Message(BaseModel):

class _ChatData(BaseModel):
messages: List[_Message]
document: str
collection_id: str


# custom prompt template to be used by chat engine
Expand Down Expand Up @@ -72,11 +72,11 @@ async def chat(
data: _ChatData = Depends(json_to_model(_ChatData)),
):
logger = logging.getLogger("uvicorn")
# get the document set selected from the request body
document_set = data.document
logger.info(f"Document Set: {document_set}")
# get the collection_id from the request body
collection_id = data.collection_id
logger.info(f"Chat -> Collection ID: {collection_id}")
# get the index for the selected document set
index = get_index(collection_name=document_set)
index = get_index(collection_name=collection_id)
# check preconditions and get last message
if len(data.messages) == 0:
raise HTTPException(
Expand Down
107 changes: 107 additions & 0 deletions backend/backend/app/api/routers/collections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import logging
import os
import uuid
from typing import List

import asyncpg
from asyncpg.exceptions import PostgresError
from fastapi import APIRouter, Body, Depends, HTTPException
from pydantic import BaseModel

from backend.app.utils import auth


class _CollectionIds(BaseModel):
collection_ids: List[str]


collections_router = r = APIRouter(dependencies=[Depends(auth.validate_user)])

logger = logging.getLogger("uvicorn")

schema_name = "vecs"

"""
This router is for deleting collections functionality.
"""


def is_valid_uuidv4(uuid_str: str) -> bool:
try:
val = uuid.UUID(uuid_str, version=4)
except ValueError:
return False
return str(val) == uuid_str


async def drop_table(conn, collection_id):
try:
await conn.execute(
f'DROP TABLE IF EXISTS "{schema_name}"."{collection_id}" CASCADE'
)
return True
except PostgresError as e:
logger.error(f"Failed to drop table {collection_id}: {e}")
return False


@r.post("/delete/single")
async def delete_single(collection_id: str):
# Log the received collection_id
logger.info(f"Delete Collection: {collection_id}")

# Validate the collection_id to ensure it's a valid UUIDv4
if not is_valid_uuidv4(collection_id):
logger.error(f"Invalid collection_id: {collection_id}")
raise HTTPException(status_code=400, detail="Invalid collection_id format")

# Try to connect to the PostgreSQL database
db_url: str = os.environ.get("POSTGRES_CONNECTION_STRING")
if not db_url:
logger.error("POSTGRES_CONNECTION_STRING environment variable not set")
raise HTTPException(status_code=500, detail="Database configuration error")

try:
conn = await asyncpg.connect(dsn=db_url)
result = await drop_table(conn, collection_id)
except Exception as e:
logger.error(f"Failed to connect to the database: {e}")
raise HTTPException(status_code=500, detail="Failed to connect to the database")
finally:
await conn.close()

logger.debug(f"Delete Collection {collection_id}: {result}")
return {collection_id: result}


@r.post("/delete/multiple")
async def delete_multiple(collection_ids: _CollectionIds = Body(...)):
# Log the received collection_ids
logger.info(f"Delete Collections: {collection_ids.collection_ids}")

# Validate the collection_ids to ensure they are valid UUIDv4s
for collection_id in collection_ids.collection_ids:
if not is_valid_uuidv4(collection_id):
logger.error(f"Invalid collection_id: {collection_id}")
raise HTTPException(status_code=400, detail="Invalid collection_id format")

# Try to connect to the PostgreSQL database
db_url: str = os.environ.get("POSTGRES_CONNECTION_STRING")
if not db_url:
logger.error("POSTGRES_CONNECTION_STRING environment variable not set")
raise HTTPException(status_code=500, detail="Database configuration error")

results = {}
try:
conn = await asyncpg.connect(dsn=db_url)
for collection_id in collection_ids.collection_ids:
async with conn.transaction():
results[collection_id] = await drop_table(conn, collection_id)
except Exception as e:
logger.error(f"Failed to connect to the database: {e}")
raise HTTPException(status_code=500, detail="Failed to connect to the database")
finally:
await conn.close()

logger.debug(f"Delete Collections: {results}")
return results
73 changes: 73 additions & 0 deletions backend/backend/app/api/routers/indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import logging
import os
import tempfile
from typing import List

from fastapi import APIRouter, Depends, Form, File, HTTPException, UploadFile
from fastapi.responses import JSONResponse

from backend.app.utils import auth, index

# Initialize the logger
logger = logging.getLogger("uvicorn")

# Initialize the API Router with dependencies
indexer_router = r = APIRouter(dependencies=[Depends(auth.validate_user)])

"""
This router is for indexing of user uploaded documents functionality.
A list of files is received by the router and stored in a temporary directory.
The uploaded documents are indexed and stored in the vecs database.
"""


@r.post("")
async def indexer(
collection_id: str = Form(...),
files: List[UploadFile] = File(...),
user=Depends(auth.validate_user),
):
logger.info(f"Indexer -> Collection ID: {collection_id}")
logger.info(
f"User {user} is uploading {len(files)} files to collection {collection_id}"
)

try:
with tempfile.TemporaryDirectory() as temp_dir:
logger.info(f"Created temporary directory at {temp_dir}")

file_paths = []

for file in files:
contents = await file.read()
file_path = os.path.join(temp_dir, file.filename)
with open(file_path, "wb") as f:
f.write(contents)
file_paths.append(file_path)
logger.info(f"Saved file: {file.filename} at {file_path}")

# Call indexing function with the directory and collection_id
if len(file_paths) == 0:
raise HTTPException(
status_code=400, detail="No files uploaded for indexing"
)
if collection_id is None:
raise HTTPException(
status_code=400, detail="No collection ID provided for indexing"
)
if index.index_uploaded_files(temp_dir, collection_id):
logger.info("Files uploaded and indexed successfully.")
return JSONResponse(
status_code=200,
content={
"status": "Files uploaded and indexed successfully",
"filenames": [file.filename for file in files],
},
)
else:
raise HTTPException(
status_code=500, detail="Failed to upload and index files"
)
except Exception as e:
logger.error(f"Failed to upload and index files: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to upload and index files.")
11 changes: 6 additions & 5 deletions backend/backend/app/api/routers/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
This router is for query functionality which consist of query engine.
The query engine is used to query the index.
There is no chat memory used here, every query is independent of each other.
// Currently Depreciated - Not used in the current version of the application
"""


Expand All @@ -27,7 +28,7 @@ class _Message(BaseModel):

class _ChatData(BaseModel):
messages: List[_Message]
document: str
collection_id: str


@r.post("")
Expand All @@ -38,11 +39,11 @@ async def query(
data: _ChatData = Depends(json_to_model(_ChatData)),
):
logger = logging.getLogger("uvicorn")
# get the document set selected from the request body
document_set = data.document
logger.info(f"Document Set: {document_set}")
# get the collection_id selected from the request body
collection_id = data.collection_id
logger.info(f"Collection ID: {collection_id}")
# get the index for the selected document set
index = get_index(collection_name=document_set)
index = get_index(collection_name=collection_id)
# check preconditions and get last message which is query
if len(data.messages) == 0:
raise HTTPException(
Expand Down
18 changes: 11 additions & 7 deletions backend/backend/app/api/routers/search.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import re

from fastapi import APIRouter, Depends, HTTPException, Request, status
from fastapi import APIRouter, Depends, HTTPException, status
from llama_index.postprocessor import SimilarityPostprocessor
from llama_index.retrievers import VectorIndexRetriever

Expand All @@ -20,16 +20,15 @@

@r.get("")
async def search(
request: Request,
query: str = None,
docSelected: str = None,
collection_id: str = None,
):
# query = request.query_params.get("query")
logger = logging.getLogger("uvicorn")
logger.info(f"Document Set: {docSelected} | Search: {query}")
logger.info(f"Document Set: {collection_id} | Search: {query}")
# get the index for the selected document set
index = get_index(collection_name=docSelected)
if query is None or docSelected is None:
index = get_index(collection_name=collection_id)
if query is None or collection_id is None:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="No search info/document set provided",
Expand Down Expand Up @@ -62,6 +61,9 @@ async def search(

logger.info(f"Filtered Search results similarity score: {filtered_results_scores}")

# Skip postprocessing for now
filtered_results = query_results

response = []
id = 1
for node in filtered_results:
Expand All @@ -72,7 +74,9 @@ async def search(
data = {}
data["id"] = id
data["file_name"] = node_metadata["file_name"]
data["page_no"] = node_metadata["page_label"]
data["page_no"] = (
node_metadata["page_label"] if "page_label" in node_metadata else "N/A"
)
cleaned_text = re.sub(
"^_+ | _+$", "", node_dict["text"]
) # remove leading and trailing underscores
Expand Down
17 changes: 9 additions & 8 deletions backend/backend/app/utils/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ def get_user_from_JWT(token: str):
if payload is not None:
user_id = payload["sub"]
# Try to get the user from the database using the user_id
response = supabase.table("users").select("*").eq("id", user_id).execute()
response = supabase.table("users").select("id").eq("id", user_id).execute()
# print(response.data)
if len(response.data) == 0:
return False
else:
return True
return response.data[0]["id"]
else:
return False

Expand Down Expand Up @@ -109,7 +109,7 @@ async def validate_user(
)
else:
logger.info("Validated API key successfully!")
return None
return "Authenticated via API Key"
else:
auth_token = (
auth_token.strip()
Expand All @@ -125,16 +125,17 @@ async def validate_user(
"Supabase JWT Secret is not set in Backend Service!"
)
if not isBearer:
return (
raise ValueError(
"Invalid token scheme. Please use the format 'Bearer [token]'"
)
# Verify the JWT token is valid
if verify_jwt(jwtoken=jwtoken):
return "Invalid token. Please provide a valid token."
if verify_jwt(jwtoken=jwtoken) is False:
raise ValueError("Invalid token. Please provide a valid token.")
# Check if the user exists in the database
if get_user_from_JWT(token=jwtoken):
user = get_user_from_JWT(token=jwtoken)
if user:
logger.info("Validated User's Auth Token successfully!")
return None
return user
else:
raise ValueError("User does not exist in the database!")
else:
Expand Down
2 changes: 1 addition & 1 deletion backend/backend/app/utils/contants.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
DEF_EMBED_MODEL_DIMENSIONS = (
1536 # Default embedding model dimensions used by OpenAI text-embedding-ada-002
)
EMBED_BATCH_SIZE = 100 # batch size for openai embeddings
EMBED_BATCH_SIZE = 10 # batch size for openai embeddings

# Prompt Helper Constants
# set maximum input size
Expand Down
Loading
Loading