Merge branch 'main' into 625-featapi-implement-translations-endpoint

defenseunicorns · Jul 11, 2024 · 0716dac · 0716dac
2 parents 58e2e20 + 13cd46f
commit 0716dac
Show file tree

Hide file tree

Showing 104 changed files with 4,074 additions and 1,015 deletions.
diff --git a/packages/ui/zarf.yaml b/packages/ui/zarf.yaml
@@ -13,7 +13,7 @@ constants:
 variables:
   - name: LEAPFROGAI_API_BASE_URL #LEAPFROGAI_API_BASE_URL
     description: The base URL for the LeapfrogAI API
-    default: http://api.leapfrogai.svc.cluster.local:8080/openai/v1
+    default: http://api.leapfrogai.svc.cluster.local:8080
     prompt: true
     sensitive: true
   - name: OPENAI_API_KEY

diff --git a/src/leapfrogai_api/backend/helpers.py b/src/leapfrogai_api/backend/helpers.py
@@ -2,8 +2,8 @@
 
 import time
 import uuid
-from typing import BinaryIO, Iterator, AsyncGenerator, Any
 import grpc
+from typing import BinaryIO, Iterator, AsyncGenerator, Any
 import leapfrogai_sdk as lfai
 from leapfrogai_api.backend.types import (
     ChatCompletionResponse,
@@ -108,8 +108,10 @@ def read_chunks(file: BinaryIO, chunk_size: int) -> Iterator[lfai.AudioRequest]:
         yield lfai.AudioRequest(chunk_data=chunk)
 
 
+# helper function used to modify objects unless certain fields are missing
 def object_or_default(obj: Any | None, _default: Any) -> Any:
-    if obj:
+    """Returns the given object unless it is a None type, otherwise a given default is returned"""
+    if obj is not None:
         return obj
     else:
         return _default
diff --git a/src/leapfrogai_api/backend/rag/document_loader.py b/src/leapfrogai_api/backend/rag/document_loader.py
@@ -11,6 +11,7 @@
     TextLoader,
     UnstructuredHTMLLoader,
     UnstructuredMarkdownLoader,
+    UnstructuredPowerPointLoader,
     UnstructuredExcelLoader,
 )
 from langchain_core.documents import Document
@@ -24,6 +25,8 @@
     "text/markdown": UnstructuredMarkdownLoader,
     "application/msword": Docx2txtLoader,
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document": Docx2txtLoader,
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": UnstructuredPowerPointLoader,
+    "application/vnd.ms-powerpoint": UnstructuredPowerPointLoader,
     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": UnstructuredExcelLoader,
     "xls:application/vnd.ms-excel": UnstructuredExcelLoader,
 }
@@ -40,6 +43,8 @@
     ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
     ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
     ".xls": "xls:application/vnd.ms-excel",
+    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    ".ppt": "application/vnd.ms-powerpoint",
 }
 
 

diff --git a/src/leapfrogai_api/backend/rag/index.py b/src/leapfrogai_api/backend/rag/index.py
@@ -2,19 +2,27 @@
 
 import logging
 import tempfile
+import time
 
-from fastapi import UploadFile
+
+from fastapi import HTTPException, UploadFile, status
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
+from openai.types.beta.vector_store import FileCounts, VectorStore
 from openai.types.beta.vector_stores import VectorStoreFile
 from openai.types.beta.vector_stores.vector_store_file import LastError
 from supabase_py_async import AsyncClient
 from leapfrogai_api.backend.rag.document_loader import load_file, split
 from leapfrogai_api.backend.rag.leapfrogai_embeddings import LeapfrogAIEmbeddings
-from leapfrogai_api.backend.types import VectorStoreFileStatus
 from leapfrogai_api.data.crud_file_bucket import CRUDFileBucket
 from leapfrogai_api.data.crud_file_object import CRUDFileObject, FilterFileObject
 from leapfrogai_api.data.crud_vector_store import CRUDVectorStore, FilterVectorStore
+from leapfrogai_api.backend.types import (
+    VectorStoreStatus,
+    VectorStoreFileStatus,
+    CreateVectorStoreRequest,
+    ModifyVectorStoreRequest,
+)
 from leapfrogai_api.data.crud_vector_store_file import (
     CRUDVectorStoreFile,
     FilterVectorStoreFile,
@@ -117,17 +125,172 @@ async def index_file(self, vector_store_id: str, file_id: str) -> VectorStoreFil
             await crud_vector_store_file.update(
                 id_=vector_store_file.id, object_=vector_store_file
             )
-        except Exception as e:
+        except Exception as exc:
             vector_store_file.status = VectorStoreFileStatus.FAILED.value
             await crud_vector_store_file.update(
                 id_=vector_store_file.id, object_=vector_store_file
             )
-            raise e
+            raise exc
 
         return await crud_vector_store_file.get(
             filters=FilterVectorStoreFile(vector_store_id=vector_store_id, id=file_id)
         )
 
+    async def index_files(
+        self, vector_store_id: str, file_ids: list[str]
+    ) -> list[VectorStoreFile]:
+        """Index a list of files into a vector store."""
+        responses = []
+        for file_id in file_ids:
+            try:
+                response = await self.index_file(
+                    vector_store_id=vector_store_id, file_id=file_id
+                )
+                responses.append(response)
+            except FileAlreadyIndexedError:
+                logging.info("File %s already exists and cannot be re-indexed", file_id)
+                continue
+            except Exception as exc:
+                raise exc
+
+        return responses
+
+    async def create_new_vector_store(
+        self, request: CreateVectorStoreRequest
+    ) -> VectorStore:
+        """Create a new vector store given a set of file ids"""
+        crud_vector_store = CRUDVectorStore(db=self.db)
+
+        last_active_at = int(time.time())
+
+        expires_after, expires_at = request.get_expiry(last_active_at)
+
+        try:
+            vector_store = VectorStore(
+                id="",  # Leave blank to have Postgres generate a UUID
+                usage_bytes=0,  # Automatically calculated by DB
+                created_at=0,  # Leave blank to have Postgres generate a timestamp
+                file_counts=FileCounts(
+                    cancelled=0, completed=0, failed=0, in_progress=0, total=0
+                ),
+                last_active_at=last_active_at,  # Set to current time
+                metadata=request.metadata,
+                name=request.name,
+                object="vector_store",
+                status=VectorStoreStatus.IN_PROGRESS.value,
+                expires_after=expires_after,
+                expires_at=expires_at,
+            )
+            new_vector_store = await crud_vector_store.create(object_=vector_store)
+
+            if request.file_ids != []:
+                responses = await self.index_files(
+                    new_vector_store.id, request.file_ids
+                )
+
+                for response in responses:
+                    await self._increment_vector_store_file_status(
+                        new_vector_store, response
+                    )
+
+            new_vector_store.status = VectorStoreStatus.COMPLETED.value
+
+            return await crud_vector_store.update(
+                id_=new_vector_store.id,
+                object_=new_vector_store,
+            )
+        except Exception as exc:
+            logging.error(exc)
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Unable to parse vector store request",
+            ) from exc
+
+    async def modify_existing_vector_store(
+        self,
+        vector_store_id: str,
+        request: ModifyVectorStoreRequest,
+    ) -> VectorStore:
+        """Modify an existing vector store given its id."""
+        crud_vector_store = CRUDVectorStore(db=self.db)
+
+        if not (
+            old_vector_store := await crud_vector_store.get(
+                filters=FilterVectorStore(id=vector_store_id)
+            )
+        ):
+            raise HTTPException(
+                status_code=status.HTTP_404_NOT_FOUND,
+                detail="Vector store not found",
+            )
+
+        try:
+            new_vector_store = VectorStore(
+                id=vector_store_id,
+                usage_bytes=old_vector_store.usage_bytes,  # Automatically calculated by DB
+                created_at=old_vector_store.created_at,
+                file_counts=old_vector_store.file_counts,
+                last_active_at=old_vector_store.last_active_at,  # Update after indexing files
+                metadata=getattr(request, "metadata", old_vector_store.metadata),
+                name=getattr(request, "name", old_vector_store.name),
+                object="vector_store",
+                status=VectorStoreStatus.IN_PROGRESS.value,
+                expires_after=old_vector_store.expires_after,
+                expires_at=old_vector_store.expires_at,
+            )
+
+            await crud_vector_store.update(
+                id_=vector_store_id,
+                object_=new_vector_store,
+            )  # Sets status to in_progress for the duration of this function
+
+            if request.file_ids:
+                responses = await self.index_files(
+                    new_vector_store.id, request.file_ids
+                )
+                for response in responses:
+                    await self._increment_vector_store_file_status(
+                        new_vector_store, response
+                    )
+
+            new_vector_store.status = VectorStoreStatus.COMPLETED.value
+
+            last_active_at = int(time.time())
+            new_vector_store.last_active_at = (
+                last_active_at  # Update after indexing files
+            )
+            expires_after, expires_at = request.get_expiry(last_active_at)
+
+            if expires_at and expires_at:
+                new_vector_store.expires_after = expires_after
+                new_vector_store.expires_at = expires_at
+
+            return await crud_vector_store.update(
+                id_=vector_store_id,
+                object_=new_vector_store,
+            )
+        except Exception as exc:
+            logging.error(exc)
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Unable to parse vector store request",
+            ) from exc
+
+    async def file_ids_are_valid(self, file_ids: str | list[str]) -> bool:
+        """Check if the provided file ids exist"""
+        crud_file_object = CRUDFileObject(db=self.db)
+
+        if not isinstance(file_ids, list):
+            file_ids = [file_ids]
+
+        try:
+            for file_id in file_ids:
+                await crud_file_object.get(filters=FilterFileObject(id=file_id))
+        except Exception:
+            return False
+
+        return True
+
     async def adelete_file(self, vector_store_id: str, file_id: str) -> bool:
         """Delete a file from the vector store.
 
@@ -217,6 +380,20 @@ async def asimilarity_search(self, query: str, vector_store_id: str, k: int = 4)
 
         return response
 
+    async def _increment_vector_store_file_status(
+        self, vector_store: VectorStore, file_response: VectorStoreFile
+    ):
+        """Increment the file count of a given vector store based on the file response"""
+        if file_response.status == VectorStoreFileStatus.COMPLETED.value:
+            vector_store.file_counts.completed += 1
+        elif file_response.status == VectorStoreFileStatus.FAILED.value:
+            vector_store.file_counts.failed += 1
+        elif file_response.status == VectorStoreFileStatus.IN_PROGRESS.value:
+            vector_store.file_counts.in_progress += 1
+        elif file_response.status == VectorStoreFileStatus.CANCELLED.value:
+            vector_store.file_counts.cancelled += 1
+        vector_store.file_counts.total += 1
+
     async def _adelete_vector(
         self,
         vector_store_id: str,

diff --git a/src/leapfrogai_api/backend/types.py b/src/leapfrogai_api/backend/types.py
@@ -8,13 +8,9 @@
 
 from fastapi import UploadFile, Form, File
 from openai.types import FileObject
-from openai.types.beta import Assistant, AssistantTool
+from openai.types.beta import Assistant
 from openai.types.beta import VectorStore
-from openai.types.beta.assistant import (
-    ToolResources as BetaAssistantToolResources,
-    ToolResourcesFileSearch,
-)
-from openai.types.beta.assistant_tool import FileSearchTool
+
 from openai.types.beta.thread import ToolResources as BetaThreadToolResources
 from openai.types.beta.thread_create_params import (
     ToolResourcesFileSearchVectorStoreChunkingStrategy,
@@ -525,72 +521,6 @@ class ListFilesResponse(BaseModel):
 #############
 
 
-class CreateAssistantRequest(BaseModel):
-    """Request object for creating an assistant."""
-
-    model: str = Field(
-        default="llama-cpp-python",
-        examples=["llama-cpp-python"],
-        description="The model to be used by the assistant. Default is 'llama-cpp-python'.",
-    )
-    name: str | None = Field(
-        default=None,
-        examples=["Froggy Assistant"],
-        description="The name of the assistant. Optional.",
-    )
-    description: str | None = Field(
-        default=None,
-        examples=["A helpful assistant."],
-        description="A description of the assistant's purpose. Optional.",
-    )
-    instructions: str | None = Field(
-        default=None,
-        examples=["You are a helpful assistant."],
-        description="Instructions that the assistant should follow. Optional.",
-    )
-    tools: list[AssistantTool] | None = Field(
-        default=None,
-        examples=[[FileSearchTool(type="file_search")]],
-        description="List of tools the assistant can use. Optional.",
-    )
-    tool_resources: BetaAssistantToolResources | None = Field(
-        default=None,
-        examples=[
-            BetaAssistantToolResources(
-                file_search=ToolResourcesFileSearch(vector_store_ids=[])
-            )
-        ],
-        description="Resources for the tools used by the assistant. Optional.",
-    )
-    metadata: dict | None = Field(
-        default={},
-        examples=[{}],
-        description="Additional metadata for the assistant. Optional.",
-    )
-    temperature: float | None = Field(
-        default=None,
-        examples=[1.0],
-        description="Sampling temperature for the model. Optional.",
-    )
-    top_p: float | None = Field(
-        default=None,
-        examples=[1.0],
-        description="Nucleus sampling parameter. Optional.",
-    )
-    response_format: Literal["auto"] | None = Field(
-        default=None,
-        examples=["auto"],
-        description="The format of the assistant's responses. Currently only 'auto' is supported. Optional.",
-    )
-
-
-class ModifyAssistantRequest(CreateAssistantRequest):
-    """Request object for modifying an assistant."""
-
-    # Inherits all fields from CreateAssistantRequest
-    # All fields are optional for modification
-
-
 class ListAssistantsResponse(BaseModel):
     """Response object for listing assistants."""
 

diff --git a/src/leapfrogai_api/data/crud_base.py b/src/leapfrogai_api/data/crud_base.py
@@ -90,6 +90,6 @@ async def delete(self, filters: dict | None = None) -> bool:
         result = await query.execute()
 
         try:
-            return True if result.data else None
+            return True if result.data else False
         except Exception:
-            return None
+            return False
diff --git a/src/leapfrogai_api/pyproject.toml b/src/leapfrogai_api/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "supabase-py-async >= 2.5.6",
     "langchain >= 0.2.1",
     "langchain-community >= 0.2.1",
-    "unstructured[md,xlsx] >= 0.14.2", # Only specify necessary filetypes to prevent package bloat (e.g. 130MB vs 6GB)
+    "unstructured[md,xlsx,pptx] >= 0.14.2", # Only specify necessary filetypes to prevent package bloat (e.g. 130MB vs 6GB)
     "pylibmagic >= 0.5.0", # Resolves issue with libmagic not being bundled with OS - https://github.com/ahupp/python-magic/issues/233, may not be needed after this is merged https://github.com/ahupp/python-magic/pull/294
     "python-magic >= 0.4.27",
     "openpyxl >= 3.1.5",