Add frontend to QtD app

superduper-io · Aug 18, 2023 · 25e109d · 25e109d
1 parent 8a78eb5
commit 25e109d
Show file tree

Hide file tree

Showing 27 changed files with 5,310 additions and 78 deletions.
diff --git a/apps/question-the-docs/Dockerfile b/apps/question-the-docs/Dockerfile
@@ -1,8 +1,15 @@
 FROM python:3.10-bullseye
 
+WORKDIR /app
+
 COPY backend backend
 
-RUN pip install --no-cache-dir --upgrade pip \
-  && pip install --no-cache-dir -r backend/requirements.in
+COPY frontend/ .
+
+RUN apt-get update && apt-get install -y
+
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && apt-get install -y nodejs
+
+RUN npm install && npm run build
 
-ENTRYPOINT ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8080"]
+RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r backend/requirements.in
diff --git a/apps/question-the-docs/README.md b/apps/question-the-docs/README.md
@@ -1,5 +1,19 @@
-# Question the Docs
+# Question the Docs :book:
 
-FARM stack tutorial here: https://www.mongodb.com/developer/languages/python/farm-stack-fastapi-react-mongodb/
+This app introduces the FARMS stack - FastAPI, React, MongoDB and SuperDuperDB. Full details on the FARM stack are available [here](https://www.mongodb.com/developer/languages/python/farm-stack-fastapi-react-mongodb/).
 
-FARM stack repo here: https://github.com/mongodb-developer/FARM-Intro
+## Frontend :art:
+
+The frontend has been developed with Node.js version 18.17.1. The packages can be installed with `npm install --prefix frontend/` and the app run with `npm run dev --prefix frontend`.
+
+## Backend :computer:
+
+The backend has been developed with CPython 3.8. To begin, you will need to create a GitHub PAT token and set this as an environment variable (`GITHUB_TOKEN`) in your local environment. This token is required for interacting with the GitHub API. See `backend/ai/utils/github.py` for details.
+
+Next, you will need to setup an account with MongoDB Atlas and configure a cluster for access with the app. You should set the URI for this cluster as an environment variable (`mongo_uri`). If all goes well, you should end with something like `mongo_uri="mongodb+srv://<USER>:<PASSWORD>@<CLUSTER>.qwekqo3.mongodb.net/<DB>?retryWrites=true&w=majority"`. Please contact Timo if there are any issues at this stage.
+
+Finally, you will also need to create an OpenAI account, get a token and set this as an environment variable (`OPENAI_API_KEY`).
+
+After you have set these environment variables, to run the backend, install the Python environment in `backend/requirement.in` and start the webserver (eg `uvicorn backend.main:app --host 0.0.0.0 --port 8000 --reload`).
+
+Good luck! :rocket:
diff --git a/apps/question-the-docs/backend/ai/artifacts.py b/apps/question-the-docs/backend/ai/artifacts.py
@@ -1,23 +1,24 @@
-from backend.ai.utils.github import get_repo_details, save_github_md_files_locally
+from backend.ai.utils.github import save_github_md_files_locally
 from backend.ai.utils.text import chunk_file_contents
 from backend.config import settings
 
 from superduperdb.container.document import Document
 from superduperdb.db.mongodb.query import Collection
 
 
+def _create_ai_text_artifacts(repo):
+    files = save_github_md_files_locally(repo)
+    # Chunked text is more suitable input for the AI models
+    ai_text_artifacts = chunk_file_contents(files)
+    return ai_text_artifacts
+
+
 def load_ai_artifacts(db):
-    for repo_url in settings.default_repos:
-        details = get_repo_details(repo_url)
-        repo = details['repo']
+    for repo in settings.default_repos:
+        # Skip if already exists in database
         if repo in db.show('vector_index'):
             continue
-        artifacts = _create_ai_text_artifacts(details)
+
+        artifacts = _create_ai_text_artifacts(repo)
         documents = [Document({settings.vector_embedding_key: v}) for v in artifacts]
         db.execute(Collection(name=repo).insert_many(documents))
-
-
-def _create_ai_text_artifacts(repo_details):
-    files = save_github_md_files_locally(repo_details)
-    ai_text_artifacts = chunk_file_contents(files)
-    return ai_text_artifacts
diff --git a/apps/question-the-docs/backend/ai/components.py b/apps/question-the-docs/backend/ai/components.py
@@ -2,29 +2,21 @@
 INSERT SUMMARY ON THIS MODULE HERE
 '''
 
-from backend.ai.utils.github import get_repo_details
 from backend.config import settings
 
 from superduperdb.container.listener import Listener
 from superduperdb.container.vector_index import VectorIndex
 from superduperdb.db.mongodb.query import Collection
 from superduperdb.ext.openai.model import OpenAIChatCompletion, OpenAIEmbedding
 
-PROMPT = '''Use the following descriptions and code-snippets to answer the question.
-Do NOT use any information you have learned about other python packages.
-ONLY base your answer on the code-snippets retrieved:
 
-{context}
-
-Here's the question:
-'''
 
 
 def install_openai_chatbot(db):
     db.add(
         OpenAIChatCompletion(
             takes_context=True,
-            prompt=PROMPT,
+            prompt=settings.PROMPT,
             model=settings.qa_model,
         )
     )
@@ -38,6 +30,7 @@ def install_openai_vector_index(db, repo):
                 model=OpenAIEmbedding(model=settings.vector_embedding_model),
                 key=settings.vector_embedding_key,
                 select=Collection(name=repo).find(),
+                predict_kwargs={'chunk_size': 100},
             ),
         )
     )
@@ -46,5 +39,4 @@ def install_openai_vector_index(db, repo):
 def install_ai_components(db):
     install_openai_chatbot(db)
     for repo in settings.default_repos:
-        repo = get_repo_details(repo)['repo']
         install_openai_vector_index(db, repo)
diff --git a/apps/question-the-docs/backend/ai/utils/github.py b/apps/question-the-docs/backend/ai/utils/github.py
@@ -1,12 +1,32 @@
 "AI helper functions for loading data from GitHub."
 
 import base64
-import json
 import os
 from pathlib import Path
 
 import requests
 
+REPOS = {
+    'superduperdb': {
+        'owner': 'SuperDuperDB',
+        'name': 'superduperdb',
+        'branch': 'main',
+        'documentation_location': 'docs/',
+    },
+    'langchain': {
+        'owner': 'langchain-ai',
+        'name': 'langchain',
+        'branch': 'master',
+        'documentation_location': 'docs/',
+    },
+    'fastchat': {
+        'owner': 'lm-sys',
+        'name': 'FastChat',
+        'branch': 'main',
+        'documentation_location': 'docs/',
+    },
+}
+
 
 # TODO: Use GraphQL API instead of REST API and convert to async
 def gh_repo_contents(owner, repo, branch=None):
@@ -31,7 +51,7 @@ def get_repo(branch):
                 errs.append(e)
                 continue
         raise Exception(
-            f"Tried `main` and `master` branches, but neither exist. :: reson {errs}"
+            f"Tried `main` and `master` branches, but neither exist. Reason: {errs}"
         )
 
 
@@ -53,41 +73,20 @@ def download_and_decode(url):
     return base64.b64decode(blob['content'])
 
 
-def save_github_md_files_locally(repo_details):
-    print(f"Downloading files from GitHub for {json.dumps(repo_details)}")
-    owner = repo_details['owner']
-    name = repo_details['repo']
-    branch = repo_details['branch']
-    documentation_location = repo_details['documentation_location']
+def save_github_md_files_locally(repo):
+    owner, name, branch, documentation_location = REPOS[repo].values()
 
     repo_contents = gh_repo_contents(owner, name, branch)
     urls = documentation_markdown_urls(repo_contents, documentation_location)
 
     try:
         Path(f"docs/{name}").mkdir(exist_ok=False, parents=True)
     except FileExistsError:
-        raise FileExistsError(f"Directory docs/{name} already exists.")
+        pass
 
     for i, url in enumerate(urls):
         content = download_and_decode(url)
         with open(f"docs/{name}/file_{i}", 'wb') as f:
             f.write(content)
 
     return Path(f"docs/{name}").glob("*")
-
-
-def get_repo_details(path):
-    path_split = path.split('/')
-    branch = None
-    documentation_location = ''
-    owner = path_split[3]
-    repo = path_split[4]
-    if len(path_split) == 7:
-        branch = path_split[-1]
-    details = {
-        'owner': owner,
-        'repo': repo,
-        'branch': branch,
-        'documentation_location': documentation_location,
-    }
-    return details
diff --git a/apps/question-the-docs/backend/app.py b/apps/question-the-docs/backend/app.py
@@ -3,10 +3,16 @@
 from backend.config import settings
 from backend.documents.routes import documents_router
 from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
 from pymongo import MongoClient
 
 from superduperdb import superduper
 
+# TODO: Fix before deployment
+origins = [
+    "*",
+]
+
 
 def init_routers(app: FastAPI) -> None:
     app.include_router(documents_router)
@@ -15,6 +21,14 @@ def init_routers(app: FastAPI) -> None:
 def create_app() -> FastAPI:
     _app = FastAPI(title="Question the Docs")
 
+    _app.add_middleware(
+        CORSMiddleware,
+        allow_origins=origins,
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
     @_app.on_event("startup")
     def startup_db_client():
         _app.mongodb_client = MongoClient(settings.mongo_uri)
@@ -23,12 +37,11 @@ def startup_db_client():
         # We wrap our MongoDB to make it a SuperDuperDB!
         _app.superduperdb = superduper(_app.mongodb)
 
-        # EXPLAIN ARTIFACTS HERE.
+        # Artifacts are data that has been pre-processed for AI.
         load_ai_artifacts(_app.superduperdb)
 
-        # We populate our SuperDuperDB with AI components.
-        # EXPLAIN COMPONENTS HERE.
-        # These will be used later to answer questions on our data.
+        # Components are AI models that have been selected based
+        # on the type artifacts that have been loaded (text, audio, ...).
         install_ai_components(_app.superduperdb)
 
     @_app.on_event("shutdown")

diff --git a/apps/question-the-docs/backend/config.py b/apps/question-the-docs/backend/config.py
@@ -1,31 +1,38 @@
+import typing as t
+
 from pydantic import BaseSettings
 
 
 class FastAPISettings(BaseSettings):
     mongo_uri: str = 'mongodb://localhost:27017/'
     mongo_db_name: str = 'documentation'
-    mongo_collection_name: str = "docs"
     port: int = 8000
     host: str = "0.0.0.0"
     debug_mode: bool = False
 
 
 class AISettings(FastAPISettings):
     # Model details
-    vector_index_name: str = 'documentation_index'
     vector_embedding_model: str = 'text-embedding-ada-002'
     vector_embedding_key: str = 'text'
     qa_model: str = 'gpt-3.5-turbo'
-    doc_file_levels: int = 3
-    doc_file_ext: str = 'md'
-    default_repos: list = [
-        'https://github.com/SuperDuperDB/superduperdb/tree/main',
-        'https://github.com/langchain-ai/langchain/tree/master',
-        'https://github.com/lm-sys/FastChat/tree/main'
+    default_repos: t.List[str] = [
+        'superduperdb',
+        'langchain',
+        'fastchat',
     ]
 
     # Query configuration
     nearest_to_query: int = 5
 
+    PROMPT: str = '''Use the following descriptions and code-snippets to answer the question.
+    Do NOT use any information you have learned about other python packages.
+    ONLY base your answer on the code-snippets retrieved:
+    
+    {context}
+    
+    Here's the question:
+    '''
+
 
 settings = AISettings()
diff --git a/apps/question-the-docs/backend/documents/models.py b/apps/question-the-docs/backend/documents/models.py
@@ -1,9 +1,18 @@
+import typing as t
+from enum import Enum
+
 from pydantic import BaseModel, Field
 
 
+class Repo(str, Enum):
+    superduperdb = 'superduperdb'
+    langchain = 'langchain'
+    fastchat = 'fastchat'
+
+
 class Query(BaseModel):
     query: str = Field(...)
-    document_index: str = Field(...)
+    collection_name: Repo = Field(...)
 
 
 class Answer(BaseModel):

diff --git a/apps/question-the-docs/backend/documents/routes.py b/apps/question-the-docs/backend/documents/routes.py
@@ -15,15 +15,14 @@ async def query_docs(request: Request, query: Query) -> Answer:
     # Step 1: Build your query
     # Build your query here combining vector-search "like(...)"
     # with classical mongodb queries "find(...)"
-    collection = Collection(name=query.document_index)
+    collection = Collection(name=query.collection_name)
     context_select = collection.like(
         {settings.vector_embedding_key: query.query},
         n=settings.nearest_to_query,
-        vector_index=query.document_index,
+        vector_index=query.collection_name,
     ).find()
 
     # Step 2: Execute your query
-    # INSERT INFORMATION HERE
     db = request.app.superduperdb
     db_response, _ = await db.apredict(
         'gpt-3.5-turbo',

diff --git a/apps/question-the-docs/fly.toml b/apps/question-the-docs/fly.toml
@@ -1,17 +1,24 @@
-# fly.toml app configuration file generated for question-the-doc on 2023-08-16T14:04:57+02:00
+# fly.toml app configuration file generated for question-the-docs on 2023-08-18T15:45:29+02:00
 #
 # See https://fly.io/docs/reference/configuration/ for information about how to use this file.
 #
 
-app = "question-the-doc"
-primary_region = "cdg"
+app = "question-the-docs"
+primary_region = "ams"
 
 [build]
 
-[http_service]
-  internal_port = 8080
-  force_https = true
-  auto_stop_machines = true
+[processes]
+  worker = "uvicorn backend.main:app --host 0.0.0.0 --port 8000"
+
+[[services]]
+  protocol = ""
+  internal_port = 8000
+  auto_stop_machines = false
   auto_start_machines = true
-  min_machines_running = 0
-  processes = ["app"]
+  min_machines_running = 1
+  processes = ["worker"]
+
+[[statics]]
+  guest_path = "/app/build"
+  url_prefix = "/"