Skip to content

Commit

Permalink
Add frontend to QtD app
Browse files Browse the repository at this point in the history
  • Loading branch information
nenb committed Aug 18, 2023
1 parent 8a78eb5 commit 25e109d
Show file tree
Hide file tree
Showing 27 changed files with 5,310 additions and 78 deletions.
13 changes: 10 additions & 3 deletions apps/question-the-docs/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
FROM python:3.10-bullseye

WORKDIR /app

COPY backend backend

RUN pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir -r backend/requirements.in
COPY frontend/ .

RUN apt-get update && apt-get install -y

RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && apt-get install -y nodejs

RUN npm install && npm run build

ENTRYPOINT ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8080"]
RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r backend/requirements.in
20 changes: 17 additions & 3 deletions apps/question-the-docs/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Question the Docs
# Question the Docs :book:

FARM stack tutorial here: https://www.mongodb.com/developer/languages/python/farm-stack-fastapi-react-mongodb/
This app introduces the FARMS stack - FastAPI, React, MongoDB and SuperDuperDB. Full details on the FARM stack are available [here](https://www.mongodb.com/developer/languages/python/farm-stack-fastapi-react-mongodb/).

FARM stack repo here: https://github.com/mongodb-developer/FARM-Intro
## Frontend :art:

The frontend has been developed with Node.js version 18.17.1. The packages can be installed with `npm install --prefix frontend/` and the app run with `npm run dev --prefix frontend`.

## Backend :computer:

The backend has been developed with CPython 3.8. To begin, you will need to create a GitHub PAT token and set this as an environment variable (`GITHUB_TOKEN`) in your local environment. This token is required for interacting with the GitHub API. See `backend/ai/utils/github.py` for details.

Next, you will need to setup an account with MongoDB Atlas and configure a cluster for access with the app. You should set the URI for this cluster as an environment variable (`mongo_uri`). If all goes well, you should end with something like `mongo_uri="mongodb+srv://<USER>:<PASSWORD>@<CLUSTER>.qwekqo3.mongodb.net/<DB>?retryWrites=true&w=majority"`. Please contact Timo if there are any issues at this stage.

Finally, you will also need to create an OpenAI account, get a token and set this as an environment variable (`OPENAI_API_KEY`).

After you have set these environment variables, to run the backend, install the Python environment in `backend/requirement.in` and start the webserver (eg `uvicorn backend.main:app --host 0.0.0.0 --port 8000 --reload`).

Good luck! :rocket:
23 changes: 12 additions & 11 deletions apps/question-the-docs/backend/ai/artifacts.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
from backend.ai.utils.github import get_repo_details, save_github_md_files_locally
from backend.ai.utils.github import save_github_md_files_locally
from backend.ai.utils.text import chunk_file_contents
from backend.config import settings

from superduperdb.container.document import Document
from superduperdb.db.mongodb.query import Collection


def _create_ai_text_artifacts(repo):
files = save_github_md_files_locally(repo)
# Chunked text is more suitable input for the AI models
ai_text_artifacts = chunk_file_contents(files)
return ai_text_artifacts


def load_ai_artifacts(db):
for repo_url in settings.default_repos:
details = get_repo_details(repo_url)
repo = details['repo']
for repo in settings.default_repos:
# Skip if already exists in database
if repo in db.show('vector_index'):
continue
artifacts = _create_ai_text_artifacts(details)

artifacts = _create_ai_text_artifacts(repo)
documents = [Document({settings.vector_embedding_key: v}) for v in artifacts]
db.execute(Collection(name=repo).insert_many(documents))


def _create_ai_text_artifacts(repo_details):
files = save_github_md_files_locally(repo_details)
ai_text_artifacts = chunk_file_contents(files)
return ai_text_artifacts
12 changes: 2 additions & 10 deletions apps/question-the-docs/backend/ai/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,21 @@
INSERT SUMMARY ON THIS MODULE HERE
'''

from backend.ai.utils.github import get_repo_details
from backend.config import settings

from superduperdb.container.listener import Listener
from superduperdb.container.vector_index import VectorIndex
from superduperdb.db.mongodb.query import Collection
from superduperdb.ext.openai.model import OpenAIChatCompletion, OpenAIEmbedding

PROMPT = '''Use the following descriptions and code-snippets to answer the question.
Do NOT use any information you have learned about other python packages.
ONLY base your answer on the code-snippets retrieved:

{context}
Here's the question:
'''


def install_openai_chatbot(db):
db.add(
OpenAIChatCompletion(
takes_context=True,
prompt=PROMPT,
prompt=settings.PROMPT,
model=settings.qa_model,
)
)
Expand All @@ -38,6 +30,7 @@ def install_openai_vector_index(db, repo):
model=OpenAIEmbedding(model=settings.vector_embedding_model),
key=settings.vector_embedding_key,
select=Collection(name=repo).find(),
predict_kwargs={'chunk_size': 100},
),
)
)
Expand All @@ -46,5 +39,4 @@ def install_openai_vector_index(db, repo):
def install_ai_components(db):
install_openai_chatbot(db)
for repo in settings.default_repos:
repo = get_repo_details(repo)['repo']
install_openai_vector_index(db, repo)
51 changes: 25 additions & 26 deletions apps/question-the-docs/backend/ai/utils/github.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
"AI helper functions for loading data from GitHub."

import base64
import json
import os
from pathlib import Path

import requests

REPOS = {
'superduperdb': {
'owner': 'SuperDuperDB',
'name': 'superduperdb',
'branch': 'main',
'documentation_location': 'docs/',
},
'langchain': {
'owner': 'langchain-ai',
'name': 'langchain',
'branch': 'master',
'documentation_location': 'docs/',
},
'fastchat': {
'owner': 'lm-sys',
'name': 'FastChat',
'branch': 'main',
'documentation_location': 'docs/',
},
}


# TODO: Use GraphQL API instead of REST API and convert to async
def gh_repo_contents(owner, repo, branch=None):
Expand All @@ -31,7 +51,7 @@ def get_repo(branch):
errs.append(e)
continue
raise Exception(
f"Tried `main` and `master` branches, but neither exist. :: reson {errs}"
f"Tried `main` and `master` branches, but neither exist. Reason: {errs}"
)


Expand All @@ -53,41 +73,20 @@ def download_and_decode(url):
return base64.b64decode(blob['content'])


def save_github_md_files_locally(repo_details):
print(f"Downloading files from GitHub for {json.dumps(repo_details)}")
owner = repo_details['owner']
name = repo_details['repo']
branch = repo_details['branch']
documentation_location = repo_details['documentation_location']
def save_github_md_files_locally(repo):
owner, name, branch, documentation_location = REPOS[repo].values()

repo_contents = gh_repo_contents(owner, name, branch)
urls = documentation_markdown_urls(repo_contents, documentation_location)

try:
Path(f"docs/{name}").mkdir(exist_ok=False, parents=True)
except FileExistsError:
raise FileExistsError(f"Directory docs/{name} already exists.")
pass

for i, url in enumerate(urls):
content = download_and_decode(url)
with open(f"docs/{name}/file_{i}", 'wb') as f:
f.write(content)

return Path(f"docs/{name}").glob("*")


def get_repo_details(path):
path_split = path.split('/')
branch = None
documentation_location = ''
owner = path_split[3]
repo = path_split[4]
if len(path_split) == 7:
branch = path_split[-1]
details = {
'owner': owner,
'repo': repo,
'branch': branch,
'documentation_location': documentation_location,
}
return details
21 changes: 17 additions & 4 deletions apps/question-the-docs/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,16 @@
from backend.config import settings
from backend.documents.routes import documents_router
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pymongo import MongoClient

from superduperdb import superduper

# TODO: Fix before deployment
origins = [
"*",
]


def init_routers(app: FastAPI) -> None:
app.include_router(documents_router)
Expand All @@ -15,6 +21,14 @@ def init_routers(app: FastAPI) -> None:
def create_app() -> FastAPI:
_app = FastAPI(title="Question the Docs")

_app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

@_app.on_event("startup")
def startup_db_client():
_app.mongodb_client = MongoClient(settings.mongo_uri)
Expand All @@ -23,12 +37,11 @@ def startup_db_client():
# We wrap our MongoDB to make it a SuperDuperDB!
_app.superduperdb = superduper(_app.mongodb)

# EXPLAIN ARTIFACTS HERE.
# Artifacts are data that has been pre-processed for AI.
load_ai_artifacts(_app.superduperdb)

# We populate our SuperDuperDB with AI components.
# EXPLAIN COMPONENTS HERE.
# These will be used later to answer questions on our data.
# Components are AI models that have been selected based
# on the type artifacts that have been loaded (text, audio, ...).
install_ai_components(_app.superduperdb)

@_app.on_event("shutdown")
Expand Down
23 changes: 15 additions & 8 deletions apps/question-the-docs/backend/config.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,38 @@
import typing as t

from pydantic import BaseSettings


class FastAPISettings(BaseSettings):
mongo_uri: str = 'mongodb://localhost:27017/'
mongo_db_name: str = 'documentation'
mongo_collection_name: str = "docs"
port: int = 8000
host: str = "0.0.0.0"
debug_mode: bool = False


class AISettings(FastAPISettings):
# Model details
vector_index_name: str = 'documentation_index'
vector_embedding_model: str = 'text-embedding-ada-002'
vector_embedding_key: str = 'text'
qa_model: str = 'gpt-3.5-turbo'
doc_file_levels: int = 3
doc_file_ext: str = 'md'
default_repos: list = [
'https://github.com/SuperDuperDB/superduperdb/tree/main',
'https://github.com/langchain-ai/langchain/tree/master',
'https://github.com/lm-sys/FastChat/tree/main'
default_repos: t.List[str] = [
'superduperdb',
'langchain',
'fastchat',
]

# Query configuration
nearest_to_query: int = 5

PROMPT: str = '''Use the following descriptions and code-snippets to answer the question.
Do NOT use any information you have learned about other python packages.
ONLY base your answer on the code-snippets retrieved:
{context}
Here's the question:
'''


settings = AISettings()
11 changes: 10 additions & 1 deletion apps/question-the-docs/backend/documents/models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
import typing as t
from enum import Enum

from pydantic import BaseModel, Field


class Repo(str, Enum):
superduperdb = 'superduperdb'
langchain = 'langchain'
fastchat = 'fastchat'


class Query(BaseModel):
query: str = Field(...)
document_index: str = Field(...)
collection_name: Repo = Field(...)


class Answer(BaseModel):
Expand Down
5 changes: 2 additions & 3 deletions apps/question-the-docs/backend/documents/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,14 @@ async def query_docs(request: Request, query: Query) -> Answer:
# Step 1: Build your query
# Build your query here combining vector-search "like(...)"
# with classical mongodb queries "find(...)"
collection = Collection(name=query.document_index)
collection = Collection(name=query.collection_name)
context_select = collection.like(
{settings.vector_embedding_key: query.query},
n=settings.nearest_to_query,
vector_index=query.document_index,
vector_index=query.collection_name,
).find()

# Step 2: Execute your query
# INSERT INFORMATION HERE
db = request.app.superduperdb
db_response, _ = await db.apredict(
'gpt-3.5-turbo',
Expand Down
25 changes: 16 additions & 9 deletions apps/question-the-docs/fly.toml
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
# fly.toml app configuration file generated for question-the-doc on 2023-08-16T14:04:57+02:00
# fly.toml app configuration file generated for question-the-docs on 2023-08-18T15:45:29+02:00
#
# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
#

app = "question-the-doc"
primary_region = "cdg"
app = "question-the-docs"
primary_region = "ams"

[build]

[http_service]
internal_port = 8080
force_https = true
auto_stop_machines = true
[processes]
worker = "uvicorn backend.main:app --host 0.0.0.0 --port 8000"

[[services]]
protocol = ""
internal_port = 8000
auto_stop_machines = false
auto_start_machines = true
min_machines_running = 0
processes = ["app"]
min_machines_running = 1
processes = ["worker"]

[[statics]]
guest_path = "/app/build"
url_prefix = "/"
Loading

0 comments on commit 25e109d

Please sign in to comment.