Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor/source url qtd #721

Merged
merged 2 commits into from
Aug 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions apps/question-the-docs/backend/ai/artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,22 @@


def load_ai_artifacts(db):
for repo_url in settings.default_repos:
details = get_repo_details(repo_url)
for name, repo in settings.default_repos.items():
details = get_repo_details(repo)
repo = details['repo']
if repo in db.show('vector_index'):
continue
artifacts = _create_ai_text_artifacts(details)
documents = [Document({settings.vector_embedding_key: v}) for v in artifacts]
documents = [
Document(
{settings.vector_embedding_key: row['text'], 'src_url': row['src_url']}
)
for _, row in artifacts.iterrows()
]
db.execute(Collection(name=repo).insert_many(documents))


def _create_ai_text_artifacts(repo_details):
files = save_github_md_files_locally(repo_details)
ai_text_artifacts = chunk_file_contents(files)
ai_text_artifacts = chunk_file_contents(repo_details['repo'], files)
return ai_text_artifacts
2 changes: 1 addition & 1 deletion apps/question-the-docs/backend/ai/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,6 @@ def install_openai_vector_index(db, repo):

def install_ai_components(db):
install_openai_chatbot(db)
for repo in settings.default_repos:
for name, repo in settings.default_repos.items():
repo = get_repo_details(repo)['repo']
install_openai_vector_index(db, repo)
31 changes: 27 additions & 4 deletions apps/question-the-docs/backend/ai/utils/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
import base64
import json
import os
import re
from pathlib import Path

import requests

URL_CACHE = {}


# TODO: Use GraphQL API instead of REST API and convert to async
def gh_repo_contents(owner, repo, branch=None):
Expand Down Expand Up @@ -39,7 +42,7 @@ def documentation_markdown_urls(repo_contents, documentation_location):
urls = []
for val in repo_contents['tree']:
if documentation_location in val['path'] and val['path'].endswith('.md'):
urls.append(val['url'])
urls.append(val)
else:
continue
return urls
Expand All @@ -59,6 +62,7 @@ def save_github_md_files_locally(repo_details):
name = repo_details['repo']
branch = repo_details['branch']
documentation_location = repo_details['documentation_location']
documentation_base_url = repo_details['documentation_base_url']

repo_contents = gh_repo_contents(owner, name, branch)
urls = documentation_markdown_urls(repo_contents, documentation_location)
Expand All @@ -68,18 +72,36 @@ def save_github_md_files_locally(repo_details):
except FileExistsError:
raise FileExistsError(f"Directory docs/{name} already exists.")

URL_CACHE[name] = urls
doc_base_path = (
lambda path, section: f"{documentation_base_url}/{path}.html#{section}"
)

for i, url in enumerate(urls):
content = download_and_decode(url)
content = download_and_decode(url['url'])
sections = re.findall(r'^\s*(#+)\s*(.*)', content.decode('utf-8'), re.MULTILINE)
for _, s in sections:
relative_path = url['path']
relative_path = '/'.join(relative_path.split('/')[1:])
section_file = os.path.splitext(relative_path)[0]
s_encoded = s.replace(' ', '-').lower()
if documentation_base_url:
URL_CACHE[(name, s)] = doc_base_path(section_file, s_encoded)
else:
URL_CACHE[(name, s)] = relative_path

with open(f"docs/{name}/file_{i}", 'wb') as f:
f.write(content)

return Path(f"docs/{name}").glob("*")


def get_repo_details(path):
def get_repo_details(repo):
path = repo['url']
documentation_base_url = repo['documentation_url']
path_split = path.split('/')
branch = None
documentation_location = ''
documentation_location = 'docs'
owner = path_split[3]
repo = path_split[4]
if len(path_split) == 7:
Expand All @@ -89,5 +111,6 @@ def get_repo_details(path):
'repo': repo,
'branch': branch,
'documentation_location': documentation_location,
'documentation_base_url': documentation_base_url,
}
return details
17 changes: 14 additions & 3 deletions apps/question-the-docs/backend/ai/utils/text.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"AI helper functions for text processing."

import enum
import re

import pandas as pd
from backend.ai.utils.github import URL_CACHE


class TextProcessing(enum.Enum):
Expand All @@ -11,6 +13,7 @@ class TextProcessing(enum.Enum):


def chunk_text_with_sliding_window(
repo: str,
df: pd.DataFrame,
window_size: int,
stride: int,
Expand All @@ -20,25 +23,33 @@ def chunk_text_with_sliding_window(
context = []
n = len(df)

curr_title = ""
titles = []
for i in range(0, n, stride):
if i + window_size <= n or n - i >= 2:
window_text = combine.join(df[text_col].iloc[i : min(i + window_size, n)])
title = re.findall(r'^\s*(#+)\s*(.*)', window_text, re.MULTILINE)
if title:
curr_title = title[0][-1]
context.append(window_text)
url = URL_CACHE.get((repo, curr_title), 'nan')
titles.append(url)

return pd.DataFrame({text_col: context})
return pd.DataFrame({text_col: context, 'src_url': titles})


def chunk_file_contents(files):
def chunk_file_contents(repo, files):
context_dfs = []
for file in files:
with open(file, 'r') as f:
content = f.readlines()
content_df = pd.DataFrame({"text": content})
df = chunk_text_with_sliding_window(
repo,
content_df,
window_size=TextProcessing.window_size.value,
stride=TextProcessing.stride.value,
)
context_dfs.append(df)
df = pd.concat(context_dfs)
return df["text"].values
return df
20 changes: 14 additions & 6 deletions apps/question-the-docs/backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,21 @@ class AISettings(FastAPISettings):
vector_embedding_model: str = 'text-embedding-ada-002'
vector_embedding_key: str = 'text'
qa_model: str = 'gpt-3.5-turbo'
doc_file_levels: int = 3
doc_file_ext: str = 'md'
default_repos: list = [
'https://github.com/SuperDuperDB/superduperdb/tree/main',
'https://github.com/langchain-ai/langchain/tree/master',
'https://github.com/lm-sys/FastChat/tree/main'
]
default_repos: dict = {
'superduperdb': {
'url': 'https://github.com/SuperDuperDB/superduperdb/tree/main',
'documentation_url': 'https://superduperdb.github.io/superduperdb',
},
'langchain': {
'url': 'https://github.com/langchain-ai/langchain/tree/master',
'documentation_url': '',
},
thejumpman2323 marked this conversation as resolved.
Show resolved Hide resolved
'fastchat': {
'url': 'https://github.com/lm-sys/FastChat/tree/main',
'documentation_url': '',
},
}

# Query configuration
nearest_to_query: int = 5
Expand Down
1 change: 1 addition & 0 deletions apps/question-the-docs/backend/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ class Query(BaseModel):

class Answer(BaseModel):
answer: str = Field(...)
source_urls: list = Field(...)
8 changes: 5 additions & 3 deletions apps/question-the-docs/backend/documents/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,17 @@ async def query_docs(request: Request, query: Query) -> Answer:
n=settings.nearest_to_query,
vector_index=query.document_index,
).find()
db = request.app.superduperdb

contexts = list(db.execute(context_select))
src_urls = [context.unpack()['src_url'] for context in contexts]

# Step 2: Execute your query
# INSERT INFORMATION HERE
db = request.app.superduperdb
db_response, _ = await db.apredict(
'gpt-3.5-turbo',
input=query.query,
context_select=context_select,
context_key=settings.vector_embedding_key,
)

return Answer(answer=db_response.unpack())
return Answer(answer=db_response.unpack(), source_urls=src_urls)