superduper-io · thejumpman2323 · Aug 19, 2023 · Aug 17, 2023 · Aug 18, 2023
diff --git a/apps/question-the-docs/backend/ai/artifacts.py b/apps/question-the-docs/backend/ai/artifacts.py
@@ -7,17 +7,22 @@
 
 
 def load_ai_artifacts(db):
-    for repo_url in settings.default_repos:
-        details = get_repo_details(repo_url)
+    for name, repo in settings.default_repos.items():
+        details = get_repo_details(repo)
         repo = details['repo']
         if repo in db.show('vector_index'):
             continue
         artifacts = _create_ai_text_artifacts(details)
-        documents = [Document({settings.vector_embedding_key: v}) for v in artifacts]
+        documents = [
+            Document(
+                {settings.vector_embedding_key: row['text'], 'src_url': row['src_url']}
+            )
+            for _, row in artifacts.iterrows()
+        ]
         db.execute(Collection(name=repo).insert_many(documents))
 
 
 def _create_ai_text_artifacts(repo_details):
     files = save_github_md_files_locally(repo_details)
-    ai_text_artifacts = chunk_file_contents(files)
+    ai_text_artifacts = chunk_file_contents(repo_details['repo'], files)
     return ai_text_artifacts
diff --git a/apps/question-the-docs/backend/ai/components.py b/apps/question-the-docs/backend/ai/components.py
@@ -45,6 +45,6 @@ def install_openai_vector_index(db, repo):
 
 def install_ai_components(db):
     install_openai_chatbot(db)
-    for repo in settings.default_repos:
+    for name, repo in settings.default_repos.items():
         repo = get_repo_details(repo)['repo']
         install_openai_vector_index(db, repo)
diff --git a/apps/question-the-docs/backend/ai/utils/github.py b/apps/question-the-docs/backend/ai/utils/github.py
@@ -3,10 +3,13 @@
 import base64
 import json
 import os
+import re
 from pathlib import Path
 
 import requests
 
+URL_CACHE = {}
+
 
 # TODO: Use GraphQL API instead of REST API and convert to async
 def gh_repo_contents(owner, repo, branch=None):
@@ -39,7 +42,7 @@ def documentation_markdown_urls(repo_contents, documentation_location):
     urls = []
     for val in repo_contents['tree']:
         if documentation_location in val['path'] and val['path'].endswith('.md'):
-            urls.append(val['url'])
+            urls.append(val)
         else:
             continue
     return urls
@@ -59,6 +62,7 @@ def save_github_md_files_locally(repo_details):
     name = repo_details['repo']
     branch = repo_details['branch']
     documentation_location = repo_details['documentation_location']
+    documentation_base_url = repo_details['documentation_base_url']
 
     repo_contents = gh_repo_contents(owner, name, branch)
     urls = documentation_markdown_urls(repo_contents, documentation_location)
@@ -68,18 +72,36 @@ def save_github_md_files_locally(repo_details):
     except FileExistsError:
         raise FileExistsError(f"Directory docs/{name} already exists.")
 
+    URL_CACHE[name] = urls
+    doc_base_path = (
+        lambda path, section: f"{documentation_base_url}/{path}.html#{section}"
+    )
+
     for i, url in enumerate(urls):
-        content = download_and_decode(url)
+        content = download_and_decode(url['url'])
+        sections = re.findall(r'^\s*(#+)\s*(.*)', content.decode('utf-8'), re.MULTILINE)
+        for _, s in sections:
+            relative_path = url['path']
+            relative_path = '/'.join(relative_path.split('/')[1:])
+            section_file = os.path.splitext(relative_path)[0]
+            s_encoded = s.replace(' ', '-').lower()
+            if documentation_base_url:
+                URL_CACHE[(name, s)] = doc_base_path(section_file, s_encoded)
+            else:
+                URL_CACHE[(name, s)] = relative_path
+
         with open(f"docs/{name}/file_{i}", 'wb') as f:
             f.write(content)
 
     return Path(f"docs/{name}").glob("*")
 
 
-def get_repo_details(path):
+def get_repo_details(repo):
+    path = repo['url']
+    documentation_base_url = repo['documentation_url']
     path_split = path.split('/')
     branch = None
-    documentation_location = ''
+    documentation_location = 'docs'
     owner = path_split[3]
     repo = path_split[4]
     if len(path_split) == 7:
@@ -89,5 +111,6 @@ def get_repo_details(path):
         'repo': repo,
         'branch': branch,
         'documentation_location': documentation_location,
+        'documentation_base_url': documentation_base_url,
     }
     return details
diff --git a/apps/question-the-docs/backend/ai/utils/text.py b/apps/question-the-docs/backend/ai/utils/text.py
@@ -1,8 +1,10 @@
 "AI helper functions for text processing."
 
 import enum
+import re
 
 import pandas as pd
+from backend.ai.utils.github import URL_CACHE
 
 
 class TextProcessing(enum.Enum):
@@ -11,6 +13,7 @@ class TextProcessing(enum.Enum):
 
 
 def chunk_text_with_sliding_window(
+    repo: str,
     df: pd.DataFrame,
     window_size: int,
     stride: int,
@@ -20,25 +23,33 @@ def chunk_text_with_sliding_window(
     context = []
     n = len(df)
 
+    curr_title = ""
+    titles = []
     for i in range(0, n, stride):
         if i + window_size <= n or n - i >= 2:
             window_text = combine.join(df[text_col].iloc[i : min(i + window_size, n)])
+            title = re.findall(r'^\s*(#+)\s*(.*)', window_text, re.MULTILINE)
+            if title:
+                curr_title = title[0][-1]
             context.append(window_text)
+            url = URL_CACHE.get((repo, curr_title), 'nan')
+            titles.append(url)
 
-    return pd.DataFrame({text_col: context})
+    return pd.DataFrame({text_col: context, 'src_url': titles})
 
 
-def chunk_file_contents(files):
+def chunk_file_contents(repo, files):
     context_dfs = []
     for file in files:
         with open(file, 'r') as f:
             content = f.readlines()
         content_df = pd.DataFrame({"text": content})
         df = chunk_text_with_sliding_window(
+            repo,
             content_df,
             window_size=TextProcessing.window_size.value,
             stride=TextProcessing.stride.value,
         )
         context_dfs.append(df)
     df = pd.concat(context_dfs)
-    return df["text"].values
+    return df
diff --git a/apps/question-the-docs/backend/config.py b/apps/question-the-docs/backend/config.py
@@ -16,13 +16,21 @@ class AISettings(FastAPISettings):
     vector_embedding_model: str = 'text-embedding-ada-002'
     vector_embedding_key: str = 'text'
     qa_model: str = 'gpt-3.5-turbo'
-    doc_file_levels: int = 3
     doc_file_ext: str = 'md'
-    default_repos: list = [
-        'https://github.com/SuperDuperDB/superduperdb/tree/main',
-        'https://github.com/langchain-ai/langchain/tree/master',
-        'https://github.com/lm-sys/FastChat/tree/main'
-    ]
+    default_repos: dict = {
+        'superduperdb': {
+            'url': 'https://github.com/SuperDuperDB/superduperdb/tree/main',
+            'documentation_url': 'https://superduperdb.github.io/superduperdb',
+        },
+        'langchain': {
+            'url': 'https://github.com/langchain-ai/langchain/tree/master',
+            'documentation_url': '',
+        },
+        'fastchat': {
+            'url': 'https://github.com/lm-sys/FastChat/tree/main',
+            'documentation_url': '',
+        },
+    }
 
     # Query configuration
     nearest_to_query: int = 5

diff --git a/apps/question-the-docs/backend/documents/models.py b/apps/question-the-docs/backend/documents/models.py
@@ -8,3 +8,4 @@ class Query(BaseModel):
 
 class Answer(BaseModel):
     answer: str = Field(...)
+    source_urls: list = Field(...)
diff --git a/apps/question-the-docs/backend/documents/routes.py b/apps/question-the-docs/backend/documents/routes.py
@@ -21,15 +21,17 @@ async def query_docs(request: Request, query: Query) -> Answer:
         n=settings.nearest_to_query,
         vector_index=query.document_index,
     ).find()
+    db = request.app.superduperdb
+
+    contexts = list(db.execute(context_select))
+    src_urls = [context.unpack()['src_url'] for context in contexts]
 
     # Step 2: Execute your query
     # INSERT INFORMATION HERE
-    db = request.app.superduperdb
     db_response, _ = await db.apredict(
         'gpt-3.5-turbo',
         input=query.query,
         context_select=context_select,
         context_key=settings.vector_embedding_key,
     )
-
-    return Answer(answer=db_response.unpack())
+    return Answer(answer=db_response.unpack(), source_urls=src_urls)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,3 +8,4 @@ class Query(BaseModel):

		class Answer(BaseModel):
		answer: str = Field(...)
		source_urls: list = Field(...)