Skip to content

Commit

Permalink
messing with formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
jonmatthis committed Oct 14, 2024
1 parent ce9c8b4 commit 9cf6a2b
Show file tree
Hide file tree
Showing 8 changed files with 74 additions and 64 deletions.
38 changes: 9 additions & 29 deletions src_python/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import asyncio
import logging
import os
from pathlib import Path

import discord
Expand All @@ -11,20 +9,13 @@

from src_python.src.scrape_server.save_to_disk import save_server_data_to_disk
from src_python.src.scrape_server.scrape_server import process_server
from src_python.src.utilities.load_env_variables import DISCORD_DEV_BOT_ID, OUTPUT_DIRECTORY, TARGET_SERVER_ID, \
STUDENT_IDENTIFIERS_CSV_PATH, DISCORD_DEV_BOT_TOKEN

configure_logging()
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv("../.env.analysis")
DISCORD_DEV_BOT_TOKEN = os.getenv('DISCORD_DEV_BOT_TOKEN')
TARGET_SERVER_ID = os.getenv('TARGET_SERVER_ID')
OUTPUT_DIRECTORY = os.getenv('OUTPUT_DIRECTORY')
STUDENT_IDENTIFIERS_CSV_PATH = os.getenv('STUDENT_IDENTIFIERS_CSV_PATH')

# Ensure the environment variables are set
if not DISCORD_DEV_BOT_TOKEN or not OUTPUT_DIRECTORY or not OUTPUT_DIRECTORY:
raise ValueError("Please set DISCORD_DEV_BOT_TOKEN and OUTPUT_DIRECTORY in your .env file")

# Initialize the Discord client
client = commands.Bot(command_prefix='!', intents=discord.Intents.all())
Expand All @@ -33,6 +24,8 @@
@client.event
async def on_ready():
logger.info(f'Logged in as {client.user.name} (ID: {client.user.id})')
if not int(DISCORD_DEV_BOT_ID) == client.user.id:
raise ValueError("Discord bot ID does not match expected ID")
await main_server_scraper(client=client,
target_server_id=TARGET_SERVER_ID,
output_directory=str(Path(OUTPUT_DIRECTORY)),
Expand All @@ -52,26 +45,13 @@ async def main_server_scraper(client: commands.Bot,

# class_roster = ClassRosterModel.from_csv(student_identifiers_path)
# save_student_data_to_disk(output_directory=output_directory, server_data=server_data, class_roster=class_roster)
else:
logger.error(f"Could not find server with ID: {target_server_id}")


client.run(DISCORD_DEV_BOT_TOKEN)

if __name__ == "__main__":
from src_python.src.ai.analyze_directory import analyze_directory
from src_python.src.models.extract_text_data import ExtractedTextData
in_server_name = "jonmatthiss_server"
input_directory_out = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}"
output_directory_out = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}_AI_Processed"
classbot_prompt_file = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}\{in_server_name}_classbot_prompt.txt"

with open(classbot_prompt_file, 'r', encoding='utf-8') as f:
classbot_prompt = f.read()

asyncio.run(analyze_directory(input_directory=input_directory_out,
output_directory=output_directory_out,
json_schema_model=ExtractedTextData,
base_prompt_text=classbot_prompt))

logger.info(f"Analysis complete for directory: {input_directory_out}")

print("Done!")
# run this script and botto will scrape the server on startup
# run the `ai/analyze_directory.py` script to analyze the server data
pass
34 changes: 17 additions & 17 deletions src_python/src/ai/analyze_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,17 @@

configure_logging()
import logging

logger = logging.getLogger(__name__)

async def analyze_directory(input_directory: str,

async def analyze_directory(base_directory: str,
output_directory: str,
json_schema_model: Type[BaseModel],
base_prompt_text: str,
max_file_count: int = None,
llm_model: str = "gpt-3.5-turbo"):
input_directory_path = Path(input_directory)
input_directory_path = Path(base_directory)
output_directory_path = Path(output_directory)
output_directory_path.mkdir(parents=True, exist_ok=True)

Expand All @@ -27,7 +29,6 @@ async def analyze_directory(input_directory: str,
logger.info(f"Analyzing directory: {input_directory_path}")
tasks = []


for file_number, file in enumerate(input_directory_path.rglob('*.md')):
if max_file_count and file_number >= max_file_count:
break
Expand All @@ -53,9 +54,9 @@ async def analyze_markdown_file(base_prompt_text: str,
output_parent_path.mkdir(parents=True, exist_ok=True)
try:
constructed_pydantic_model = await analyze_text(input_text=input_file_text,
json_schema_model=json_schema_model,
base_prompt_text=base_prompt_text,
llm_model=llm_model)
json_schema_model=json_schema_model,
base_prompt_text=base_prompt_text,
llm_model=llm_model)
except Exception as e:
logger.error(f"Error analyzing file: {file_path}")
logger.error(e)
Expand All @@ -66,7 +67,7 @@ async def analyze_markdown_file(base_prompt_text: str,
logger.info(f"Constructed Pydantic model:\n\n{constructed_pydantic_model}")

output_markdown_string = str(constructed_pydantic_model)
full_output_string = output_markdown_string + "\n\nOriginal text:\n\n```\n\n" + input_file_text + "\n\n``` \n\n"
full_output_string = output_markdown_string + "\n\n___\n\n___\n\nOriginal text:\n\n" + input_file_text
output_file_name = constructed_pydantic_model.filename
save_path = output_parent_path / output_file_name

Expand All @@ -79,21 +80,20 @@ async def analyze_markdown_file(base_prompt_text: str,


if __name__ == "__main__":
from src_python.src.utilities.load_env_variables import OUTPUT_DIRECTORY

in_server_name = "HMN_Fall24"
classbot_prompt_file_name = f"{in_server_name}-prompt.txt"
classbot_prompt_file_path = str(Path(OUTPUT_DIRECTORY) / classbot_prompt_file_name)

in_server_name = "jonmatthiss_server"
input_directory_out = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}"
output_directory_out = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}_AI_Processed"
classbot_prompt_file = rf"C:\Users\jonma\Sync\skellybot-data\markdown\{in_server_name}_prompt.txt"

with open(classbot_prompt_file, 'r', encoding='utf-8') as f:
with open(classbot_prompt_file_path, 'r', encoding='utf-8') as f:
classbot_prompt = f.read()

asyncio.run(analyze_directory(input_directory=input_directory_out,
output_directory=output_directory_out,
asyncio.run(analyze_directory(base_directory=OUTPUT_DIRECTORY,
output_directory=str(Path(OUTPUT_DIRECTORY) / f"{in_server_name}-ai-processed"),
json_schema_model=ExtractedTextData,
base_prompt_text=classbot_prompt))

logger.info(f"Analysis complete for directory: {input_directory_out}")
logger.info(f"Analysis complete for directory: {OUTPUT_DIRECTORY}")

print("Done!")
print("Done!")
3 changes: 2 additions & 1 deletion src_python/src/ai/analyze_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import pprint
from typing import Type

import tiktoken
from dotenv import load_dotenv
Expand All @@ -23,7 +24,7 @@


async def analyze_text(input_text: str,
json_schema_model: ExtractedTextData,
json_schema_model: Type[ExtractedTextData],
base_prompt_text: str = "",
max_input_tokens: int = 1.6e4,
llm_model: str = "gpt-4o-mini") -> BaseModel:
Expand Down
6 changes: 3 additions & 3 deletions src_python/src/ai/construct_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
SANDWICH_CAPPER = "Remember! You instructions are to: \n\n"


def construct_analyzer_prompt(json_schema_model: ExtractedTextData,
def construct_analyzer_prompt(json_schema_model: Type[ExtractedTextData],
input_text: str,
base_prompt_text: str = "",
) -> str:
Expand All @@ -29,7 +29,7 @@ def construct_analyzer_prompt(json_schema_model: ExtractedTextData,

input_text_prompt_string = f"BEGIN INPUT TEXT: \n\n{input_text}\n\n END INPUT TEXT\n\n"

sandwich_cap_prompt = f"{SANDWICH_CAPPER} \n\n {instruction_prompt} \n\n {json_schema_prompt}"
sandwich_cap_prompt = f"{SANDWICH_CAPPER} \n\n {BASE_JSON_PROMPT} \n\n {json_schema_prompt}"


output_prompt = instruction_prompt + "\n\n" + input_text_prompt_string + "\n\n" + sandwich_cap_prompt + "\n"
Expand All @@ -54,7 +54,7 @@ def construct_json_prompt(pydantic_model: Type[BaseModel]) -> str:
json_prompt = ['{\n']

for name, field in fields.items():
field_info = pydantic_model.__fields__[name]
field_info = pydantic_model.model_fields[name]
description = field_info.description or ""
json_prompt.append(f'"{name}": ({field_info.annotation}) // {description},')

Expand Down
2 changes: 1 addition & 1 deletion src_python/src/configure_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class ColoredConsoleHandler(logging.StreamHandler):
"INFO": "\033[96m", # Cyan
"SUCCESS": "\033[95m", # Magenta
"WARNING": "\033[33m", # Yellow
"ERROR": "\033[101m", # Background Dark Red
"ERROR": "\033[30;41m", # Black text on Red background
}

def emit(self, record):
Expand Down
28 changes: 17 additions & 11 deletions src_python/src/models/extract_text_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@


class ExtractedTextData(BaseModel):
detailed_summary: str = Field("",
description="An exhaustively thorough and detailed summary of the major points of this text in markdown bulleted outline format, like `* point 1\n* point 2\n* point 3` etc")
highlights: str = Field("",
description="A list of the most important points of the text, formatted as a bulleted list")
short_summary: str = Field("", description="A short (2-3 sentence) summary of the text")
very_short_summary: str = Field("", description="A very short one sentence summary of the text")
extremely_short_summary: str = Field("", description="An extremely short 6-10 word summary of the text")
title_slug: str = Field("",
description="The a descriptive title of the text, will be used as the H1 header, the filename slug, and the URL slug. It should be short (only a few words) and provide a terse preview of the basic content of the full text, it should include NO colons")
tags: str = Field("",
description="A list of tags that describe the content of the text, formatted as comma separated #lower-kabob-case. These should be like topic tags that can be used to categorize the text within a larger collection of texts")
description="A list of tags that describe the content of the text, formatted as comma separated #lower-kabob-case. These should be like topic tags that can be used to categorize the text within a larger collection of texts. Ignore conversational aspects (such as 'greetings', 'farewells', 'thanks', etc.)")
extremely_short_summary: str = Field("", description="An extremely short 6-10 word summary of the text")
very_short_summary: str = Field("", description="A very short one sentence summary of the text")
short_summary: str = Field("", description="A short (2-3 sentence) summary of the text")
highlights: str = Field("",
description="A list of the most important points of the text, formatted as a bulleted list")
detailed_summary: str = Field("",
description="An exhaustively thorough and detailed summary of the major points of this text in markdown bulleted outline format, like `* point 1\n* point 2\n* point 3` etc")
backlinks: str = Field("",
description="A list of key concepts and terms that will be used as backlinks in the text, formatted as comma separated wiki style links like `[[backlink 1]], [[backlink 2]], [[backlink 3]]` etc. These shoud be the kinds of things you would expect to find a Wikipedia article about")

description="A list of key words and phrases in the text which will highlighted as [[backlinks]] within the text, These should be the kinds of things you would expect to find a Wikipedia article about. Format this section as comma separated wiki style links like `[[backlink 1]], [[backlink 2]], [[backlink 3]]` etc. ")
pull_quotes: str = Field("",description="A list of the most important quotes from the text which the key points of the contentful aspects of the text, formatted as a bulleted list")
@property
def title(self):
return self.title_slug.replace("-", " ").title()
Expand All @@ -26,14 +26,18 @@ def title(self):
def filename(self, extension="md"):
if not extension.startswith("."):
extension = "." + extension
return sanitize_name(self.title_slug) + f"{extension}"
return sanitize_name(self.title_slug.lower()) + f"{extension}"

def __str__(self):
tags = "\n".join(self.tags.split(","))
return f"""
# {self.title}\n\n
## Extremely Short Summary\n\n
{self.extremely_short_summary}\n\n
## Highlights\n
{self.highlights}\n\n
## Pull Quotes\n
{self.pull_quotes}\n\n
## Very Short Summary\n
{self.very_short_summary}\n\n
## Short Summary\n
Expand All @@ -42,6 +46,8 @@ def __str__(self):
{self.detailed_summary}\n\n
## Tags\n
{tags}\n\n
## Backlinks\n
{self.backlinks}\n\n
"""


Expand Down
7 changes: 5 additions & 2 deletions src_python/src/scrape_server/save_to_disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@

from src_python.src.models.server_data_model import ServerData, save_as_markdown_directory, save_as_json
from src_python.src.models.student_info import ClassRosterModel
from src_python.src.utilities.sanitize_filename import sanitize_name

logger = logging.getLogger(__name__)


def save_server_data_to_disk(output_directory: str, server_data: ServerData):

json_save_path = save_as_json(server_data=server_data, output_directory=output_directory)

logger.info(f"Saved server data to disk: {json_save_path}")
Expand All @@ -18,12 +20,13 @@ def save_server_data_to_disk(output_directory: str, server_data: ServerData):
pickle.dump(server_data, open(pickle_save_path, 'wb'))
logger.info(f"Saved server data to disk: {pickle_save_path}")
except Exception as e:
logger.error(f"Error saving server data as pickle: {e}")
raise ValueError(f"Error saving server data as pickle: {e}")

try:
markdown_save_path = save_as_markdown_directory(server_data=server_data, output_directory=output_directory)
logger.info(f"Saved server data to disk: {markdown_save_path}")
except Exception as e:
logger.error(f"Error saving server data as markdown: {e}")
raise ValueError(f"Error saving server data as markdown: {e}")


def save_student_data_to_disk(output_directory: str,
Expand Down
20 changes: 20 additions & 0 deletions src_python/src/utilities/load_env_variables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os
from pathlib import Path

from dotenv import load_dotenv
# Load environment variables
env_analysis_path = Path(__file__).parent.parent.parent.parent / ".env.analysis"
if not os.path.exists(env_analysis_path):
raise FileNotFoundError(f".env.analysis file not found at: {env_analysis_path}")
load_dotenv(str(env_analysis_path))


DISCORD_DEV_BOT_TOKEN = os.getenv('DISCORD_DEV_BOT_TOKEN')
DISCORD_DEV_BOT_ID = os.getenv('DISCORD_DEV_BOT_ID')
TARGET_SERVER_ID = os.getenv('TARGET_SERVER_ID')
OUTPUT_DIRECTORY = os.getenv('OUTPUT_DIRECTORY')
STUDENT_IDENTIFIERS_CSV_PATH = os.getenv('STUDENT_IDENTIFIERS_CSV_PATH')

# Ensure the environment variables are set
if not DISCORD_DEV_BOT_TOKEN or not OUTPUT_DIRECTORY or not OUTPUT_DIRECTORY:
raise ValueError("Please set DISCORD_DEV_BOT_TOKEN and OUTPUT_DIRECTORY in your .env file")

0 comments on commit 9cf6a2b

Please sign in to comment.