-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
184 lines (159 loc) · 10.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import json
import logging
from pathlib import Path
import fire
import rich
from git import Repo
from phi.assistant import Assistant
from phi.knowledge import AssistantKnowledge
from phi.llm.ollama import Ollama
from phi.prompt import PromptTemplate
from rich.box import ROUNDED
from rich.table import Table
def init_cudeschin(update=True):
"""Download (and update) the Cudeschin aka the documents we want to RAG on."""
cudeschin_path = Path("./cudeschin")
if not cudeschin_path.exists():
Repo.clone_from("https://github.com/scout-ch/cudeschin.git", cudeschin_path)
elif update:
Repo(cudeschin_path).remote("origin").pull()
return cudeschin_path
def init_knowledge_base(cudeschin_path: Path, n_documents=3) -> AssistantKnowledge:
"""
Set up and load the knowledge base. This is (arguably) the most important part of the whole application and defines
the R in RAG. The most common way is to use a vector database and embed chunks of the documents for a semantic
similarity search (akin to the traditional TF-IDF). Other options could include a (fuzzy) keyword search.
"""
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_core.vectorstores import InMemoryVectorStore
from phi.knowledge.langchain import LangChainKnowledgeBase
loader = DirectoryLoader(cudeschin_path / "content/de", glob="**/*.md",
loader_cls=TextLoader,
loader_kwargs=dict(encoding="utf-8"))
# doesn't derive from TextSplitter -> no split_documents so manual aggregation necessary
splitter = MarkdownHeaderTextSplitter(
# cudeschin only contains these two header types
headers_to_split_on=[("###", "Ueberschrift"), ("####", "Unterkapitel")], strip_headers=False)
o_docs = loader.load()
docs = []
for o_doc in o_docs:
chunks = splitter.split_text(o_doc.page_content)
for chunk in chunks:
chunk.metadata.update(o_doc.metadata)
docs.extend(chunks)
# splits on all headers and always keeps them. Derives from TextSplitter. Doesn't set is_separator_regex even though it should?
# splitter = MarkdownTextSplitter(is_separator_regex=True)
# docs = splitter.split_documents(loader.load())
embeddings = OllamaEmbeddings(model="jina/jina-embeddings-v2-base-de") # make sure to use german embedding
store = InMemoryVectorStore.from_documents(docs, embeddings)
retriever = store.as_retriever(search_kwargs=dict(k=n_documents))
# passing num_documents here literally only changes the log message
return LangChainKnowledgeBase(retriever=retriever, num_documents=n_documents)
def init_assistant(cudeschin: AssistantKnowledge, temperature=0.1, debug=False):
"""
Set up the actual assistant with its model, templates and prompts.
"""
# Results are generally very sensitive to prompting, especially when working in a non-english language.
# https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/
# Also, something to be careful about (or it can get confusing) is the pre-configured system prompt from Ollama.
# Both phidata and Ollama have some opinions about system prompts and prompt templates in general so here's a short
# breakdown:
# - whenever Ollama is used, you are using the template in the modelfile, which defines how to pass messages
# or prompts to the model. In the case of llama3.1 it seems to format multiple messages plainly according to
# the specs above, but if you include any mention of tools, it will write an english prompt saying "you're able
# to use tools", which might not be what you want. Another thing that confuses me is that there is no
# <|begin_of_text|> tag despite the specs saying you need one AND there is an unconditional <|eot_id|> tag, even
# if there's no user message. This might be an error but after some testing, it seems to perform the same as
# formatting it according to the specs (see custom modelfile in file tree).
# - when using phidata, it will use the ollama-python library, which will use the /api/chat endpoint
# (https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion). This endpoint accepts
# messages from a conversation where each message has a role (system, assistant, user, tool). These messages
# are then formatted according to the prompt template in the modelfile.
# TODO you could also do one of these from scratch without PhiData but so far I like the level of abstraction.
# Also, using LangChain all the way might also be interesting for more complex scenarios because it's a bit lower
# level, but is still general unlike when directly building on top of the Ollama library for example.
return Assistant(
llm=Ollama(
# TODO experiment with different model versions, quantizations and instruct vs chat.
# model version heavily influences prompting style
model="llama3.1:8b-instruct-q5_K_M",
# model="llama3.1-instruct-custom", # seemingly no difference in performance
options=dict(temperature=temperature), # TODO experiment with different temperature settings
),
# TODO fiddle around with the prompts, I don't think I like things like DSPy.
system_prompt=
"Du bist ein Schweizer Pfadi-Coach und unterstützst die Leitpersonen beim Planen und Durchführen "
"von Aktivitäten und Lagern indem du ihre Fragen zur Pfadi faktisch beantwortest und "
"dafür mitgelieferte Informationen zur Pfadi und Jungend und Sport (J+S) verwendetst.\n"
"<instructions>\n"
"1. Beantworte die Fragen immer auf Deutsch.\n"
"2. Beantworte die Fragen, indem du Inhalte aus der mitgelieferten knowledge_base zitierst oder umschreibst. "
"Nutze dazu das Feld `content` der knowledge_base-Einträge.\n"
"3. Gib jeweils an von welcher Datei die Informationen "
"stammen (Feld `meta_data.source`). SCHREIBE NICHTS ÜBER DIE STRUKTUR DER KNOWLEDGE_BASE, VERWENDE SIE NUR!\n"
# "3. Die mitgelieferten Informationen enthalten meistens Links zu weiterführenden Informationen, "
# "erwähne diese jeweils am Ende der Antwort.\n"
"4. Formatiere deine Antwort mithilfe von Markdown.\n"
"</instructions>",
# NOTE: phidata formats the retrieved documents as JSON and inserts the JSON string into the prompt
# at `references`. If you need a different format for the knowledge base, you need to pass a `reference_function`
# to the assistant, which takes the assistant, query, num_documents and returns a string representation of the
# retrieved documents. Also note that only the stringified references are added to memory, so if you want to
# avoid decoding again from JSON, you'd need extra logic. Or just go with a lower level framework like LangChain.
user_prompt_template=PromptTemplate(template="Beantworte folgende Frage zur Pfadi mithilfe der Informationen "
"aus der strukturierten knowledge_base unterhalb: {message}\n\n"
"<knowledge_base>\n{references}\n</knowledge_base>"),
knowledge_base=cudeschin,
# TODO try a version where the model has to use function calls to query the knowledge base (autonomous RAG).
# I suspect the results will be worse for small models but better for large models (that can write good queries).
add_references_to_prompt=True, # force references and always query the knowledge base
# these are all False by default btw, I just want to be explicit about it with reasons why.
add_chat_history_to_prompt=False, # sends all the messages to the chat endpoint to simulate a continued
# conversation, BUT it also adds an extra (english) prompt with all previous messages, there doesn't seem to
# be a way to separate those but not sure, would have to dig deeper.
markdown=False, # avoid extra (english) prompt specifying to write in Markdown
create_memories=False, # no need to store history atm
debug=debug,
save_output_to_file="./output.md",
)
def improve_question(prompt: str) -> str:
"""
Improve the user prompt for specific questions, where you know the model (or more likely the retrieval) struggles.
I would generally advise against things like this and to instead focus on other parts but then again trying to
get an LLM to do what you want is inherently hacky af.
"""
return (prompt.replace("LS", "J+S Lagersport (LS)").
replace("LA", "J+S Lageraktivität (LA)").
replace("LP", "J+S Lagerprogramm (LP)"))
def cli(question: str, *, n_documents=3, temperature=0.1, debug=False, prompt_hacks=True):
if debug:
logging.getLogger("phi").setLevel(logging.DEBUG)
cudeschin_path = init_cudeschin(update=True)
cudeschin = init_knowledge_base(cudeschin_path, n_documents)
assistant = init_assistant(cudeschin, temperature, debug)
if prompt_hacks:
question = improve_question(question)
assistant.print_response(question)
used_references = json.loads(assistant.memory.references[-1].references)
rich.print("Folgende Referenzen wurden verwendet (wenn diese unpassend sind, versuche die Frage umzuformulieren):")
table = Table(box=ROUNDED, border_style="blue", show_header=True, show_lines=True)
table.add_column("Quelle")
table.add_column("Inhalt")
for ref in used_references:
meta = ref["meta_data"]
file = Path(meta["source"]).name
headers = ""
if "Ueberschrift" in meta:
headers += meta["Ueberschrift"]
if "Unterkapitel" in meta:
if headers:
headers += " / "
headers += meta["Unterkapitel"]
source = file + (f" ({headers})" if headers else "")
table.add_row(source, ref["content"])
rich.print(table)
if __name__ == '__main__':
fire.Fire(cli)