-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #472 from tarrencev/main
feat: Improve knowledge embeddings
- Loading branch information
Showing
10 changed files
with
201 additions
and
137 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import { UUID } from "crypto"; | ||
|
||
import { AgentRuntime } from "./runtime.ts"; | ||
import { embed } from "./embedding.ts"; | ||
import { Content, ModelClass, type Memory } from "./types.ts"; | ||
import { stringToUuid } from "./uuid.ts"; | ||
import { embeddingZeroVector } from "./memory.ts"; | ||
import { splitChunks } from "./generation.ts"; | ||
import { models } from "./models.ts"; | ||
import elizaLogger from "./logger.ts"; | ||
|
||
async function get(runtime: AgentRuntime, message: Memory): Promise<string[]> { | ||
const processed = preprocess(message.content.text); | ||
elizaLogger.log(`Querying knowledge for: ${processed}`); | ||
const embedding = await embed(runtime, processed); | ||
const fragments = await runtime.knowledgeManager.searchMemoriesByEmbedding( | ||
embedding, | ||
{ | ||
roomId: message.agentId, | ||
agentId: message.agentId, | ||
count: 3, | ||
match_threshold: 0.1, | ||
} | ||
); | ||
|
||
const uniqueSources = [ | ||
...new Set( | ||
fragments.map((memory) => { | ||
elizaLogger.log( | ||
`Matched fragment: ${memory.content.text} with similarity: ${message.similarity}` | ||
); | ||
return memory.content.source; | ||
}) | ||
), | ||
]; | ||
|
||
const knowledgeDocuments = await Promise.all( | ||
uniqueSources.map((source) => | ||
runtime.documentsManager.getMemoryById(source as UUID) | ||
) | ||
); | ||
|
||
const knowledge = knowledgeDocuments | ||
.filter((memory) => memory !== null) | ||
.map((memory) => memory.content.text); | ||
return knowledge; | ||
} | ||
|
||
export type KnowledgeItem = { | ||
id: UUID; | ||
content: Content; | ||
}; | ||
|
||
async function set(runtime: AgentRuntime, item: KnowledgeItem) { | ||
await runtime.documentsManager.createMemory({ | ||
embedding: embeddingZeroVector, | ||
id: item.id, | ||
agentId: runtime.agentId, | ||
roomId: runtime.agentId, | ||
userId: runtime.agentId, | ||
createdAt: Date.now(), | ||
content: item.content, | ||
}); | ||
|
||
const preprocessed = preprocess(item.content.text); | ||
const fragments = await splitChunks( | ||
preprocessed, | ||
10, | ||
models[runtime.character.modelProvider].model?.[ModelClass.EMBEDDING], | ||
5 | ||
); | ||
|
||
for (const fragment of fragments) { | ||
const embedding = await embed(runtime, fragment); | ||
await runtime.knowledgeManager.createMemory({ | ||
// We namespace the knowledge base uuid to avoid id | ||
// collision with the document above. | ||
id: stringToUuid(item.id + fragment), | ||
roomId: runtime.agentId, | ||
agentId: runtime.agentId, | ||
userId: runtime.agentId, | ||
createdAt: Date.now(), | ||
content: { | ||
source: item.id, | ||
text: fragment, | ||
}, | ||
embedding, | ||
}); | ||
} | ||
} | ||
|
||
export function preprocess(content: string): string { | ||
return ( | ||
content | ||
// Remove code blocks and their content | ||
.replace(/```[\s\S]*?```/g, "") | ||
// Remove inline code | ||
.replace(/`.*?`/g, "") | ||
// Convert headers to plain text with emphasis | ||
.replace(/#{1,6}\s*(.*)/g, "$1") | ||
// Remove image links but keep alt text | ||
.replace(/!\[(.*?)\]\(.*?\)/g, "$1") | ||
// Remove links but keep text | ||
.replace(/\[(.*?)\]\(.*?\)/g, "$1") | ||
// Remove HTML tags | ||
.replace(/<[^>]*>/g, "") | ||
// Remove horizontal rules | ||
.replace(/^\s*[-*_]{3,}\s*$/gm, "") | ||
// Remove comments | ||
.replace(/\/\*[\s\S]*?\*\//g, "") | ||
.replace(/\/\/.*/g, "") | ||
// Normalize whitespace | ||
.replace(/\s+/g, " ") | ||
// Remove multiple newlines | ||
.replace(/\n{3,}/g, "\n\n") | ||
// strip all special characters | ||
.replace(/[^a-zA-Z0-9\s]/g, "") | ||
// Remove Discord mentions | ||
.replace(/<@!?\d+>/g, "") | ||
.trim() | ||
.toLowerCase() | ||
); | ||
} | ||
|
||
export default { | ||
get, | ||
set, | ||
process, | ||
}; |
Oops, something went wrong.