Skip to content

Commit

Permalink
Enhance processing with exclusions and debug embedding
Browse files Browse the repository at this point in the history
  • Loading branch information
anpetroc committed Oct 13, 2024
1 parent 9dc40e7 commit 5a1201f
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 8 deletions.
48 changes: 40 additions & 8 deletions src/AssistantAI/KnowledgeCollectors/Git.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@ public override void DoWork()
if (IsShuttingDown)
return;

var embedding = _embeddingClient.GenerateEmbedding("Report outage");
var results = repoInfo.VectorDB.FindWithDistance(embedding.Value.ToFloats());
#if DEBUG // Test embedding
var embedding = _embeddingClient.GenerateEmbedding("Governance Section for copilot");
var results = repoInfo.VectorDB.FindWithDistance(embedding.Value.ToFloats(), maxResultCount: 10);
#endif
}
}

Expand Down Expand Up @@ -123,7 +125,7 @@ private void ProcessCommit(Core.Options.Git.Repository repoInfo, Repository gitR
embedding = _embeddingClient.GenerateEmbedding(commit.MessageShort);
repoInfo.VectorDB.Insert(embedding.Value.ToFloats(), new Sha1(commit.Sha), null);

ProcessChanges(repoInfo.VectorDB, gitRepo, changes, commit.Sha);
ProcessChanges(repoInfo, gitRepo, changes, commit.Sha);

if (repoInfo.VectorDB.Headers.ContainsKey(commitDirection))
repoInfo.VectorDB.Headers[commitDirection] = commit.Sha;
Expand All @@ -136,7 +138,7 @@ private void ProcessCommit(Core.Options.Git.Repository repoInfo, Repository gitR
Debug.WriteLine($"{commitDirection}: {commit.Sha}");
}

private void ProcessChanges(VectorDB<Sha1> vectorDB, Repository repo, TreeChanges changes, string commitSha)
private void ProcessChanges(Core.Options.Git.Repository repoInfo, Repository repo, TreeChanges changes, string commitSha)
{
foreach (var change in changes)
{
Expand All @@ -146,6 +148,17 @@ private void ProcessChanges(VectorDB<Sha1> vectorDB, Repository repo, TreeChange
if (change.Path.IndexOf("/locales/", StringComparison.OrdinalIgnoreCase) >= 0 &&
change.Path.IndexOf("/locales/en-US/", StringComparison.OrdinalIgnoreCase) < 0)
continue;
bool excluded = false;
foreach (var excludedFolder in repoInfo.ExcludedFolders)
{
if (change.Path.IndexOf(excludedFolder.Key, StringComparison.OrdinalIgnoreCase) >= 0)
{
excluded = true;
break;
}
}
if (excluded)
continue;

var oldBlob = repo.Lookup<Blob>(change.OldOid);
var newBlob = repo.Lookup<Blob>(change.Oid);
Expand Down Expand Up @@ -192,7 +205,7 @@ private void ProcessChanges(VectorDB<Sha1> vectorDB, Repository repo, TreeChange
var embeddings = _embeddingClient.GenerateEmbeddings(textBlocks);
foreach (var embedding in embeddings.Value)
{
vectorDB.Insert(embedding.ToFloats(), new Sha1(commitSha), null);
repoInfo.VectorDB.Insert(embedding.ToFloats(), new Sha1(commitSha), null);
}
}
else // use batch embeddings
Expand All @@ -202,7 +215,7 @@ private void ProcessChanges(VectorDB<Sha1> vectorDB, Repository repo, TreeChange
var embeddings = _embeddingClient.GenerateEmbeddings(textBlocks.Skip(i).Take(EmbeddingBatchSize));
foreach (var embedding in embeddings.Value)
{
vectorDB.Insert(embedding.ToFloats(), new Sha1(commitSha), null);
repoInfo.VectorDB.Insert(embedding.ToFloats(), new Sha1(commitSha), null);
}
if (IsShuttingDown)
return;
Expand All @@ -218,7 +231,7 @@ private void ProcessChanges(VectorDB<Sha1> vectorDB, Repository repo, TreeChange
const string DataUri = "'data:image/svg+xml;base64,";

/// <summary>
/// Cleans the line from special characters and data URIs.
/// Cleans the line from special characters and data URIs to save tokens
/// </summary>
private string CleanLine(string content)
{
Expand All @@ -232,7 +245,7 @@ private string CleanLine(string content)
else
content = content.Remove(index);
}
content = content.Trim([',', ';', ':', '\'', '"', '(', ')', '{', '}', '[', ']']);
content = content.Trim([',', ';', ':', '\'', '"', '`', '=', '+', '(', ')', '{', '}', '[', ']']);

content = content.Replace(" { ", " ");
content = content.Replace(" } ", " ");
Expand All @@ -243,6 +256,15 @@ private string CleanLine(string content)
content = content.Replace(" / ", " ");
content = content.Replace(" % ", " ");

content = content.Replace(": '", ": ");
content = content.Replace(": `${", ": ");
content = content.Replace("\": \"", ": ");
content = content.Replace("?: ", ": ");

content = content.Replace("', '", ", ");
content = content.Replace("', ", ", ");
content = content.Replace(", '", ", ");

content = content.Replace(" == ", " ");
content = content.Replace(" => ", " ");
content = content.Replace(" += ", " ");
Expand All @@ -251,9 +273,19 @@ private string CleanLine(string content)
content = content.Replace(" /= ", " ");
content = content.Replace(" %= ", " ");

content = content.Replace(" const ", " ");
content = content.Replace("const ", " ");
content = content.Replace(" return ", " ");
content = content.Replace("return ", " ");

content = CompressSpaces().Replace(content, " ");

return content + "\n";
}

[GeneratedRegex("^[ !@#$%^&*()_+\\-=\\[\\]{};':\"\\\\|,.<>\\/?]*$")]
private static partial Regex SpecialCharactersOnly();

[GeneratedRegex("\\s+")]
private static partial Regex CompressSpaces();
}
1 change: 1 addition & 0 deletions src/Core/Options/Git.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ public class Repository
public required string VectorDBPath { get; set; }
[JsonIgnore]
public required VectorDB.VectorDB<Sha1> VectorDB { get; set; }
public Dictionary<string, string> ExcludedFolders { get; set; } = new Dictionary<string, string>();
}

public IList<Repository> Repositories { get; set; } = new List<Repository>();
Expand Down

0 comments on commit 5a1201f

Please sign in to comment.