From 7457c50bc752f51957306fe1e2542038d2d2504f Mon Sep 17 00:00:00 2001 From: westey <164392973+westey-m@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:30:05 +0000 Subject: [PATCH] .Net: Add more steps to getting started project. (#9522) ### Motivation and Context #7606 ### Description - Add a non string key common code step - Add a generic data model step - Add a custom mapper step ### Contribution Checklist - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone :smile: --- .../GettingStartedWithVectorStores.csproj | 1 + .../Step3_Switch_VectorStore.cs | 2 +- .../Step4_NonStringKey_VectorStore.cs | 195 ++++++++++++++++++ .../Step5_Use_GenericDataModel.cs | 78 +++++++ .../Step6_Use_CustomMapper.cs | 149 +++++++++++++ 5 files changed, 424 insertions(+), 1 deletion(-) create mode 100644 dotnet/samples/GettingStartedWithVectorStores/Step4_NonStringKey_VectorStore.cs create mode 100644 dotnet/samples/GettingStartedWithVectorStores/Step5_Use_GenericDataModel.cs create mode 100644 dotnet/samples/GettingStartedWithVectorStores/Step6_Use_CustomMapper.cs diff --git a/dotnet/samples/GettingStartedWithVectorStores/GettingStartedWithVectorStores.csproj b/dotnet/samples/GettingStartedWithVectorStores/GettingStartedWithVectorStores.csproj index 1e95a8187551..7a33e7c2fa3b 100644 --- a/dotnet/samples/GettingStartedWithVectorStores/GettingStartedWithVectorStores.csproj +++ b/dotnet/samples/GettingStartedWithVectorStores/GettingStartedWithVectorStores.csproj @@ -42,6 +42,7 @@ + diff --git a/dotnet/samples/GettingStartedWithVectorStores/Step3_Switch_VectorStore.cs b/dotnet/samples/GettingStartedWithVectorStores/Step3_Switch_VectorStore.cs index 9255b51b78ea..cc6c7443968c 100644 --- a/dotnet/samples/GettingStartedWithVectorStores/Step3_Switch_VectorStore.cs +++ b/dotnet/samples/GettingStartedWithVectorStores/Step3_Switch_VectorStore.cs @@ -22,7 +22,7 @@ public class Step3_Switch_VectorStore(ITestOutputHelper output, VectorStoresFixt [Fact] public async Task UseAnAzureAISearchVectorStoreAsync() { - // Construct a Redis vector store and get the collection. + // Construct an Azure AI Search vector store and get the collection. var vectorStore = new AzureAISearchVectorStore(new SearchIndexClient( new Uri(TestConfiguration.AzureAISearch.Endpoint), new AzureKeyCredential(TestConfiguration.AzureAISearch.ApiKey))); diff --git a/dotnet/samples/GettingStartedWithVectorStores/Step4_NonStringKey_VectorStore.cs b/dotnet/samples/GettingStartedWithVectorStores/Step4_NonStringKey_VectorStore.cs new file mode 100644 index 000000000000..906df16d84a1 --- /dev/null +++ b/dotnet/samples/GettingStartedWithVectorStores/Step4_NonStringKey_VectorStore.cs @@ -0,0 +1,195 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Runtime.CompilerServices; +using Microsoft.Extensions.VectorData; +using Microsoft.SemanticKernel.Connectors.Qdrant; +using Qdrant.Client; + +namespace GettingStartedWithVectorStores; + +/// +/// Example that shows that you can switch between different vector stores with the same code, in this case +/// with a vector store that doesn't use string keys. +/// This sample demonstrates one possible approach, however it is also possible to use generics +/// in the common code to achieve code reuse. +/// +public class Step4_NonStringKey_VectorStore(ITestOutputHelper output, VectorStoresFixture fixture) : BaseTest(output), IClassFixture +{ + /// + /// Here we are going to use the same code that we used in and + /// but now with an . + /// Qdrant uses Guid or ulong as the key type, but the common code works with a string key. The string keys of the records created + /// in contain numbers though, so it's possible for us to convert them to ulong. + /// In this example, we'll demonstrate how to do that. + /// + /// This example requires a Qdrant server up and running. To run a Qdrant server in a Docker container, use the following command: + /// docker run -d --name qdrant -p 6333:6333 -p 6334:6334 qdrant/qdrant:latest + /// + [Fact] + public async Task UseAQdrantVectorStoreAsync() + { + // Construct a Qdrant vector store collection. + var collection = new QdrantVectorStoreRecordCollection(new QdrantClient("localhost"), "skglossary"); + + // Wrap the collection using a decorator that allows us to expose a version that uses string keys, but internally + // we convert to and from ulong. + var stringKeyCollection = new MappingVectorStoreRecordCollection( + collection, + p => ulong.Parse(p), + i => i.ToString(), + p => new UlongGlossary { Key = ulong.Parse(p.Key), Category = p.Category, Term = p.Term, Definition = p.Definition, DefinitionEmbedding = p.DefinitionEmbedding }, + i => new Glossary { Key = i.Key.ToString("D"), Category = i.Category, Term = i.Term, Definition = i.Definition, DefinitionEmbedding = i.DefinitionEmbedding }); + + // Ingest data into the collection using the same code as we used in Step1 with the InMemory Vector Store. + await Step1_Ingest_Data.IngestDataIntoVectorStoreAsync(stringKeyCollection, fixture.TextEmbeddingGenerationService); + + // Search the vector store using the same code as we used in Step2 with the InMemory Vector Store. + var searchResultItem = await Step2_Vector_Search.SearchVectorStoreAsync( + stringKeyCollection, + "What is an Application Programming Interface?", + fixture.TextEmbeddingGenerationService); + + // Write the search result with its score to the console. + Console.WriteLine(searchResultItem.Record.Definition); + Console.WriteLine(searchResultItem.Score); + } + + /// + /// Data model that uses a ulong as the key type instead of a string. + /// + private sealed class UlongGlossary + { + [VectorStoreRecordKey] + public ulong Key { get; set; } + + [VectorStoreRecordData(IsFilterable = true)] + public string Category { get; set; } + + [VectorStoreRecordData] + public string Term { get; set; } + + [VectorStoreRecordData] + public string Definition { get; set; } + + [VectorStoreRecordVector(Dimensions: 1536)] + public ReadOnlyMemory DefinitionEmbedding { get; set; } + } + + /// + /// Simple decorator class that allows conversion of keys and records from one type to another. + /// + private sealed class MappingVectorStoreRecordCollection : IVectorStoreRecordCollection + where TPublicKey : notnull + where TInternalKey : notnull + { + private readonly IVectorStoreRecordCollection _collection; + private readonly Func _publicToInternalKeyMapper; + private readonly Func _internalToPublicKeyMapper; + private readonly Func _publicToInternalRecordMapper; + private readonly Func _internalToPublicRecordMapper; + + public MappingVectorStoreRecordCollection( + IVectorStoreRecordCollection collection, + Func publicToInternalKeyMapper, + Func internalToPublicKeyMapper, + Func publicToInternalRecordMapper, + Func internalToPublicRecordMapper) + { + this._collection = collection; + this._publicToInternalKeyMapper = publicToInternalKeyMapper; + this._internalToPublicKeyMapper = internalToPublicKeyMapper; + this._publicToInternalRecordMapper = publicToInternalRecordMapper; + this._internalToPublicRecordMapper = internalToPublicRecordMapper; + } + + /// + public string CollectionName => this._collection.CollectionName; + + /// + public Task CollectionExistsAsync(CancellationToken cancellationToken = default) + { + return this._collection.CollectionExistsAsync(cancellationToken); + } + + /// + public Task CreateCollectionAsync(CancellationToken cancellationToken = default) + { + return this._collection.CreateCollectionAsync(cancellationToken); + } + + /// + public Task CreateCollectionIfNotExistsAsync(CancellationToken cancellationToken = default) + { + return this._collection.CreateCollectionIfNotExistsAsync(cancellationToken); + } + + /// + public Task DeleteAsync(TPublicKey key, DeleteRecordOptions? options = null, CancellationToken cancellationToken = default) + { + return this._collection.DeleteAsync(this._publicToInternalKeyMapper(key), options, cancellationToken); + } + + /// + public Task DeleteBatchAsync(IEnumerable keys, DeleteRecordOptions? options = null, CancellationToken cancellationToken = default) + { + return this._collection.DeleteBatchAsync(keys.Select(this._publicToInternalKeyMapper), options, cancellationToken); + } + + /// + public Task DeleteCollectionAsync(CancellationToken cancellationToken = default) + { + return this._collection.DeleteCollectionAsync(cancellationToken); + } + + /// + public async Task GetAsync(TPublicKey key, GetRecordOptions? options = null, CancellationToken cancellationToken = default) + { + var internalRecord = await this._collection.GetAsync(this._publicToInternalKeyMapper(key), options, cancellationToken).ConfigureAwait(false); + if (internalRecord == null) + { + return default; + } + + return this._internalToPublicRecordMapper(internalRecord); + } + + /// + public IAsyncEnumerable GetBatchAsync(IEnumerable keys, GetRecordOptions? options = null, CancellationToken cancellationToken = default) + { + var internalRecords = this._collection.GetBatchAsync(keys.Select(this._publicToInternalKeyMapper), options, cancellationToken); + return internalRecords.Select(this._internalToPublicRecordMapper); + } + + /// + public async Task UpsertAsync(TPublicRecord record, UpsertRecordOptions? options = null, CancellationToken cancellationToken = default) + { + var internalRecord = this._publicToInternalRecordMapper(record); + var internalKey = await this._collection.UpsertAsync(internalRecord, options, cancellationToken).ConfigureAwait(false); + return this._internalToPublicKeyMapper(internalKey); + } + + /// + public async IAsyncEnumerable UpsertBatchAsync(IEnumerable records, UpsertRecordOptions? options = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + var internalRecords = records.Select(this._publicToInternalRecordMapper); + var internalKeys = this._collection.UpsertBatchAsync(internalRecords, options, cancellationToken); + await foreach (var internalKey in internalKeys.ConfigureAwait(false)) + { + yield return this._internalToPublicKeyMapper(internalKey); + } + } + + /// + public async Task> VectorizedSearchAsync(TVector vector, VectorSearchOptions? options = null, CancellationToken cancellationToken = default) + { + var searchResults = await this._collection.VectorizedSearchAsync(vector, options, cancellationToken).ConfigureAwait(false); + var publicResultRecords = searchResults.Results.Select(result => new VectorSearchResult(this._internalToPublicRecordMapper(result.Record), result.Score)); + + return new VectorSearchResults(publicResultRecords) + { + TotalCount = searchResults.TotalCount, + Metadata = searchResults.Metadata, + }; + } + } +} diff --git a/dotnet/samples/GettingStartedWithVectorStores/Step5_Use_GenericDataModel.cs b/dotnet/samples/GettingStartedWithVectorStores/Step5_Use_GenericDataModel.cs new file mode 100644 index 000000000000..449daf1c19b1 --- /dev/null +++ b/dotnet/samples/GettingStartedWithVectorStores/Step5_Use_GenericDataModel.cs @@ -0,0 +1,78 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.VectorData; +using Microsoft.SemanticKernel.Connectors.Redis; +using Microsoft.SemanticKernel.Embeddings; +using StackExchange.Redis; + +namespace GettingStartedWithVectorStores; + +/// +/// Example that shows that you can use the generic data model to interact with a vector database. +/// This makes it possible to use the vector store abstractions without having to create your own data model. +/// +public class Step5_Use_GenericDataModel(ITestOutputHelper output, VectorStoresFixture fixture) : BaseTest(output), IClassFixture +{ + /// + /// Example showing how to query a vector store that uses the generic data model. + /// + /// This example requires a Redis server running on localhost:6379. To run a Redis server in a Docker container, use the following command: + /// docker run -d --name redis-stack -p 6379:6379 -p 8001:8001 redis/redis-stack:latest + /// + [Fact] + public async Task SearchAVectorStoreWithGenericDataModelAsync() + { + // Construct a redis vector store. + var vectorStore = new RedisVectorStore(ConnectionMultiplexer.Connect("localhost:6379").GetDatabase()); + + // First, let's use the code from step 1 to ingest data into the vector store + // using the custom data model, simulating a scenario where someone else ingested + // the data into the database previously. + var collection = vectorStore.GetCollection("skglossary"); + var customDataModelCollection = vectorStore.GetCollection("skglossary"); + await Step1_Ingest_Data.IngestDataIntoVectorStoreAsync(customDataModelCollection, fixture.TextEmbeddingGenerationService); + + // To use the generic data model, we still have to describe the storage schema to the vector store + // using a record definition. The benefit over a custom data model is that this definition + // does not have to be known at compile time. + // E.g. it can be read from a configuration or retrieved from a service. + var recordDefinition = new VectorStoreRecordDefinition + { + Properties = new List + { + new VectorStoreRecordKeyProperty("Key", typeof(string)), + new VectorStoreRecordDataProperty("Category", typeof(string)), + new VectorStoreRecordDataProperty("Term", typeof(string)), + new VectorStoreRecordDataProperty("Definition", typeof(string)), + new VectorStoreRecordVectorProperty("DefinitionEmbedding", typeof(ReadOnlyMemory)) { Dimensions = 1536 }, + } + }; + + // Now, let's create a collection that uses the generic data model. + var genericDataModelCollection = vectorStore.GetCollection>("skglossary", recordDefinition); + + // Generate an embedding from the search string. + var searchString = "How do I provide additional context to an LLM?"; + var searchVector = await fixture.TextEmbeddingGenerationService.GenerateEmbeddingAsync(searchString); + + // Search the generic data model collection and get the single most relevant result. + var searchResult = await genericDataModelCollection.VectorizedSearchAsync( + searchVector, + new() + { + Top = 1, + }); + var searchResultItems = await searchResult.Results.ToListAsync(); + + // Write the search result with its score to the console. + // Note that here we can loop through all the data properties + // without knowing the schema, since the data properties are + // stored as a dictionary of string keys and object values + // when using the generic data model. + foreach (var dataProperty in searchResultItems.First().Record.Data) + { + Console.WriteLine($"{dataProperty.Key}: {dataProperty.Value}"); + } + Console.WriteLine(searchResultItems.First().Score); + } +} diff --git a/dotnet/samples/GettingStartedWithVectorStores/Step6_Use_CustomMapper.cs b/dotnet/samples/GettingStartedWithVectorStores/Step6_Use_CustomMapper.cs new file mode 100644 index 000000000000..cc86a773b0c0 --- /dev/null +++ b/dotnet/samples/GettingStartedWithVectorStores/Step6_Use_CustomMapper.cs @@ -0,0 +1,149 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Text.Json; +using System.Text.Json.Nodes; +using Azure; +using Azure.Search.Documents.Indexes; +using Microsoft.Extensions.VectorData; +using Microsoft.SemanticKernel.Connectors.AzureAISearch; +using Microsoft.SemanticKernel.Embeddings; + +namespace GettingStartedWithVectorStores; + +/// +/// Example that shows how you can use custom mappers if you wish the data model and storage schema to differ. +/// +public class Step6_Use_CustomMapper(ITestOutputHelper output, VectorStoresFixture fixture) : BaseTest(output), IClassFixture +{ + /// + /// Example showing how to upsert and query records when using a custom mapper if you wish + /// the data model and storage schema to differ. + /// + /// This example requires an Azure AI Search service to be available. + /// + [Fact] + public async Task UseCustomMapperAsync() + { + // When using a custom mapper, we still have to describe the storage schema to the vector store + // using a record definition. Since the storage schema does not match the data model + // it won't make sense for the vector store to infer the schema from the data model. + var recordDefinition = new VectorStoreRecordDefinition + { + Properties = new List + { + new VectorStoreRecordKeyProperty("Key", typeof(string)), + new VectorStoreRecordDataProperty("Category", typeof(string)), + new VectorStoreRecordDataProperty("Term", typeof(string)), + new VectorStoreRecordDataProperty("Definition", typeof(string)), + new VectorStoreRecordVectorProperty("DefinitionEmbedding", typeof(ReadOnlyMemory)) { Dimensions = 1536 }, + } + }; + + // Construct an Azure AI Search vector store collection and + // pass in the custom mapper and record definition. + var collection = new AzureAISearchVectorStoreRecordCollection( + new SearchIndexClient( + new Uri(TestConfiguration.AzureAISearch.Endpoint), + new AzureKeyCredential(TestConfiguration.AzureAISearch.ApiKey)), + "skglossary", + new() + { + JsonObjectCustomMapper = new CustomMapper(), + VectorStoreRecordDefinition = recordDefinition + }); + + // Create the collection if it doesn't exist. + // This call will use the schena defined by the record definition + // above for creating the collection. + await collection.CreateCollectionIfNotExistsAsync(); + + // Now we can upsert a record using + // the data model, even though it doesn't match the storage schema. + var definition = "A set of rules and protocols that allows one software application to interact with another."; + await collection.UpsertAsync(new ComplexGlossary + { + Key = "1", + Metadata = new Metadata + { + Category = "API", + Term = "Application Programming Interface" + }, + Definition = definition, + DefinitionEmbedding = await fixture.TextEmbeddingGenerationService.GenerateEmbeddingAsync(definition) + }); + + // Generate an embedding from the search string. + var searchVector = await fixture.TextEmbeddingGenerationService.GenerateEmbeddingAsync("How do two software applications interact with another?"); + + // Search the vector store. + var searchResult = await collection.VectorizedSearchAsync( + searchVector, + new() + { + Top = 1 + }); + var searchResultItem = await searchResult.Results.FirstAsync(); + + // Write the search result with its score to the console. + Console.WriteLine(searchResultItem.Record.Metadata.Term); + Console.WriteLine(searchResultItem.Record.Definition); + Console.WriteLine(searchResultItem.Score); + } + + /// + /// Sample mapper class that maps between the custom data model + /// and the that should match the storage schema. + /// + private sealed class CustomMapper : IVectorStoreRecordMapper + { + public JsonObject MapFromDataToStorageModel(ComplexGlossary dataModel) + { + return new JsonObject + { + ["Key"] = dataModel.Key, + ["Category"] = dataModel.Metadata.Category, + ["Term"] = dataModel.Metadata.Term, + ["Definition"] = dataModel.Definition, + ["DefinitionEmbedding"] = JsonSerializer.SerializeToNode(dataModel.DefinitionEmbedding.ToArray()) + }; + } + + public ComplexGlossary MapFromStorageToDataModel(JsonObject storageModel, StorageToDataModelMapperOptions options) + { + return new ComplexGlossary + { + Key = storageModel["Key"]!.ToString(), + Metadata = new Metadata + { + Category = storageModel["Category"]!.ToString(), + Term = storageModel["Term"]!.ToString() + }, + Definition = storageModel["Definition"]!.ToString(), + DefinitionEmbedding = JsonSerializer.Deserialize>(storageModel["DefinitionEmbedding"]) + }; + } + } + + /// + /// Sample model class that represents a glossary entry. + /// This model differs from the model used in previous steps by having a complex property + /// that contains the category and term. + /// + private sealed class ComplexGlossary + { + public string Key { get; set; } + + public Metadata Metadata { get; set; } + + public string Definition { get; set; } + + public ReadOnlyMemory DefinitionEmbedding { get; set; } + } + + private sealed class Metadata + { + public string Category { get; set; } + + public string Term { get; set; } + } +}