Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-62354: Support Cosine Similarity #2051

Merged
merged 5 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.21
require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/bits-and-blooms/bitset v1.12.0
github.com/blevesearch/bleve_index_api v1.1.10
github.com/blevesearch/bleve_index_api v1.1.11
github.com/blevesearch/geo v0.1.20
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475
github.com/blevesearch/go-porterstemmer v1.0.3
Expand All @@ -23,7 +23,7 @@ require (
github.com/blevesearch/zapx/v13 v13.3.10
github.com/blevesearch/zapx/v14 v14.3.10
github.com/blevesearch/zapx/v15 v15.3.13
github.com/blevesearch/zapx/v16 v16.1.5
github.com/blevesearch/zapx/v16 v16.1.6-0.20240805195258-e1776480442c
github.com/couchbase/moss v0.2.0
github.com/golang/protobuf v1.3.2
github.com/spf13/cobra v1.7.0
Expand Down
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4
github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.1.10 h1:PDLFhVjrjQWr6jCuU7TwlmByQVCSEURADHdCqVS9+g0=
github.com/blevesearch/bleve_index_api v1.1.10/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/bleve_index_api v1.1.11 h1:OTNpRnxPWFIhMSgBUBlkD7RVWYrfsojtQeACb8tGGpw=
github.com/blevesearch/bleve_index_api v1.1.11/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
github.com/blevesearch/go-faiss v1.0.20 h1:AIkdTQFWuZ5LQmKQSebgMR4RynGNw8ZseJXaan5kvtI=
Expand Down Expand Up @@ -43,8 +43,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7
github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
github.com/blevesearch/zapx/v15 v15.3.13 h1:6EkfaZiPlAxqXz0neniq35my6S48QI94W/wyhnpDHHQ=
github.com/blevesearch/zapx/v15 v15.3.13/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg=
github.com/blevesearch/zapx/v16 v16.1.5 h1:b0sMcarqNFxuXvjoXsF8WtwVahnxyhEvBSRJi/AUHjU=
github.com/blevesearch/zapx/v16 v16.1.5/go.mod h1:J4mSF39w1QELc11EWRSBFkPeZuO7r/NPKkHzDCoiaI8=
github.com/blevesearch/zapx/v16 v16.1.6-0.20240805195258-e1776480442c h1:j2znHQQ7LfLtwPNRekgv49D0IbrLkmIMzKAhho3X0L0=
github.com/blevesearch/zapx/v16 v16.1.6-0.20240805195258-e1776480442c/go.mod h1:Er6ZhsETdPDgHUm7EPlgaNsDz/PuX9fokakN3ZrBYsQ=
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=
Expand Down
27 changes: 27 additions & 0 deletions mapping/mapping_vectors.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package mapping

import (
"fmt"
"math"
"reflect"

"github.com/blevesearch/bleve/v2/document"
Expand Down Expand Up @@ -140,6 +141,10 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
if !ok {
return false
}
// normalize raw vector if similarity is cosine
if fm.Similarity == index.CosineSimilarity {
vector = NormalizeVector(vector)
}

fieldName := getFieldName(pathString, path, fm)
options := fm.Options()
Expand All @@ -163,6 +168,10 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac
if err != nil || len(decodedVector) != fm.Dims {
return
}
// normalize raw vector if similarity is cosine
if fm.Similarity == index.CosineSimilarity {
decodedVector = NormalizeVector(decodedVector)
}

fieldName := getFieldName(pathString, path, fm)
options := fm.Options()
Expand Down Expand Up @@ -252,3 +261,21 @@ func validateVectorFieldAlias(field *FieldMapping, parentName string,

return nil
}

func NormalizeVector(vector []float32) []float32 {
// first calculate the magnitude of the vector
var mag float64
for _, v := range vector {
mag += float64(v) * float64(v)
}
// cannot normalize a zero vector
// if the magnitude is 1, then the vector is already normalized
if mag != 0 && mag != 1 {
mag = math.Sqrt(mag)
// normalize the vector
for i, v := range vector {
vector[i] = float32(float64(v) / mag)
}
}
return vector
}
4 changes: 4 additions & 0 deletions search/query/knn.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader,
if q.K <= 0 || len(q.Vector) == 0 {
return nil, fmt.Errorf("k must be greater than 0 and vector must be non-empty")
}
if similarityMetric == index.CosineSimilarity {
// normalize the vector
q.Vector = mapping.NormalizeVector(q.Vector)
}
return searcher.NewKNNSearcher(ctx, i, m, options, q.VectorField,
q.Vector, q.K, q.BoostVal.Value(), similarityMetric, q.Params)
}
4 changes: 2 additions & 2 deletions search/scorer/scorer_knn_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func TestKNNScorerExplanation(t *testing.T) {
},
norm: 1.0,
scorer: NewKNNQueryScorer(queryVector, "desc", 1.0,
search.SearcherOptions{Explain: true}, index.CosineSimilarity),
search.SearcherOptions{Explain: true}, index.InnerProduct),
result: &search.DocumentMatch{
IndexInternalID: index.IndexInternalID("one"),
Score: 0.5,
Expand All @@ -127,7 +127,7 @@ func TestKNNScorerExplanation(t *testing.T) {
},
norm: 0.5,
scorer: NewKNNQueryScorer(queryVector, "desc", 1.0,
search.SearcherOptions{Explain: true}, index.CosineSimilarity),
search.SearcherOptions{Explain: true}, index.InnerProduct),
result: &search.DocumentMatch{
IndexInternalID: index.IndexInternalID("one"),
Score: 0.25,
Expand Down
105 changes: 100 additions & 5 deletions search_knn_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,20 @@ func TestSimilaritySearchPartitionedIndex(t *testing.T) {

vecFieldMappingDot := mapping.NewVectorFieldMapping()
vecFieldMappingDot.Dims = testDatasetDims
vecFieldMappingDot.Similarity = index.CosineSimilarity
vecFieldMappingDot.Similarity = index.InnerProduct

indexMappingDotProduct := NewIndexMapping()
indexMappingDotProduct.DefaultMapping.AddFieldMappingsAt("content", contentFieldMapping)
indexMappingDotProduct.DefaultMapping.AddFieldMappingsAt("vector", vecFieldMappingDot)

vecFieldMappingCosine := mapping.NewVectorFieldMapping()
vecFieldMappingCosine.Dims = testDatasetDims
vecFieldMappingCosine.Similarity = index.CosineSimilarity

indexMappingCosine := NewIndexMapping()
indexMappingCosine.DefaultMapping.AddFieldMappingsAt("content", contentFieldMapping)
indexMappingCosine.DefaultMapping.AddFieldMappingsAt("vector", vecFieldMappingCosine)

type testCase struct {
testType string
queryIndex int
Expand Down Expand Up @@ -130,6 +138,31 @@ func TestSimilaritySearchPartitionedIndex(t *testing.T) {
numIndexPartitions: 4,
mapping: indexMappingDotProduct,
},
// cosine similarity
{
testType: "multi_partition:match_none:oneKNNreq:k=3",
queryIndex: 0,
numIndexPartitions: 7,
mapping: indexMappingCosine,
},
{
testType: "multi_partition:match_none:oneKNNreq:k=2",
queryIndex: 0,
numIndexPartitions: 5,
mapping: indexMappingCosine,
},
{
testType: "multi_partition:match:oneKNNreq:k=2",
queryIndex: 1,
numIndexPartitions: 3,
mapping: indexMappingCosine,
},
{
testType: "multi_partition:disjunction:twoKNNreq:k=2,2",
queryIndex: 2,
numIndexPartitions: 9,
mapping: indexMappingCosine,
},
}

index := NewIndexAlias()
Expand Down Expand Up @@ -459,11 +492,11 @@ func TestVectorBase64Index(t *testing.T) {

vecFMDot := mapping.NewVectorFieldMapping()
vecFMDot.Dims = testDatasetDims
vecFMDot.Similarity = index.CosineSimilarity
vecFMDot.Similarity = index.InnerProduct

vecBFMDot := mapping.NewVectorBase64FieldMapping()
vecBFMDot.Dims = testDatasetDims
vecBFMDot.Similarity = index.CosineSimilarity
vecBFMDot.Similarity = index.InnerProduct

indexMappingL2 := NewIndexMapping()
indexMappingL2.DefaultMapping.AddFieldMappingsAt("content", contentFM)
Expand Down Expand Up @@ -768,7 +801,12 @@ func createMultipleSegmentsIndex(documents []map[string]interface{}, index Index
}

func truncateScore(score float64) float64 {
return float64(int(score*1e6)) / 1e6
epsilon := 1e-4
truncated := float64(int(score*1e6)) / 1e6
if math.Abs(truncated-1.0) <= epsilon {
return 1.0
}
return truncated
}

// Function to compare two Explanation structs recursively
Expand Down Expand Up @@ -920,7 +958,11 @@ func TestSimilaritySearchMultipleSegments(t *testing.T) {

vecFieldMappingDot := mapping.NewVectorFieldMapping()
vecFieldMappingDot.Dims = testDatasetDims
vecFieldMappingDot.Similarity = index.CosineSimilarity
vecFieldMappingDot.Similarity = index.InnerProduct

vecFieldMappingCosine := mapping.NewVectorFieldMapping()
vecFieldMappingCosine.Dims = testDatasetDims
vecFieldMappingCosine.Similarity = index.CosineSimilarity

indexMappingL2Norm := NewIndexMapping()
indexMappingL2Norm.DefaultMapping.AddFieldMappingsAt("content", contentFieldMapping)
Expand All @@ -930,6 +972,10 @@ func TestSimilaritySearchMultipleSegments(t *testing.T) {
indexMappingDotProduct.DefaultMapping.AddFieldMappingsAt("content", contentFieldMapping)
indexMappingDotProduct.DefaultMapping.AddFieldMappingsAt("vector", vecFieldMappingDot)

indexMappingCosine := NewIndexMapping()
indexMappingCosine.DefaultMapping.AddFieldMappingsAt("content", contentFieldMapping)
indexMappingCosine.DefaultMapping.AddFieldMappingsAt("vector", vecFieldMappingCosine)

var reqSort = search.SortOrder{&search.SortScore{Desc: true}, &search.SortDocID{Desc: true}, &search.SortField{Desc: false, Field: "content"}}

testCases := []struct {
Expand Down Expand Up @@ -1000,6 +1046,37 @@ func TestSimilaritySearchMultipleSegments(t *testing.T) {
queryIndex: 5,
mapping: indexMappingDotProduct,
},
// cosine similarity
{
numSegments: 9,
queryIndex: 0,
mapping: indexMappingCosine,
},
{
numSegments: 5,
queryIndex: 1,
mapping: indexMappingCosine,
},
{
numSegments: 4,
queryIndex: 2,
mapping: indexMappingCosine,
},
{
numSegments: 12,
queryIndex: 3,
mapping: indexMappingCosine,
},
{
numSegments: 7,
queryIndex: 4,
mapping: indexMappingCosine,
},
{
numSegments: 11,
queryIndex: 5,
mapping: indexMappingCosine,
},
// score none test
{
numSegments: 3,
Expand Down Expand Up @@ -1037,6 +1114,24 @@ func TestSimilaritySearchMultipleSegments(t *testing.T) {
mapping: indexMappingDotProduct,
scoreValue: "none",
},
{
numSegments: 3,
queryIndex: 0,
mapping: indexMappingCosine,
scoreValue: "none",
},
{
numSegments: 7,
queryIndex: 1,
mapping: indexMappingCosine,
scoreValue: "none",
},
{
numSegments: 8,
queryIndex: 2,
mapping: indexMappingCosine,
scoreValue: "none",
},
}
for testCaseNum, testCase := range testCases {
originalRequest := searchRequests[testCase.queryIndex]
Expand Down
Loading