Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-62230 - Pre-Filtering Support for kNN #2063

Merged
merged 18 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/vectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,26 @@ if err != nil {
fmt.Println(searchResult.Hits)
```

## Querying with Filters (v2.4.3+)
```go
searchRequest := NewSearchRequest(query.NewMatchNoneQuery())

filterQuery := NewTermQuery("hello")

searchRequest.AddKNNWithFilter(
"vec", // vector field name
[]float32{10,11,12,13,14,15,16,17,18,19}, // query vector (same dims)
5, // k
0, // boost
filterQuery, // filter query
)
searchResult, err := index.Search(searchRequest)
if err != nil {
panic(err)
}
fmt.Println(searchResult.Hits)
```

## Setup Instructions

* Using `cmake` is a recommended approach by FAISS authors.
Expand Down
37 changes: 35 additions & 2 deletions index/scorch/optimize_knn.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ type OptimizeVR struct {
totalCost uint64
// maps field to vector readers
vrs map[string][]*IndexSnapshotVectorReader
// if at least one of the vector readers requires filtered kNN.
requiresFiltering bool
}

// This setting _MUST_ only be changed during init and not after.
Expand Down Expand Up @@ -62,6 +64,8 @@ func (o *OptimizeVR) Finish() error {
var errorsM sync.Mutex
var errors []error

snapshotGlobalDocNums := o.snapshot.globalDocNums()

defer o.invokeSearcherEndCallback()

wg := sync.WaitGroup{}
Expand All @@ -77,7 +81,8 @@ func (o *OptimizeVR) Finish() error {
wg.Done()
}()
for field, vrs := range o.vrs {
vecIndex, err := segment.InterpretVectorIndex(field, origSeg.deleted)
vecIndex, err := segment.InterpretVectorIndex(field,
o.requiresFiltering, origSeg.deleted)
if err != nil {
errorsM.Lock()
errors = append(errors, err)
Expand All @@ -89,9 +94,34 @@ func (o *OptimizeVR) Finish() error {
vectorIndexSize := vecIndex.Size()
origSeg.cachedMeta.updateMeta(field, vectorIndexSize)
for _, vr := range vrs {
eligibleVectorInternalIDs := vr.getEligibleDocIDs()
if snapshotGlobalDocNums != nil {
// Only the eligible documents belonging to this segment
// will get filtered out.
// There is no way to determine which doc belongs to which segment
eligibleVectorInternalIDs.And(snapshotGlobalDocNums[index])
}

eligibleLocalDocNums := make([]uint64,
eligibleVectorInternalIDs.Stats().Cardinality)
// get the (segment-)local document numbers
for i, docNum := range eligibleVectorInternalIDs.ToArray() {
localDocNum := o.snapshot.localDocNumFromGlobal(index,
uint64(docNum))
eligibleLocalDocNums[i] = localDocNum
}

var pl segment_api.VecPostingsList
var err error
// for each VR, populate postings list and iterators
// by passing the obtained vector index and getting similar vectors.
pl, err := vecIndex.Search(vr.vector, vr.k, vr.searchParams)
if vr.eligibleDocIDs != nil && len(vr.eligibleDocIDs) > 0 {
pl, err = vecIndex.SearchWithFilter(vr.vector, vr.k,
eligibleLocalDocNums, vr.searchParams)
} else {
pl, err = vecIndex.Search(vr.vector, vr.k, vr.searchParams)
}

if err != nil {
errorsM.Lock()
errors = append(errors, err)
Expand Down Expand Up @@ -140,6 +170,9 @@ func (s *IndexSnapshotVectorReader) VectorOptimize(ctx context.Context,
return octx, nil
}
o.ctx = ctx
if !o.requiresFiltering {
o.requiresFiltering = len(s.eligibleDocIDs) > 0
}

if o.snapshot != s.snapshot {
o.invokeSearcherEndCallback()
Expand Down
30 changes: 29 additions & 1 deletion index/scorch/snapshot_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -471,16 +471,44 @@ func (is *IndexSnapshot) Document(id string) (rv index.Document, err error) {
return rvd, nil
}

// In a multi-segment index, each document has:
// 1. a local docnum - local to the segment
// 2. a global docnum - unique identifier across the index
// This function returns the segment index(the segment in which the docnum is present)
// and local docnum of a document.
func (is *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (int, uint64) {
segmentIndex := sort.Search(len(is.offsets),
func(x int) bool {
return is.offsets[x] > docNum
}) - 1

localDocNum := docNum - is.offsets[segmentIndex]
localDocNum := is.localDocNumFromGlobal(segmentIndex, docNum)
return int(segmentIndex), localDocNum
}

// This function returns the local docnum, given the segment index and global docnum
func (is *IndexSnapshot) localDocNumFromGlobal(segmentIndex int, docNum uint64) uint64 {
return docNum - is.offsets[segmentIndex]
}

// Function to return a mapping of the segment index to the live global doc nums
// in the segment of the specified index snapshot.
func (is *IndexSnapshot) globalDocNums() map[int]*roaring.Bitmap {
if len(is.segment) == 0 {
return nil
}

segmentIndexGlobalDocNums := make(map[int]*roaring.Bitmap)
metonymic-smokey marked this conversation as resolved.
Show resolved Hide resolved

for i := range is.segment {
segmentIndexGlobalDocNums[i] = roaring.NewBitmap()
for _, localDocNum := range is.segment[i].DocNumbersLive().ToArray() {
segmentIndexGlobalDocNums[i].Add(localDocNum + uint32(is.offsets[i]))
}
}
return segmentIndexGlobalDocNums
}

func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
docNum, err := docInternalToNumber(id)
if err != nil {
Expand Down
31 changes: 30 additions & 1 deletion index/scorch/snapshot_index_vr.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"fmt"
"reflect"

"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
segment_api "github.com/blevesearch/scorch_segment_api/v2"
Expand Down Expand Up @@ -51,6 +52,24 @@ type IndexSnapshotVectorReader struct {
ctx context.Context

searchParams json.RawMessage

// The following fields are only applicable for vector readers which will
// process kNN queries.
eligibleDocIDs []index.IndexInternalID
}

// Function to convert the internal IDs of the eligible documents to a type suitable
// for addition to a bitmap.
// Useful to have the eligible doc IDs in a bitmap to leverage the fast intersection
// (AND) operations. Eg. finding the eligible doc IDs present in a segment.
func (i *IndexSnapshotVectorReader) getEligibleDocIDs() *roaring.Bitmap {
res := roaring.NewBitmap()
abhinavdangeti marked this conversation as resolved.
Show resolved Hide resolved
// converts the doc IDs to uint32 and returns
for _, eligibleDocInternalID := range i.eligibleDocIDs {
internalDocID, _ := docInternalToNumber(index.IndexInternalID(eligibleDocInternalID))
res.Add(uint32(internalDocID))
}
return res
}

func (i *IndexSnapshotVectorReader) Size() int {
Expand Down Expand Up @@ -108,7 +127,17 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
preAlloced *index.VectorDoc) (*index.VectorDoc, error) {

if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k, i.searchParams)
var i2 index.VectorReader
var err error

if len(i.eligibleDocIDs) > 0 {
i2, err = i.snapshot.VectorReaderWithFilter(i.ctx, i.vector, i.field,
i.k, i.searchParams, i.eligibleDocIDs)
} else {
i2, err = i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k,
i.searchParams)
}

if err != nil {
return nil, err
}
Expand Down
26 changes: 26 additions & 0 deletions index/scorch/snapshot_vector_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,29 @@ func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32,

return rv, nil
}

func (is *IndexSnapshot) VectorReaderWithFilter(ctx context.Context, vector []float32,
field string, k int64, searchParams json.RawMessage,
filterIDs []index.IndexInternalID) (
index.VectorReader, error) {

rv := &IndexSnapshotVectorReader{
vector: vector,
field: field,
k: k,
snapshot: is,
searchParams: searchParams,
eligibleDocIDs: filterIDs,
}

if rv.postings == nil {
rv.postings = make([]segment_api.VecPostingsList, len(is.segment))
}
if rv.iterators == nil {
rv.iterators = make([]segment_api.VecPostingsIterator, len(is.segment))
}

// initialize postings and iterators within the OptimizeVR's Finish()

return rv, nil
}
156 changes: 156 additions & 0 deletions search/collector/eligible.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
// Copyright (c) 2024 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package collector
metonymic-smokey marked this conversation as resolved.
Show resolved Hide resolved

import (
"context"
"time"

"github.com/blevesearch/bleve/v2/search"
index "github.com/blevesearch/bleve_index_api"
)

type EligibleCollector struct {
size int
total uint64
took time.Duration
results search.DocumentMatchCollection

store collectorStore
}

func NewEligibleCollector(size int) *EligibleCollector {
return newEligibleCollector(size)
}

func newEligibleCollector(size int) *EligibleCollector {
// No sort order & skip always 0 since this is only to filter eligible docs.
hc := &EligibleCollector{size: size}

// comparator is a dummy here
hc.store = getOptimalCollectorStore(size, 0, func(i, j *search.DocumentMatch) int {
return 0
})

return hc
}

func (hc *EligibleCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
startTime := time.Now()
var err error
var next *search.DocumentMatch

backingSize := hc.size
if backingSize > PreAllocSizeSkipCap {
backingSize = PreAllocSizeSkipCap + 1
}
searchContext := &search.SearchContext{
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), 0),
Collector: hc,
IndexReader: reader,
}

dmHandlerMaker := MakeEligibleDocumentMatchHandler
if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil {
dmHandlerMaker = cv.(search.MakeDocumentMatchHandler)
}
// use the application given builder for making the custom document match
// handler and perform callbacks/invocations on the newly made handler.
dmHandler, _, err := dmHandlerMaker(searchContext)
if err != nil {
return err
}
select {
case <-ctx.Done():
search.RecordSearchCost(ctx, search.AbortM, 0)
return ctx.Err()
default:
next, err = searcher.Next(searchContext)
}
for err == nil && next != nil {
if hc.total%CheckDoneEvery == 0 {
select {
case <-ctx.Done():
search.RecordSearchCost(ctx, search.AbortM, 0)
return ctx.Err()
default:
}
}
hc.total++

err = dmHandler(next)
if err != nil {
break
}

next, err = searcher.Next(searchContext)
}
if err != nil {
return err
}

// help finalize/flush the results in case
// of custom document match handlers.
err = dmHandler(nil)
if err != nil {
return err
}

// compute search duration
hc.took = time.Since(startTime)

// finalize actual results
err = hc.finalizeResults(reader)
if err != nil {
return err
}
return nil
}

func (hc *EligibleCollector) finalizeResults(r index.IndexReader) error {
var err error
hc.results, err = hc.store.Final(0, func(doc *search.DocumentMatch) error {
// Adding the results to the store without any modifications since we don't
// require the external ID of the filtered hits.
return nil
})
return err
}

func (hc *EligibleCollector) Results() search.DocumentMatchCollection {
return hc.results
}

func (hc *EligibleCollector) Total() uint64 {
return hc.total
}

// No concept of scoring in the eligible collector.
func (hc *EligibleCollector) MaxScore() float64 {
return 0
}

func (hc *EligibleCollector) Took() time.Duration {
return hc.took
}

func (hc *EligibleCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) {
// facet unsupported for pre-filtering in KNN search
}

func (hc *EligibleCollector) FacetResults() search.FacetResults {
// facet unsupported for pre-filtering in KNN search
return nil
}
5 changes: 5 additions & 0 deletions search/collector/heap.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap {
return rv
}

func (c *collectStoreHeap) Add(doc *search.DocumentMatch) *search.DocumentMatch {
c.add(doc)
return nil
}

func (c *collectStoreHeap) AddNotExceedingSize(doc *search.DocumentMatch,
size int) *search.DocumentMatch {
c.add(doc)
Expand Down
Loading
Loading