Skip to content

Commit

Permalink
OpenSearch top k parameter fix (#5216)
Browse files Browse the repository at this point in the history
For most queries it's the `size` parameter that determines final number
of documents to return. Since our abstractions refer to this as `k`, set
this to be `k` everywhere instead of expecting a separate param. Would
be great to have someone more familiar with OpenSearch validate that
this is reasonable (e.g. that having `size` and what OpenSearch calls
`k` be the same won't lead to any strange behavior). cc @naveentatikonda

Closes #5212
  • Loading branch information
dev2049 authored May 25, 2023
1 parent 88ed8e1 commit 3be9ba1
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 20 deletions.
42 changes: 34 additions & 8 deletions docs/modules/indexes/vectorstores/examples/opensearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
"\n",
"This notebook shows how to use functionality related to the `OpenSearch` database.\n",
"\n",
"To run, you should have the opensearch instance up and running: [here](https://opensearch.org/docs/latest/install-and-configure/install-opensearch/index/)\n",
"To run, you should have an OpenSearch instance up and running: [see here for an easy Docker installation](https://hub.docker.com/r/opensearchproject/opensearch).\n",
"\n",
"`similarity_search` by default performs the Approximate k-NN Search which uses one of the several algorithms like lucene, nmslib, faiss recommended for\n",
"large datasets. To perform brute force search we have other search methods known as Script Scoring and Painless Scripting.\n",
"Check [this](https://opensearch.org/docs/latest/search-plugins/knn/index/) for more details."
Expand All @@ -23,7 +24,8 @@
"id": "94963977-9dfc-48b7-872a-53f2947f46c6",
"metadata": {},
"source": [
"## Installation"
"## Installation\n",
"Install the Python client."
]
},
{
Expand Down Expand Up @@ -61,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "aac9563e",
"metadata": {},
"outputs": [],
Expand All @@ -74,7 +76,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "a3c3999a",
"metadata": {},
"outputs": [],
Expand All @@ -98,6 +100,32 @@
"`similarity_search` using `Approximate k-NN` Search with Custom Parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "803fe12b",
"metadata": {},
"outputs": [],
"source": [
"docsearch = OpenSearchVectorSearch.from_documents(\n",
" docs, \n",
" embeddings, \n",
" opensearch_url=\"http://localhost:9200\"\n",
")\n",
"\n",
"# If using the default Docker installation, use this instantiation instead:\n",
"# docsearch = OpenSearchVectorSearch.from_documents(\n",
"# docs, \n",
"# embeddings, \n",
"# opensearch_url=\"https://localhost:9200\", \n",
"# http_auth=(\"admin\", \"admin\"), \n",
"# use_ssl = False,\n",
"# verify_certs = False,\n",
"# ssl_assert_hostname = False,\n",
"# ssl_show_warn = False,\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -109,10 +137,8 @@
},
"outputs": [],
"source": [
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\")\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
"docs = docsearch.similarity_search(query, k=10)"
]
},
{
Expand Down Expand Up @@ -283,7 +309,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.11.3"
}
},
"nbformat": 4,
Expand Down
22 changes: 10 additions & 12 deletions langchain/vectorstores/opensearch_vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,28 +153,26 @@ def _default_text_mapping(

def _default_approximate_search_query(
query_vector: List[float],
size: int = 4,
k: int = 4,
vector_field: str = "vector_field",
) -> Dict:
"""For Approximate k-NN Search, this is the default query."""
return {
"size": size,
"size": k,
"query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
}


def _approximate_search_query_with_boolean_filter(
query_vector: List[float],
boolean_filter: Dict,
size: int = 4,
k: int = 4,
vector_field: str = "vector_field",
subquery_clause: str = "must",
) -> Dict:
"""For Approximate k-NN Search, with Boolean Filter."""
return {
"size": size,
"size": k,
"query": {
"bool": {
"filter": boolean_filter,
Expand All @@ -189,13 +187,12 @@ def _approximate_search_query_with_boolean_filter(
def _approximate_search_query_with_lucene_filter(
query_vector: List[float],
lucene_filter: Dict,
size: int = 4,
k: int = 4,
vector_field: str = "vector_field",
) -> Dict:
"""For Approximate k-NN Search, with Lucene Filter."""
search_query = _default_approximate_search_query(
query_vector, size, k, vector_field
query_vector, k=k, vector_field=vector_field
)
search_query["query"]["knn"][vector_field]["filter"] = lucene_filter
return search_query
Expand Down Expand Up @@ -382,8 +379,6 @@ def similarity_search(
Optional Args for Approximate Search:
search_type: "approximate_search"; default: "approximate_search"
size: number of results the query actually returns; default: 4
boolean_filter: A Boolean filter consists of a Boolean query that
contains a k-NN query and a filter.
Expand Down Expand Up @@ -438,7 +433,6 @@ def similarity_search_with_score(
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")

if search_type == "approximate_search":
size = _get_kwargs_value(kwargs, "size", 4)
boolean_filter = _get_kwargs_value(kwargs, "boolean_filter", {})
subquery_clause = _get_kwargs_value(kwargs, "subquery_clause", "must")
lucene_filter = _get_kwargs_value(kwargs, "lucene_filter", {})
Expand All @@ -449,15 +443,19 @@ def similarity_search_with_score(
)
if boolean_filter != {}:
search_query = _approximate_search_query_with_boolean_filter(
embedding, boolean_filter, size, k, vector_field, subquery_clause
embedding,
boolean_filter,
k=k,
vector_field=vector_field,
subquery_clause=subquery_clause,
)
elif lucene_filter != {}:
search_query = _approximate_search_query_with_lucene_filter(
embedding, lucene_filter, size, k, vector_field
embedding, lucene_filter, k=k, vector_field=vector_field
)
else:
search_query = _default_approximate_search_query(
embedding, size, k, vector_field
embedding, k=k, vector_field=vector_field
)
elif search_type == SCRIPT_SCORING_SEARCH:
space_type = _get_kwargs_value(kwargs, "space_type", "l2")
Expand Down

0 comments on commit 3be9ba1

Please sign in to comment.