NIFI-12831: Add PutOpenSearchVector and QueryOpenSearchVector processors

Signed-off-by: Pierre Villard <[email protected]> This closes #8441.
apache · May 2, 2024 · b608e5a · b608e5a
1 parent f87a0f4
commit b608e5a
Show file tree

Hide file tree

Showing 4 changed files with 609 additions and 0 deletions.
diff --git a/...ensions/nifi-text-embeddings-module/src/main/python/vectorstores/OpenSearchVectorUtils.py b/...ensions/nifi-text-embeddings-module/src/main/python/vectorstores/OpenSearchVectorUtils.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
+from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL
+
+# Space types
+L2 = ("L2 (Euclidean distance)", "l2")
+L1 = ("L1 (Manhattan distance)", "l1")
+LINF = ("L-infinity (chessboard) distance", "linf")
+COSINESIMIL = ("Cosine similarity", "cosinesimil")
+
+HUGGING_FACE_API_KEY = PropertyDescriptor(
+    name="HuggingFace API Key",
+    description="The API Key for interacting with HuggingFace",
+    required=True,
+    sensitive=True,
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
+)
+HUGGING_FACE_MODEL = PropertyDescriptor(
+    name="HuggingFace Model",
+    description="The name of the HuggingFace model to use",
+    default_value="sentence-transformers/all-MiniLM-L6-v2",
+    required=True,
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
+)
+OPENAI_API_KEY = PropertyDescriptor(
+    name="OpenAI API Key",
+    description="The API Key for OpenAI in order to create embeddings",
+    required=True,
+    sensitive=True,
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
+)
+OPENAI_API_MODEL = PropertyDescriptor(
+    name="OpenAI Model",
+    description="The API Key for OpenAI in order to create embeddings",
+    default_value="text-embedding-ada-002",
+    required=True,
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
+)
+HTTP_HOST = PropertyDescriptor(
+    name="HTTP Host",
+    description="URL where OpenSearch is hosted.",
+    default_value="http://localhost:9200",
+    required=True,
+    validators=[StandardValidators.URL_VALIDATOR]
+)
+USERNAME = PropertyDescriptor(
+    name="Username",
+    description="The username to use for authenticating to OpenSearch server",
+    required=False,
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+)
+PASSWORD = PropertyDescriptor(
+    name="Password",
+    description="The password to use for authenticating to OpenSearch server",
+    required=False,
+    sensitive=True,
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+)
+INDEX_NAME = PropertyDescriptor(
+    name="Index Name",
+    description="The name of the OpenSearch index.",
+    sensitive=False,
+    required=True,
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+)
+VECTOR_FIELD = PropertyDescriptor(
+    name="Vector Field Name",
+    description="The name of field in the document where the embeddings are stored. This field need to be a 'knn_vector' typed field.",
+    default_value="vector_field",
+    required=True,
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+)
+TEXT_FIELD = PropertyDescriptor(
+    name="Text Field Name",
+    description="The name of field in the document where the text is stored.",
+    default_value="text",
+    required=True,
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+)
+
+
+def create_authentication_params(context):
+    username = context.getProperty(USERNAME).getValue()
+    password = context.getProperty(PASSWORD).getValue()
+
+    params = {"verify_certs": "true"}
+
+    if username is not None and password is not None:
+        params["http_auth"] = (username, password)
+
+    return params
+
+
+def parse_documents(json_lines, id_field_name, file_name):
+    import json
+
+    texts = []
+    metadatas = []
+    ids = []
+    for i, line in enumerate(json_lines.split("\n"), start=1):
+        try:
+            doc = json.loads(line)
+        except Exception as e:
+            raise ValueError(f"Could not parse line {i} as JSON") from e
+
+        text = doc.get('text')
+        metadata = doc.get('metadata')
+        texts.append(text)
+
+        # Remove any null values, or it will cause the embedding to fail
+        filtered_metadata = {key: value for key, value in metadata.items() if value is not None}
+        metadatas.append(filtered_metadata)
+
+        doc_id = None
+        if id_field_name is not None:
+            doc_id = metadata.get(id_field_name)
+        if doc_id is None:
+            doc_id = file_name + "-" + str(i)
+        ids.append(doc_id)
+
+    return {"texts": texts, "metadatas": metadatas, "ids": ids}
diff --git a/...xtensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutOpenSearchVector.py b/...xtensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutOpenSearchVector.py
@@ -0,0 +1,245 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from langchain.vectorstores import OpenSearchVectorSearch
+from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult
+from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope, PropertyDependency
+from OpenSearchVectorUtils import (L2, L1, LINF, COSINESIMIL, OPENAI_API_KEY, OPENAI_API_MODEL, HUGGING_FACE_API_KEY,
+                                   HUGGING_FACE_MODEL, HTTP_HOST, USERNAME, PASSWORD, INDEX_NAME, VECTOR_FIELD,
+                                   TEXT_FIELD, create_authentication_params, parse_documents)
+from EmbeddingUtils import EMBEDDING_MODEL, create_embedding_service
+from nifiapi.documentation import use_case, ProcessorConfiguration
+
+
+@use_case(description="Create vectors/embeddings that represent text content and send the vectors to OpenSearch",
+          notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in OpenSearch provided in the 'text' field.",
+          keywords=["opensearch", "embedding", "vector", "text", "vectorstore", "insert"],
+          configuration="""
+                Configure the 'HTTP Host' to an appropriate URL where OpenSearch is accessible.
+                Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model'
+                Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model.
+                Set 'Index Name' to the name of your OpenSearch Index.
+                Set 'Vector Field Name' to the name of the field in the document which will store the vector data.
+                Set 'Text Field Name' to the name of the field in the document which will store the text data.
+
+                If the documents to send to OpenSearch contain a unique identifier, set the 'Document ID Field Name' property to the name of the field that contains the document ID.
+                This property can be left blank, in which case a unique ID will be generated based on the FlowFile's filename.
+
+                If the provided index does not exists in OpenSearch then the processor is capable to create it. The 'New Index Strategy' property defines 
+                that the index needs to be created from the default template or it should be configured with custom values.
+                """)
+@use_case(description="Update vectors/embeddings in OpenSearch",
+          notes="This use case assumes that the data has already been formatted in JSONL format with the text to store in OpenSearch provided in the 'text' field.",
+          keywords=["opensearch", "embedding", "vector", "text", "vectorstore", "update", "upsert"],
+          configuration="""
+                Configure the 'HTTP Host' to an appropriate URL where OpenSearch is accessible.
+                Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model'
+                Configure the 'OpenAI API Key' or 'HuggingFace API Key', depending on the chosen Embedding Model.
+                Set 'Index Name' to the name of your OpenSearch Index.
+                Set 'Vector Field Name' to the name of the field in the document which will store the vector data.
+                Set 'Text Field Name' to the name of the field in the document which will store the text data.
+                Set the 'Document ID Field Name' property to the name of the field that contains the identifier of the document in OpenSearch to update.
+                """)
+class PutOpenSearchVector(FlowFileTransform):
+    class Java:
+        implements = ['org.apache.nifi.python.processor.FlowFileTransform']
+
+    class ProcessorDetails:
+        version = '@project.version@'
+        description = """Publishes JSON data to OpenSearch. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'.
+                       The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored."""
+        tags = ["opensearch", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml",
+                "machine learning", "text", "LLM"]
+
+    # Engine types
+    NMSLIB = ("nmslib (Non-Metric Space Library)", "nmslib")
+    FAISS = ("faiss (Facebook AI Similarity Search)", "faiss")
+    LUCENE = ("lucene", "lucene")
+
+    ENGINE_VALUES = dict([NMSLIB, FAISS, LUCENE])
+
+    # Space types
+    INNERPRODUCT = ("Inner product", "innerproduct")
+
+    NMSLIB_SPACE_TYPE_VALUES = dict([L2, L1, LINF, COSINESIMIL, INNERPRODUCT])
+    FAISS_SPACE_TYPE_VALUES = dict([L2, INNERPRODUCT])
+    LUCENE_SPACE_TYPE_VALUES = dict([L2, COSINESIMIL])
+
+    # New Index Mapping Strategy
+    DEFAULT_INDEX_MAPPING = "Default index mapping"
+    CUSTOM_INDEX_MAPPING = "Custom index mapping"
+
+    DOC_ID_FIELD_NAME = PropertyDescriptor(
+        name="Document ID Field Name",
+        description="""Specifies the name of the field in the 'metadata' element of each document where the document's ID can be found.  
+                    If not specified, an ID will be generated based on the FlowFile's filename and a one-up number.""",
+        required=False,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+    )
+    NEW_INDEX_STRATEGY = PropertyDescriptor(
+        name="New Index Strategy",
+        description="Specifies the Mapping strategy to use for new index creation. The default template values are the following: "
+                    "{engine: nmslib, space_type: l2, ef_search: 512, ef_construction: 512, m: 16}",
+        allowable_values=[DEFAULT_INDEX_MAPPING, CUSTOM_INDEX_MAPPING],
+        default_value=DEFAULT_INDEX_MAPPING,
+        required=False,
+    )
+    ENGINE = PropertyDescriptor(
+        name="Engine",
+        description="The approximate k-NN library to use for indexing and search.",
+        allowable_values=ENGINE_VALUES.keys(),
+        default_value=NMSLIB[0],
+        required=False,
+        dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)]
+    )
+    NMSLIB_SPACE_TYPE = PropertyDescriptor(
+        name="NMSLIB Space Type",
+        description="The vector space used to calculate the distance between vectors.",
+        allowable_values=NMSLIB_SPACE_TYPE_VALUES.keys(),
+        default_value=L2[0],
+        required=False,
+        dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING),
+                      PropertyDependency(ENGINE, NMSLIB[0])]
+    )
+    FAISS_SPACE_TYPE = PropertyDescriptor(
+        name="FAISS Space Type",
+        description="The vector space used to calculate the distance between vectors.",
+        allowable_values=FAISS_SPACE_TYPE_VALUES.keys(),
+        default_value=L2[0],
+        required=False,
+        dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING),
+                      PropertyDependency(ENGINE, FAISS[0])]
+    )
+    LUCENE_SPACE_TYPE = PropertyDescriptor(
+        name="Lucene Space Type",
+        description="The vector space used to calculate the distance between vectors.",
+        allowable_values=LUCENE_SPACE_TYPE_VALUES.keys(),
+        default_value=L2[0],
+        required=False,
+        dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING),
+                      PropertyDependency(ENGINE, LUCENE[0])]
+    )
+    EF_SEARCH = PropertyDescriptor(
+        name="EF Search",
+        description="The size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches.",
+        default_value="512",
+        required=False,
+        validators=[StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR],
+        dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)]
+    )
+    EF_CONSTRUCTION = PropertyDescriptor(
+        name="EF Construction",
+        description="The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph but slower indexing speed.",
+        default_value="512",
+        required=False,
+        validators=[StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR],
+        dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)]
+    )
+    M = PropertyDescriptor(
+        name="M",
+        description="The number of bidirectional links that the plugin creates for each new element. Increasing and "
+                    "decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100.",
+        default_value="16",
+        required=False,
+        validators=[StandardValidators._standard_validators.createLongValidator(2, 100, True)],
+        dependencies=[PropertyDependency(NEW_INDEX_STRATEGY, CUSTOM_INDEX_MAPPING)]
+    )
+
+    properties = [EMBEDDING_MODEL,
+                  OPENAI_API_KEY,
+                  OPENAI_API_MODEL,
+                  HUGGING_FACE_API_KEY,
+                  HUGGING_FACE_MODEL,
+                  HTTP_HOST,
+                  USERNAME,
+                  PASSWORD,
+                  INDEX_NAME,
+                  DOC_ID_FIELD_NAME,
+                  VECTOR_FIELD,
+                  TEXT_FIELD,
+                  NEW_INDEX_STRATEGY,
+                  ENGINE,
+                  NMSLIB_SPACE_TYPE,
+                  FAISS_SPACE_TYPE,
+                  LUCENE_SPACE_TYPE,
+                  EF_SEARCH,
+                  EF_CONSTRUCTION,
+                  M]
+
+    embeddings = None
+
+    def __init__(self, **kwargs):
+        pass
+
+    def getPropertyDescriptors(self):
+        return self.properties
+
+    def onScheduled(self, context):
+        self.embeddings = create_embedding_service(context)
+
+    def transform(self, context, flowfile):
+        file_name = flowfile.getAttribute("filename")
+        http_host = context.getProperty(HTTP_HOST).evaluateAttributeExpressions(flowfile).getValue()
+        index_name = context.getProperty(INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue()
+        id_field_name = context.getProperty(self.DOC_ID_FIELD_NAME).evaluateAttributeExpressions(flowfile).getValue()
+        vector_field = context.getProperty(VECTOR_FIELD).evaluateAttributeExpressions(flowfile).getValue()
+        text_field = context.getProperty(TEXT_FIELD).evaluateAttributeExpressions(flowfile).getValue()
+        new_index_strategy = context.getProperty(self.NEW_INDEX_STRATEGY).evaluateAttributeExpressions().getValue()
+
+        params = {"vector_field": vector_field, "text_field": text_field}
+        params.update(create_authentication_params(context))
+
+        if new_index_strategy == self.CUSTOM_INDEX_MAPPING:
+            engine = context.getProperty(self.ENGINE).evaluateAttributeExpressions().getValue()
+            params["engine"] = self.ENGINE_VALUES.get(engine)
+
+            if engine == self.NMSLIB[0]:
+                space_type = context.getProperty(self.NMSLIB_SPACE_TYPE).evaluateAttributeExpressions().getValue()
+                params["space_type"] = self.NMSLIB_SPACE_TYPE_VALUES.get(space_type)
+            if engine == self.FAISS[0]:
+                space_type = context.getProperty(self.FAISS_SPACE_TYPE).evaluateAttributeExpressions().getValue()
+                params["space_type"] = self.FAISS_SPACE_TYPE_VALUES.get(space_type)
+            if engine == self.LUCENE[0]:
+                space_type = context.getProperty(self.LUCENE_SPACE_TYPE).evaluateAttributeExpressions().getValue()
+                params["space_type"] = self.LUCENE_SPACE_TYPE_VALUES.get(space_type)
+
+            ef_search = context.getProperty(self.EF_SEARCH).evaluateAttributeExpressions().asInteger()
+            params["ef_search"] = ef_search
+
+            ef_construction = context.getProperty(self.EF_CONSTRUCTION).evaluateAttributeExpressions().asInteger()
+            params["ef_construction"] = ef_construction
+
+            m = context.getProperty(self.M).evaluateAttributeExpressions().asInteger()
+            params["m"] = m
+
+        # Read the FlowFile content as "json-lines".
+        json_lines = flowfile.getContentsAsBytes().decode()
+        parsed_documents = parse_documents(json_lines, id_field_name, file_name)
+
+        vectorstore = OpenSearchVectorSearch(
+            opensearch_url=http_host,
+            index_name=index_name,
+            embedding_function=self.embeddings,
+            **params
+        )
+        vectorstore.add_texts(texts=parsed_documents["texts"],
+                              metadatas=parsed_documents["metadatas"],
+                              ids=parsed_documents["ids"],
+                              **params
+                              )
+
+        return FlowFileTransformResult(relationship="success")
+