Exposes Kendra result item DocumentAttributes in the document metadata (

#7781) - Description: exposes the ResultItem DocumentAttributes as document metadata with key 'document_attributes' and refactors AmazonKendraRetriever by providing a ResultItem base class in order to avoid duplicate code; - Tag maintainer: @3coins @hupe1980 @dev2049 @baskaryan - Twitter handle: wilsonleao ### Why? Some use cases depend on specific document attributes returned by the retriever in order to improve the quality of the overall completion and adjust what will be displayed to the user. For the sake of consistency, we need to expose the DocumentAttributes as document metadata so we are sure that we are using the values returned by the kendra request issued by langchain. I would appreciate your review @3coins @hupe1980 @dev2049. Thank you in advance! ### References - [Amazon Kendra DocumentAttribute](https://docs.aws.amazon.com/kendra/latest/APIReference/API_DocumentAttribute.html) - [Amazon Kendra DocumentAttributeValue](https://docs.aws.amazon.com/kendra/latest/APIReference/API_DocumentAttributeValue.html) --------- Co-authored-by: Piyush Jain <[email protected]>
langchain-ai · Jul 19, 2023 · 8bb33f2 · 8bb33f2
1 parent efa67ed
commit 8bb33f2
Showing 1 changed file with 127 additions and 58 deletions.
diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py
@@ -1,5 +1,6 @@
 import re
-from typing import Any, Dict, List, Literal, Optional
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Literal, Optional, Union
 
 from pydantic import BaseModel, Extra, root_validator
 
@@ -47,10 +48,19 @@ def combined_text(title: str, excerpt: str) -> str:
 
 
 class Highlight(BaseModel, extra=Extra.allow):
+    """
+    Represents the information that can be
+    used to highlight key words in the excerpt.
+    """
+
     BeginOffset: int
+    """The zero-based location in the excerpt where the highlight starts."""
     EndOffset: int
+    """The zero-based location in the excerpt where the highlight ends."""
     TopAnswer: Optional[bool]
+    """Indicates whether the result is the best one."""
     Type: Optional[str]
+    """The highlight type: STANDARD or THESAURUS_SYNONYM."""
 
 
 class TextWithHighLights(BaseModel, extra=Extra.allow):
@@ -83,18 +93,116 @@ def get_value_text(self) -> str:
         return self.Value.TextWithHighlightsValue.Text
 
 
-class QueryResultItem(BaseModel, extra=Extra.allow):
-    """A query result item."""
+class DocumentAttributeValue(BaseModel, extra=Extra.allow):
+    """The value of a document attribute."""
 
-    DocumentId: str
-    DocumentTitle: TextWithHighLights
+    DateValue: Optional[str]
+    """The date value."""
+    LongValue: Optional[int]
+    """The long value."""
+    StringListValue: Optional[List[str]]
+    """The string list value."""
+    StringValue: Optional[str]
+    """The string value."""
+
+    @property
+    def value(self) -> Optional[Union[str, int, List[str]]]:
+        """The only defined document attribute value or None.
+        According to Amazon Kendra, you can only provide one
+        value for a document attribute.
+        """
+        if self.DateValue:
+            return self.DateValue
+        if self.LongValue:
+            return self.LongValue
+        if self.StringListValue:
+            return self.StringListValue
+        if self.StringValue:
+            return self.StringValue
+
+        return None
+
+
+class DocumentAttribute(BaseModel, extra=Extra.allow):
+    """A document attribute."""
+
+    Key: str
+    """The key of the attribute."""
+    Value: DocumentAttributeValue
+    """The value of the attribute."""
+
+
+class ResultItem(BaseModel, ABC, extra=Extra.allow):
+    """Abstract class that represents a result item."""
+
+    Id: Optional[str]
+    """The ID of the item."""
+    DocumentId: Optional[str]
+    """The document ID."""
     DocumentURI: Optional[str]
+    """The document URI."""
+    DocumentAttributes: Optional[List[DocumentAttribute]] = []
+    """The document attributes."""
+
+    @abstractmethod
+    def get_title(self) -> str:
+        """Document title."""
+
+    @abstractmethod
+    def get_excerpt(self) -> str:
+        """Document excerpt or passage."""
+
+    def get_additional_metadata(self) -> dict:
+        """Document additional metadata dict.
+        This returns any extra metadata except these values:
+        ['source', 'title', 'excerpt' and 'document_attributes'].
+        """
+        return {}
+
+    def get_document_attributes_dict(self) -> dict:
+        return {attr.Key: attr.Value.value for attr in (self.DocumentAttributes or [])}
+
+    def to_doc(self) -> Document:
+        title = self.get_title()
+        excerpt = self.get_excerpt()
+        page_content = combined_text(title, excerpt)
+        source = self.DocumentURI
+        document_attributes = self.get_document_attributes_dict()
+        metadata = self.get_additional_metadata()
+        metadata.update(
+            {
+                "source": source,
+                "title": title,
+                "excerpt": excerpt,
+                "document_attributes": document_attributes,
+            }
+        )
+
+        return Document(page_content=page_content, metadata=metadata)
+
+
+class QueryResultItem(ResultItem):
+    """A Query API result item."""
+
+    DocumentTitle: TextWithHighLights
+    """The document title."""
     FeedbackToken: Optional[str]
+    """Identifies a particular result from a particular query."""
     Format: Optional[str]
-    Id: Optional[str]
+    """
+    If the Type is ANSWER, then format is either:
+        * TABLE: a table excerpt is returned in TableExcerpt;
+        * TEXT: a text excerpt is returned in DocumentExcerpt.
+    """
     Type: Optional[str]
+    """Type of result: DOCUMENT or QUESTION_ANSWER or ANSWER"""
     AdditionalAttributes: Optional[List[AdditionalResultAttribute]] = []
+    """One or more additional attributes associated with the result."""
     DocumentExcerpt: Optional[TextWithHighLights]
+    """Excerpt of the document text."""
+
+    def get_title(self) -> str:
+        return self.DocumentTitle.Text
 
     def get_attribute_value(self) -> str:
         if not self.AdditionalAttributes:
@@ -117,20 +225,16 @@ def get_excerpt(self) -> str:
 
         return clean_excerpt(excerpt)
 
-    def to_doc(self) -> Document:
-        title = self.DocumentTitle.Text
-        source = self.DocumentURI
-        excerpt = self.get_excerpt()
-        type = self.Type
-        page_content = combined_text(title, excerpt)
-        metadata = {"source": source, "title": title, "excerpt": excerpt, "type": type}
-        return Document(page_content=page_content, metadata=metadata)
+    def get_additional_metadata(self) -> dict:
+        additional_metadata = {"type": self.Type}
+        return additional_metadata
 
 
 class QueryResult(BaseModel, extra=Extra.allow):
-    """A query result."""
+    """A Query API result."""
 
     ResultItems: List[QueryResultItem]
+    """The result items."""
 
     def get_top_k_docs(self, top_n: int) -> List[Document]:
         """Gets the top k documents.
@@ -148,60 +252,25 @@ def get_top_k_docs(self, top_n: int) -> List[Document]:
         return docs
 
 
-class DocumentAttributeValue(BaseModel, extra=Extra.allow):
-    """The value of a document attribute."""
-
-    DateValue: Optional[str]
-    """The date value."""
-    LongValue: Optional[int]
-    """The long value."""
-    StringListValue: Optional[List[str]]
-    """The string list value."""
-    StringValue: Optional[str]
-    """The string value."""
-
-
-class DocumentAttribute(BaseModel, extra=Extra.allow):
-    """A document attribute."""
-
-    Key: str
-    """The key of the attribute."""
-    Value: DocumentAttributeValue
-    """The value of the attribute."""
-
-
-class RetrieveResultItem(BaseModel, extra=Extra.allow):
-    """A retrieve result item."""
+class RetrieveResultItem(ResultItem):
+    """A Retrieve API result item."""
 
-    Content: Optional[str]
-    """The content of the item."""
-    DocumentAttributes: Optional[List[DocumentAttribute]] = []
-    """The document attributes."""
-    DocumentId: Optional[str]
-    """The document ID."""
     DocumentTitle: Optional[str]
     """The document title."""
-    DocumentURI: Optional[str]
-    """The document URI."""
-    Id: Optional[str]
-    """The ID of the item."""
+    Content: Optional[str]
+    """The content of the item."""
+
+    def get_title(self) -> str:
+        return self.DocumentTitle or ""
 
     def get_excerpt(self) -> str:
         if not self.Content:
             return ""
         return clean_excerpt(self.Content)
 
-    def to_doc(self) -> Document:
-        title = self.DocumentTitle if self.DocumentTitle else ""
-        source = self.DocumentURI
-        excerpt = self.get_excerpt()
-        page_content = combined_text(title, excerpt)
-        metadata = {"source": source, "title": title, "excerpt": excerpt}
-        return Document(page_content=page_content, metadata=metadata)
-
 
 class RetrieveResult(BaseModel, extra=Extra.allow):
-    """A retrieve result."""
+    """A Retrieve API result."""
 
     QueryId: str
     """The ID of the query."""