From 107bcc209c8317613b9d7848734f6470d5484762 Mon Sep 17 00:00:00 2001
From: Wilson Leao <wln1987@gmail.com>
Date: Sun, 16 Jul 2023 11:41:48 +0200
Subject: [PATCH 1/6] Exposes Kendra result item DocumentAttributes in the
 document metadata

- Refactors retriever by providing a ResultItem base class in order to
avoid duplicate code;
- Exposes the ResultItem DocumentAttributes as
document metadata with key 'document_attributes'.
---
 langchain/retrievers/kendra.py | 153 +++++++++++++++++++++------------
 1 file changed, 98 insertions(+), 55 deletions(-)

diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py
index 2ceeb1d0fcd73..7a67d8d920ccb 100644
--- a/langchain/retrievers/kendra.py
+++ b/langchain/retrievers/kendra.py
@@ -1,5 +1,6 @@
 import re
-from typing import Any, Dict, List, Literal, Optional
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Literal, Optional, Union
 
 from pydantic import BaseModel, Extra, root_validator
 
@@ -80,19 +81,101 @@ def get_value_text(self) -> str:
         return self.Value.TextWithHighlightsValue.Text
 
 
-class QueryResultItem(BaseModel, extra=Extra.allow):
-    """A query result item."""
+class DocumentAttributeValue(BaseModel, extra=Extra.allow):
+    """The value of a document attribute."""
 
-    DocumentId: str
-    DocumentTitle: TextWithHighLights
+    DateValue: Optional[str]
+    """The date value."""
+    LongValue: Optional[int]
+    """The long value."""
+    StringListValue: Optional[List[str]]
+    """The string list value."""
+    StringValue: Optional[str]
+    """The string value."""
+
+    @property
+    def value(self) -> Optional[Union[str, int, List[str]]]:
+        """The only defined document attribute value or None.
+        According to Amazon Kendra, you can only provide one value for a document attribute.
+        """
+        if self.DateValue:
+            return self.DateValue
+        if self.LongValue:
+            return self.LongValue
+        if self.StringListValue:
+            return self.StringListValue
+        if self.StringValue:
+            return self.StringValue
+
+        return None
+
+
+class DocumentAttribute(BaseModel, extra=Extra.allow):
+    """A document attribute."""
+
+    Key: str
+    """The key of the attribute."""
+    Value: DocumentAttributeValue
+    """The value of the attribute."""
+
+
+class ResultItem(BaseModel, ABC, extra=Extra.allow):
+    """Abstract class that represents a result item."""
+
+    Id: Optional[str]
+    """The ID of the item."""
+    DocumentId: Optional[str]
+    """The document ID."""
     DocumentURI: Optional[str]
+    """The document URI."""
+    DocumentAttributes: Optional[List[DocumentAttribute]] = []
+    """The document attributes."""
+
+    @abstractmethod
+    def get_title(self) -> str:
+        """Document title."""
+
+    @abstractmethod
+    def get_excerpt(self) -> str:
+        """Document excerpt or passage."""
+
+    def get_additional_metadata(self) -> dict:
+        """Document additional metadata dict.
+        The final Document metadata will always provide the keys:
+        ['source', 'title', 'excerpt' and 'document_attributes'].
+        """
+        return {}
+
+    def get_document_attributes_dict(self) -> dict:
+        return {attr.Key: attr.Value.value for attr in (self.DocumentAttributes or [])}
+
+    def to_doc(self) -> Document:
+        title = self.get_title()
+        excerpt = self.get_excerpt()
+        page_content = combined_text(title, excerpt)
+        source = self.DocumentURI
+        document_attributes = self.get_document_attributes_dict()
+        metadata = self.get_additional_metadata() | {
+            "source": source,
+            "title": title,
+            "excerpt": excerpt,
+            "document_attributes": document_attributes,
+        }
+
+        return Document(page_content=page_content, metadata=metadata)
+
+
+class QueryResultItem(ResultItem):
+    DocumentTitle: TextWithHighLights
     FeedbackToken: Optional[str]
     Format: Optional[str]
-    Id: Optional[str]
     Type: Optional[str]
     AdditionalAttributes: Optional[List[AdditionalResultAttribute]] = []
     DocumentExcerpt: Optional[TextWithHighLights]
 
+    def get_title(self) -> str:
+        return self.DocumentTitle.Text
+
     def get_attribute_value(self) -> str:
         if not self.AdditionalAttributes:
             return ""
@@ -114,14 +197,9 @@ def get_excerpt(self) -> str:
 
         return clean_excerpt(excerpt)
 
-    def to_doc(self) -> Document:
-        title = self.DocumentTitle.Text
-        source = self.DocumentURI
-        excerpt = self.get_excerpt()
-        type = self.Type
-        page_content = combined_text(title, excerpt)
-        metadata = {"source": source, "title": title, "excerpt": excerpt, "type": type}
-        return Document(page_content=page_content, metadata=metadata)
+    def get_additional_metadata(self) -> dict:
+        additional_metadata = {"type": self.Type}
+        return additional_metadata
 
 
 class QueryResult(BaseModel, extra=Extra.allow):
@@ -145,57 +223,22 @@ def get_top_k_docs(self, top_n: int) -> List[Document]:
         return docs
 
 
-class DocumentAttributeValue(BaseModel, extra=Extra.allow):
-    """The value of a document attribute."""
-
-    DateValue: Optional[str]
-    """The date value."""
-    LongValue: Optional[int]
-    """The long value."""
-    StringListValue: Optional[List[str]]
-    """The string list value."""
-    StringValue: Optional[str]
-    """The string value."""
-
-
-class DocumentAttribute(BaseModel, extra=Extra.allow):
-    """A document attribute."""
-
-    Key: str
-    """The key of the attribute."""
-    Value: DocumentAttributeValue
-    """The value of the attribute."""
-
-
-class RetrieveResultItem(BaseModel, extra=Extra.allow):
+class RetrieveResultItem(ResultItem):
     """A retrieve result item."""
 
-    Content: Optional[str]
-    """The content of the item."""
-    DocumentAttributes: Optional[List[DocumentAttribute]] = []
-    """The document attributes."""
-    DocumentId: Optional[str]
-    """The document ID."""
     DocumentTitle: Optional[str]
     """The document title."""
-    DocumentURI: Optional[str]
-    """The document URI."""
-    Id: Optional[str]
-    """The ID of the item."""
+    Content: Optional[str]
+    """The content of the item."""
+
+    def get_title(self) -> str:
+        return self.DocumentTitle or ""
 
     def get_excerpt(self) -> str:
         if not self.Content:
             return ""
         return clean_excerpt(self.Content)
 
-    def to_doc(self) -> Document:
-        title = self.DocumentTitle if self.DocumentTitle else ""
-        source = self.DocumentURI
-        excerpt = self.get_excerpt()
-        page_content = combined_text(title, excerpt)
-        metadata = {"source": source, "title": title, "excerpt": excerpt}
-        return Document(page_content=page_content, metadata=metadata)
-
 
 class RetrieveResult(BaseModel, extra=Extra.allow):
     """A retrieve result."""

From af3b0d9795eef34cb4f7a87f65b052ec21c8b55d Mon Sep 17 00:00:00 2001
From: wnleao <wln1987@gmail.com>
Date: Mon, 17 Jul 2023 08:07:04 +0200
Subject: [PATCH 2/6] adjust additional metadata documentation

Co-authored-by: Piyush Jain <piyushjain@duck.com>
---
 langchain/retrievers/kendra.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py
index 7a67d8d920ccb..76fda669f43c8 100644
--- a/langchain/retrievers/kendra.py
+++ b/langchain/retrievers/kendra.py
@@ -141,7 +141,7 @@ def get_excerpt(self) -> str:
 
     def get_additional_metadata(self) -> dict:
         """Document additional metadata dict.
-        The final Document metadata will always provide the keys:
+        This returns any extra metadata except these values:
         ['source', 'title', 'excerpt' and 'document_attributes'].
         """
         return {}

From 63c44a0b745b09f26740481152f72079a96f8c1b Mon Sep 17 00:00:00 2001
From: Wilson Leao Neto <wln1987@gmail.com>
Date: Tue, 18 Jul 2023 21:34:37 +0200
Subject: [PATCH 3/6] fix: lint error kendra.py:81:89: E501 Line too long (92 >
 88 characters)

---
 langchain/retrievers/kendra.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py
index 76fda669f43c8..82fdeb0c1919c 100644
--- a/langchain/retrievers/kendra.py
+++ b/langchain/retrievers/kendra.py
@@ -96,7 +96,8 @@ class DocumentAttributeValue(BaseModel, extra=Extra.allow):
     @property
     def value(self) -> Optional[Union[str, int, List[str]]]:
         """The only defined document attribute value or None.
-        According to Amazon Kendra, you can only provide one value for a document attribute.
+        According to Amazon Kendra, you can only provide one
+        value for a document attribute.
         """
         if self.DateValue:
             return self.DateValue

From 5a822fc2b907c7ddd418fe91ae2040d54d1b492a Mon Sep 17 00:00:00 2001
From: Wilson Leao Neto <wln1987@gmail.com>
Date: Tue, 18 Jul 2023 22:18:46 +0200
Subject: [PATCH 4/6] fix: documentation

---
 langchain/retrievers/kendra.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py
index 82fdeb0c1919c..053ad72d711f1 100644
--- a/langchain/retrievers/kendra.py
+++ b/langchain/retrievers/kendra.py
@@ -45,10 +45,19 @@ def combined_text(title: str, excerpt: str) -> str:
 
 
 class Highlight(BaseModel, extra=Extra.allow):
+    """
+    Represents the information that can be
+    used to highlight key words in the excerpt.
+    """
+
     BeginOffset: int
+    """The zero-based location in the excerpt where the highlight starts."""
     EndOffset: int
+    """The zero-based location in the excerpt where the highlight ends."""
     TopAnswer: Optional[bool]
+    """Indicates whether the result is the best one."""
     Type: Optional[str]
+    """The highlight type: STANDARD or THESAURUS_SYNONYM."""
 
 
 class TextWithHighLights(BaseModel, extra=Extra.allow):
@@ -167,12 +176,24 @@ def to_doc(self) -> Document:
 
 
 class QueryResultItem(ResultItem):
+    """A Query API result item."""
+
     DocumentTitle: TextWithHighLights
+    """The document title."""
     FeedbackToken: Optional[str]
+    """Identifies a particular result from a particular query."""
     Format: Optional[str]
+    """
+    If the Type is ANSWER, then format is either:
+        * TABLE: a table excerpt is returned in TableExcerpt;
+        * TEXT: a text excerpt is returned in DocumentExcerpt.
+    """
     Type: Optional[str]
+    """Type of result: DOCUMENT or QUESTION_ANSWER or ANSWER"""
     AdditionalAttributes: Optional[List[AdditionalResultAttribute]] = []
+    """One or more additional attributes associated with the result."""
     DocumentExcerpt: Optional[TextWithHighLights]
+    """Excerpt of the document text."""
 
     def get_title(self) -> str:
         return self.DocumentTitle.Text
@@ -204,9 +225,10 @@ def get_additional_metadata(self) -> dict:
 
 
 class QueryResult(BaseModel, extra=Extra.allow):
-    """A query result."""
+    """A Query API result."""
 
     ResultItems: List[QueryResultItem]
+    """The result items."""
 
     def get_top_k_docs(self, top_n: int) -> List[Document]:
         """Gets the top k documents.
@@ -225,7 +247,7 @@ def get_top_k_docs(self, top_n: int) -> List[Document]:
 
 
 class RetrieveResultItem(ResultItem):
-    """A retrieve result item."""
+    """A Retrieve API result item."""
 
     DocumentTitle: Optional[str]
     """The document title."""
@@ -242,7 +264,7 @@ def get_excerpt(self) -> str:
 
 
 class RetrieveResult(BaseModel, extra=Extra.allow):
-    """A retrieve result."""
+    """A Retrieve API result."""
 
     QueryId: str
     """The ID of the query."""

From 2ed3648e42a62747969e5c2f4896df9fbecea302 Mon Sep 17 00:00:00 2001
From: Wilson Leao Neto <wln1987@gmail.com>
Date: Tue, 18 Jul 2023 23:30:23 +0200
Subject: [PATCH 5/6] fix: use dict update to merge metadata

---
 langchain/retrievers/kendra.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py
index 053ad72d711f1..0268bdc7da790 100644
--- a/langchain/retrievers/kendra.py
+++ b/langchain/retrievers/kendra.py
@@ -165,12 +165,13 @@ def to_doc(self) -> Document:
         page_content = combined_text(title, excerpt)
         source = self.DocumentURI
         document_attributes = self.get_document_attributes_dict()
-        metadata = self.get_additional_metadata() | {
+        metadata = self.get_additional_metadata()
+        metadata.update({
             "source": source,
             "title": title,
             "excerpt": excerpt,
             "document_attributes": document_attributes,
-        }
+        })
 
         return Document(page_content=page_content, metadata=metadata)
 

From 789838fda435b79f0d420a0be4bdd3b344c47152 Mon Sep 17 00:00:00 2001
From: Wilson Leao Neto <wln1987@gmail.com>
Date: Tue, 18 Jul 2023 23:32:03 +0200
Subject: [PATCH 6/6] fix: format

---
 langchain/retrievers/kendra.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py
index 0268bdc7da790..59247e7bcc170 100644
--- a/langchain/retrievers/kendra.py
+++ b/langchain/retrievers/kendra.py
@@ -166,12 +166,14 @@ def to_doc(self) -> Document:
         source = self.DocumentURI
         document_attributes = self.get_document_attributes_dict()
         metadata = self.get_additional_metadata()
-        metadata.update({
-            "source": source,
-            "title": title,
-            "excerpt": excerpt,
-            "document_attributes": document_attributes,
-        })
+        metadata.update(
+            {
+                "source": source,
+                "title": title,
+                "excerpt": excerpt,
+                "document_attributes": document_attributes,
+            }
+        )
 
         return Document(page_content=page_content, metadata=metadata)