From 107bcc209c8317613b9d7848734f6470d5484762 Mon Sep 17 00:00:00 2001 From: Wilson Leao Date: Sun, 16 Jul 2023 11:41:48 +0200 Subject: [PATCH 1/6] Exposes Kendra result item DocumentAttributes in the document metadata - Refactors retriever by providing a ResultItem base class in order to avoid duplicate code; - Exposes the ResultItem DocumentAttributes as document metadata with key 'document_attributes'. --- langchain/retrievers/kendra.py | 153 +++++++++++++++++++++------------ 1 file changed, 98 insertions(+), 55 deletions(-) diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py index 2ceeb1d0fcd73..7a67d8d920ccb 100644 --- a/langchain/retrievers/kendra.py +++ b/langchain/retrievers/kendra.py @@ -1,5 +1,6 @@ import re -from typing import Any, Dict, List, Literal, Optional +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Literal, Optional, Union from pydantic import BaseModel, Extra, root_validator @@ -80,19 +81,101 @@ def get_value_text(self) -> str: return self.Value.TextWithHighlightsValue.Text -class QueryResultItem(BaseModel, extra=Extra.allow): - """A query result item.""" +class DocumentAttributeValue(BaseModel, extra=Extra.allow): + """The value of a document attribute.""" - DocumentId: str - DocumentTitle: TextWithHighLights + DateValue: Optional[str] + """The date value.""" + LongValue: Optional[int] + """The long value.""" + StringListValue: Optional[List[str]] + """The string list value.""" + StringValue: Optional[str] + """The string value.""" + + @property + def value(self) -> Optional[Union[str, int, List[str]]]: + """The only defined document attribute value or None. + According to Amazon Kendra, you can only provide one value for a document attribute. + """ + if self.DateValue: + return self.DateValue + if self.LongValue: + return self.LongValue + if self.StringListValue: + return self.StringListValue + if self.StringValue: + return self.StringValue + + return None + + +class DocumentAttribute(BaseModel, extra=Extra.allow): + """A document attribute.""" + + Key: str + """The key of the attribute.""" + Value: DocumentAttributeValue + """The value of the attribute.""" + + +class ResultItem(BaseModel, ABC, extra=Extra.allow): + """Abstract class that represents a result item.""" + + Id: Optional[str] + """The ID of the item.""" + DocumentId: Optional[str] + """The document ID.""" DocumentURI: Optional[str] + """The document URI.""" + DocumentAttributes: Optional[List[DocumentAttribute]] = [] + """The document attributes.""" + + @abstractmethod + def get_title(self) -> str: + """Document title.""" + + @abstractmethod + def get_excerpt(self) -> str: + """Document excerpt or passage.""" + + def get_additional_metadata(self) -> dict: + """Document additional metadata dict. + The final Document metadata will always provide the keys: + ['source', 'title', 'excerpt' and 'document_attributes']. + """ + return {} + + def get_document_attributes_dict(self) -> dict: + return {attr.Key: attr.Value.value for attr in (self.DocumentAttributes or [])} + + def to_doc(self) -> Document: + title = self.get_title() + excerpt = self.get_excerpt() + page_content = combined_text(title, excerpt) + source = self.DocumentURI + document_attributes = self.get_document_attributes_dict() + metadata = self.get_additional_metadata() | { + "source": source, + "title": title, + "excerpt": excerpt, + "document_attributes": document_attributes, + } + + return Document(page_content=page_content, metadata=metadata) + + +class QueryResultItem(ResultItem): + DocumentTitle: TextWithHighLights FeedbackToken: Optional[str] Format: Optional[str] - Id: Optional[str] Type: Optional[str] AdditionalAttributes: Optional[List[AdditionalResultAttribute]] = [] DocumentExcerpt: Optional[TextWithHighLights] + def get_title(self) -> str: + return self.DocumentTitle.Text + def get_attribute_value(self) -> str: if not self.AdditionalAttributes: return "" @@ -114,14 +197,9 @@ def get_excerpt(self) -> str: return clean_excerpt(excerpt) - def to_doc(self) -> Document: - title = self.DocumentTitle.Text - source = self.DocumentURI - excerpt = self.get_excerpt() - type = self.Type - page_content = combined_text(title, excerpt) - metadata = {"source": source, "title": title, "excerpt": excerpt, "type": type} - return Document(page_content=page_content, metadata=metadata) + def get_additional_metadata(self) -> dict: + additional_metadata = {"type": self.Type} + return additional_metadata class QueryResult(BaseModel, extra=Extra.allow): @@ -145,57 +223,22 @@ def get_top_k_docs(self, top_n: int) -> List[Document]: return docs -class DocumentAttributeValue(BaseModel, extra=Extra.allow): - """The value of a document attribute.""" - - DateValue: Optional[str] - """The date value.""" - LongValue: Optional[int] - """The long value.""" - StringListValue: Optional[List[str]] - """The string list value.""" - StringValue: Optional[str] - """The string value.""" - - -class DocumentAttribute(BaseModel, extra=Extra.allow): - """A document attribute.""" - - Key: str - """The key of the attribute.""" - Value: DocumentAttributeValue - """The value of the attribute.""" - - -class RetrieveResultItem(BaseModel, extra=Extra.allow): +class RetrieveResultItem(ResultItem): """A retrieve result item.""" - Content: Optional[str] - """The content of the item.""" - DocumentAttributes: Optional[List[DocumentAttribute]] = [] - """The document attributes.""" - DocumentId: Optional[str] - """The document ID.""" DocumentTitle: Optional[str] """The document title.""" - DocumentURI: Optional[str] - """The document URI.""" - Id: Optional[str] - """The ID of the item.""" + Content: Optional[str] + """The content of the item.""" + + def get_title(self) -> str: + return self.DocumentTitle or "" def get_excerpt(self) -> str: if not self.Content: return "" return clean_excerpt(self.Content) - def to_doc(self) -> Document: - title = self.DocumentTitle if self.DocumentTitle else "" - source = self.DocumentURI - excerpt = self.get_excerpt() - page_content = combined_text(title, excerpt) - metadata = {"source": source, "title": title, "excerpt": excerpt} - return Document(page_content=page_content, metadata=metadata) - class RetrieveResult(BaseModel, extra=Extra.allow): """A retrieve result.""" From af3b0d9795eef34cb4f7a87f65b052ec21c8b55d Mon Sep 17 00:00:00 2001 From: wnleao Date: Mon, 17 Jul 2023 08:07:04 +0200 Subject: [PATCH 2/6] adjust additional metadata documentation Co-authored-by: Piyush Jain --- langchain/retrievers/kendra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py index 7a67d8d920ccb..76fda669f43c8 100644 --- a/langchain/retrievers/kendra.py +++ b/langchain/retrievers/kendra.py @@ -141,7 +141,7 @@ def get_excerpt(self) -> str: def get_additional_metadata(self) -> dict: """Document additional metadata dict. - The final Document metadata will always provide the keys: + This returns any extra metadata except these values: ['source', 'title', 'excerpt' and 'document_attributes']. """ return {} From 63c44a0b745b09f26740481152f72079a96f8c1b Mon Sep 17 00:00:00 2001 From: Wilson Leao Neto Date: Tue, 18 Jul 2023 21:34:37 +0200 Subject: [PATCH 3/6] fix: lint error kendra.py:81:89: E501 Line too long (92 > 88 characters) --- langchain/retrievers/kendra.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py index 76fda669f43c8..82fdeb0c1919c 100644 --- a/langchain/retrievers/kendra.py +++ b/langchain/retrievers/kendra.py @@ -96,7 +96,8 @@ class DocumentAttributeValue(BaseModel, extra=Extra.allow): @property def value(self) -> Optional[Union[str, int, List[str]]]: """The only defined document attribute value or None. - According to Amazon Kendra, you can only provide one value for a document attribute. + According to Amazon Kendra, you can only provide one + value for a document attribute. """ if self.DateValue: return self.DateValue From 5a822fc2b907c7ddd418fe91ae2040d54d1b492a Mon Sep 17 00:00:00 2001 From: Wilson Leao Neto Date: Tue, 18 Jul 2023 22:18:46 +0200 Subject: [PATCH 4/6] fix: documentation --- langchain/retrievers/kendra.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py index 82fdeb0c1919c..053ad72d711f1 100644 --- a/langchain/retrievers/kendra.py +++ b/langchain/retrievers/kendra.py @@ -45,10 +45,19 @@ def combined_text(title: str, excerpt: str) -> str: class Highlight(BaseModel, extra=Extra.allow): + """ + Represents the information that can be + used to highlight key words in the excerpt. + """ + BeginOffset: int + """The zero-based location in the excerpt where the highlight starts.""" EndOffset: int + """The zero-based location in the excerpt where the highlight ends.""" TopAnswer: Optional[bool] + """Indicates whether the result is the best one.""" Type: Optional[str] + """The highlight type: STANDARD or THESAURUS_SYNONYM.""" class TextWithHighLights(BaseModel, extra=Extra.allow): @@ -167,12 +176,24 @@ def to_doc(self) -> Document: class QueryResultItem(ResultItem): + """A Query API result item.""" + DocumentTitle: TextWithHighLights + """The document title.""" FeedbackToken: Optional[str] + """Identifies a particular result from a particular query.""" Format: Optional[str] + """ + If the Type is ANSWER, then format is either: + * TABLE: a table excerpt is returned in TableExcerpt; + * TEXT: a text excerpt is returned in DocumentExcerpt. + """ Type: Optional[str] + """Type of result: DOCUMENT or QUESTION_ANSWER or ANSWER""" AdditionalAttributes: Optional[List[AdditionalResultAttribute]] = [] + """One or more additional attributes associated with the result.""" DocumentExcerpt: Optional[TextWithHighLights] + """Excerpt of the document text.""" def get_title(self) -> str: return self.DocumentTitle.Text @@ -204,9 +225,10 @@ def get_additional_metadata(self) -> dict: class QueryResult(BaseModel, extra=Extra.allow): - """A query result.""" + """A Query API result.""" ResultItems: List[QueryResultItem] + """The result items.""" def get_top_k_docs(self, top_n: int) -> List[Document]: """Gets the top k documents. @@ -225,7 +247,7 @@ def get_top_k_docs(self, top_n: int) -> List[Document]: class RetrieveResultItem(ResultItem): - """A retrieve result item.""" + """A Retrieve API result item.""" DocumentTitle: Optional[str] """The document title.""" @@ -242,7 +264,7 @@ def get_excerpt(self) -> str: class RetrieveResult(BaseModel, extra=Extra.allow): - """A retrieve result.""" + """A Retrieve API result.""" QueryId: str """The ID of the query.""" From 2ed3648e42a62747969e5c2f4896df9fbecea302 Mon Sep 17 00:00:00 2001 From: Wilson Leao Neto Date: Tue, 18 Jul 2023 23:30:23 +0200 Subject: [PATCH 5/6] fix: use dict update to merge metadata --- langchain/retrievers/kendra.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py index 053ad72d711f1..0268bdc7da790 100644 --- a/langchain/retrievers/kendra.py +++ b/langchain/retrievers/kendra.py @@ -165,12 +165,13 @@ def to_doc(self) -> Document: page_content = combined_text(title, excerpt) source = self.DocumentURI document_attributes = self.get_document_attributes_dict() - metadata = self.get_additional_metadata() | { + metadata = self.get_additional_metadata() + metadata.update({ "source": source, "title": title, "excerpt": excerpt, "document_attributes": document_attributes, - } + }) return Document(page_content=page_content, metadata=metadata) From 789838fda435b79f0d420a0be4bdd3b344c47152 Mon Sep 17 00:00:00 2001 From: Wilson Leao Neto Date: Tue, 18 Jul 2023 23:32:03 +0200 Subject: [PATCH 6/6] fix: format --- langchain/retrievers/kendra.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/langchain/retrievers/kendra.py b/langchain/retrievers/kendra.py index 0268bdc7da790..59247e7bcc170 100644 --- a/langchain/retrievers/kendra.py +++ b/langchain/retrievers/kendra.py @@ -166,12 +166,14 @@ def to_doc(self) -> Document: source = self.DocumentURI document_attributes = self.get_document_attributes_dict() metadata = self.get_additional_metadata() - metadata.update({ - "source": source, - "title": title, - "excerpt": excerpt, - "document_attributes": document_attributes, - }) + metadata.update( + { + "source": source, + "title": title, + "excerpt": excerpt, + "document_attributes": document_attributes, + } + ) return Document(page_content=page_content, metadata=metadata)