Skip to content

Commit

Permalink
Exposes Kendra result item DocumentAttributes in the document metadata (
Browse files Browse the repository at this point in the history
#7781)

- Description: exposes the ResultItem DocumentAttributes as document
metadata with key 'document_attributes' and refactors
AmazonKendraRetriever by providing a ResultItem base class in order to
avoid duplicate code;
- Tag maintainer: @3coins @hupe1980 @dev2049 @baskaryan
- Twitter handle: wilsonleao

### Why?
Some use cases depend on specific document attributes returned by the
retriever in order to improve the quality of the overall completion and
adjust what will be displayed to the user. For the sake of consistency,
we need to expose the DocumentAttributes as document metadata so we are
sure that we are using the values returned by the kendra request issued
by langchain.

I would appreciate your review @3coins @hupe1980 @dev2049. Thank you in
advance!

### References
- [Amazon Kendra
DocumentAttribute](https://docs.aws.amazon.com/kendra/latest/APIReference/API_DocumentAttribute.html)
- [Amazon Kendra
DocumentAttributeValue](https://docs.aws.amazon.com/kendra/latest/APIReference/API_DocumentAttributeValue.html)

---------

Co-authored-by: Piyush Jain <[email protected]>
  • Loading branch information
wnleao and 3coins authored Jul 19, 2023
1 parent efa67ed commit 8bb33f2
Showing 1 changed file with 127 additions and 58 deletions.
185 changes: 127 additions & 58 deletions langchain/retrievers/kendra.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from typing import Any, Dict, List, Literal, Optional
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Literal, Optional, Union

from pydantic import BaseModel, Extra, root_validator

Expand Down Expand Up @@ -47,10 +48,19 @@ def combined_text(title: str, excerpt: str) -> str:


class Highlight(BaseModel, extra=Extra.allow):
"""
Represents the information that can be
used to highlight key words in the excerpt.
"""

BeginOffset: int
"""The zero-based location in the excerpt where the highlight starts."""
EndOffset: int
"""The zero-based location in the excerpt where the highlight ends."""
TopAnswer: Optional[bool]
"""Indicates whether the result is the best one."""
Type: Optional[str]
"""The highlight type: STANDARD or THESAURUS_SYNONYM."""


class TextWithHighLights(BaseModel, extra=Extra.allow):
Expand Down Expand Up @@ -83,18 +93,116 @@ def get_value_text(self) -> str:
return self.Value.TextWithHighlightsValue.Text


class QueryResultItem(BaseModel, extra=Extra.allow):
"""A query result item."""
class DocumentAttributeValue(BaseModel, extra=Extra.allow):
"""The value of a document attribute."""

DocumentId: str
DocumentTitle: TextWithHighLights
DateValue: Optional[str]
"""The date value."""
LongValue: Optional[int]
"""The long value."""
StringListValue: Optional[List[str]]
"""The string list value."""
StringValue: Optional[str]
"""The string value."""

@property
def value(self) -> Optional[Union[str, int, List[str]]]:
"""The only defined document attribute value or None.
According to Amazon Kendra, you can only provide one
value for a document attribute.
"""
if self.DateValue:
return self.DateValue
if self.LongValue:
return self.LongValue
if self.StringListValue:
return self.StringListValue
if self.StringValue:
return self.StringValue

return None


class DocumentAttribute(BaseModel, extra=Extra.allow):
"""A document attribute."""

Key: str
"""The key of the attribute."""
Value: DocumentAttributeValue
"""The value of the attribute."""


class ResultItem(BaseModel, ABC, extra=Extra.allow):
"""Abstract class that represents a result item."""

Id: Optional[str]
"""The ID of the item."""
DocumentId: Optional[str]
"""The document ID."""
DocumentURI: Optional[str]
"""The document URI."""
DocumentAttributes: Optional[List[DocumentAttribute]] = []
"""The document attributes."""

@abstractmethod
def get_title(self) -> str:
"""Document title."""

@abstractmethod
def get_excerpt(self) -> str:
"""Document excerpt or passage."""

def get_additional_metadata(self) -> dict:
"""Document additional metadata dict.
This returns any extra metadata except these values:
['source', 'title', 'excerpt' and 'document_attributes'].
"""
return {}

def get_document_attributes_dict(self) -> dict:
return {attr.Key: attr.Value.value for attr in (self.DocumentAttributes or [])}

def to_doc(self) -> Document:
title = self.get_title()
excerpt = self.get_excerpt()
page_content = combined_text(title, excerpt)
source = self.DocumentURI
document_attributes = self.get_document_attributes_dict()
metadata = self.get_additional_metadata()
metadata.update(
{
"source": source,
"title": title,
"excerpt": excerpt,
"document_attributes": document_attributes,
}
)

return Document(page_content=page_content, metadata=metadata)


class QueryResultItem(ResultItem):
"""A Query API result item."""

DocumentTitle: TextWithHighLights
"""The document title."""
FeedbackToken: Optional[str]
"""Identifies a particular result from a particular query."""
Format: Optional[str]
Id: Optional[str]
"""
If the Type is ANSWER, then format is either:
* TABLE: a table excerpt is returned in TableExcerpt;
* TEXT: a text excerpt is returned in DocumentExcerpt.
"""
Type: Optional[str]
"""Type of result: DOCUMENT or QUESTION_ANSWER or ANSWER"""
AdditionalAttributes: Optional[List[AdditionalResultAttribute]] = []
"""One or more additional attributes associated with the result."""
DocumentExcerpt: Optional[TextWithHighLights]
"""Excerpt of the document text."""

def get_title(self) -> str:
return self.DocumentTitle.Text

def get_attribute_value(self) -> str:
if not self.AdditionalAttributes:
Expand All @@ -117,20 +225,16 @@ def get_excerpt(self) -> str:

return clean_excerpt(excerpt)

def to_doc(self) -> Document:
title = self.DocumentTitle.Text
source = self.DocumentURI
excerpt = self.get_excerpt()
type = self.Type
page_content = combined_text(title, excerpt)
metadata = {"source": source, "title": title, "excerpt": excerpt, "type": type}
return Document(page_content=page_content, metadata=metadata)
def get_additional_metadata(self) -> dict:
additional_metadata = {"type": self.Type}
return additional_metadata


class QueryResult(BaseModel, extra=Extra.allow):
"""A query result."""
"""A Query API result."""

ResultItems: List[QueryResultItem]
"""The result items."""

def get_top_k_docs(self, top_n: int) -> List[Document]:
"""Gets the top k documents.
Expand All @@ -148,60 +252,25 @@ def get_top_k_docs(self, top_n: int) -> List[Document]:
return docs


class DocumentAttributeValue(BaseModel, extra=Extra.allow):
"""The value of a document attribute."""

DateValue: Optional[str]
"""The date value."""
LongValue: Optional[int]
"""The long value."""
StringListValue: Optional[List[str]]
"""The string list value."""
StringValue: Optional[str]
"""The string value."""


class DocumentAttribute(BaseModel, extra=Extra.allow):
"""A document attribute."""

Key: str
"""The key of the attribute."""
Value: DocumentAttributeValue
"""The value of the attribute."""


class RetrieveResultItem(BaseModel, extra=Extra.allow):
"""A retrieve result item."""
class RetrieveResultItem(ResultItem):
"""A Retrieve API result item."""

Content: Optional[str]
"""The content of the item."""
DocumentAttributes: Optional[List[DocumentAttribute]] = []
"""The document attributes."""
DocumentId: Optional[str]
"""The document ID."""
DocumentTitle: Optional[str]
"""The document title."""
DocumentURI: Optional[str]
"""The document URI."""
Id: Optional[str]
"""The ID of the item."""
Content: Optional[str]
"""The content of the item."""

def get_title(self) -> str:
return self.DocumentTitle or ""

def get_excerpt(self) -> str:
if not self.Content:
return ""
return clean_excerpt(self.Content)

def to_doc(self) -> Document:
title = self.DocumentTitle if self.DocumentTitle else ""
source = self.DocumentURI
excerpt = self.get_excerpt()
page_content = combined_text(title, excerpt)
metadata = {"source": source, "title": title, "excerpt": excerpt}
return Document(page_content=page_content, metadata=metadata)


class RetrieveResult(BaseModel, extra=Extra.allow):
"""A retrieve result."""
"""A Retrieve API result."""

QueryId: str
"""The ID of the query."""
Expand Down

0 comments on commit 8bb33f2

Please sign in to comment.