Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exposes Kendra result item DocumentAttributes in the document metadata #7781

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 127 additions & 58 deletions langchain/retrievers/kendra.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from typing import Any, Dict, List, Literal, Optional
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Literal, Optional, Union

from pydantic import BaseModel, Extra, root_validator

Expand Down Expand Up @@ -44,10 +45,19 @@ def combined_text(title: str, excerpt: str) -> str:


class Highlight(BaseModel, extra=Extra.allow):
"""
Represents the information that can be
used to highlight key words in the excerpt.
"""

BeginOffset: int
"""The zero-based location in the excerpt where the highlight starts."""
EndOffset: int
"""The zero-based location in the excerpt where the highlight ends."""
TopAnswer: Optional[bool]
"""Indicates whether the result is the best one."""
Type: Optional[str]
"""The highlight type: STANDARD or THESAURUS_SYNONYM."""


class TextWithHighLights(BaseModel, extra=Extra.allow):
Expand Down Expand Up @@ -80,18 +90,116 @@ def get_value_text(self) -> str:
return self.Value.TextWithHighlightsValue.Text


class QueryResultItem(BaseModel, extra=Extra.allow):
"""A query result item."""
class DocumentAttributeValue(BaseModel, extra=Extra.allow):
"""The value of a document attribute."""

DocumentId: str
DocumentTitle: TextWithHighLights
DateValue: Optional[str]
"""The date value."""
LongValue: Optional[int]
"""The long value."""
StringListValue: Optional[List[str]]
"""The string list value."""
StringValue: Optional[str]
"""The string value."""

@property
def value(self) -> Optional[Union[str, int, List[str]]]:
"""The only defined document attribute value or None.
According to Amazon Kendra, you can only provide one
value for a document attribute.
"""
if self.DateValue:
return self.DateValue
if self.LongValue:
return self.LongValue
if self.StringListValue:
return self.StringListValue
if self.StringValue:
return self.StringValue

return None


class DocumentAttribute(BaseModel, extra=Extra.allow):
"""A document attribute."""

Key: str
"""The key of the attribute."""
Value: DocumentAttributeValue
"""The value of the attribute."""


class ResultItem(BaseModel, ABC, extra=Extra.allow):
"""Abstract class that represents a result item."""

Id: Optional[str]
"""The ID of the item."""
DocumentId: Optional[str]
"""The document ID."""
DocumentURI: Optional[str]
"""The document URI."""
DocumentAttributes: Optional[List[DocumentAttribute]] = []
"""The document attributes."""

@abstractmethod
def get_title(self) -> str:
"""Document title."""

@abstractmethod
def get_excerpt(self) -> str:
"""Document excerpt or passage."""

def get_additional_metadata(self) -> dict:
"""Document additional metadata dict.
This returns any extra metadata except these values:
['source', 'title', 'excerpt' and 'document_attributes'].
"""
return {}

def get_document_attributes_dict(self) -> dict:
return {attr.Key: attr.Value.value for attr in (self.DocumentAttributes or [])}

def to_doc(self) -> Document:
title = self.get_title()
excerpt = self.get_excerpt()
page_content = combined_text(title, excerpt)
source = self.DocumentURI
document_attributes = self.get_document_attributes_dict()
metadata = self.get_additional_metadata()
metadata.update(
{
"source": source,
"title": title,
"excerpt": excerpt,
"document_attributes": document_attributes,
}
)

return Document(page_content=page_content, metadata=metadata)


class QueryResultItem(ResultItem):
"""A Query API result item."""

DocumentTitle: TextWithHighLights
"""The document title."""
FeedbackToken: Optional[str]
"""Identifies a particular result from a particular query."""
Format: Optional[str]
Id: Optional[str]
"""
If the Type is ANSWER, then format is either:
* TABLE: a table excerpt is returned in TableExcerpt;
* TEXT: a text excerpt is returned in DocumentExcerpt.
"""
Type: Optional[str]
"""Type of result: DOCUMENT or QUESTION_ANSWER or ANSWER"""
AdditionalAttributes: Optional[List[AdditionalResultAttribute]] = []
"""One or more additional attributes associated with the result."""
DocumentExcerpt: Optional[TextWithHighLights]
"""Excerpt of the document text."""

def get_title(self) -> str:
return self.DocumentTitle.Text

def get_attribute_value(self) -> str:
if not self.AdditionalAttributes:
Expand All @@ -114,20 +222,16 @@ def get_excerpt(self) -> str:

return clean_excerpt(excerpt)

def to_doc(self) -> Document:
title = self.DocumentTitle.Text
source = self.DocumentURI
excerpt = self.get_excerpt()
type = self.Type
page_content = combined_text(title, excerpt)
metadata = {"source": source, "title": title, "excerpt": excerpt, "type": type}
return Document(page_content=page_content, metadata=metadata)
def get_additional_metadata(self) -> dict:
additional_metadata = {"type": self.Type}
return additional_metadata


class QueryResult(BaseModel, extra=Extra.allow):
"""A query result."""
"""A Query API result."""

ResultItems: List[QueryResultItem]
"""The result items."""

def get_top_k_docs(self, top_n: int) -> List[Document]:
"""Gets the top k documents.
Expand All @@ -145,60 +249,25 @@ def get_top_k_docs(self, top_n: int) -> List[Document]:
return docs


class DocumentAttributeValue(BaseModel, extra=Extra.allow):
"""The value of a document attribute."""

DateValue: Optional[str]
"""The date value."""
LongValue: Optional[int]
"""The long value."""
StringListValue: Optional[List[str]]
"""The string list value."""
StringValue: Optional[str]
"""The string value."""


class DocumentAttribute(BaseModel, extra=Extra.allow):
"""A document attribute."""

Key: str
"""The key of the attribute."""
Value: DocumentAttributeValue
"""The value of the attribute."""


class RetrieveResultItem(BaseModel, extra=Extra.allow):
"""A retrieve result item."""
class RetrieveResultItem(ResultItem):
"""A Retrieve API result item."""

Content: Optional[str]
"""The content of the item."""
DocumentAttributes: Optional[List[DocumentAttribute]] = []
"""The document attributes."""
DocumentId: Optional[str]
"""The document ID."""
DocumentTitle: Optional[str]
"""The document title."""
DocumentURI: Optional[str]
"""The document URI."""
Id: Optional[str]
"""The ID of the item."""
Content: Optional[str]
"""The content of the item."""

def get_title(self) -> str:
return self.DocumentTitle or ""

def get_excerpt(self) -> str:
if not self.Content:
return ""
return clean_excerpt(self.Content)

def to_doc(self) -> Document:
title = self.DocumentTitle if self.DocumentTitle else ""
source = self.DocumentURI
excerpt = self.get_excerpt()
page_content = combined_text(title, excerpt)
metadata = {"source": source, "title": title, "excerpt": excerpt}
return Document(page_content=page_content, metadata=metadata)


class RetrieveResult(BaseModel, extra=Extra.allow):
"""A retrieve result."""
"""A Retrieve API result."""

QueryId: str
"""The ID of the query."""
Expand Down