Skip to content

Commit

Permalink
Include meta data when computing embeddings in EmbeddingRetriever (#2559
Browse files Browse the repository at this point in the history
)

* include meta data when calculating embeddings in EmbeddingRetriever

* Update Documentation & Code Style

* fix None meta field

* remove default values

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
MichelBartels and github-actions[bot] authored May 17, 2022
1 parent ff4303c commit a952ba2
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 7 deletions.
9 changes: 7 additions & 2 deletions docs/_src/api/api/retriever.md
Original file line number Diff line number Diff line change
Expand Up @@ -909,7 +909,7 @@ one used by hugging-face transformers' modelhub models.
- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
- `batch_size`: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size.
- `embed_meta_fields`: Concatenate the provided meta fields and text passage / table to a text pair that is
then used to create the embedding.
then used to create the embedding.
This is the approach used in the original paper and is likely to improve
performance if your titles contain meaningful information for retrieval
(topic, entities etc.).
Expand Down Expand Up @@ -1163,7 +1163,7 @@ class EmbeddingRetriever(BaseRetriever)
#### EmbeddingRetriever.\_\_init\_\_

```python
def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = [])
```

**Arguments**:
Expand Down Expand Up @@ -1200,6 +1200,11 @@ Additional information can be found here https://huggingface.co/transformers/mai
- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
- `embed_meta_fields`: Concatenate the provided meta fields and text passage / table to a text pair that is
then used to create the embedding.
This approach is also used in the TableTextRetriever paper and is likely to improve
performance if your titles contain meaningful information for retrieval
(topic, entities etc.).

<a id="dense.EmbeddingRetriever.retrieve"></a>

Expand Down
8 changes: 8 additions & 0 deletions haystack/json-schemas/haystack-pipeline-master.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -2265,6 +2265,14 @@
"title": "Scale Score",
"default": true,
"type": "boolean"
},
"embed_meta_fields": {
"title": "Embed Meta Fields",
"default": [],
"type": "array",
"items": {
"type": "string"
}
}
},
"required": [
Expand Down
20 changes: 15 additions & 5 deletions haystack/nodes/retriever/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,7 @@ def __init__(
:param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
:param batch_size: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size.
:param embed_meta_fields: Concatenate the provided meta fields and text passage / table to a text pair that is
then used to create the embedding.
then used to create the embedding.
This is the approach used in the original paper and is likely to improve
performance if your titles contain meaningful information for retrieval
(topic, entities etc.).
Expand Down Expand Up @@ -1468,6 +1468,7 @@ def __init__(
devices: Optional[List[Union[str, torch.device]]] = None,
use_auth_token: Optional[Union[str, bool]] = None,
scale_score: bool = True,
embed_meta_fields: List[str] = [],
):
"""
:param document_store: An instance of DocumentStore from which to retrieve documents.
Expand Down Expand Up @@ -1503,6 +1504,11 @@ def __init__(
:param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
:param embed_meta_fields: Concatenate the provided meta fields and text passage / table to a text pair that is
then used to create the embedding.
This approach is also used in the TableTextRetriever paper and is likely to improve
performance if your titles contain meaningful information for retrieval
(topic, entities etc.).
"""
super().__init__()

Expand Down Expand Up @@ -1540,6 +1546,7 @@ def __init__(
)

self.embedding_encoder = _EMBEDDING_ENCODERS[model_format](self)
self.embed_meta_fields = embed_meta_fields

def retrieve(
self,
Expand Down Expand Up @@ -1806,24 +1813,27 @@ def embed_documents(self, docs: List[Document]) -> List[np.ndarray]:
:param docs: List of documents to embed
:return: Embeddings, one per input document
"""
docs = self._linearize_tables(docs)
docs = self._preprocess_documents(docs)
return self.embedding_encoder.embed_documents(docs)

def _linearize_tables(self, docs: List[Document]) -> List[Document]:
def _preprocess_documents(self, docs: List[Document]) -> List[Document]:
"""
Turns table documents into text documents by representing the table in csv format.
This allows us to use text embedding models for table retrieval.
It also concatenates specified meta data fields with the text representations.
:param docs: List of documents to linearize. If the document is not a table, it is returned as is.
:return: List of documents with linearized tables or original documents if they are not tables.
:return: List of documents with meta data + linearized tables or original documents if they are not tables.
"""
linearized_docs = []
for doc in docs:
doc = deepcopy(doc)
if doc.content_type == "table":
doc = deepcopy(doc)
if isinstance(doc.content, pd.DataFrame):
doc.content = doc.content.to_csv(index=False)
else:
raise HaystackError("Documents of type 'table' need to have a pd.DataFrame as content field")
meta_data_fields = [doc.meta[key] for key in self.embed_meta_fields if key in doc.meta and doc.meta[key]]
doc.content = "\n".join(meta_data_fields + [doc.content])
linearized_docs.append(doc)
return linearized_docs

0 comments on commit a952ba2

Please sign in to comment.