diff --git a/VERSION.txt b/VERSION.txt
index 32d9eef225..bc80560fad 100644
--- a/VERSION.txt
+++ b/VERSION.txt
@@ -1 +1 @@
-1.4.1rc0
+1.5.0
diff --git a/docs/_src/api/openapi/openapi-1.5.0.json b/docs/_src/api/openapi/openapi-1.5.0.json
new file mode 100644
index 0000000000..fd07ddd0d5
--- /dev/null
+++ b/docs/_src/api/openapi/openapi-1.5.0.json
@@ -0,0 +1,892 @@
+{
+ "openapi": "3.0.2",
+ "info": {
+ "title": "Haystack REST API",
+ "version": "1.5.0"
+ },
+ "paths": {
+ "/initialized": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Check Status",
+ "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.",
+ "operationId": "check_status",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/hs_version": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Haystack Version",
+ "description": "Get the running Haystack version.",
+ "operationId": "haystack_version",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/query": {
+ "post": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Query",
+ "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.",
+ "operationId": "query",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback",
+ "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.",
+ "operationId": "get_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Feedback Feedback Get",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Label"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Post Feedback",
+ "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.",
+ "operationId": "post_feedback",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Feedback",
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Label"
+ },
+ {
+ "$ref": "#/components/schemas/CreateLabelSerialized"
+ }
+ ]
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ },
+ "delete": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Delete Feedback",
+ "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint",
+ "operationId": "delete_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/eval-feedback": {
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback Metrics",
+ "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`",
+ "operationId": "get_feedback_metrics",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ }
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/export-feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Export Feedback",
+ "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.",
+ "operationId": "export_feedback",
+ "parameters": [
+ {
+ "required": false,
+ "schema": {
+ "title": "Context Size",
+ "type": "integer",
+ "default": 100000
+ },
+ "name": "context_size",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Full Document Context",
+ "type": "boolean",
+ "default": true
+ },
+ "name": "full_document_context",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Only Positive Labels",
+ "type": "boolean",
+ "default": false
+ },
+ "name": "only_positive_labels",
+ "in": "query"
+ }
+ ],
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/file-upload": {
+ "post": {
+ "tags": [
+ "file-upload"
+ ],
+ "summary": "Upload File",
+ "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).",
+ "operationId": "upload_file",
+ "requestBody": {
+ "content": {
+ "multipart/form-data": {
+ "schema": {
+ "$ref": "#/components/schemas/Body_upload_file_file_upload_post"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/get_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Get Documents",
+ "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "get_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Documents Documents Get By Filters Post",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ }
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/delete_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Delete Documents",
+ "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "delete_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Delete Documents Documents Delete By Filters Post",
+ "type": "boolean"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "components": {
+ "schemas": {
+ "Answer": {
+ "title": "Answer",
+ "required": [
+ "answer"
+ ],
+ "type": "object",
+ "properties": {
+ "answer": {
+ "title": "Answer",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "enum": [
+ "generative",
+ "extractive",
+ "other"
+ ],
+ "type": "string",
+ "default": "extractive"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "context": {
+ "title": "Context",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "offsets_in_document": {
+ "title": "Offsets In Document",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "offsets_in_context": {
+ "title": "Offsets In Context",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "document_id": {
+ "title": "Document Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ }
+ }
+ },
+ "Body_upload_file_file_upload_post": {
+ "title": "Body_upload_file_file_upload_post",
+ "required": [
+ "files"
+ ],
+ "type": "object",
+ "properties": {
+ "files": {
+ "title": "Files",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "format": "binary"
+ }
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "string",
+ "default": "null"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables"
+ },
+ "valid_languages": {
+ "title": "Valid Languages"
+ },
+ "clean_whitespace": {
+ "title": "Clean Whitespace"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer"
+ },
+ "split_by": {
+ "title": "Split By"
+ },
+ "split_length": {
+ "title": "Split Length"
+ },
+ "split_overlap": {
+ "title": "Split Overlap"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary"
+ }
+ }
+ },
+ "CreateLabelSerialized": {
+ "title": "CreateLabelSerialized",
+ "required": [
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ },
+ "additionalProperties": false
+ },
+ "Document": {
+ "title": "Document",
+ "required": [
+ "content",
+ "content_type",
+ "id",
+ "meta"
+ ],
+ "type": "object",
+ "properties": {
+ "content": {
+ "title": "Content",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "content_type": {
+ "title": "Content Type",
+ "enum": [
+ "text",
+ "table",
+ "image"
+ ],
+ "type": "string"
+ },
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "embedding": {
+ "title": "Embedding",
+ "type": "string"
+ }
+ }
+ },
+ "FilterRequest": {
+ "title": "FilterRequest",
+ "type": "object",
+ "properties": {
+ "filters": {
+ "title": "Filters",
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ "HTTPValidationError": {
+ "title": "HTTPValidationError",
+ "type": "object",
+ "properties": {
+ "detail": {
+ "title": "Detail",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ValidationError"
+ }
+ }
+ }
+ },
+ "Label": {
+ "title": "Label",
+ "required": [
+ "id",
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ }
+ },
+ "QueryRequest": {
+ "title": "QueryRequest",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "params": {
+ "title": "Params",
+ "type": "object"
+ },
+ "debug": {
+ "title": "Debug",
+ "type": "boolean",
+ "default": false
+ }
+ },
+ "additionalProperties": false
+ },
+ "QueryResponse": {
+ "title": "QueryResponse",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "answers": {
+ "title": "Answers",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "default": []
+ },
+ "documents": {
+ "title": "Documents",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "default": []
+ },
+ "_debug": {
+ "title": " Debug",
+ "type": "object"
+ }
+ }
+ },
+ "Span": {
+ "title": "Span",
+ "required": [
+ "start",
+ "end"
+ ],
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start",
+ "type": "integer"
+ },
+ "end": {
+ "title": "End",
+ "type": "integer"
+ }
+ }
+ },
+ "ValidationError": {
+ "title": "ValidationError",
+ "required": [
+ "loc",
+ "msg",
+ "type"
+ ],
+ "type": "object",
+ "properties": {
+ "loc": {
+ "title": "Location",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ }
+ ]
+ }
+ },
+ "msg": {
+ "title": "Message",
+ "type": "string"
+ },
+ "type": {
+ "title": "Error Type",
+ "type": "string"
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/_src/api/openapi/openapi.json b/docs/_src/api/openapi/openapi.json
index 5d70c32a2f..fd07ddd0d5 100644
--- a/docs/_src/api/openapi/openapi.json
+++ b/docs/_src/api/openapi/openapi.json
@@ -2,7 +2,7 @@
"openapi": "3.0.2",
"info": {
"title": "Haystack REST API",
- "version": "1.4.1rc0"
+ "version": "1.5.0"
},
"paths": {
"/initialized": {
diff --git a/docs/v1.5.0/Makefile b/docs/v1.5.0/Makefile
new file mode 100644
index 0000000000..8634435d76
--- /dev/null
+++ b/docs/v1.5.0/Makefile
@@ -0,0 +1,25 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+
+SPHINXBUILD := sphinx-build
+MAKEINFO := makeinfo
+
+BUILDDIR := build
+SOURCE := _src/
+# SPHINXFLAGS := -a -W -n -A local=1 -d $(BUILDDIR)/doctree
+SPHINXFLAGS := -A local=1 -d $(BUILDDIR)/doctree
+SPHINXOPTS := $(SPHINXFLAGS) $(SOURCE)
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ $(SPHINXBUILD) -M $@ $(SPHINXOPTS) $(BUILDDIR)/$@
diff --git a/docs/v1.5.0/_src/api/Makefile b/docs/v1.5.0/_src/api/Makefile
new file mode 100644
index 0000000000..d4bb2cbb9e
--- /dev/null
+++ b/docs/v1.5.0/_src/api/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/v1.5.0/_src/api/_static/floating_sidebar.css b/docs/v1.5.0/_src/api/_static/floating_sidebar.css
new file mode 100644
index 0000000000..e59adc6722
--- /dev/null
+++ b/docs/v1.5.0/_src/api/_static/floating_sidebar.css
@@ -0,0 +1,29 @@
+div.sphinxsidebarwrapper {
+ position: relative;
+ top: 0px;
+ padding: 0;
+}
+
+div.sphinxsidebar {
+ margin: 0;
+ padding: 0 15px 0 15px;
+ width: 210px;
+ float: left;
+ font-size: 1em;
+ text-align: left;
+}
+
+div.sphinxsidebar .logo {
+ font-size: 1.8em;
+ color: #0A507A;
+ font-weight: 300;
+ text-align: center;
+}
+
+div.sphinxsidebar .logo img {
+ vertical-align: middle;
+}
+
+div.sphinxsidebar .download a img {
+ vertical-align: middle;
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/api/_templates/xxlayout.html b/docs/v1.5.0/_src/api/_templates/xxlayout.html
new file mode 100644
index 0000000000..de71588332
--- /dev/null
+++ b/docs/v1.5.0/_src/api/_templates/xxlayout.html
@@ -0,0 +1,46 @@
+{# put the sidebar before the body #}
+{% block sidebar1 %}{{ sidebar() }}{% endblock %}
+{% block sidebar2 %}{% endblock %}
+
+{% block extrahead %}
+
+{{ super() }}
+{#- if not embedded #}
+
+
+{#- endif #}
+{% endblock %}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/api/api/crawler.md b/docs/v1.5.0/_src/api/api/crawler.md
new file mode 100644
index 0000000000..05e518aa3d
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/crawler.md
@@ -0,0 +1,113 @@
+
+
+# Module crawler
+
+
+
+## Crawler
+
+```python
+class Crawler(BaseComponent)
+```
+
+Crawl texts from a website so that we can use them later in Haystack as a corpus for search / question answering etc.
+
+**Example:**
+```python
+| from haystack.nodes.connector import Crawler
+|
+| crawler = Crawler(output_dir="crawled_files")
+| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
+| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
+| filter_urls= ["haystack\.deepset\.ai\/overview\/"])
+```
+
+
+
+#### Crawler.\_\_init\_\_
+
+```python
+def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None)
+```
+
+Init object with basic params for crawling (can be overwritten later).
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http(s) address(es) (can also be supplied later when calling crawl())
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+#### Crawler.crawl
+
+```python
+def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Path]
+```
+
+Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
+
+file per URL, including text and basic meta data).
+You can optionally specify via `filter_urls` to only crawl URLs that match a certain pattern.
+All parameters are optional here and only meant to overwrite instance attributes at runtime.
+If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used.
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+**Returns**:
+
+List of paths where the crawled webpages got stored
+
+
+
+#### Crawler.run
+
+```python
+def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None) -> Tuple[Dict, str]
+```
+
+Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `return_documents`: Return json files content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+**Returns**:
+
+Tuple({"paths": List of filepaths, ...}, Name of output edge)
+
diff --git a/docs/v1.5.0/_src/api/api/document_classifier.md b/docs/v1.5.0/_src/api/api/document_classifier.md
new file mode 100644
index 0000000000..f59bd0904f
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/document_classifier.md
@@ -0,0 +1,161 @@
+
+
+# Module base
+
+
+
+## BaseDocumentClassifier
+
+```python
+class BaseDocumentClassifier(BaseComponent)
+```
+
+
+
+#### BaseDocumentClassifier.timing
+
+```python
+def timing(fn, attr_name)
+```
+
+Wrapper method used to time functions.
+
+
+
+# Module transformers
+
+
+
+## TransformersDocumentClassifier
+
+```python
+class TransformersDocumentClassifier(BaseDocumentClassifier)
+```
+
+Transformer based model for document classification using the HuggingFace's transformers framework
+(https://github.com/huggingface/transformers).
+While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
+This node classifies documents and adds the output from the classification step to the document's meta data.
+The meta field of the document is a dictionary with the following format:
+``'meta': {'name': '450_Baelor.txt', 'classification': {'label': 'neutral', 'probability' = 0.9997646, ...} }``
+
+Classification is run on document's content field by default. If you want it to run on another field,
+set the `classification_field` to one of document's meta fields.
+
+With this document_classifier, you can directly get predictions via predict()
+
+ **Usage example at query time:**
+ ```python
+| ...
+| retriever = BM25Retriever(document_store=document_store)
+| document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion")
+| p = Pipeline()
+| p.add_node(component=retriever, name="Retriever", inputs=["Query"])
+| p.add_node(component=document_classifier, name="Classifier", inputs=["Retriever"])
+| res = p.run(
+| query="Who is the father of Arya Stark?",
+| params={"Retriever": {"top_k": 10}}
+| )
+|
+| # print the classification results
+| print_documents(res, max_text_len=100, print_meta=True)
+| # or access the predicted class label directly
+| res["documents"][0].to_dict()["meta"]["classification"]["label"]
+ ```
+
+**Usage example at index time:**
+ ```python
+| ...
+| converter = TextConverter()
+| preprocessor = Preprocessor()
+| document_store = ElasticsearchDocumentStore()
+| document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion",
+| batch_size=16)
+| p = Pipeline()
+| p.add_node(component=converter, name="TextConverter", inputs=["File"])
+| p.add_node(component=preprocessor, name="Preprocessor", inputs=["TextConverter"])
+| p.add_node(component=document_classifier, name="DocumentClassifier", inputs=["Preprocessor"])
+| p.add_node(component=document_store, name="DocumentStore", inputs=["DocumentClassifier"])
+| p.run(file_paths=file_paths)
+ ```
+
+
+
+#### TransformersDocumentClassifier.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: Optional[int] = None, classification_field: str = None)
+```
+
+Load a text classification model from Transformers.
+
+Available models for the task of text-classification include:
+- ``'bhadresh-savani/distilbert-base-uncased-emotion'``
+- ``'Hate-speech-CNERG/dehatebert-mono-english'``
+
+Available models for the task of zero-shot-classification include:
+- ``'valhalla/distilbart-mnli-12-3'``
+- ``'cross-encoder/nli-distilroberta-base'``
+
+See https://huggingface.co/models for full list of available models.
+Filter for text classification models: https://huggingface.co/models?pipeline_tag=text-classification&sort=downloads
+Filter for zero-shot classification models (NLI): https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=downloads&search=nli
+
+**Arguments**:
+
+- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bhadresh-savani/distilbert-base-uncased-emotion'.
+See https://huggingface.co/models for full list of available models.
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `tokenizer`: Name of the tokenizer (usually the same as model)
+- `use_gpu`: Whether to use GPU (if available).
+- `return_all_scores`: Whether to return all prediction scores or just the one of the predicted class. Only used for task 'text-classification'.
+- `task`: 'text-classification' or 'zero-shot-classification'
+- `labels`: Only used for task 'zero-shot-classification'. List of string defining class labels, e.g.,
+["positive", "negative"] otherwise None. Given a LABEL, the sequence fed to the model is " sequence to
+classify This example is LABEL . " and the model predicts whether that sequence is a contradiction
+or an entailment.
+- `batch_size`: Number of Documents to be processed at a time.
+- `classification_field`: Name of Document's meta field to be used for classification. If left unset, Document.content is used by default.
+
+
+
+#### TransformersDocumentClassifier.predict
+
+```python
+def predict(documents: List[Document], batch_size: Optional[int] = None) -> List[Document]
+```
+
+Returns documents containing classification result in a meta field.
+
+Documents are updated in place.
+
+**Arguments**:
+
+- `documents`: A list of Documents to classify.
+- `batch_size`: The number of Documents to classify at a time.
+
+**Returns**:
+
+A list of Documents enriched with meta information.
+
+
+
+#### TransformersDocumentClassifier.predict\_batch
+
+```python
+def predict_batch(documents: Union[List[Document], List[List[Document]]], batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]]
+```
+
+Returns documents containing classification result in meta field.
+
+Documents are updated in place.
+
+**Arguments**:
+
+- `documents`: List of Documents or list of lists of Documents to classify.
+- `batch_size`: Number of Documents to classify at a time.
+
+**Returns**:
+
+List of Documents or list of lists of Documents enriched with meta information.
+
diff --git a/docs/v1.5.0/_src/api/api/document_store.md b/docs/v1.5.0/_src/api/api/document_store.md
new file mode 100644
index 0000000000..d99c1be688
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/document_store.md
@@ -0,0 +1,5053 @@
+
+
+# Module base
+
+
+
+## BaseKnowledgeGraph
+
+```python
+class BaseKnowledgeGraph(BaseComponent)
+```
+
+Base class for implementing Knowledge Graphs.
+
+
+
+## BaseDocumentStore
+
+```python
+class BaseDocumentStore(BaseComponent)
+```
+
+Base class for implementing Document Stores.
+
+
+
+#### BaseDocumentStore.write\_documents
+
+```python
+@abstractmethod
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Indexes documents for later queries.
+
+**Arguments**:
+
+- `documents`: a list of Python dictionaries or a list of Haystack Document objects.
+For documents as dictionaries, the format is {"text": ""}.
+Optionally: Include meta data via {"text": "",
+"meta":{"name": ", "author": "somebody", ...}}
+It can be used for filtering and is accessible in the responses of the Finder.
+- `index`: Optional name of index where the documents shall be written to.
+If None, the DocumentStore's default index (self.index) will be used.
+- `batch_size`: Number of documents that are passed to bulk function at a time.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+
+**Returns**:
+
+None
+
+
+
+#### BaseDocumentStore.get\_all\_documents
+
+```python
+@abstractmethod
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Get documents from the document store.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: Number of documents that are passed to bulk function at a time.
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+
+
+
+#### BaseDocumentStore.get\_all\_documents\_generator
+
+```python
+@abstractmethod
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+__Example__:
+```python
+filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+}
+```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+
+
+
+#### BaseDocumentStore.get\_all\_labels\_aggregated
+
+```python
+def get_all_labels_aggregated(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, open_domain: bool = True, drop_negative_labels: bool = False, drop_no_answers: bool = False, aggregate_by_meta: Optional[Union[str, list]] = None, headers: Optional[Dict[str, str]] = None) -> List[MultiLabel]
+```
+
+Return all labels in the DocumentStore, aggregated into MultiLabel objects.
+
+This aggregation step helps, for example, if you collected multiple possible answers for one question and you
+want now all answers bundled together in one place for evaluation.
+How they are aggregated is defined by the open_domain and aggregate_by_meta parameters.
+If the questions are being asked to a single document (i.e. SQuAD style), you should set open_domain=False to aggregate by question and document.
+If the questions are being asked to your full collection of documents, you should set open_domain=True to aggregate just by question.
+If the questions are being asked to a subslice of your document set (e.g. product review use cases),
+you should set open_domain=True and populate aggregate_by_meta with the names of Label meta fields to aggregate by question and your custom meta fields.
+For example, in a product review use case, you might set aggregate_by_meta=["product_id"] so that Labels
+with the same question but different answers from different documents are aggregated into the one MultiLabel
+object, provided that they have the same product_id (to be found in Label.meta["product_id"])
+
+**Arguments**:
+
+- `index`: Name of the index to get the labels from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `open_domain`: When True, labels are aggregated purely based on the question text alone.
+When False, labels are aggregated in a closed domain fashion based on the question text
+and also the id of the document that the label is tied to. In this setting, this function
+might return multiple MultiLabel objects with the same question string.
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+- `aggregate_by_meta`: The names of the Label meta fields by which to aggregate. For example: ["product_id"]
+TODO drop params
+
+
+
+#### BaseDocumentStore.normalize\_embedding
+
+```python
+def normalize_embedding(emb: np.ndarray) -> None
+```
+
+Performs L2 normalization of embeddings vector inplace. Input can be a single vector (1D array) or a matrix
+(2D array).
+
+
+
+#### BaseDocumentStore.add\_eval\_data
+
+```python
+def add_eval_data(filename: str, doc_index: str = "eval_document", label_index: str = "label", batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None, max_docs: Union[int, bool] = None, open_domain: bool = False, headers: Optional[Dict[str, str]] = None)
+```
+
+Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
+
+If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise
+from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors.
+
+**Arguments**:
+
+- `filename`: Name of the file containing evaluation data (json or jsonl)
+- `doc_index`: Elasticsearch index where evaluation documents should be stored
+- `label_index`: Elasticsearch index where labeled questions should be stored
+- `batch_size`: Optional number of documents that are loaded and processed at a time.
+When set to None (default) all documents are processed at once.
+- `preprocessor`: Optional PreProcessor to preprocess evaluation documents.
+It can be used for splitting documents into passages (and assigning labels to corresponding passages).
+Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0.
+When set to None (default) preprocessing is disabled.
+- `max_docs`: Optional number of documents that will be loaded.
+When set to None (default) all available eval documents are used.
+- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the
+same question might be found in different contexts.
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+
+
+
+#### BaseDocumentStore.delete\_index
+
+```python
+@abstractmethod
+def delete_index(index: str)
+```
+
+Delete an existing index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
+
+
+#### BaseDocumentStore.run
+
+```python
+def run(documents: List[Union[dict, Document]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None)
+```
+
+Run requests of document stores
+
+Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents.
+In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function
+is therefore only an interim solution until the run function also accepts documents.
+
+**Arguments**:
+
+- `documents`: A list of dicts that are documents.
+- `headers`: A list of headers.
+- `index`: Optional name of index where the documents shall be written to.
+If None, the DocumentStore's default index (self.index) will be used.
+- `id_hash_keys`: List of the fields that the hashes of the ids are generated from.
+
+
+
+#### BaseDocumentStore.describe\_documents
+
+```python
+def describe_documents(index=None)
+```
+
+Return a summary of the documents in the document store
+
+
+
+## KeywordDocumentStore
+
+```python
+class KeywordDocumentStore(BaseDocumentStore)
+```
+
+Base class for implementing Document Stores that support keyword searches.
+
+
+
+#### KeywordDocumentStore.query
+
+```python
+@abstractmethod
+def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query as defined by keyword matching algorithms like BM25.
+
+**Arguments**:
+
+- `query`: The query
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `custom_query`: Custom query to be executed.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to False.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### KeywordDocumentStore.query\_batch
+
+```python
+@abstractmethod
+def query_batch(queries: List[str], filters: Optional[
+ Union[
+ Dict[str, Union[Dict, List, str, int, float, bool]],
+ List[Dict[str, Union[Dict, List, str, int, float, bool]]],
+ ]
+ ] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[List[Document]]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the provided queries as defined by keyword matching algorithms like BM25.
+
+This method lets you find relevant documents for a single query string (output: List of Documents), or a
+a list of query strings (output: List of Lists of Documents).
+
+**Arguments**:
+
+- `queries`: Single query or list of queries.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `custom_query`: Custom query to be executed.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to False.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### get\_batches\_from\_generator
+
+```python
+def get_batches_from_generator(iterable, n)
+```
+
+Batch elements of an iterable into fixed-length chunks or blocks.
+
+
+
+# Module elasticsearch
+
+
+
+## ElasticsearchDocumentStore
+
+```python
+class ElasticsearchDocumentStore(KeywordDocumentStore)
+```
+
+
+
+#### ElasticsearchDocumentStore.\_\_init\_\_
+
+```python
+def __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False)
+```
+
+A DocumentStore using Elasticsearch to store and query the documents for our search.
+
+* Keeps all the logic to store and query documents from Elastic, incl. mapping of fields, adding filters or boosts to your queries, and storing embeddings
+ * You can either use an existing Elasticsearch index or create a new one via haystack
+ * Retrievers operate on top of this DocumentStore to find the relevant documents for a query
+
+**Arguments**:
+
+- `host`: url(s) of elasticsearch nodes
+- `port`: port(s) of elasticsearch nodes
+- `username`: username (standard authentication via http_auth)
+- `password`: password (standard authentication via http_auth)
+- `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth)
+- `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth)
+- `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package)
+- `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one.
+- `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one.
+- `search_fields`: Name of fields used by BM25Retriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"]
+- `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text").
+If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned.
+- `name_field`: Name of field that contains the title of the the doc
+- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top)
+- `embedding_dim`: Dimensionality of embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top)
+- `custom_mapping`: If you want to use your own custom mapping for creating a new index in Elasticsearch, you can supply it here as a dictionary.
+- `analyzer`: Specify the default analyzer from one of the built-ins when creating a new Elasticsearch Index.
+Elasticsearch also has built-in analyzers for different languages (e.g. impacting tokenization). More info at:
+https://www.elastic.co/guide/en/elasticsearch/reference/7.9/analysis-analyzers.html
+- `excluded_meta_data`: Name of fields in Elasticsearch that should not be returned (e.g. [field_one, field_two]).
+Helpful if you have fields with long, irrelevant content that you don't want to display in results (e.g. embedding vectors).
+- `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance
+- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine.
+- `verify_certs`: Whether to be strict about ca certificates
+- `recreate_index`: If set to True, an existing elasticsearch index will be deleted and a new one will be
+created using the config you are using for initialization. Be aware that all data in the old index will be
+lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
+be recreated.
+- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case)
+..deprecated:: 2.0
+This param is deprecated. In the next major version we will always try to create an index if there is no
+existing index (the current behaviour when create_index=True). If you are looking to recreate an
+existing index by deleting it first if it already exist use param recreate_index.
+- `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
+If set to 'wait_for', continue only after changes are visible (slow, but safe).
+If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion).
+More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-refresh.html
+- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is
+more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
+- `timeout`: Number of seconds after which an ElasticSearch request times out.
+- `return_embedding`: To return document embedding
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
+ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
+- `scroll`: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
+Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
+For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
+- `skip_missing_embeddings`: Parameter to control queries based on vector similarity when indexed documents miss embeddings.
+Parameter options: (True, False)
+False: Raises exception if one or more documents do not have embeddings at query time
+True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
+- `synonyms`: List of synonyms can be passed while elasticsearch initialization.
+For example: [ "foo, bar => baz",
+ "foozball , foosball" ]
+More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
+- `synonym_type`: Synonym filter type can be passed.
+Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
+More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
+- `use_system_proxy`: Whether to use system proxy.
+
+
+
+#### ElasticsearchDocumentStore.get\_document\_by\_id
+
+```python
+def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document]
+```
+
+Fetch a document by specifying its text id string
+
+
+
+#### ElasticsearchDocumentStore.get\_documents\_by\_id
+
+```python
+def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead
+to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default.
+
+
+
+#### ElasticsearchDocumentStore.get\_metadata\_values\_by\_key
+
+```python
+def get_metadata_values_by_key(key: str, query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[dict]
+```
+
+Get values associated with a metadata key. The output is in the format:
+
+[{"value": "my-value-1", "count": 23}, {"value": "my-value-2", "count": 12}, ... ]
+
+**Arguments**:
+
+- `key`: the meta key name to get the values for.
+- `query`: narrow down the scope to documents matching the query string.
+- `filters`: Narrow down the scope to documents that match the given filters.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `index`: Elasticsearch index where the meta values should be searched. If not supplied,
+self.index will be used.
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+
+
+
+#### ElasticsearchDocumentStore.write\_documents
+
+```python
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Indexes documents for later queries in Elasticsearch.
+
+Behaviour if a document with the same ID already exists in ElasticSearch:
+a) (Default) Throw Elastic's standard error message for duplicate IDs.
+b) If `self.update_existing_documents=True` for DocumentStore: Overwrite existing documents.
+(This is only relevant if you pass your own ID when initializing a `Document`.
+If don't set custom IDs for your Documents or just pass a list of dictionaries here,
+they will automatically get UUIDs assigned. See the `Document` class for details)
+
+**Arguments**:
+
+- `documents`: a list of Python dictionaries or a list of Haystack Document objects.
+For documents as dictionaries, the format is {"content": ""}.
+Optionally: Include meta data via {"content": "",
+"meta":{"name": ", "author": "somebody", ...}}
+It can be used for filtering and is accessible in the responses of the Finder.
+Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
+should be changed to what you have set for self.content_field and self.name_field.
+- `index`: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
+- `batch_size`: Number of documents that are passed to Elasticsearch's bulk function at a time.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+
+**Raises**:
+
+- `DuplicateDocumentError`: Exception trigger on duplicate document
+
+**Returns**:
+
+None
+
+
+
+#### ElasticsearchDocumentStore.write\_labels
+
+```python
+def write_labels(labels: Union[List[Label], List[dict]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000)
+```
+
+Write annotation labels into document store.
+
+**Arguments**:
+
+- `labels`: A list of Python dictionaries or a list of Haystack Label objects.
+- `index`: Elasticsearch index where the labels should be stored. If not supplied, self.label_index will be used.
+- `batch_size`: Number of labels that are passed to Elasticsearch's bulk function at a time.
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+
+
+
+#### ElasticsearchDocumentStore.update\_document\_meta
+
+```python
+def update_document_meta(id: str, meta: Dict[str, str], headers: Optional[Dict[str, str]] = None, index: str = None)
+```
+
+Update the metadata dictionary of a document by specifying its string id
+
+
+
+#### ElasticsearchDocumentStore.get\_document\_count
+
+```python
+def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
+```
+
+Return the number of documents in the document store.
+
+
+
+#### ElasticsearchDocumentStore.get\_label\_count
+
+```python
+def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int
+```
+
+Return the number of labels in the document store
+
+
+
+#### ElasticsearchDocumentStore.get\_embedding\_count
+
+```python
+def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
+
+
+#### ElasticsearchDocumentStore.get\_all\_documents
+
+```python
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Get documents from the document store.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents to return.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+
+
+
+#### ElasticsearchDocumentStore.get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents to return.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+
+
+
+#### ElasticsearchDocumentStore.get\_all\_labels
+
+```python
+def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) -> List[Label]
+```
+
+Return all labels in the document store
+
+
+
+#### ElasticsearchDocumentStore.query
+
+```python
+def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query as defined by the BM25 algorithm.
+
+**Arguments**:
+
+- `query`: The query
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query).
+Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
+that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
+names must match with the filters dict supplied in self.retrieve().
+::
+
+ **An example custom_query:**
+ ```python
+ | {
+ | "size": 10,
+ | "query": {
+ | "bool": {
+ | "should": [{"multi_match": {
+ | "query": ${query}, // mandatory query placeholder
+ | "type": "most_fields",
+ | "fields": ["content", "title"]}}],
+ | "filter": [ // optional custom filters
+ | {"terms": {"year": ${years}}},
+ | {"terms": {"quarter": ${quarters}}},
+ | {"range": {"date": {"gte": ${date}}}}
+ | ],
+ | }
+ | },
+ | }
+ ```
+
+ **For this custom_query, a sample retrieve() could be:**
+ ```python
+ | self.retrieve(query="Why did the revenue increase?",
+ | filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
+ ```
+
+Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings.
+See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
+You will find the highlighted output in the returned Document's meta field by key "highlighted".
+::
+
+ **Example custom_query with highlighting:**
+ ```python
+ | {
+ | "size": 10,
+ | "query": {
+ | "bool": {
+ | "should": [{"multi_match": {
+ | "query": ${query}, // mandatory query placeholder
+ | "type": "most_fields",
+ | "fields": ["content", "title"]}}],
+ | }
+ | },
+ | "highlight": { // enable highlighting
+ | "fields": { // for fields content and title
+ | "content": {},
+ | "title": {}
+ | }
+ | },
+ | }
+ ```
+
+ **For this custom_query, highlighting info can be accessed by:**
+ ```python
+ | docs = self.retrieve(query="Why did the revenue increase?")
+ | highlighted_content = docs[0].meta["highlighted"]["content"]
+ | highlighted_title = docs[0].meta["highlighted"]["title"]
+ ```
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to false.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### ElasticsearchDocumentStore.query\_batch
+
+```python
+def query_batch(queries: List[str], filters: Optional[
+ Union[
+ Dict[str, Union[Dict, List, str, int, float, bool]],
+ List[Dict[str, Union[Dict, List, str, int, float, bool]]],
+ ]
+ ] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[List[Document]]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the provided queries as defined by keyword matching algorithms like BM25.
+
+This method lets you find relevant documents for list of query strings (output: List of Lists of Documents).
+
+**Arguments**:
+
+- `queries`: List of query strings.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions. Can be a single filter that will be applied to each query or a list of filters
+(one filter per query).
+
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `custom_query`: Custom query to be executed.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to False.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### ElasticsearchDocumentStore.query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return
+- `index`: Index name for storing the docs and metadata
+- `return_embedding`: To return document embedding
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### ElasticsearchDocumentStore.update\_embeddings
+
+```python
+def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None)
+```
+
+Updates the embeddings in the the document store using the encoding model specified in the retriever.
+
+This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).
+
+**Arguments**:
+
+- `retriever`: Retriever to use to update the embeddings.
+- `index`: Index name to update
+- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False,
+only documents without embeddings are processed. This mode can be used for
+incremental updating of embeddings, wherein, only newly indexed documents
+get processed.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+
+**Returns**:
+
+None
+
+
+
+#### ElasticsearchDocumentStore.delete\_all\_documents
+
+```python
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+
+**Returns**:
+
+None
+
+
+
+#### ElasticsearchDocumentStore.delete\_documents
+
+```python
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the documents from. If None, the
+DocumentStore's default index (self.index) will be used
+- `ids`: Optional list of IDs to narrow down the documents to be deleted.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+
+ If filters are provided along with a list of IDs, this method deletes the
+ intersection of the two query results (documents that match the filters and
+ have their ID in the list).
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+
+**Returns**:
+
+None
+
+
+
+#### ElasticsearchDocumentStore.delete\_labels
+
+```python
+def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete labels in an index. All labels are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the labels from. If None, the
+DocumentStore's default label index (self.label_index) will be used
+- `ids`: Optional list of IDs to narrow down the labels to be deleted.
+- `filters`: Optional filters to narrow down the labels to be deleted.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+
+**Returns**:
+
+None
+
+
+
+#### ElasticsearchDocumentStore.delete\_index
+
+```python
+def delete_index(index: str)
+```
+
+Delete an existing elasticsearch index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
+
+
+## OpenSearchDocumentStore
+
+```python
+class OpenSearchDocumentStore(ElasticsearchDocumentStore)
+```
+
+
+
+#### OpenSearchDocumentStore.\_\_init\_\_
+
+```python
+def __init__(scheme: str = "https", username: str = "admin", password: str = "admin", host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", ca_certs: Optional[str] = None, verify_certs: bool = False, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False)
+```
+
+Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service.
+
+In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
+the KNN plugin that can scale to a large number of documents.
+
+**Arguments**:
+
+- `host`: url(s) of elasticsearch nodes
+- `port`: port(s) of elasticsearch nodes
+- `username`: username (standard authentication via http_auth)
+- `password`: password (standard authentication via http_auth)
+- `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth)
+- `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth)
+- `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package)
+- `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one.
+- `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one.
+- `search_fields`: Name of fields used by BM25Retriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"]
+- `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text").
+If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned.
+- `name_field`: Name of field that contains the title of the the doc
+- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top)
+Note, that in OpenSearch the similarity type for efficient approximate vector similarity calculations is tied to the embedding field's data type which cannot be changed after creation.
+- `embedding_dim`: Dimensionality of embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top)
+- `custom_mapping`: If you want to use your own custom mapping for creating a new index in Elasticsearch, you can supply it here as a dictionary.
+- `analyzer`: Specify the default analyzer from one of the built-ins when creating a new Elasticsearch Index.
+Elasticsearch also has built-in analyzers for different languages (e.g. impacting tokenization). More info at:
+https://www.elastic.co/guide/en/elasticsearch/reference/7.9/analysis-analyzers.html
+- `excluded_meta_data`: Name of fields in Elasticsearch that should not be returned (e.g. [field_one, field_two]).
+Helpful if you have fields with long, irrelevant content that you don't want to display in results (e.g. embedding vectors).
+- `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance
+- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine.
+- `verify_certs`: Whether to be strict about ca certificates
+- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case
+- `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
+If set to 'wait_for', continue only after changes are visible (slow, but safe).
+If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion).
+More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-refresh.html
+- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is
+more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
+Note, that the use of efficient approximate vector calculations in OpenSearch is tied to embedding_field's data type which cannot be changed after creation.
+You won't be able to use approximate vector calculations on an embedding_field which was created with a different similarity value.
+In such cases a fallback to exact but slow vector calculations will happen and a warning will be displayed.
+- `timeout`: Number of seconds after which an ElasticSearch request times out.
+- `return_embedding`: To return document embedding
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'.
+As OpenSearch currently does not support all similarity functions (e.g. dot_product) in exact vector similarity calculations,
+we don't make use of exact vector similarity when index_type='flat'. Instead we use the same approximate vector similarity calculations like in 'hnsw', but further optimized for accuracy.
+Exact vector similarity is only used as fallback when there's a mismatch between certain requested and indexed similarity types.
+In these cases however, a warning will be displayed. See similarity param for more information.
+- `scroll`: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
+Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
+For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
+- `skip_missing_embeddings`: Parameter to control queries based on vector similarity when indexed documents miss embeddings.
+Parameter options: (True, False)
+False: Raises exception if one or more documents do not have embeddings at query time
+True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
+- `synonyms`: List of synonyms can be passed while elasticsearch initialization.
+For example: [ "foo, bar => baz",
+ "foozball , foosball" ]
+More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
+- `synonym_type`: Synonym filter type can be passed.
+Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
+More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
+
+
+
+#### OpenSearchDocumentStore.query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return
+- `index`: Index name for storing the docs and metadata
+- `return_embedding`: To return document embedding
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+## OpenDistroElasticsearchDocumentStore
+
+```python
+class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore)
+```
+
+A DocumentStore which has an Open Distro for Elasticsearch service behind it.
+
+
+
+# Module memory
+
+
+
+## InMemoryDocumentStore
+
+```python
+class InMemoryDocumentStore(BaseDocumentStore)
+```
+
+In-memory document store
+
+
+
+#### InMemoryDocumentStore.\_\_init\_\_
+
+```python
+def __init__(index: str = "document", label_index: str = "label", embedding_field: Optional[str] = "embedding", embedding_dim: int = 768, return_embedding: bool = False, similarity: str = "dot_product", progress_bar: bool = True, duplicate_documents: str = "overwrite", use_gpu: bool = True, scoring_batch_size: int = 500000)
+```
+
+**Arguments**:
+
+- `index`: The documents are scoped to an index attribute that can be used when writing, querying,
+or deleting documents. This parameter sets the default value for document index.
+- `label_index`: The default value of index attribute for the labels.
+- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top)
+- `embedding_dim`: The size of the embedding vector.
+- `return_embedding`: To return document embedding
+- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
+more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `use_gpu`: Whether to use a GPU or the CPU for calculating embedding similarity.
+Falls back to CPU if no GPU is available.
+- `scoring_batch_size`: Batch size of documents to calculate similarity for. Very small batch sizes are inefficent.
+Very large batch sizes can overrun GPU memory. In general you want to make sure
+you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory.
+Since the data is originally stored in CPU memory there is little risk of overruning memory
+when running on CPU.
+
+
+
+#### InMemoryDocumentStore.write\_documents
+
+```python
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Indexes documents for later queries.
+
+**Arguments**:
+
+- `documents`: a list of Python dictionaries or a list of Haystack Document objects.
+For documents as dictionaries, the format is {"text": ""}.
+ Optionally: Include meta data via {"text": "",
+ "meta": {"name": ", "author": "somebody", ...}}
+ It can be used for filtering and is accessible in the responses of the Finder.
+:param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a
+ separate index than the documents for search.
+:param duplicate_documents: Handle duplicates document based on parameter options.
+ Parameter options : ( 'skip','overwrite','fail')
+ skip: Ignore the duplicates documents
+ overwrite: Update any existing documents with the same ID when adding documents.
+ fail: an error is raised if the document ID of the document being added already
+ exists.
+:raises DuplicateDocumentError: Exception trigger on duplicate document
+:return: None
+
+
+
+#### InMemoryDocumentStore.write\_labels
+
+```python
+def write_labels(labels: Union[List[dict], List[Label]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Write annotation labels into document store.
+
+
+
+#### InMemoryDocumentStore.get\_document\_by\_id
+
+```python
+def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document]
+```
+
+Fetch a document by specifying its text id string.
+
+
+
+#### InMemoryDocumentStore.get\_documents\_by\_id
+
+```python
+def get_documents_by_id(ids: List[str], index: Optional[str] = None) -> List[Document]
+```
+
+Fetch documents by specifying a list of text id strings.
+
+
+
+#### InMemoryDocumentStore.get\_scores\_torch
+
+```python
+def get_scores_torch(query_emb: np.ndarray, document_to_search: List[Document]) -> List[float]
+```
+
+Calculate similarity scores between query embedding and a list of documents using torch.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `document_to_search`: List of documents to compare `query_emb` against.
+
+
+
+#### InMemoryDocumentStore.get\_scores\_numpy
+
+```python
+def get_scores_numpy(query_emb: np.ndarray, document_to_search: List[Document]) -> List[float]
+```
+
+Calculate similarity scores between query embedding and a list of documents using numpy.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `document_to_search`: List of documents to compare `query_emb` against.
+
+
+
+#### InMemoryDocumentStore.query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+Example:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+To use the same logical operator multiple times on the same level, logical operators take
+optionally a list of dictionaries as value.
+Example:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return
+- `index`: Index name for storing the docs and metadata
+- `return_embedding`: To return document embedding
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### InMemoryDocumentStore.update\_embeddings
+
+```python
+def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000)
+```
+
+Updates the embeddings in the the document store using the encoding model specified in the retriever.
+
+This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).
+
+**Arguments**:
+
+- `retriever`: Retriever to use to get embeddings for text
+- `index`: Index name for which embeddings are to be updated. If set to None, the default self.index is used.
+- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False,
+only documents without embeddings are processed. This mode can be used for
+incremental updating of embeddings, wherein, only newly indexed documents
+get processed.
+- `filters`: Narrow down the scope to documents that match the given filters.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+Example:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+**Returns**:
+
+None
+
+
+
+#### InMemoryDocumentStore.get\_document\_count
+
+```python
+def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
+```
+
+Return the number of documents in the document store.
+
+
+
+#### InMemoryDocumentStore.get\_embedding\_count
+
+```python
+def get_embedding_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
+
+
+#### InMemoryDocumentStore.get\_label\_count
+
+```python
+def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int
+```
+
+Return the number of labels in the document store.
+
+
+
+#### InMemoryDocumentStore.get\_all\_documents
+
+```python
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Get all documents from the document store as a list.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Narrow down the scope to documents that match the given filters.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+Example:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+
+
+
+#### InMemoryDocumentStore.get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get all documents from the document store. The methods returns a Python Generator that yields individual
+
+documents.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Narrow down the scope to documents that match the given filters.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+Example:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+
+
+
+#### InMemoryDocumentStore.get\_all\_labels
+
+```python
+def get_all_labels(index: str = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label]
+```
+
+Return all labels in the document store.
+
+
+
+#### InMemoryDocumentStore.delete\_all\_documents
+
+```python
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Narrow down the scope to documents that match the given filters.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+Example:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+
+**Returns**:
+
+None
+
+
+
+#### InMemoryDocumentStore.delete\_documents
+
+```python
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `ids`: Optional list of IDs to narrow down the documents to be deleted.
+- `filters`: Narrow down the scope to documents that match the given filters.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+Example:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+
+**Returns**:
+
+None
+
+
+
+#### InMemoryDocumentStore.delete\_index
+
+```python
+def delete_index(index: str)
+```
+
+Delete an existing index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
+
+
+#### InMemoryDocumentStore.delete\_labels
+
+```python
+def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete labels in an index. All labels are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the labels from. If None, the
+DocumentStore's default label index (self.label_index) will be used.
+- `ids`: Optional list of IDs to narrow down the labels to be deleted.
+- `filters`: Narrow down the scope to documents that match the given filters.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+Example:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+
+**Returns**:
+
+None
+
+
+
+# Module sql
+
+
+
+## SQLDocumentStore
+
+```python
+class SQLDocumentStore(BaseDocumentStore)
+```
+
+
+
+#### SQLDocumentStore.\_\_init\_\_
+
+```python
+def __init__(url: str = "sqlite://", index: str = "document", label_index: str = "label", duplicate_documents: str = "overwrite", check_same_thread: bool = False, isolation_level: str = None)
+```
+
+An SQL backed DocumentStore. Currently supports SQLite, PostgreSQL and MySQL backends.
+
+**Arguments**:
+
+- `url`: URL for SQL database as expected by SQLAlchemy. More info here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls
+- `index`: The documents are scoped to an index attribute that can be used when writing, querying, or deleting documents.
+This parameter sets the default value for document index.
+- `label_index`: The default value of index attribute for the labels.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `check_same_thread`: Set to False to mitigate multithreading issues in older SQLite versions (see https://docs.sqlalchemy.org/en/14/dialects/sqlite.html?highlight=check_same_thread#threading-pooling-behavior)
+- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level)
+
+
+
+#### SQLDocumentStore.get\_document\_by\_id
+
+```python
+def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document]
+```
+
+Fetch a document by specifying its text id string
+
+
+
+#### SQLDocumentStore.get\_documents\_by\_id
+
+```python
+def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Fetch documents by specifying a list of text id strings
+
+
+
+#### SQLDocumentStore.get\_documents\_by\_vector\_ids
+
+```python
+def get_documents_by_vector_ids(vector_ids: List[str], index: Optional[str] = None, batch_size: int = 10_000)
+```
+
+Fetch documents by specifying a list of text vector id strings
+
+
+
+#### SQLDocumentStore.get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents to return.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### SQLDocumentStore.get\_all\_labels
+
+```python
+def get_all_labels(index=None, filters: Optional[dict] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Return all labels in the document store
+
+
+
+#### SQLDocumentStore.write\_documents
+
+```python
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> None
+```
+
+Indexes documents for later queries.
+
+**Arguments**:
+
+- `documents`: a list of Python dictionaries or a list of Haystack Document objects.
+For documents as dictionaries, the format is {"text": ""}.
+Optionally: Include meta data via {"text": "",
+"meta":{"name": ", "author": "somebody", ...}}
+It can be used for filtering and is accessible in the responses of the Finder.
+- `index`: add an optional index attribute to documents. It can be later used for filtering. For instance,
+documents for evaluation can be indexed in a separate index than the documents for search.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents
+but is considerably slower (default).
+fail: an error is raised if the document ID of the document being added already
+exists.
+
+**Returns**:
+
+None
+
+
+
+#### SQLDocumentStore.write\_labels
+
+```python
+def write_labels(labels, index=None, headers: Optional[Dict[str, str]] = None)
+```
+
+Write annotation labels into document store.
+
+
+
+#### SQLDocumentStore.update\_vector\_ids
+
+```python
+def update_vector_ids(vector_id_map: Dict[str, str], index: Optional[str] = None, batch_size: int = 10_000)
+```
+
+Update vector_ids for given document_ids.
+
+**Arguments**:
+
+- `vector_id_map`: dict containing mapping of document_id -> vector_id.
+- `index`: filter documents by the optional index attribute for documents in database.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### SQLDocumentStore.reset\_vector\_ids
+
+```python
+def reset_vector_ids(index: Optional[str] = None)
+```
+
+Set vector IDs for all documents as None
+
+
+
+#### SQLDocumentStore.update\_document\_meta
+
+```python
+def update_document_meta(id: str, meta: Dict[str, str], index: str = None)
+```
+
+Update the metadata dictionary of a document by specifying its string id
+
+
+
+#### SQLDocumentStore.get\_document\_count
+
+```python
+def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
+```
+
+Return the number of documents in the document store.
+
+
+
+#### SQLDocumentStore.get\_label\_count
+
+```python
+def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int
+```
+
+Return the number of labels in the document store
+
+
+
+#### SQLDocumentStore.delete\_all\_documents
+
+```python
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+
+**Returns**:
+
+None
+
+
+
+#### SQLDocumentStore.delete\_documents
+
+```python
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `ids`: Optional list of IDs to narrow down the documents to be deleted.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+Example filters: {"name": ["some", "more"], "category": ["only_one"]}.
+If filters are provided along with a list of IDs, this method deletes the
+intersection of the two query results (documents that match the filters and
+have their ID in the list).
+
+**Returns**:
+
+None
+
+
+
+#### SQLDocumentStore.delete\_index
+
+```python
+def delete_index(index: str)
+```
+
+Delete an existing index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
+
+
+#### SQLDocumentStore.delete\_labels
+
+```python
+def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete labels from the document store. All labels are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the labels from. If None, the
+DocumentStore's default label index (self.label_index) will be used.
+- `ids`: Optional list of IDs to narrow down the labels to be deleted.
+- `filters`: Optional filters to narrow down the labels to be deleted.
+Example filters: {"id": ["9a196e41-f7b5-45b4-bd19-5feb7501c159", "9a196e41-f7b5-45b4-bd19-5feb7501c159"]} or {"query": ["question2"]}
+
+**Returns**:
+
+None
+
+
+
+# Module faiss
+
+
+
+## FAISSDocumentStore
+
+```python
+class FAISSDocumentStore(SQLDocumentStore)
+```
+
+Document store for very large scale embedding based dense retrievers like the DPR.
+
+It implements the FAISS library(https://github.com/facebookresearch/faiss)
+to perform similarity search on vectors.
+
+The document text and meta-data (for filtering) are stored using the SQLDocumentStore, while
+the vector embeddings are indexed in a FAISS Index.
+
+
+
+#### FAISSDocumentStore.\_\_init\_\_
+
+```python
+def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, n_links: int = 64, ef_search: int = 20, ef_construction: int = 80)
+```
+
+**Arguments**:
+
+- `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale
+deployment, Postgres is recommended.
+- `vector_dim`: Deprecated. Use embedding_dim instead.
+- `embedding_dim`: The embedding vector size. Default: 768.
+- `faiss_index_factory_str`: Create a new FAISS index of the specified type.
+The type is determined from the given string following the conventions
+of the original FAISS index factory.
+Recommended options:
+- "Flat" (default): Best accuracy (= exact). Becomes slow and RAM intense for > 1 Mio docs.
+- "HNSW": Graph-based heuristic. If not further specified,
+ we use the following config:
+ HNSW64, efConstruction=80 and efSearch=20
+- "IVFx,Flat": Inverted Index. Replace x with the number of centroids aka nlist.
+ Rule of thumb: nlist = 10 * sqrt (num_docs) is a good starting point.
+For more details see:
+- Overview of indices https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
+- Guideline for choosing an index https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
+- FAISS Index factory https://github.com/facebookresearch/faiss/wiki/The-index-factory
+Benchmarks: XXX
+- `faiss_index`: Pass an existing FAISS Index, i.e. an empty one that you configured manually
+or one with docs that you used in Haystack before and want to load again.
+- `return_embedding`: To return document embedding. Unlike other document stores, FAISS will return normalized embeddings
+- `index`: Name of index in document store to use.
+- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is
+more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence-Transformer model.
+In both cases, the returned values in Document.score are normalized to be in range [0,1]:
+For `dot_product`: expit(np.asarray(raw_score / 100))
+FOr `cosine`: (raw_score + 1) / 2
+- `embedding_field`: Name of field containing an embedding vector.
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `faiss_index_path`: Stored FAISS index file. Can be created via calling `save()`.
+If specified no other params besides faiss_config_path must be specified.
+- `faiss_config_path`: Stored FAISS initial configuration parameters.
+Can be created via calling `save()`
+- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level)
+- `n_links`: used only if index_factory == "HNSW"
+- `ef_search`: used only if index_factory == "HNSW"
+- `ef_construction`: used only if index_factory == "HNSW"
+
+
+
+#### FAISSDocumentStore.write\_documents
+
+```python
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> None
+```
+
+Add new documents to the DocumentStore.
+
+**Arguments**:
+
+- `documents`: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index
+them right away in FAISS. If not, you can later call update_embeddings() to create & index them.
+- `index`: (SQL) index name for storing the docs and metadata
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+
+**Raises**:
+
+- `DuplicateDocumentError`: Exception trigger on duplicate document
+
+**Returns**:
+
+None
+
+
+
+#### FAISSDocumentStore.update\_embeddings
+
+```python
+def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None, batch_size: int = 10_000)
+```
+
+Updates the embeddings in the the document store using the encoding model specified in the retriever.
+
+This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).
+
+**Arguments**:
+
+- `retriever`: Retriever to use to get embeddings for text
+- `index`: Index name for which embeddings are to be updated. If set to None, the default self.index is used.
+- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False,
+only documents without embeddings are processed. This mode can be used for
+incremental updating of embeddings, wherein, only newly indexed documents
+get processed.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+**Returns**:
+
+None
+
+
+
+#### FAISSDocumentStore.get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get all documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents to return.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `return_embedding`: Whether to return the document embeddings. Unlike other document stores, FAISS will return normalized embeddings
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### FAISSDocumentStore.get\_embedding\_count
+
+```python
+def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
+
+
+#### FAISSDocumentStore.train\_index
+
+```python
+def train_index(documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.ndarray] = None, index: Optional[str] = None)
+```
+
+Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors.
+
+The train vectors should come from the same distribution as your final ones.
+You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on.
+
+**Arguments**:
+
+- `documents`: Documents (incl. the embeddings)
+- `embeddings`: Plain embeddings
+- `index`: Name of the index to train. If None, the DocumentStore's default index (self.index) will be used.
+
+**Returns**:
+
+None
+
+
+
+#### FAISSDocumentStore.delete\_all\_documents
+
+```python
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete all documents from the document store.
+
+
+
+#### FAISSDocumentStore.delete\_documents
+
+```python
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents from the document store. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `ids`: Optional list of IDs to narrow down the documents to be deleted.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+Example filters: {"name": ["some", "more"], "category": ["only_one"]}.
+If filters are provided along with a list of IDs, this method deletes the
+intersection of the two query results (documents that match the filters and
+have their ID in the list).
+
+**Returns**:
+
+None
+
+
+
+#### FAISSDocumentStore.delete\_index
+
+```python
+def delete_index(index: str)
+```
+
+Delete an existing index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
+
+
+#### FAISSDocumentStore.query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `filters`: Optional filters to narrow down the search space.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `top_k`: How many documents to return
+- `index`: Index name to query the document from.
+- `return_embedding`: To return document embedding. Unlike other document stores, FAISS will return normalized embeddings
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### FAISSDocumentStore.save
+
+```python
+def save(index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None)
+```
+
+Save FAISS Index to the specified file.
+
+**Arguments**:
+
+- `index_path`: Path to save the FAISS index to.
+- `config_path`: Path to save the initial configuration parameters to.
+Defaults to the same as the file path, save the extension (.json).
+This file contains all the parameters passed to FAISSDocumentStore()
+at creation time (for example the SQL path, embedding_dim, etc), and will be
+used by the `load` method to restore the index with the appropriate configuration.
+
+**Returns**:
+
+None
+
+
+
+#### FAISSDocumentStore.load
+
+```python
+@classmethod
+def load(cls, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None)
+```
+
+Load a saved FAISS index from a file and connect to the SQL database.
+
+Note: In order to have a correct mapping from FAISS to SQL,
+ make sure to use the same SQL DB that you used when calling `save()`.
+
+**Arguments**:
+
+- `index_path`: Stored FAISS index file. Can be created via calling `save()`
+- `config_path`: Stored FAISS initial configuration parameters.
+Can be created via calling `save()`
+
+
+
+# Module milvus1
+
+
+
+## Milvus1DocumentStore
+
+```python
+class Milvus1DocumentStore(SQLDocumentStore)
+```
+
+Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors.
+Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR).
+In contrast to FAISS, Milvus ...
+ - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment
+ - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index)
+ - encapsulates multiple ANN libraries (FAISS, ANNOY ...)
+
+This class uses Milvus for all vector related storage, processing and querying.
+The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus
+does not allow these data types (yet).
+
+Usage:
+1. Start a Milvus server (see https://milvus.io/docs/v1.0.0/install_milvus.md)
+2. Run pip install farm-haystack[milvus1]
+3. Init a MilvusDocumentStore in Haystack
+
+
+
+#### Milvus1DocumentStore.\_\_init\_\_
+
+```python
+def __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None)
+```
+
+**WARNING:** Milvus1DocumentStore is deprecated and will be removed in a future version. Please switch to Milvus2
+
+or consider using another DocumentStore.
+
+**Arguments**:
+
+- `sql_url`: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale
+deployment, Postgres is recommended. If using MySQL then same server can also be used for
+Milvus metadata. For more details see https://milvus.io/docs/v1.0.0/data_manage.md.
+- `milvus_url`: Milvus server connection URL for storing and processing vectors.
+Protocol, host and port will automatically be inferred from the URL.
+See https://milvus.io/docs/v1.0.0/install_milvus.md for instructions to start a Milvus instance.
+- `connection_pool`: Connection pool type to connect with Milvus server. Default: "SingletonThread".
+- `index`: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name").
+- `vector_dim`: Deprecated. Use embedding_dim instead.
+- `embedding_dim`: The embedding vector size. Default: 768.
+- `index_file_size`: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB.
+When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment.
+Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one.
+As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048.
+Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory.
+(From https://milvus.io/docs/v1.0.0/performance_faq.md#How-can-I-get-the-best-performance-from-Milvus-through-setting-index_file_size)
+- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings.
+'cosine' is recommended for Sentence Transformers.
+- `index_type`: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy.
+Some popular options:
+- FLAT (default): Exact method, slow
+- IVF_FLAT, inverted file based heuristic, fast
+- HSNW: Graph based, fast
+- ANNOY: Tree based, fast
+See: https://milvus.io/docs/v1.0.0/index.md
+- `index_param`: Configuration parameters for the chose index_type needed at indexing time.
+For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT.
+See https://milvus.io/docs/v1.0.0/index.md
+- `search_param`: Configuration parameters for the chose index_type needed at query time
+For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT.
+See https://milvus.io/docs/v1.0.0/index.md
+- `return_embedding`: To return document embedding.
+- `embedding_field`: Name of field containing an embedding vector.
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level)
+
+
+
+#### Milvus1DocumentStore.write\_documents
+
+```python
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, index_param: Optional[Dict[str, Any]] = None)
+```
+
+Add new documents to the DocumentStore.
+
+**Arguments**:
+
+- `documents`: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index
+them right away in Milvus. If not, you can later call update_embeddings() to create & index them.
+- `index`: (SQL) index name for storing the docs and metadata
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+
+**Raises**:
+
+- `DuplicateDocumentError`: Exception trigger on duplicate document
+
+**Returns**:
+
+None
+
+
+
+#### Milvus1DocumentStore.update\_embeddings
+
+```python
+def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None)
+```
+
+Updates the embeddings in the the document store using the encoding model specified in the retriever.
+
+This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).
+
+**Arguments**:
+
+- `retriever`: Retriever to use to get embeddings for text
+- `index`: (SQL) index name for storing the docs and metadata
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False,
+only documents without embeddings are processed. This mode can be used for
+incremental updating of embeddings, wherein, only newly indexed documents
+get processed.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+
+**Returns**:
+
+None
+
+
+
+#### Milvus1DocumentStore.query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `filters`: Optional filters to narrow down the search space.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `top_k`: How many documents to return
+- `index`: (SQL) index name for storing the docs and metadata
+- `return_embedding`: To return document embedding
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+**Returns**:
+
+list of Documents that are the most similar to `query_emb`
+
+
+
+#### Milvus1DocumentStore.delete\_all\_documents
+
+```python
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete all documents (from SQL AND Milvus).
+
+**Arguments**:
+
+- `index`: (SQL) index name for storing the docs and metadata
+- `filters`: Optional filters to narrow down the search space.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+
+**Returns**:
+
+None
+
+
+
+#### Milvus1DocumentStore.delete\_documents
+
+```python
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `ids`: Optional list of IDs to narrow down the documents to be deleted.
+- `filters`: Optional filters to narrow down the documents to be deleted.
+Example filters: {"name": ["some", "more"], "category": ["only_one"]}.
+If filters are provided along with a list of IDs, this method deletes the
+intersection of the two query results (documents that match the filters and
+have their ID in the list).
+
+**Returns**:
+
+None
+
+
+
+#### Milvus1DocumentStore.delete\_index
+
+```python
+def delete_index(index: str)
+```
+
+Delete an existing index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
+
+
+#### Milvus1DocumentStore.get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get all documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents to return.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### Milvus1DocumentStore.get\_all\_documents
+
+```python
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Get documents from the document store (optionally using filter criteria).
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents to return.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### Milvus1DocumentStore.get\_document\_by\_id
+
+```python
+def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document]
+```
+
+Fetch a document by specifying its text id string
+
+**Arguments**:
+
+- `id`: ID of the document
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+
+
+
+#### Milvus1DocumentStore.get\_documents\_by\_id
+
+```python
+def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Fetch multiple documents by specifying their IDs (strings)
+
+**Arguments**:
+
+- `ids`: List of IDs of the documents
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `batch_size`: is currently not used
+
+
+
+#### Milvus1DocumentStore.get\_all\_vectors
+
+```python
+def get_all_vectors(index: Optional[str] = None) -> List[np.ndarray]
+```
+
+Helper function to dump all vectors stored in Milvus server.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+
+**Returns**:
+
+List[np.array]: List of vectors.
+
+
+
+#### Milvus1DocumentStore.get\_embedding\_count
+
+```python
+def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
+
+
+# Module milvus2
+
+
+
+## Milvus2DocumentStore
+
+```python
+class Milvus2DocumentStore(SQLDocumentStore)
+```
+
+Limitations:
+Milvus 2.0 so far doesn't support the deletion of documents (https://github.com/milvus-io/milvus/issues/7130).
+Therefore, delete_documents() and update_embeddings() won't work yet.
+
+Differences to 1.x:
+Besides big architectural changes that impact performance and reliability 2.0 supports the filtering by scalar data types.
+For Haystack users this means you can now run a query using vector similarity and filter for some meta data at the same time!
+(See https://milvus.io/docs/v2.0.x/comparison.md for more details)
+
+Usage:
+1. Start a Milvus service via docker (see https://milvus.io/docs/v2.0.x/install_standalone-docker.md)
+2. Run pip install farm-haystack[milvus]
+3. Init a MilvusDocumentStore() in Haystack
+
+Overview:
+Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors.
+Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR).
+
+In contrast to FAISS, Milvus ...
+ - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment
+ - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index)
+ - encapsulates multiple ANN libraries (FAISS, ANNOY ...)
+
+This class uses Milvus for all vector related storage, processing and querying.
+The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus
+does not allow these data types (yet).
+
+
+
+#### Milvus2DocumentStore.\_\_init\_\_
+
+```python
+def __init__(sql_url: str = "sqlite:///", host: str = "localhost", port: str = "19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: str = "IVF_FLAT", index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", id_field: str = "id", custom_fields: Optional[List[Any]] = None, progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None, consistency_level: int = 0, recreate_index: bool = False)
+```
+
+**Arguments**:
+
+- `sql_url`: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale
+deployment, Postgres is recommended. If using MySQL then same server can also be used for
+Milvus metadata. For more details see https://milvus.io/docs/v1.1.0/data_manage.md.
+- `milvus_url`: Milvus server connection URL for storing and processing vectors.
+Protocol, host and port will automatically be inferred from the URL.
+See https://milvus.io/docs/v2.0.x/install_standalone-docker.md for instructions to start a Milvus instance.
+- `connection_pool`: Connection pool type to connect with Milvus server. Default: "SingletonThread".
+- `index`: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name").
+- `vector_dim`: Deprecated. Use embedding_dim instead.
+- `embedding_dim`: The embedding vector size. Default: 768.
+- `index_file_size`: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB.
+When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment.
+Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one.
+As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048.
+Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory.
+(From https://milvus.io/docs/v2.0.x/performance_faq.md)
+- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings.
+'cosine' is recommended for Sentence Transformers, but is not directly supported by Milvus.
+However, you can normalize your embeddings and use `dot_product` to get the same results.
+See https://milvus.io/docs/v2.0.x/metric.md.
+- `index_type`: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy.
+Some popular options:
+- FLAT (default): Exact method, slow
+- IVF_FLAT, inverted file based heuristic, fast
+- HSNW: Graph based, fast
+- ANNOY: Tree based, fast
+See: https://milvus.io/docs/v2.0.x/index.md
+- `index_param`: Configuration parameters for the chose index_type needed at indexing time.
+For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT.
+See https://milvus.io/docs/v2.0.x/index.md
+- `search_param`: Configuration parameters for the chose index_type needed at query time
+For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT.
+See https://milvus.io/docs/v2.0.x/index.md
+- `return_embedding`: To return document embedding.
+- `embedding_field`: Name of field containing an embedding vector.
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level)
+- `recreate_index`: If set to True, an existing Milvus index will be deleted and a new one will be
+created using the config you are using for initialization. Be aware that all data in the old index will be
+lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
+be recreated.
+
+
+
+#### Milvus2DocumentStore.write\_documents
+
+```python
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, index_param: Optional[Dict[str, Any]] = None)
+```
+
+Add new documents to the DocumentStore.
+
+**Arguments**:
+
+- `documents`: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index
+them right away in Milvus. If not, you can later call `update_embeddings()` to create & index them.
+- `index`: (SQL) index name for storing the docs and metadata
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+
+**Raises**:
+
+- `DuplicateDocumentError`: Exception trigger on duplicate document
+
+
+
+#### Milvus2DocumentStore.update\_embeddings
+
+```python
+def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None)
+```
+
+Updates the embeddings in the the document store using the encoding model specified in the retriever.
+
+This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).
+
+**Arguments**:
+
+- `retriever`: Retriever to use to get embeddings for text
+- `index`: (SQL) index name for storing the docs and metadata
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False,
+only documents without embeddings are processed. This mode can be used for
+incremental updating of embeddings, wherein, only newly indexed documents
+get processed.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+
+**Returns**:
+
+None
+
+
+
+#### Milvus2DocumentStore.query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `filters`: Optional filters to narrow down the search space.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `top_k`: How many documents to return
+- `index`: (SQL) index name for storing the docs and metadata
+- `return_embedding`: To return document embedding
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### Milvus2DocumentStore.delete\_documents
+
+```python
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000)
+```
+
+Delete all documents (from SQL AND Milvus).
+
+**Arguments**:
+
+- `index`: (SQL) index name for storing the docs and metadata
+- `filters`: Optional filters to narrow down the search space.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+
+**Returns**:
+
+None
+
+
+
+#### Milvus2DocumentStore.delete\_index
+
+```python
+def delete_index(index: str)
+```
+
+Delete an existing index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
+
+
+#### Milvus2DocumentStore.get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get all documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents to return.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### Milvus2DocumentStore.get\_all\_documents
+
+```python
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Get documents from the document store (optionally using filter criteria).
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents to return.
+Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### Milvus2DocumentStore.get\_document\_by\_id
+
+```python
+def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document]
+```
+
+Fetch a document by specifying its text id string
+
+**Arguments**:
+
+- `id`: ID of the document
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+
+
+
+#### Milvus2DocumentStore.get\_documents\_by\_id
+
+```python
+def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Fetch multiple documents by specifying their IDs (strings)
+
+**Arguments**:
+
+- `ids`: List of IDs of the documents
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### Milvus2DocumentStore.get\_embedding\_count
+
+```python
+def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
+
+
+# Module weaviate
+
+
+
+## WeaviateDocumentStore
+
+```python
+class WeaviateDocumentStore(BaseDocumentStore)
+```
+
+Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
+(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate)
+
+Some of the key differences in contrast to FAISS & Milvus:
+1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
+2. Allows combination of vector search and scalar filtering, i.e. you can filter for a certain tag and do dense retrieval on that subset
+3. Has less variety of ANN algorithms, as of now only HNSW.
+4. Requires document ids to be in uuid-format. If wrongly formatted ids are provided at indexing time they will be replaced with uuids automatically.
+5. Only support cosine similarity.
+
+Weaviate python client is used to connect to the server, more details are here
+https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html
+
+Usage:
+1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html)
+2. Init a WeaviateDocumentStore in Haystack
+
+Limitations:
+The current implementation is not supporting the storage of labels, so you cannot run any evaluation workflows.
+
+
+
+#### WeaviateDocumentStore.\_\_init\_\_
+
+```python
+def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, List[int]] = 8080, timeout_config: tuple = (5, 15), username: str = None, password: str = None, index: str = "Document", embedding_dim: int = 768, content_field: str = "content", name_field: str = "name", similarity: str = "cosine", index_type: str = "hnsw", custom_schema: Optional[dict] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False)
+```
+
+**Arguments**:
+
+- `host`: Weaviate server connection URL for storing and processing documents and vectors.
+For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html"
+- `port`: port of Weaviate instance
+- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds).
+- `username`: username (standard authentication via http_auth)
+- `password`: password (standard authentication via http_auth)
+- `index`: Index name for document text, embedding and metadata (in Weaviate terminology, this is a "Class" in Weaviate schema).
+- `embedding_dim`: The embedding vector size. Default: 768.
+- `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text").
+If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned.
+- `name_field`: Name of field that contains the title of the the doc
+- `similarity`: The similarity function used to compare document vectors. 'cosine' is the only currently supported option and default.
+'cosine' is recommended for Sentence Transformers.
+- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable.
+Currently, HSNW is only supported.
+See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html
+- `custom_schema`: Allows to create custom schema in Weaviate, for more details
+See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html
+- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers"
+For more details, See https://weaviate.io/developers/weaviate/current/modules/
+- `return_embedding`: To return document embedding.
+- `embedding_field`: Name of field containing an embedding vector.
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already exists.
+- `recreate_index`: If set to True, an existing Weaviate index will be deleted and a new one will be
+created using the config you are using for initialization. Be aware that all data in the old index will be
+lost if you choose to recreate the index.
+
+
+
+#### WeaviateDocumentStore.get\_document\_by\_id
+
+```python
+def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document]
+```
+
+Fetch a document by specifying its uuid string
+
+
+
+#### WeaviateDocumentStore.get\_documents\_by\_id
+
+```python
+def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Fetch documents by specifying a list of uuid strings.
+
+
+
+#### WeaviateDocumentStore.write\_documents
+
+```python
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Add new documents to the DocumentStore.
+
+**Arguments**:
+
+- `documents`: List of `Dicts` or List of `Documents`. A dummy embedding vector for each document is automatically generated if it is not provided. The document id needs to be in uuid format. Otherwise a correctly formatted uuid will be automatically generated based on the provided id.
+- `index`: index name for storing the docs and metadata
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+
+**Raises**:
+
+- `DuplicateDocumentError`: Exception trigger on duplicate document
+
+**Returns**:
+
+None
+
+
+
+#### WeaviateDocumentStore.update\_document\_meta
+
+```python
+def update_document_meta(id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None)
+```
+
+Update the metadata dictionary of a document by specifying its string id.
+Overwrites only the specified fields, the unspecified ones remain unchanged.
+
+
+
+#### WeaviateDocumentStore.get\_embedding\_count
+
+```python
+def get_embedding_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None) -> int
+```
+
+Return the number of embeddings in the document store, which is the same as the number of documents since
+every document has a default embedding.
+
+
+
+#### WeaviateDocumentStore.get\_document\_count
+
+```python
+def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
+```
+
+Return the number of documents in the document store.
+
+
+
+#### WeaviateDocumentStore.get\_all\_documents
+
+```python
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Get documents from the document store.
+
+Note this limitation from the changelog of Weaviate 1.8.0:
+
+.. quote::
+ Due to the increasing cost of each page outlined above, there is a limit to
+ how many objects can be retrieved using pagination. By default setting the sum
+ of offset and limit to higher than 10,000 objects, will lead to an error.
+ If you must retrieve more than 10,000 objects, you can increase this limit by
+ setting the environment variable `QUERY_MAXIMUM_RESULTS=`.
+
+ Warning: Setting this to arbitrarily high values can make the memory consumption
+ of a single query explode and single queries can slow down the entire cluster.
+ We recommend setting this value to the lowest possible value that does not
+ interfere with your users' expectations.
+
+(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0)
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### WeaviateDocumentStore.get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+Note this limitation from the changelog of Weaviate 1.8.0:
+
+.. quote::
+ Due to the increasing cost of each page outlined above, there is a limit to
+ how many objects can be retrieved using pagination. By default setting the sum
+ of offset and limit to higher than 10,000 objects, will lead to an error.
+ If you must retrieve more than 10,000 objects, you can increase this limit by
+ setting the environment variable `QUERY_MAXIMUM_RESULTS=`.
+
+ Warning: Setting this to arbitrarily high values can make the memory consumption
+ of a single query explode and single queries can slow down the entire cluster.
+ We recommend setting this value to the lowest possible value that does not
+ interfere with your users' expectations.
+
+(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0)
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+
+
+#### WeaviateDocumentStore.query
+
+```python
+def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, scale_score: bool = True) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query as defined by Weaviate semantic search.
+
+**Arguments**:
+
+- `query`: The query
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `custom_query`: Custom query that will executed using query.raw method, for more details refer
+https://weaviate.io/developers/weaviate/current/graphql-references/filters.html
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### WeaviateDocumentStore.query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return
+- `index`: index name for storing the docs and metadata
+- `return_embedding`: To return document embedding
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### WeaviateDocumentStore.update\_embeddings
+
+```python
+def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000)
+```
+
+Updates the embeddings in the the document store using the encoding model specified in the retriever.
+
+This can be useful if want to change the embeddings for your documents (e.g. after changing the retriever config).
+
+**Arguments**:
+
+- `retriever`: Retriever to use to update the embeddings.
+- `index`: Index name to update
+- `update_existing_embeddings`: Weaviate mandates an embedding while creating the document itself.
+This option must be always true for weaviate and it will update the embeddings for all the documents.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+
+**Returns**:
+
+None
+
+
+
+#### WeaviateDocumentStore.delete\_all\_documents
+
+```python
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+
+**Returns**:
+
+None
+
+
+
+#### WeaviateDocumentStore.delete\_documents
+
+```python
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents in an index. All documents are deleted if no filters are passed.
+
+**Arguments**:
+
+- `index`: Index name to delete the document from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `ids`: Optional list of IDs to narrow down the documents to be deleted.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+ If filters are provided along with a list of IDs, this method deletes the
+ intersection of the two query results (documents that match the filters and
+ have their ID in the list).
+
+**Returns**:
+
+None
+
+
+
+#### WeaviateDocumentStore.delete\_index
+
+```python
+def delete_index(index: str)
+```
+
+Delete an existing index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
+
+
+#### WeaviateDocumentStore.delete\_labels
+
+```python
+def delete_labels()
+```
+
+Implemented to respect BaseDocumentStore's contract.
+
+Weaviate does not support labels (yet).
+
+
+
+#### WeaviateDocumentStore.get\_all\_labels
+
+```python
+def get_all_labels()
+```
+
+Implemented to respect BaseDocumentStore's contract.
+
+Weaviate does not support labels (yet).
+
+
+
+#### WeaviateDocumentStore.get\_label\_count
+
+```python
+def get_label_count()
+```
+
+Implemented to respect BaseDocumentStore's contract.
+
+Weaviate does not support labels (yet).
+
+
+
+#### WeaviateDocumentStore.write\_labels
+
+```python
+def write_labels()
+```
+
+Implemented to respect BaseDocumentStore's contract.
+
+Weaviate does not support labels (yet).
+
+
+
+# Module graphdb
+
+
+
+## GraphDBKnowledgeGraph
+
+```python
+class GraphDBKnowledgeGraph(BaseKnowledgeGraph)
+```
+
+Knowledge graph store that runs on a GraphDB instance.
+
+
+
+#### GraphDBKnowledgeGraph.\_\_init\_\_
+
+```python
+def __init__(host: str = "localhost", port: int = 7200, username: str = "", password: str = "", index: Optional[str] = None, prefixes: str = "")
+```
+
+Init the knowledge graph by defining the settings to connect with a GraphDB instance
+
+**Arguments**:
+
+- `host`: address of server where the GraphDB instance is running
+- `port`: port where the GraphDB instance is running
+- `username`: username to login to the GraphDB instance (if any)
+- `password`: password to login to the GraphDB instance (if any)
+- `index`: name of the index (also called repository) stored in the GraphDB instance
+- `prefixes`: definitions of namespaces with a new line after each namespace, e.g., PREFIX hp:
+
+
+
+#### GraphDBKnowledgeGraph.create\_index
+
+```python
+def create_index(config_path: Path, headers: Optional[Dict[str, str]] = None)
+```
+
+Create a new index (also called repository) stored in the GraphDB instance
+
+**Arguments**:
+
+- `config_path`: path to a .ttl file with configuration settings, details:
+- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+https://graphdb.ontotext.com/documentation/free/configuring-a-repository.html#configure-a-repository-programmatically
+
+
+
+#### GraphDBKnowledgeGraph.delete\_index
+
+```python
+def delete_index(headers: Optional[Dict[str, str]] = None)
+```
+
+Delete the index that GraphDBKnowledgeGraph is connected to. This method deletes all data stored in the index.
+
+**Arguments**:
+
+- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+
+
+
+#### GraphDBKnowledgeGraph.import\_from\_ttl\_file
+
+```python
+def import_from_ttl_file(index: str, path: Path, headers: Optional[Dict[str, str]] = None)
+```
+
+Load an existing knowledge graph represented in the form of triples of subject, predicate, and object from a .ttl file into an index of GraphDB
+
+**Arguments**:
+
+- `index`: name of the index (also called repository) in the GraphDB instance where the imported triples shall be stored
+- `path`: path to a .ttl containing a knowledge graph
+- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+
+
+
+#### GraphDBKnowledgeGraph.get\_all\_triples
+
+```python
+def get_all_triples(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Query the given index in the GraphDB instance for all its stored triples. Duplicates are not filtered.
+
+**Arguments**:
+
+- `index`: name of the index (also called repository) in the GraphDB instance
+- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+
+**Returns**:
+
+all triples stored in the index
+
+
+
+#### GraphDBKnowledgeGraph.get\_all\_subjects
+
+```python
+def get_all_subjects(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Query the given index in the GraphDB instance for all its stored subjects. Duplicates are not filtered.
+
+**Arguments**:
+
+- `index`: name of the index (also called repository) in the GraphDB instance
+- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+
+**Returns**:
+
+all subjects stored in the index
+
+
+
+#### GraphDBKnowledgeGraph.get\_all\_predicates
+
+```python
+def get_all_predicates(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Query the given index in the GraphDB instance for all its stored predicates. Duplicates are not filtered.
+
+**Arguments**:
+
+- `index`: name of the index (also called repository) in the GraphDB instance
+- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+
+**Returns**:
+
+all predicates stored in the index
+
+
+
+#### GraphDBKnowledgeGraph.get\_all\_objects
+
+```python
+def get_all_objects(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered.
+
+**Arguments**:
+
+- `index`: name of the index (also called repository) in the GraphDB instance
+- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+
+**Returns**:
+
+all objects stored in the index
+
+
+
+#### GraphDBKnowledgeGraph.query
+
+```python
+def query(sparql_query: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Execute a SPARQL query on the given index in the GraphDB instance
+
+**Arguments**:
+
+- `sparql_query`: SPARQL query that shall be executed
+- `index`: name of the index (also called repository) in the GraphDB instance
+- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+
+**Returns**:
+
+query result
+
+
+
+# Module deepsetcloud
+
+
+
+#### disable\_and\_log
+
+```python
+def disable_and_log(func)
+```
+
+Decorator to disable write operation, shows warning and inputs instead.
+
+
+
+## DeepsetCloudDocumentStore
+
+```python
+class DeepsetCloudDocumentStore(KeywordDocumentStore)
+```
+
+
+
+#### DeepsetCloudDocumentStore.\_\_init\_\_
+
+```python
+def __init__(api_key: str = None, workspace: str = "default", index: Optional[str] = None, duplicate_documents: str = "overwrite", api_endpoint: Optional[str] = None, similarity: str = "dot_product", return_embedding: bool = False, label_index: str = "default")
+```
+
+A DocumentStore facade enabling you to interact with the documents stored in deepset Cloud.
+
+Thus you can run experiments like trying new nodes, pipelines, etc. without having to index your data again.
+
+You can also use this DocumentStore to create new pipelines on deepset Cloud. To do that, take the following
+steps:
+
+- create a new DeepsetCloudDocumentStore without an index (e.g. `DeepsetCloudDocumentStore()`)
+- create query and indexing pipelines using this DocumentStore
+- call `Pipeline.save_to_deepset_cloud()` passing the pipelines and a `pipeline_config_name`
+- call `Pipeline.deploy_on_deepset_cloud()` passing the `pipeline_config_name`
+
+DeepsetCloudDocumentStore is not intended for use in production-like scenarios.
+See [https://haystack.deepset.ai/components/v1.5.0/document-store](https://haystack.deepset.ai/components/v1.5.0/document-store)
+for more information.
+
+**Arguments**:
+
+- `api_key`: Secret value of the API key.
+If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable.
+See docs on how to generate an API key for your workspace: https://docs.cloud.deepset.ai/docs/connect-deepset-cloud-to-your-application
+- `workspace`: workspace name in deepset Cloud
+- `index`: name of the index to access within the deepset Cloud workspace. This equals typically the name of
+your pipeline. You can run Pipeline.list_pipelines_on_deepset_cloud() to see all available ones.
+If you set index to `None`, this DocumentStore will always return empty results.
+This is especially useful if you want to create a new Pipeline within deepset Cloud
+(see Pipeline.save_to_deepset_cloud()` and `Pipeline.deploy_on_deepset_cloud()`).
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `api_endpoint`: The URL of the deepset Cloud API.
+If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable.
+If DEEPSET_CLOUD_API_ENDPOINT environment variable is not specified either, defaults to "https://api.cloud.deepset.ai/api/v1".
+- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is
+more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence Transformer model.
+- `label_index`: index for the evaluation set interface
+- `return_embedding`: To return document embedding.
+
+
+
+#### DeepsetCloudDocumentStore.get\_all\_documents
+
+```python
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Get documents from the document store.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: Number of documents that are passed to bulk function at a time.
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+
+
+
+#### DeepsetCloudDocumentStore.get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+
+
+
+#### DeepsetCloudDocumentStore.query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR)
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return
+- `index`: Index name for storing the docs and metadata
+- `return_embedding`: To return document embedding
+- `headers`: Custom HTTP headers to pass to requests
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### DeepsetCloudDocumentStore.query
+
+```python
+def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query as defined by the BM25 algorithm.
+
+**Arguments**:
+
+- `query`: The query
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `custom_query`: Custom query to be executed.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `headers`: Custom HTTP headers to pass to requests
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to False.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### DeepsetCloudDocumentStore.write\_documents
+
+```python
+@disable_and_log
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Indexes documents for later queries.
+
+**Arguments**:
+
+- `documents`: a list of Python dictionaries or a list of Haystack Document objects.
+For documents as dictionaries, the format is {"text": ""}.
+Optionally: Include meta data via {"text": "",
+"meta":{"name": ", "author": "somebody", ...}}
+It can be used for filtering and is accessible in the responses of the Finder.
+- `index`: Optional name of index where the documents shall be written to.
+If None, the DocumentStore's default index (self.index) will be used.
+- `batch_size`: Number of documents that are passed to bulk function at a time.
+- `duplicate_documents`: Handle duplicates document based on parameter options.
+Parameter options : ( 'skip','overwrite','fail')
+skip: Ignore the duplicates documents
+overwrite: Update any existing documents with the same ID when adding documents.
+fail: an error is raised if the document ID of the document being added already
+exists.
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+
+**Returns**:
+
+None
+
+
+
+#### DeepsetCloudDocumentStore.get\_evaluation\_sets
+
+```python
+def get_evaluation_sets() -> List[dict]
+```
+
+Returns a list of uploaded evaluation sets to deepset cloud.
+
+**Returns**:
+
+list of evaluation sets as dicts
+These contain ("name", "evaluation_set_id", "created_at", "matched_labels", "total_labels") as fields.
+
+
+
+#### DeepsetCloudDocumentStore.get\_all\_labels
+
+```python
+def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label]
+```
+
+Returns a list of labels for the given index name.
+
+**Arguments**:
+
+- `index`: Optional name of evaluation set for which labels should be searched.
+If None, the DocumentStore's default label_index (self.label_index) will be used.
+- `headers`: Not supported.
+
+**Returns**:
+
+list of Labels.
+
+
+
+#### DeepsetCloudDocumentStore.get\_label\_count
+
+```python
+def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int
+```
+
+Counts the number of labels for the given index and returns the value.
+
+**Arguments**:
+
+- `index`: Optional evaluation set name for which the labels should be counted.
+If None, the DocumentStore's default label_index (self.label_index) will be used.
+- `headers`: Not supported.
+
+**Returns**:
+
+number of labels for the given index
+
+
+
+# Module pinecone
+
+
+
+## PineconeDocumentStore
+
+```python
+class PineconeDocumentStore(SQLDocumentStore)
+```
+
+Document store for very large scale embedding based dense retrievers like the DPR. This is a hosted document store,
+this means that your vectors will not be stored locally but in the cloud. This means that the similarity
+search will be run on the cloud as well.
+
+It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io))
+to perform similarity search on vectors. In order to use this document store, you need an API key that you can
+obtain by creating an account on the [Pinecone website](https://www.pinecone.io).
+
+The document text is stored using the SQLDocumentStore, while
+the vector embeddings and metadata (for filtering) are indexed in a Pinecone Index.
+
+
+
+#### PineconeDocumentStore.\_\_init\_\_
+
+```python
+def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False)
+```
+
+**Arguments**:
+
+- `api_key`: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)).
+- `environment`: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS regions are
+supported, contact Pinecone [here](https://www.pinecone.io/contact/) if required.
+- `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale
+deployment, Postgres is recommended.
+- `pinecone_index`: pinecone-client Index object, an index will be initialized or loaded if not specified.
+- `embedding_dim`: The embedding vector size.
+- `return_embedding`: Whether to return document embeddings.
+- `index`: Name of index in document store to use.
+- `similarity`: The similarity function used to compare document vectors. `"cosine"` is the default
+and is recommended if you are using a Sentence-Transformer model. `"dot_product"` is more performant
+with DPR embeddings.
+In both cases, the returned values in Document.score are normalized to be in range [0,1]:
+ - For `"dot_product"`: `expit(np.asarray(raw_score / 100))`
+ - For `"cosine"`: `(raw_score + 1) / 2`
+- `replicas`: The number of replicas. Replicas duplicate the index. They provide higher availability and
+throughput.
+- `shards`: The number of shards to be used in the index. We recommend to use 1 shard per 1GB of data.
+- `embedding_field`: Name of field containing an embedding vector.
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_documents`: Handle duplicate documents based on parameter options.\
+Parameter options:
+ - `"skip"`: Ignore the duplicate documents.
+ - `"overwrite"`: Update any existing documents with the same ID when adding documents.
+ - `"fail"`: An error is raised if the document ID of the document being added already exists.
+- `recreate_index`: If set to True, an existing Pinecone index will be deleted and a new one will be
+created using the config you are using for initialization. Be aware that all data in the old index will be
+lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
+be recreated.
+
+
+
+#### PineconeDocumentStore.write\_documents
+
+```python
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Add new documents to the DocumentStore.
+
+**Arguments**:
+
+- `documents`: List of `Dicts` or list of `Documents`. If they already contain embeddings, we'll index them
+right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them.
+- `index`: Index name for storing the docs and metadata.
+- `batch_size`: Number of documents to process at a time. When working with large number of documents,
+batching can help to reduce the memory footprint.
+- `duplicate_documents`: handle duplicate documents based on parameter options.
+Parameter options:
+ - `"skip"`: Ignore the duplicate documents.
+ - `"overwrite"`: Update any existing documents with the same ID when adding documents.
+ - `"fail"`: An error is raised if the document ID of the document being added already exists.
+- `headers`: PineconeDocumentStore does not support headers.
+
+**Raises**:
+
+- `DuplicateDocumentError`: Exception trigger on duplicate document.
+
+
+
+#### PineconeDocumentStore.update\_embeddings
+
+```python
+def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 32)
+```
+
+Updates the embeddings in the document store using the encoding model specified in the retriever.
+
+This can be useful if you want to add or change the embeddings for your documents (e.g. after changing the
+retriever config).
+
+**Arguments**:
+
+- `retriever`: Retriever to use to get embeddings for text.
+- `index`: Index name for which embeddings are to be updated. If set to `None`, the default `self.index` is
+used.
+- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to `False`,
+only documents without embeddings are processed. This mode can be used for incremental updating of
+embeddings, wherein, only newly indexed documents get processed.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `batch_size`: Number of documents to process at a time. When working with large number of documents,
+batching can help reduce memory footprint.
+
+
+
+#### PineconeDocumentStore.get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get all documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `headers`: PineconeDocumentStore does not support headers.
+
+
+
+#### PineconeDocumentStore.get\_embedding\_count
+
+```python
+def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
+
+
+#### PineconeDocumentStore.update\_document\_meta
+
+```python
+def update_document_meta(id: str, meta: Dict[str, str], index: str = None)
+```
+
+Update the metadata dictionary of a document by specifying its string id
+
+
+
+#### PineconeDocumentStore.delete\_documents
+
+```python
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents from the document store.
+
+**Arguments**:
+
+- `index`: Index name to delete the documents from. If `None`, the DocumentStore's default index
+(`self.index`) will be used.
+- `ids`: Optional list of IDs to narrow down the documents to be deleted.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+- `headers`: PineconeDocumentStore does not support headers.
+
+
+
+#### PineconeDocumentStore.delete\_index
+
+```python
+def delete_index(index: str)
+```
+
+Delete an existing index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
+
+
+#### PineconeDocumentStore.query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR).
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return.
+- `index`: The name of the index from which to retrieve documents.
+- `return_embedding`: Whether to return document embedding.
+- `headers`: PineconeDocumentStore does not support headers.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### PineconeDocumentStore.load
+
+```python
+@classmethod
+def load(cls)
+```
+
+Default class method used for loading indexes. Not applicable to the PineconeDocumentStore.
+
+
+
+# Module utils
+
+
+
+#### eval\_data\_from\_json
+
+```python
+def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None, open_domain: bool = False) -> Tuple[List[Document], List[Label]]
+```
+
+Read Documents + Labels from a SQuAD-style file.
+
+Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
+
+**Arguments**:
+
+- `filename`: Path to file in SQuAD format
+- `max_docs`: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents.
+- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the same question might be found in different contexts.
+
+
+
+#### eval\_data\_from\_jsonl
+
+```python
+def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None, open_domain: bool = False) -> Generator[Tuple[List[Document], List[Label]], None, None]
+```
+
+Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line.
+
+Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
+
+This is a generator which will yield one tuple per iteration containing a list
+of batch_size documents and a list with the documents' labels.
+If batch_size is set to None, this method will yield all documents and labels.
+
+**Arguments**:
+
+- `filename`: Path to file in SQuAD format
+- `max_docs`: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents.
+- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the same question might be found in different contexts.
+
+
+
+#### squad\_json\_to\_jsonl
+
+```python
+def squad_json_to_jsonl(squad_file: str, output_file: str)
+```
+
+Converts a SQuAD-json-file into jsonl format with one document per line.
+
+**Arguments**:
+
+- `squad_file`: SQuAD-file in json format.
+- `output_file`: Name of output file (SQuAD in jsonl format)
+
+
+
+#### convert\_date\_to\_rfc3339
+
+```python
+def convert_date_to_rfc3339(date: str) -> str
+```
+
+Converts a date to RFC3339 format, as Weaviate requires dates to be in RFC3339 format including the time and
+timezone.
+
+If the provided date string does not contain a time and/or timezone, we use 00:00 as default time
+and UTC as default time zone.
+
+This method cannot be part of WeaviateDocumentStore, as this would result in a circular import between weaviate.py
+and filter_utils.py.
+
diff --git a/docs/v1.5.0/_src/api/api/evaluation.md b/docs/v1.5.0/_src/api/api/evaluation.md
new file mode 100644
index 0000000000..d4cb736e3a
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/evaluation.md
@@ -0,0 +1,148 @@
+
+
+# Module evaluator
+
+
+
+## EvalDocuments
+
+```python
+class EvalDocuments(BaseComponent)
+```
+
+This is a pipeline node that should be placed after a node that returns a List of Document, e.g., Retriever or
+Ranker, in order to assess its performance. Performance metrics are stored in this class and updated as each
+sample passes through it. To view the results of the evaluation, call EvalDocuments.print(). Note that results
+from this Node may differ from that when calling Retriever.eval() since that is a closed domain evaluation. Have
+a look at our evaluation tutorial for more info about open vs closed domain eval (
+https://haystack.deepset.ai/tutorials/v1.5.0/evaluation).
+
+EvalDocuments node is deprecated and will be removed in a future version.
+Please use pipeline.eval() instead.
+
+
+
+#### EvalDocuments.\_\_init\_\_
+
+```python
+def __init__(debug: bool = False, open_domain: bool = True, top_k: int = 10)
+```
+
+**Arguments**:
+
+- `open_domain`: When True, a document is considered correctly retrieved so long as the answer string can be found within it.
+When False, correct retrieval is evaluated based on document_id.
+- `debug`: When True, a record of each sample and its evaluation will be stored in EvalDocuments.log
+- `top_k`: calculate eval metrics for top k results, e.g., recall@k
+
+
+
+#### EvalDocuments.run
+
+```python
+def run(documents: List[Document], labels: List[Label], top_k: Optional[int] = None)
+```
+
+Run this node on one sample and its labels
+
+
+
+#### EvalDocuments.print
+
+```python
+def print()
+```
+
+Print the evaluation results
+
+
+
+## EvalAnswers
+
+```python
+class EvalAnswers(BaseComponent)
+```
+
+This is a pipeline node that should be placed after a Reader in order to assess the performance of the Reader
+individually or to assess the extractive QA performance of the whole pipeline. Performance metrics are stored in
+this class and updated as each sample passes through it. To view the results of the evaluation, call EvalAnswers.print().
+Note that results from this Node may differ from that when calling Reader.eval()
+since that is a closed domain evaluation. Have a look at our evaluation tutorial for more info about
+open vs closed domain eval (https://haystack.deepset.ai/tutorials/v1.5.0/evaluation).
+
+EvalAnswers node is deprecated and will be removed in a future version.
+Please use pipeline.eval() instead.
+
+
+
+#### EvalAnswers.\_\_init\_\_
+
+```python
+def __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, sas_model: str = None, debug: bool = False)
+```
+
+**Arguments**:
+
+- `skip_incorrect_retrieval`: When set to True, this eval will ignore the cases where the retriever returned no correct documents
+- `open_domain`: When True, extracted answers are evaluated purely on string similarity rather than the position of the extracted answer
+- `sas_model`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric.
+The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps.
+Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture.
+More info in the paper: https://arxiv.org/abs/2108.06130
+Models:
+- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data.
+ Not all cross encoders can be used because of different return types.
+ If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class
+- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large"
+- Large model for German only: "deepset/gbert-large-sts"
+- `debug`: When True, a record of each sample and its evaluation will be stored in EvalAnswers.log
+
+
+
+#### EvalAnswers.run
+
+```python
+def run(labels: List[Label], answers: List[Answer], correct_retrieval: bool)
+```
+
+Run this node on one sample and its labels
+
+
+
+#### EvalAnswers.print
+
+```python
+def print(mode)
+```
+
+Print the evaluation results
+
+
+
+#### semantic\_answer\_similarity
+
+```python
+def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", batch_size: int = 32, use_gpu: bool = True) -> Tuple[List[float], List[float], List[List[float]]]
+```
+
+Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
+
+Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels
+ b) the highest similarity of all predictions to gold labels
+ c) a matrix consisting of the similarities of all the predicitions compared to all gold labels
+
+**Arguments**:
+
+- `predictions`: Predicted answers as list of multiple preds per question
+- `gold_labels`: Labels as list of multiple possible answers per question
+- `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model, should be path or string
+pointing to downloadable models.
+- `batch_size`: Number of prediction label pairs to encode at once.
+- `use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+Falls back to CPU if no GPU is available.
+
+**Returns**:
+
+top_1_sas, top_k_sas, pred_label_matrix
+
diff --git a/docs/v1.5.0/_src/api/api/extractor.md b/docs/v1.5.0/_src/api/api/extractor.md
new file mode 100644
index 0000000000..40f4652ce2
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/extractor.md
@@ -0,0 +1,73 @@
+
+
+# Module entity
+
+
+
+## EntityExtractor
+
+```python
+class EntityExtractor(BaseComponent)
+```
+
+This node is used to extract entities out of documents.
+The most common use case for this would be as a named entity extractor.
+The default model used is dslim/bert-base-NER.
+This node can be placed in a querying pipeline to perform entity extraction on retrieved documents only,
+or it can be placed in an indexing pipeline so that all documents in the document store have extracted entities.
+The entities extracted by this Node will populate Document.entities
+
+
+
+#### EntityExtractor.run
+
+```python
+def run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str]
+```
+
+This is the method called when this node is used in a pipeline
+
+
+
+#### EntityExtractor.extract
+
+```python
+def extract(text)
+```
+
+This function can be called to perform entity extraction when using the node in isolation.
+
+
+
+#### EntityExtractor.extract\_batch
+
+```python
+def extract_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None)
+```
+
+This function allows to extract entities out of a list of strings or a list of lists of strings.
+
+**Arguments**:
+
+- `texts`: List of str or list of lists of str to extract entities from.
+- `batch_size`: Number of texts to make predictions on at a time.
+
+
+
+#### simplify\_ner\_for\_qa
+
+```python
+def simplify_ner_for_qa(output)
+```
+
+Returns a simplified version of the output dictionary
+with the following structure:
+[
+ {
+ answer: { ... }
+ entities: [ { ... }, {} ]
+ }
+]
+The entities included are only the ones that overlap with
+the answer itself.
+
diff --git a/docs/v1.5.0/_src/api/api/file_classifier.md b/docs/v1.5.0/_src/api/api/file_classifier.md
new file mode 100644
index 0000000000..32fbdf28dd
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/file_classifier.md
@@ -0,0 +1,47 @@
+
+
+# Module file\_type
+
+
+
+## FileTypeClassifier
+
+```python
+class FileTypeClassifier(BaseComponent)
+```
+
+Route files in an Indexing Pipeline to corresponding file converters.
+
+
+
+#### FileTypeClassifier.\_\_init\_\_
+
+```python
+def __init__(supported_types: List[str] = DEFAULT_TYPES)
+```
+
+Node that sends out files on a different output edge depending on their extension.
+
+**Arguments**:
+
+- `supported_types`: the file types that this node can distinguish.
+Note that it's limited to a maximum of 10 outgoing edges, which
+correspond each to a file extension. Such extension are, by default
+`txt`, `pdf`, `md`, `docx`, `html`. Lists containing more than 10
+elements will not be allowed. Lists with duplicate elements will
+also be rejected.
+
+
+
+#### FileTypeClassifier.run
+
+```python
+def run(file_paths: Union[Path, List[Path], str, List[str], List[Union[Path, str]]])
+```
+
+Sends out files on a different output edge depending on their extension.
+
+**Arguments**:
+
+- `file_paths`: paths to route on different edges.
+
diff --git a/docs/v1.5.0/_src/api/api/file_converter.md b/docs/v1.5.0/_src/api/api/file_converter.md
new file mode 100644
index 0000000000..f6ef56fc5b
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/file_converter.md
@@ -0,0 +1,533 @@
+
+
+# Module base
+
+
+
+## BaseConverter
+
+```python
+class BaseConverter(BaseComponent)
+```
+
+Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
+
+
+
+#### BaseConverter.\_\_init\_\_
+
+```python
+def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None)
+```
+
+**Arguments**:
+
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+#### BaseConverter.convert
+
+```python
+@abstractmethod
+def convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Convert a file to a dictionary containing the text and any associated meta data.
+
+File converters may extract file meta like name or size. In addition to it, user
+supplied meta data like author, url, external IDs can be supplied as a dictionary.
+
+**Arguments**:
+
+- `file_path`: path of the file to convert
+- `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Select the file encoding (default is `UTF-8`)
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+#### BaseConverter.validate\_language
+
+```python
+def validate_language(text: str, valid_languages: Optional[List[str]] = None) -> bool
+```
+
+Validate if the language of the text is one of valid languages.
+
+
+
+#### BaseConverter.run
+
+```python
+def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
+```
+
+Extract text from a file.
+
+**Arguments**:
+
+- `file_paths`: Path to the files you want to convert
+- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
+Can be any custom keys and values.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `known_ligatures`: Some converters tends to recognize clusters of letters as ligatures, such as "ff" (double f).
+Such ligatures however make text hard to compare with the content of other files,
+which are generally ligature free. Therefore we automatically find and replace the most
+common ligatures with their split counterparts. The default mapping is in
+`haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths
+but excludes all ligatures that are known to be used in IPA.
+You can use this parameter to provide your own set of ligatures to clean up from the documents.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Select the file encoding (default is `UTF-8`)
+
+
+
+# Module docx
+
+
+
+## DocxToTextConverter
+
+```python
+class DocxToTextConverter(BaseConverter)
+```
+
+
+
+#### DocxToTextConverter.convert
+
+```python
+def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Extract text from a .docx file.
+
+Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
+For compliance with other converters we nevertheless opted for keeping the methods name.
+
+**Arguments**:
+
+- `file_path`: Path to the .docx file you want to convert
+- `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Not applicable
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+# Module image
+
+
+
+## ImageToTextConverter
+
+```python
+class ImageToTextConverter(BaseConverter)
+```
+
+
+
+#### ImageToTextConverter.\_\_init\_\_
+
+```python
+def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
+```
+
+**Arguments**:
+
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified here
+(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text. Run the following line of code to check available language packs:
+# List of available languages
+print(pytesseract.get_languages(config=''))
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+#### ImageToTextConverter.convert
+
+```python
+def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
+
+**Arguments**:
+
+- `file_path`: path to image file
+- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
+Can be any custom keys and values.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages supported by tessarect
+(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Not applicable
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+# Module markdown
+
+
+
+## MarkdownConverter
+
+```python
+class MarkdownConverter(BaseConverter)
+```
+
+
+
+#### MarkdownConverter.convert
+
+```python
+def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Reads text from a txt file and executes optional preprocessing steps.
+
+**Arguments**:
+
+- `file_path`: path of the file to convert
+- `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `encoding`: Select the file encoding (default is `utf-8`)
+- `remove_numeric_tables`: Not applicable
+- `valid_languages`: Not applicable
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+#### MarkdownConverter.markdown\_to\_text
+
+```python
+@staticmethod
+def markdown_to_text(markdown_string: str) -> str
+```
+
+Converts a markdown string to plaintext
+
+**Arguments**:
+
+- `markdown_string`: String in markdown format
+
+
+
+# Module pdf
+
+
+
+## PDFToTextConverter
+
+```python
+class PDFToTextConverter(BaseConverter)
+```
+
+
+
+#### PDFToTextConverter.\_\_init\_\_
+
+```python
+def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
+```
+
+**Arguments**:
+
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`.
+Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
+(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
+
+
+
+#### PDFToTextConverter.convert
+
+```python
+def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
+
+**Arguments**:
+
+- `file_path`: Path to the .pdf file you want to convert
+- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
+Can be any custom keys and values.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
+(See list of available encodings by running `pdftotext -listenc` in the terminal)
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+## PDFToTextOCRConverter
+
+```python
+class PDFToTextOCRConverter(BaseConverter)
+```
+
+
+
+#### PDFToTextOCRConverter.\_\_init\_\_
+
+```python
+def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
+```
+
+Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
+
+**Arguments**:
+
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages supported by tessarect
+(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+#### PDFToTextOCRConverter.convert
+
+```python
+def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Convert a file to a dictionary containing the text and any associated meta data.
+
+File converters may extract file meta like name or size. In addition to it, user
+supplied meta data like author, url, external IDs can be supplied as a dictionary.
+
+**Arguments**:
+
+- `file_path`: path of the file to convert
+- `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Not applicable
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+# Module tika
+
+
+
+## TikaConverter
+
+```python
+class TikaConverter(BaseConverter)
+```
+
+
+
+#### TikaConverter.\_\_init\_\_
+
+```python
+def __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None)
+```
+
+**Arguments**:
+
+- `tika_url`: URL of the Tika server
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+#### TikaConverter.convert
+
+```python
+def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+**Arguments**:
+
+- `file_path`: path of the file to convert
+- `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Not applicable
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+**Returns**:
+
+A list of pages and the extracted meta data of the file.
+
+
+
+# Module txt
+
+
+
+## TextConverter
+
+```python
+class TextConverter(BaseConverter)
+```
+
+
+
+#### TextConverter.convert
+
+```python
+def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Reads text from a txt file and executes optional preprocessing steps.
+
+**Arguments**:
+
+- `file_path`: path of the file to convert
+- `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Select the file encoding (default is `utf-8`)
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
diff --git a/docs/v1.5.0/_src/api/api/generator.md b/docs/v1.5.0/_src/api/api/generator.md
new file mode 100644
index 0000000000..db0d316f0a
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/generator.md
@@ -0,0 +1,299 @@
+
+
+# Module base
+
+
+
+## BaseGenerator
+
+```python
+class BaseGenerator(BaseComponent)
+```
+
+Abstract class for Generators
+
+
+
+#### BaseGenerator.predict
+
+```python
+@abstractmethod
+def predict(query: str, documents: List[Document], top_k: Optional[int]) -> Dict
+```
+
+Abstract method to generate answers.
+
+**Arguments**:
+
+- `query`: Query
+- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
+- `top_k`: Number of returned answers
+
+**Returns**:
+
+Generated answers plus additional infos in a dict
+
+
+
+#### BaseGenerator.predict\_batch
+
+```python
+def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
+```
+
+Generate the answer to the input queries. The generation will be conditioned on the supplied documents.
+
+These documents can for example be retrieved via the Retriever.
+
+- If you provide a list containing a single query...
+
+ - ... and a single list of Documents, the query will be applied to each Document individually.
+ - ... and a list of lists of Documents, the query will be applied to each list of Documents and the Answers
+ will be aggregated per Document list.
+
+- If you provide a list of multiple queries...
+
+ - ... and a single list of Documents, each query will be applied to each Document individually.
+ - ... and a list of lists of Documents, each query will be applied to its corresponding list of Documents
+ and the Answers will be aggregated per query-Document pair.
+
+**Arguments**:
+
+- `queries`: List of queries.
+- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
+Can be a single list of Documents or a list of lists of Documents.
+- `top_k`: Number of returned answers per query.
+- `batch_size`: Not applicable.
+
+**Returns**:
+
+Generated answers plus additional infos in a dict like this:
+```python
+| {'queries': 'who got the first nobel prize in physics',
+| 'answers':
+| [{'query': 'who got the first nobel prize in physics',
+| 'answer': ' albert einstein',
+| 'meta': { 'doc_ids': [...],
+| 'doc_scores': [80.42758 ...],
+| 'doc_probabilities': [40.71379089355469, ...
+| 'content': ['Albert Einstein was a ...]
+| 'titles': ['"Albert Einstein"', ...]
+| }}]}
+```
+
+
+
+# Module transformers
+
+
+
+## RAGenerator
+
+```python
+class RAGenerator(BaseGenerator)
+```
+
+Implementation of Facebook's Retrieval-Augmented Generator (https://arxiv.org/abs/2005.11401) based on
+HuggingFace's transformers (https://huggingface.co/transformers/model_doc/rag.html).
+
+Instead of "finding" the answer within a document, these models **generate** the answer.
+In that sense, RAG follows a similar approach as GPT-3 but it comes with two huge advantages
+for real-world applications:
+a) it has a manageable model size
+b) the answer generation is conditioned on retrieved documents,
+i.e. the model can easily adjust to domain documents even after training has finished
+(in contrast: GPT-3 relies on the web data seen during training)
+
+**Example**
+
+```python
+| query = "who got the first nobel prize in physics?"
+|
+| # Retrieve related documents from retriever
+| retrieved_docs = retriever.retrieve(query=query)
+|
+| # Now generate answer from query and retrieved documents
+| generator.predict(
+| query=query,
+| documents=retrieved_docs,
+| top_k=1
+| )
+|
+| # Answer
+|
+| {'query': 'who got the first nobel prize in physics',
+| 'answers':
+| [{'query': 'who got the first nobel prize in physics',
+| 'answer': ' albert einstein',
+| 'meta': { 'doc_ids': [...],
+| 'doc_scores': [80.42758 ...],
+| 'doc_probabilities': [40.71379089355469, ...
+| 'content': ['Albert Einstein was a ...]
+| 'titles': ['"Albert Einstein"', ...]
+| }}]}
+```
+
+
+
+#### RAGenerator.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True)
+```
+
+Load a RAG model from Transformers along with passage_embedding_model.
+
+See https://huggingface.co/transformers/model_doc/rag.html for more details
+
+**Arguments**:
+
+- `model_name_or_path`: Directory of a saved model or the name of a public model e.g.
+'facebook/rag-token-nq', 'facebook/rag-sequence-nq'.
+See https://huggingface.co/models for full list of available models.
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `retriever`: `DensePassageRetriever` used to embedded passages for the docs passed to `predict()`. This is optional and is only needed if the docs you pass don't already contain embeddings in `Document.embedding`.
+- `generator_type`: Which RAG generator implementation to use ("token" or "sequence")
+- `top_k`: Number of independently generated text to return
+- `max_length`: Maximum length of generated text
+- `min_length`: Minimum length of generated text
+- `num_beams`: Number of beams for beam search. 1 means no beam search.
+- `embed_title`: Embedded the title of passage while generating embedding
+- `prefix`: The prefix used by the generator's tokenizer.
+- `use_gpu`: Whether to use GPU. Falls back on CPU if no GPU is available.
+
+
+
+#### RAGenerator.predict
+
+```python
+def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
+```
+
+Generate the answer to the input query. The generation will be conditioned on the supplied documents.
+
+These documents can for example be retrieved via the Retriever.
+
+**Arguments**:
+
+- `query`: Query
+- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
+- `top_k`: Number of returned answers
+
+**Returns**:
+
+Generated answers plus additional infos in a dict like this:
+```python
+| {'query': 'who got the first nobel prize in physics',
+| 'answers':
+| [{'query': 'who got the first nobel prize in physics',
+| 'answer': ' albert einstein',
+| 'meta': { 'doc_ids': [...],
+| 'doc_scores': [80.42758 ...],
+| 'doc_probabilities': [40.71379089355469, ...
+| 'content': ['Albert Einstein was a ...]
+| 'titles': ['"Albert Einstein"', ...]
+| }}]}
+```
+
+
+
+## Seq2SeqGenerator
+
+```python
+class Seq2SeqGenerator(BaseGenerator)
+```
+
+A generic sequence-to-sequence generator based on HuggingFace's transformers.
+
+Text generation is supported by so called auto-regressive language models like GPT2,
+XLNet, XLM, Bart, T5 and others. In fact, any HuggingFace language model that extends
+GenerationMixin can be used by Seq2SeqGenerator.
+
+Moreover, as language models prepare model input in their specific encoding, each model
+specified with model_name_or_path parameter in this Seq2SeqGenerator should have an
+accompanying model input converter that takes care of prefixes, separator tokens etc.
+By default, we provide model input converters for a few well-known seq2seq language models (e.g. ELI5).
+It is the responsibility of Seq2SeqGenerator user to ensure an appropriate model input converter
+is either already registered or specified on a per-model basis in the Seq2SeqGenerator constructor.
+
+For mode details on custom model input converters refer to _BartEli5Converter
+
+
+See https://huggingface.co/transformers/main_classes/model.html?transformers.generation_utils.GenerationMixin#transformers.generation_utils.GenerationMixin
+as well as https://huggingface.co/blog/how-to-generate
+
+For a list of all text-generation models see https://huggingface.co/models?pipeline_tag=text-generation
+
+**Example**
+
+```python
+| query = "Why is Dothraki language important?"
+|
+| # Retrieve related documents from retriever
+| retrieved_docs = retriever.retrieve(query=query)
+|
+| # Now generate answer from query and retrieved documents
+| generator.predict(
+| query=query,
+| documents=retrieved_docs,
+| top_k=1
+| )
+|
+| # Answer
+|
+| {'query': 'who got the first nobel prize in physics',
+| 'answers':
+| [{'query': 'who got the first nobel prize in physics',
+| 'answer': ' albert einstein',
+| 'meta': { 'doc_ids': [...],
+| 'doc_scores': [80.42758 ...],
+| 'doc_probabilities': [40.71379089355469, ...
+| 'content': ['Albert Einstein was a ...]
+| 'titles': ['"Albert Einstein"', ...]
+| }}]}
+```
+
+
+
+#### Seq2SeqGenerator.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True)
+```
+
+**Arguments**:
+
+- `model_name_or_path`: a HF model name for auto-regressive language model like GPT2, XLNet, XLM, Bart, T5 etc
+- `input_converter`: an optional Callable to prepare model input for the underlying language model
+specified in model_name_or_path parameter. The required __call__ method signature for
+the Callable is:
+__call__(tokenizer: PreTrainedTokenizer, query: str, documents: List[Document],
+top_k: Optional[int] = None) -> BatchEncoding:
+- `top_k`: Number of independently generated text to return
+- `max_length`: Maximum length of generated text
+- `min_length`: Minimum length of generated text
+- `num_beams`: Number of beams for beam search. 1 means no beam search.
+- `use_gpu`: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available.
+
+
+
+#### Seq2SeqGenerator.predict
+
+```python
+def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
+```
+
+Generate the answer to the input query. The generation will be conditioned on the supplied documents.
+
+These document can be retrieved via the Retriever or supplied directly via predict method.
+
+**Arguments**:
+
+- `query`: Query
+- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
+- `top_k`: Number of returned answers
+
+**Returns**:
+
+Generated answers
+
diff --git a/docs/v1.5.0/_src/api/api/other.md b/docs/v1.5.0/_src/api/api/other.md
new file mode 100644
index 0000000000..67995fbd4b
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/other.md
@@ -0,0 +1,124 @@
+
+
+# Module docs2answers
+
+
+
+## Docs2Answers
+
+```python
+class Docs2Answers(BaseComponent)
+```
+
+This Node is used to convert retrieved documents into predicted answers format.
+It is useful for situations where you are calling a Retriever only pipeline via REST API.
+This ensures that your output is in a compatible format.
+
+
+
+# Module join\_docs
+
+
+
+## JoinDocuments
+
+```python
+class JoinDocuments(BaseComponent)
+```
+
+A node to join documents outputted by multiple retriever nodes.
+
+The node allows multiple join modes:
+* concatenate: combine the documents from multiple nodes. Any duplicate documents are discarded.
+ The score is only determined by the last node that outputs the document.
+* merge: merge scores of documents from multiple nodes. Optionally, each input score can be given a different
+ `weight` & a `top_k` limit can be set. This mode can also be used for "reranking" retrieved documents.
+* reciprocal_rank_fusion: combines the documents based on their rank in multiple nodes.
+
+
+
+#### JoinDocuments.\_\_init\_\_
+
+```python
+def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None)
+```
+
+**Arguments**:
+
+- `join_mode`: `concatenate` to combine documents from multiple retrievers `merge` to aggregate scores of
+individual documents, `reciprocal_rank_fusion` to apply rank based scoring.
+- `weights`: A node-wise list(length of list must be equal to the number of input nodes) of weights for
+adjusting document scores when using the `merge` join_mode. By default, equal weight is given
+to each retriever score. This param is not compatible with the `concatenate` join_mode.
+- `top_k_join`: Limit documents to top_k based on the resulting scores of the join.
+
+
+
+# Module join\_answers
+
+
+
+## JoinAnswers
+
+```python
+class JoinAnswers(BaseComponent)
+```
+
+A node to join `Answer`s produced by multiple `Reader` nodes.
+
+
+
+#### JoinAnswers.\_\_init\_\_
+
+```python
+def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None, sort_by_score: bool = True)
+```
+
+**Arguments**:
+
+- `join_mode`: `"concatenate"` to combine documents from multiple `Reader`s. `"merge"` to aggregate scores
+of individual `Answer`s.
+- `weights`: A node-wise list (length of list must be equal to the number of input nodes) of weights for
+adjusting `Answer` scores when using the `"merge"` join_mode. By default, equal weight is assigned to each
+`Reader` score. This parameter is not compatible with the `"concatenate"` join_mode.
+- `top_k_join`: Limit `Answer`s to top_k based on the resulting scored of the join.
+- `sort_by_score`: Whether to sort the incoming answers by their score. Set this to True if your Answers
+are coming from a Reader or TableReader. Set to False if any Answers come from a Generator since this assigns
+None as a score to each.
+
+
+
+# Module route\_documents
+
+
+
+## RouteDocuments
+
+```python
+class RouteDocuments(BaseComponent)
+```
+
+A node to split a list of `Document`s by `content_type` or by the values of a metadata field and route them to
+different nodes.
+
+
+
+#### RouteDocuments.\_\_init\_\_
+
+```python
+def __init__(split_by: str = "content_type", metadata_values: Optional[List[str]] = None)
+```
+
+**Arguments**:
+
+- `split_by`: Field to split the documents by, either `"content_type"` or a metadata field name.
+If this parameter is set to `"content_type"`, the list of `Document`s will be split into a list containing
+only `Document`s of type `"text"` (will be routed to `"output_1"`) and a list containing only `Document`s of
+type `"table"` (will be routed to `"output_2"`).
+If this parameter is set to a metadata field name, you need to specify the parameter `metadata_values` as
+well.
+- `metadata_values`: If the parameter `split_by` is set to a metadata field name, you need to provide a list
+of values to group the `Document`s to. `Document`s whose metadata field is equal to the first value of the
+provided list will be routed to `"output_1"`, `Document`s whose metadata field is equal to the second
+value of the provided list will be routed to `"output_2"`, etc.
+
diff --git a/docs/v1.5.0/_src/api/api/pipelines.md b/docs/v1.5.0/_src/api/api/pipelines.md
new file mode 100644
index 0000000000..da0b500bcb
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/pipelines.md
@@ -0,0 +1,1649 @@
+
+
+# Module base
+
+
+
+## Pipeline
+
+```python
+class Pipeline()
+```
+
+Pipeline brings together building blocks to build a complex search pipeline with Haystack and user-defined components.
+
+Under the hood, a Pipeline is represented as a directed acyclic graph of component nodes. You can use it for custom query flows with the option to branch queries (for example, extractive question answering and keyword match query), merge candidate documents for a Reader from multiple Retrievers, or re-ranking of candidate documents.
+
+
+
+#### Pipeline.root\_node
+
+```python
+@property
+def root_node() -> Optional[str]
+```
+
+Returns the root node of the pipeline's graph.
+
+
+
+#### Pipeline.components
+
+```python
+@property
+def components() -> Dict[str, BaseComponent]
+```
+
+Returns all components used by this pipeline.
+Note that this also includes such components that are being utilized by other components only and are not being used as a pipeline node directly.
+
+
+
+#### Pipeline.to\_code
+
+```python
+def to_code(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = False) -> str
+```
+
+Returns the code to create this pipeline as string.
+
+**Arguments**:
+
+- `pipeline_variable_name`: The variable name of the generated pipeline.
+Default value is 'pipeline'.
+- `generate_imports`: Whether to include the required import statements into the code.
+Default value is True.
+- `add_comment`: Whether to add a preceding comment that this code has been generated.
+Default value is False.
+
+
+
+#### Pipeline.to\_notebook\_cell
+
+```python
+def to_notebook_cell(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = True)
+```
+
+Creates a new notebook cell with the code to create this pipeline.
+
+**Arguments**:
+
+- `pipeline_variable_name`: The variable name of the generated pipeline.
+Default value is 'pipeline'.
+- `generate_imports`: Whether to include the required import statements into the code.
+Default value is True.
+- `add_comment`: Whether to add a preceding comment that this code has been generated.
+Default value is True.
+
+
+
+#### Pipeline.load\_from\_deepset\_cloud
+
+```python
+@classmethod
+def load_from_deepset_cloud(cls, pipeline_config_name: str, pipeline_name: str = "query", workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite_with_env_variables: bool = False)
+```
+
+Load Pipeline from Deepset Cloud defining the individual components and how they're tied together to form
+
+a Pipeline. A single config can declare multiple Pipelines, in which case an explicit `pipeline_name` must
+be passed.
+
+In order to get a list of all available pipeline_config_names, call `list_pipelines_on_deepset_cloud()`.
+Use the returned `name` as `pipeline_config_name`.
+
+**Arguments**:
+
+- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace.
+To get a list of all available pipeline_config_names, call `list_pipelines_on_deepset_cloud()`.
+- `pipeline_name`: specifies which pipeline to load from config.
+Deepset Cloud typically provides a 'query' and a 'index' pipeline per config.
+- `workspace`: workspace in Deepset Cloud
+- `api_key`: Secret value of the API key.
+If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable.
+- `api_endpoint`: The URL of the Deepset Cloud API.
+If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable.
+- `overwrite_with_env_variables`: Overwrite the config with environment variables. For example,
+to change return_no_answer param for a FARMReader, an env
+variable 'READER_PARAMS_RETURN_NO_ANSWER=False' can be set. Note that an
+`_` sign must be used to specify nested hierarchical properties.
+
+
+
+#### Pipeline.list\_pipelines\_on\_deepset\_cloud
+
+```python
+@classmethod
+def list_pipelines_on_deepset_cloud(cls, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None) -> List[dict]
+```
+
+Lists all pipeline configs available on Deepset Cloud.
+
+**Arguments**:
+
+- `workspace`: workspace in Deepset Cloud
+- `api_key`: Secret value of the API key.
+If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable.
+- `api_endpoint`: The URL of the Deepset Cloud API.
+If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable.
+
+Returns:
+ list of dictionaries: List[dict]
+ each dictionary: {
+ "name": str -> `pipeline_config_name` to be used in `load_from_deepset_cloud()`,
+ "..." -> additional pipeline meta information
+ }
+ example:
+ [{'name': 'my_super_nice_pipeline_config',
+ 'pipeline_id': '2184e0c1-c6ec-40a1-9b28-5d2768e5efa2',
+ 'status': 'DEPLOYED',
+ 'created_at': '2022-02-01T09:57:03.803991+00:00',
+ 'deleted': False,
+ 'is_default': False,
+ 'indexing': {'status': 'IN_PROGRESS',
+ 'pending_file_count': 3,
+ 'total_file_count': 31}}]
+
+
+
+#### Pipeline.save\_to\_deepset\_cloud
+
+```python
+@classmethod
+def save_to_deepset_cloud(cls, query_pipeline: Pipeline, index_pipeline: Pipeline, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite: bool = False)
+```
+
+Saves a Pipeline config to Deepset Cloud defining the individual components and how they're tied together to form
+
+a Pipeline. A single config must declare a query pipeline and a index pipeline.
+
+**Arguments**:
+
+- `query_pipeline`: the query pipeline to save.
+- `index_pipeline`: the index pipeline to save.
+- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace.
+- `workspace`: workspace in Deepset Cloud
+- `api_key`: Secret value of the API key.
+If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable.
+- `api_endpoint`: The URL of the Deepset Cloud API.
+If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable.
+- `overwrite`: Whether to overwrite the config if it already exists. Otherwise an error is being raised.
+
+
+
+#### Pipeline.deploy\_on\_deepset\_cloud
+
+```python
+@classmethod
+def deploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60, show_curl_message: bool = True)
+```
+
+Deploys the pipelines of a pipeline config on Deepset Cloud.
+
+Blocks until pipelines are successfully deployed, deployment failed or timeout exceeds.
+If pipelines are already deployed no action will be taken and an info will be logged.
+If timeout exceeds a TimeoutError will be raised.
+If deployment fails a DeepsetCloudError will be raised.
+
+Pipeline config must be present on Deepset Cloud. See save_to_deepset_cloud() for more information.
+
+**Arguments**:
+
+- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace.
+- `workspace`: workspace in Deepset Cloud
+- `api_key`: Secret value of the API key.
+If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable.
+- `api_endpoint`: The URL of the Deepset Cloud API.
+If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable.
+- `timeout`: The time in seconds to wait until deployment completes.
+If the timeout is exceeded an error will be raised.
+- `show_curl_message`: Whether to print an additional message after successful deployment showing how to query the pipeline using curl.
+
+
+
+#### Pipeline.undeploy\_on\_deepset\_cloud
+
+```python
+@classmethod
+def undeploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60)
+```
+
+Undeploys the pipelines of a pipeline config on Deepset Cloud.
+
+Blocks until pipelines are successfully undeployed, undeployment failed or timeout exceeds.
+If pipelines are already undeployed no action will be taken and an info will be logged.
+If timeout exceeds a TimeoutError will be raised.
+If deployment fails a DeepsetCloudError will be raised.
+
+Pipeline config must be present on Deepset Cloud. See save_to_deepset_cloud() for more information.
+
+**Arguments**:
+
+- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace.
+- `workspace`: workspace in Deepset Cloud
+- `api_key`: Secret value of the API key.
+If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable.
+- `api_endpoint`: The URL of the Deepset Cloud API.
+If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable.
+- `timeout`: The time in seconds to wait until undeployment completes.
+If the timeout is exceeded an error will be raised.
+
+
+
+#### Pipeline.add\_node
+
+```python
+def add_node(component: BaseComponent, name: str, inputs: List[str])
+```
+
+Add a new node to the pipeline.
+
+**Arguments**:
+
+- `component`: The object to be called when the data is passed to the node. It can be a Haystack component
+(like Retriever, Reader, or Generator) or a user-defined object that implements a run()
+method to process incoming data from predecessor node.
+- `name`: The name for the node. It must not contain any dots.
+- `inputs`: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name
+of node is sufficient. For instance, a 'BM25Retriever' node would always output a single
+edge with a list of documents. It can be represented as ["BM25Retriever"].
+
+In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output
+must be specified explicitly as "QueryClassifier.output_2".
+
+
+
+#### Pipeline.get\_node
+
+```python
+def get_node(name: str) -> Optional[BaseComponent]
+```
+
+Get a node from the Pipeline.
+
+**Arguments**:
+
+- `name`: The name of the node.
+
+
+
+#### Pipeline.set\_node
+
+```python
+def set_node(name: str, component)
+```
+
+Set the component for a node in the Pipeline.
+
+**Arguments**:
+
+- `name`: The name of the node.
+- `component`: The component object to be set at the node.
+
+
+
+#### Pipeline.run
+
+```python
+def run(query: Optional[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[MultiLabel] = None, documents: Optional[List[Document]] = None, meta: Optional[Union[dict, List[dict]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None)
+```
+
+Runs the Pipeline, one node at a time.
+
+**Arguments**:
+
+- `query`: The search query (for query pipelines only).
+- `file_paths`: The files to index (for indexing pipelines only).
+- `labels`: Ground-truth labels that you can use to perform an isolated evaluation of pipelines. These labels are input to nodes in the pipeline.
+- `documents`: A list of Document objects to be processed by the Pipeline Nodes.
+- `meta`: Files' metadata. Used in indexing pipelines in combination with `file_paths`.
+- `params`: Dictionary of parameters to be dispatched to the nodes.
+To pass a parameter to all Nodes, use: `{"top_k": 10}`.
+To pass a parameter to targeted Nodes, run:
+ `{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}`
+- `debug`: Specifies whether the Pipeline should instruct Nodes to collect debug information
+about their execution. By default, this information includes the input parameters
+the Nodes received and the output they generated. You can then find all debug information in the dictionary returned by this method under the key `_debug`.
+
+
+
+#### Pipeline.run\_batch
+
+```python
+def run_batch(queries: List[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None, documents: Optional[Union[List[Document], List[List[Document]]]] = None, meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None)
+```
+
+Runs the Pipeline in a batch mode, one node at a time. The batch mode means that the Pipeline can take more than one query as input. You can use this method for query pipelines only. When used with an indexing pipeline, it calls the pipeline `run()` method.
+
+Here's what this method returns for Retriever-Reader pipelines:
+- Single query: Retrieves top-k relevant Documents and returns a list of answers for each retrieved Document.
+- A list of queries: Retrieves top-k relevant Documents for each query and returns a list of answers for each query.
+
+Here's what this method returns for Reader-only pipelines:
+- Single query + a list of Documents: Applies the query to each Document individually and returns answers for each single Document.
+- Single query + a list of lists of Documents: Applies the query to each list of Documents and returns aggregated answers for each list of Documents.
+- A list of queries + a list of Documents: Applies each query to each Document individually and returns answers for each query-document pair.
+- A list of queries + a list of lists of Documents: Applies each query to its corresponding Document list and aggregates answers for each list of Documents.
+
+**Arguments**:
+
+- `queries`: List of search queries (for query pipelines only).
+- `file_paths`: The files to index (for indexing pipelines only). If you provide `file_paths` the Pipeline's `run` method instead of `run_batch` is called.
+- `labels`: Ground-truth labels that you can use to perform an isolated evaluation of pipelines. These labels are input to nodes in the pipeline.
+- `documents`: A list of Document objects or a list of lists of Document objects to be processed by the Pipeline Nodes.
+- `meta`: Files' metadata. Used in indexing pipelines in combination with `file_paths`.
+- `params`: Dictionary of parameters to be dispatched to the nodes.
+To pass a parameter to all Nodes, use: `{"top_k": 10}`.
+To pass a parameter to targeted Nodes, run:
+ `{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}`
+- `debug`: Specifies whether the Pipeline should instruct Nodes to collect debug information
+about their execution. By default, this information includes the input parameters
+the Nodes received and the output they generated. You can then find all debug information in the dictionary returned by this method under the key `_debug`.
+
+
+
+#### Pipeline.eval\_beir
+
+```python
+@classmethod
+def eval_beir(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict = {}, query_params: dict = {}, dataset: str = "scifact", dataset_dir: Path = Path("."), top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], keep_index: bool = False) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]
+```
+
+Runs information retrieval evaluation of a pipeline using BEIR on a specified BEIR dataset.
+
+See https://github.com/beir-cellar/beir for more information.
+
+**Arguments**:
+
+- `index_pipeline`: The indexing pipeline to use.
+- `query_pipeline`: The query pipeline to evaluate.
+- `index_params`: The params to use during indexing (see pipeline.run's params).
+- `query_params`: The params to use during querying (see pipeline.run's params).
+- `dataset`: The BEIR dataset to use.
+- `dataset_dir`: The directory to store the dataset to.
+- `top_k_values`: The top_k values each metric will be calculated for.
+- `keep_index`: Whether to keep the index after evaluation.
+If True the index will be kept after beir evaluation. Otherwise it will be deleted immediately afterwards.
+ Defaults to False.
+
+Returns a tuple containing the ncdg, map, recall and precision scores.
+Each metric is represented by a dictionary containing the scores for each top_k value.
+
+
+
+#### Pipeline.execute\_eval\_run
+
+```python
+@classmethod
+def execute_eval_run(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, evaluation_set_labels: List[MultiLabel], corpus_file_paths: List[str], experiment_name: str, experiment_run_name: str, experiment_tracking_tool: Literal["mlflow", None] = None, experiment_tracking_uri: Optional[str] = None, corpus_file_metas: List[Dict[str, Any]] = None, corpus_meta: Dict[str, Any] = {}, evaluation_set_meta: Dict[str, Any] = {}, pipeline_meta: Dict[str, Any] = {}, index_params: dict = {}, query_params: dict = {}, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, reuse_index: bool = False, custom_document_id_field: Optional[str] = None, document_scope: Literal[
+ "document_id",
+ "context",
+ "document_id_and_context",
+ "document_id_or_context",
+ "answer",
+ "document_id_or_answer",
+ ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult
+```
+
+Starts an experiment run that first indexes the specified files (forming a corpus) using the index pipeline
+
+and subsequently evaluates the query pipeline on the provided labels (forming an evaluation set) using pipeline.eval().
+Parameters and results (metrics and predictions) of the run are tracked by an experiment tracking tool for further analysis.
+You can specify the experiment tracking tool by setting the params `experiment_tracking_tool` and `experiment_tracking_uri`
+or by passing a (custom) tracking head to Tracker.set_tracking_head().
+Note, that `experiment_tracking_tool` only supports `mlflow` currently.
+
+For easier comparison you can pass additional metadata regarding corpus (corpus_meta), evaluation set (evaluation_set_meta) and pipelines (pipeline_meta).
+E.g. you can give them names or ids to identify them across experiment runs.
+
+This method executes an experiment run. Each experiment run is part of at least one experiment.
+An experiment typically consists of multiple runs to be compared (e.g. using different retrievers in query pipeline).
+Experiment tracking tools usually share the same concepts of experiments and provide additional functionality to easily compare runs across experiments.
+
+E.g. you can call execute_eval_run() multiple times with different retrievers in your query pipeline and compare the runs in mlflow:
+
+```python
+ | for retriever_type, query_pipeline in zip(["sparse", "dpr", "embedding"], [sparse_pipe, dpr_pipe, embedding_pipe]):
+ | eval_result = Pipeline.execute_eval_run(
+ | index_pipeline=index_pipeline,
+ | query_pipeline=query_pipeline,
+ | evaluation_set_labels=labels,
+ | corpus_file_paths=file_paths,
+ | corpus_file_metas=file_metas,
+ | experiment_tracking_tool="mlflow",
+ | experiment_tracking_uri="http://localhost:5000",
+ | experiment_name="my-retriever-experiment",
+ | experiment_run_name=f"run_{retriever_type}",
+ | pipeline_meta={"name": f"my-pipeline-{retriever_type}"},
+ | evaluation_set_meta={"name": "my-evalset"},
+ | corpus_meta={"name": "my-corpus"}.
+ | reuse_index=False
+ | )
+```
+
+**Arguments**:
+
+- `index_pipeline`: The indexing pipeline to use.
+- `query_pipeline`: The query pipeline to evaluate.
+- `evaluation_set_labels`: The labels to evaluate on forming an evalution set.
+- `corpus_file_paths`: The files to be indexed and searched during evaluation forming a corpus.
+- `experiment_name`: The name of the experiment
+- `experiment_run_name`: The name of the experiment run
+- `experiment_tracking_tool`: The experiment tracking tool to be used. Currently we only support "mlflow".
+If left unset the current TrackingHead specified by Tracker.set_tracking_head() will be used.
+- `experiment_tracking_uri`: The uri of the experiment tracking server to be used. Must be specified if experiment_tracking_tool is set.
+You can use deepset's public mlflow server via https://public-mlflow.deepset.ai/.
+Note, that artifact logging (e.g. Pipeline YAML or evaluation result CSVs) are currently not allowed on deepset's public mlflow server as this might expose sensitive data.
+- `corpus_file_metas`: The optional metadata to be stored for each corpus file (e.g. title).
+- `corpus_meta`: Metadata about the corpus to track (e.g. name, date, author, version).
+- `evaluation_set_meta`: Metadata about the evalset to track (e.g. name, date, author, version).
+- `pipeline_meta`: Metadata about the pipelines to track (e.g. name, author, version).
+- `index_params`: The params to use during indexing (see pipeline.run's params).
+- `query_params`: The params to use during querying (see pipeline.run's params).
+- `sas_model_name_or_path`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric.
+The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps.
+Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture.
+More info in the paper: https://arxiv.org/abs/2108.06130
+Models:
+- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data.
+Not all cross encoders can be used because of different return types.
+If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class
+- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large"
+- Large model for German only: "deepset/gbert-large-sts"
+- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS.
+- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+Falls back to CPU if no GPU is available.
+- `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode.
+This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node.
+If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance.
+If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance.
+The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node.
+To this end, labels are used as input to the node instead of the output of the previous node in the pipeline.
+The generated dataframes in the EvaluationResult then contain additional rows, which can be distinguished from the integrated evaluation results based on the
+values "integrated" or "isolated" in the column "eval_mode" and the evaluation report then additionally lists the upper bound of each node's evaluation metrics.
+- `reuse_index`: Whether to reuse existing non-empty index and to keep the index after evaluation.
+If True the index will be kept after evaluation and no indexing will take place if index has already documents. Otherwise it will be deleted immediately afterwards.
+Defaults to False.
+- `custom_document_id_field`: Custom field name within `Document`'s `meta` which identifies the document and is being used as criterion for matching documents to labels during evaluation.
+This is especially useful if you want to match documents on other criteria (e.g. file names) than the default document ids as these could be heavily influenced by preprocessing.
+If not set (default) the `Document`'s `id` is being used as criterion for matching documents to labels.
+- `document_scope`: A criterion for deciding whether documents are relevant or not.
+You can select between:
+- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+ A typical use case is Document Retrieval.
+- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `context_matching_...` params).
+ A typical use case is Document-Independent Passage Retrieval.
+- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
+ A typical use case is Document-Specific Passage Retrieval.
+- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
+ A typical use case is Document Retrieval having sparse context labels.
+- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
+ A typical use case is Question Answering.
+- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
+ This is intended to be a proper default value in order to support both main use cases:
+ - Document Retrieval
+ - Question Answering
+The default value is 'document_id_or_answer'.
+- `answer_scope`: Specifies the scope in which a matching answer is considered correct.
+You can select between:
+- 'any' (default): Any matching answer is considered correct.
+- 'context': The answer is only considered correct if its context matches as well.
+ Uses fuzzy matching (see `context_matching_...` params).
+- 'document_id': The answer is only considered correct if its document ID matches as well.
+ You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
+The default value is 'any'.
+In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
+- `context_matching_min_length`: The minimum string length context and candidate need to have in order to be scored.
+Returns 0.0 otherwise.
+- `context_matching_boost_split_overlaps`: Whether to boost split overlaps (e.g. [AB] <-> [BC]) that result from different preprocessing params.
+If we detect that the score is near a half match and the matching part of the candidate is at its boundaries
+we cut the context on the same side, recalculate the score and take the mean of both.
+Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total.
+- `context_matching_threshold`: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]
+
+
+
+#### Pipeline.eval
+
+```python
+@send_event
+def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult
+```
+
+Evaluates the pipeline by running the pipeline once per query in debug mode
+
+and putting together all data that is needed for evaluation, e.g. calculating metrics.
+
+If you want to calculate SAS (Semantic Answer Similarity) metrics, you have to specify `sas_model_name_or_path`.
+
+You will be able to control the scope within which an answer or a document is considered correct afterwards (See `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`).
+Some of these scopes require additional information that already needs to be specified during `eval()`:
+- `custom_document_id_field` param to select a custom document ID from document's meta data for ID matching (only affects 'document_id' scopes)
+- `context_matching_...` param to fine-tune the fuzzy matching mechanism that determines whether some text contexts match each other (only affects 'context' scopes, default values should work most of the time)
+
+**Arguments**:
+
+- `labels`: The labels to evaluate on
+- `documents`: List of List of Document that the first node in the pipeline should get as input per multilabel. Can be used to evaluate a pipeline that consists of a reader without a retriever.
+- `params`: Dictionary of parameters to be dispatched to the nodes.
+If you want to pass a param to all nodes, you can just use: {"top_k":10}
+If you want to pass it to targeted nodes, you can do:
+{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}
+- `sas_model_name_or_path`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric.
+The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps.
+Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture.
+More info in the paper: https://arxiv.org/abs/2108.06130
+Models:
+- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data.
+Not all cross encoders can be used because of different return types.
+If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class
+- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large"
+- Large model for German only: "deepset/gbert-large-sts"
+- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS.
+- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+Falls back to CPU if no GPU is available.
+- `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode.
+This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node.
+If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance.
+If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance.
+The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node.
+To this end, labels are used as input to the node instead of the output of the previous node in the pipeline.
+The generated dataframes in the EvaluationResult then contain additional rows, which can be distinguished from the integrated evaluation results based on the
+values "integrated" or "isolated" in the column "eval_mode" and the evaluation report then additionally lists the upper bound of each node's evaluation metrics.
+- `custom_document_id_field`: Custom field name within `Document`'s `meta` which identifies the document and is being used as criterion for matching documents to labels during evaluation.
+This is especially useful if you want to match documents on other criteria (e.g. file names) than the default document ids as these could be heavily influenced by preprocessing.
+If not set (default) the `Document`'s `id` is being used as criterion for matching documents to labels.
+- `context_matching_min_length`: The minimum string length context and candidate need to have in order to be scored.
+Returns 0.0 otherwise.
+- `context_matching_boost_split_overlaps`: Whether to boost split overlaps (e.g. [AB] <-> [BC]) that result from different preprocessing params.
+If we detect that the score is near a half match and the matching part of the candidate is at its boundaries
+we cut the context on the same side, recalculate the score and take the mean of both.
+Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total.
+- `context_matching_threshold`: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]
+
+
+
+#### Pipeline.get\_nodes\_by\_class
+
+```python
+def get_nodes_by_class(class_type) -> List[Any]
+```
+
+Gets all nodes in the pipeline that are an instance of a certain class (incl. subclasses).
+
+This is for example helpful if you loaded a pipeline and then want to interact directly with the document store.
+Example:
+| from haystack.document_stores.base import BaseDocumentStore
+| INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
+| res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore)
+
+**Returns**:
+
+List of components that are an instance the requested class
+
+
+
+#### Pipeline.get\_document\_store
+
+```python
+def get_document_store() -> Optional[BaseDocumentStore]
+```
+
+Return the document store object used in the current pipeline.
+
+**Returns**:
+
+Instance of DocumentStore or None
+
+
+
+#### Pipeline.draw
+
+```python
+def draw(path: Path = Path("pipeline.png"))
+```
+
+Create a Graphviz visualization of the pipeline.
+
+**Arguments**:
+
+- `path`: the path to save the image.
+
+
+
+#### Pipeline.load\_from\_yaml
+
+```python
+@classmethod
+def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False)
+```
+
+Load Pipeline from a YAML file defining the individual components and how they're tied together to form
+
+a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must
+be passed.
+
+Here's a sample configuration:
+
+ ```yaml
+ | version: '1.0.0'
+ |
+ | components: # define all the building-blocks for Pipeline
+ | - name: MyReader # custom-name for the component; helpful for visualization & debugging
+ | type: FARMReader # Haystack Class name for the component
+ | params:
+ | no_ans_boost: -10
+ | model_name_or_path: deepset/roberta-base-squad2
+ | - name: MyESRetriever
+ | type: BM25Retriever
+ | params:
+ | document_store: MyDocumentStore # params can reference other components defined in the YAML
+ | custom_query: null
+ | - name: MyDocumentStore
+ | type: ElasticsearchDocumentStore
+ | params:
+ | index: haystack_test
+ |
+ | pipelines: # multiple Pipelines can be defined using the components from above
+ | - name: my_query_pipeline # a simple extractive-qa Pipeline
+ | nodes:
+ | - name: MyESRetriever
+ | inputs: [Query]
+ | - name: MyReader
+ | inputs: [MyESRetriever]
+ ```
+
+Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed.
+If the pipeline loads correctly regardless, save again the pipeline using `Pipeline.save_to_yaml()` to remove the warning.
+
+**Arguments**:
+
+- `path`: path of the YAML file.
+- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set.
+- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example,
+to change index name param for an ElasticsearchDocumentStore, an env
+variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
+`_` sign must be used to specify nested hierarchical properties.
+- `strict_version_check`: whether to fail in case of a version mismatch (throws a warning otherwise)
+
+
+
+#### Pipeline.load\_from\_config
+
+```python
+@classmethod
+def load_from_config(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False)
+```
+
+Load Pipeline from a config dict defining the individual components and how they're tied together to form
+
+a Pipeline. A single config can declare multiple Pipelines, in which case an explicit `pipeline_name` must
+be passed.
+
+Here's a sample configuration:
+
+ ```python
+ | {
+ | "version": "ignore",
+ | "components": [
+ | { # define all the building-blocks for Pipeline
+ | "name": "MyReader", # custom-name for the component; helpful for visualization & debugging
+ | "type": "FARMReader", # Haystack Class name for the component
+ | "params": {"no_ans_boost": -10, "model_name_or_path": "deepset/roberta-base-squad2"},
+ | },
+ | {
+ | "name": "MyESRetriever",
+ | "type": "BM25Retriever",
+ | "params": {
+ | "document_store": "MyDocumentStore", # params can reference other components defined in the YAML
+ | "custom_query": None,
+ | },
+ | },
+ | {"name": "MyDocumentStore", "type": "ElasticsearchDocumentStore", "params": {"index": "haystack_test"}},
+ | ],
+ | "pipelines": [
+ | { # multiple Pipelines can be defined using the components from above
+ | "name": "my_query_pipeline", # a simple extractive-qa Pipeline
+ | "nodes": [
+ | {"name": "MyESRetriever", "inputs": ["Query"]},
+ | {"name": "MyReader", "inputs": ["MyESRetriever"]},
+ | ],
+ | }
+ | ],
+ | }
+ ```
+
+**Arguments**:
+
+- `pipeline_config`: the pipeline config as dict
+- `pipeline_name`: if the config contains multiple pipelines, the pipeline_name to load must be set.
+- `overwrite_with_env_variables`: Overwrite the configuration with environment variables. For example,
+to change index name param for an ElasticsearchDocumentStore, an env
+variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
+`_` sign must be used to specify nested hierarchical properties.
+- `strict_version_check`: whether to fail in case of a version mismatch (throws a warning otherwise).
+
+
+
+#### Pipeline.save\_to\_yaml
+
+```python
+def save_to_yaml(path: Path, return_defaults: bool = False)
+```
+
+Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`.
+
+**Arguments**:
+
+- `path`: path of the output YAML file.
+- `return_defaults`: whether to output parameters that have the default values.
+
+
+
+#### Pipeline.get\_config
+
+```python
+def get_config(return_defaults: bool = False) -> dict
+```
+
+Returns a configuration for the Pipeline that can be used with `Pipeline.load_from_config()`.
+
+**Arguments**:
+
+- `return_defaults`: whether to output parameters that have the default values.
+
+
+
+#### Pipeline.print\_eval\_report
+
+```python
+def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[
+ "document_id",
+ "context",
+ "document_id_and_context",
+ "document_id_or_context",
+ "answer",
+ "document_id_or_answer",
+ ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any")
+```
+
+Prints evaluation report containing a metrics funnel and worst queries for further analysis.
+
+**Arguments**:
+
+- `eval_result`: The evaluation result, can be obtained by running eval().
+- `n_wrong_examples`: The number of worst queries to show.
+- `metrics_filter`: The metrics to show per node. If None all metrics will be shown.
+- `document_scope`: A criterion for deciding whether documents are relevant or not.
+You can select between:
+- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+ A typical use case is Document Retrieval.
+- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
+ A typical use case is Document-Independent Passage Retrieval.
+- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
+ A typical use case is Document-Specific Passage Retrieval.
+- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
+ A typical use case is Document Retrieval having sparse context labels.
+- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
+ A typical use case is Question Answering.
+- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
+ This is intended to be a proper default value in order to support both main use cases:
+ - Document Retrieval
+ - Question Answering
+The default value is 'document_id_or_answer'.
+- `answer_scope`: Specifies the scope in which a matching answer is considered correct.
+You can select between:
+- 'any' (default): Any matching answer is considered correct.
+- 'context': The answer is only considered correct if its context matches as well.
+ Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
+- 'document_id': The answer is only considered correct if its document ID matches as well.
+ You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
+The default value is 'any'.
+In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
+
+
+
+## \_HaystackBeirRetrieverAdapter
+
+```python
+class _HaystackBeirRetrieverAdapter()
+```
+
+
+
+#### \_HaystackBeirRetrieverAdapter.\_\_init\_\_
+
+```python
+def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict, query_params: dict)
+```
+
+Adapter mimicking a BEIR retriever used by BEIR's EvaluateRetrieval class to run BEIR evaluations on Haystack Pipelines.
+
+This has nothing to do with Haystack's retriever classes.
+See https://github.com/beir-cellar/beir/blob/main/beir/retrieval/evaluation.py.
+
+**Arguments**:
+
+- `index_pipeline`: The indexing pipeline to use.
+- `query_pipeline`: The query pipeline to evaluate.
+- `index_params`: The params to use during indexing (see pipeline.run's params).
+- `query_params`: The params to use during querying (see pipeline.run's params).
+
+
+
+# Module ray
+
+
+
+## RayPipeline
+
+```python
+class RayPipeline(Pipeline)
+```
+
+[Ray](https://ray.io) is a framework for distributed computing.
+
+With Ray, you can distribute a Pipeline's components across a cluster of machines. The individual components of a
+Pipeline can be independently scaled. For instance, an extractive QA Pipeline deployment can have three replicas
+of the Reader and a single replica for the Retriever. This way, you can use your resources more efficiently by horizontally scaling Components.
+
+To set the number of replicas, add `replicas` in the YAML configuration for the node in a pipeline:
+
+ ```yaml
+ | components:
+ | ...
+ |
+ | pipelines:
+ | - name: ray_query_pipeline
+ | type: RayPipeline
+ | nodes:
+ | - name: ESRetriever
+ | replicas: 2 # number of replicas to create on the Ray cluster
+ | inputs: [ Query ]
+ ```
+
+A Ray Pipeline can only be created with a YAML Pipeline configuration.
+
+```python
+from haystack.pipeline import RayPipeline
+pipeline = RayPipeline.load_from_yaml(path="my_pipelines.yaml", pipeline_name="my_query_pipeline")
+pipeline.run(query="What is the capital of Germany?")
+```
+
+By default, RayPipelines create an instance of RayServe locally. To connect to an existing Ray instance,
+set the `address` parameter when creating the RayPipeline instance.
+
+YAML definitions of Ray pipelines are validated at load. For more information, see [YAML File Definitions](https://haystack-website-git-fork-fstau-dev-287-search-deepset-overnice.vercel.app/components/pipelines#yaml-file-definitions).
+
+
+
+#### RayPipeline.\_\_init\_\_
+
+```python
+def __init__(address: str = None, ray_args: Optional[Dict[str, Any]] = None)
+```
+
+**Arguments**:
+
+- `address`: The IP address for the Ray cluster. If set to `None`, a local Ray instance is started.
+- `kwargs`: Optional parameters for initializing Ray.
+
+
+
+#### RayPipeline.load\_from\_yaml
+
+```python
+@classmethod
+def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, address: Optional[str] = None, strict_version_check: bool = False, ray_args: Optional[Dict[str, Any]] = None)
+```
+
+Load Pipeline from a YAML file defining the individual components and how they're tied together to form
+
+a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must
+be passed.
+
+Here's a sample configuration:
+
+ ```yaml
+ | version: '1.0.0'
+ |
+ | components: # define all the building-blocks for Pipeline
+ | - name: MyReader # custom-name for the component; helpful for visualization & debugging
+ | type: FARMReader # Haystack Class name for the component
+ | params:
+ | no_ans_boost: -10
+ | model_name_or_path: deepset/roberta-base-squad2
+ | - name: MyESRetriever
+ | type: ElasticsearchRetriever
+ | params:
+ | document_store: MyDocumentStore # params can reference other components defined in the YAML
+ | custom_query: null
+ | - name: MyDocumentStore
+ | type: ElasticsearchDocumentStore
+ | params:
+ | index: haystack_test
+ |
+ | pipelines: # multiple Pipelines can be defined using the components from above
+ | - name: my_query_pipeline # a simple extractive-qa Pipeline
+ | type: RayPipeline
+ | nodes:
+ | - name: MyESRetriever
+ | inputs: [Query]
+ | replicas: 2 # number of replicas to create on the Ray cluster
+ | - name: MyReader
+ | inputs: [MyESRetriever]
+ ```
+
+
+Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed.
+If the pipeline loads correctly regardless, save again the pipeline using `RayPipeline.save_to_yaml()` to remove the warning.
+
+**Arguments**:
+
+- `path`: path of the YAML file.
+- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set.
+- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example,
+to change index name param for an ElasticsearchDocumentStore, an env
+variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
+`_` sign must be used to specify nested hierarchical properties.
+- `address`: The IP address for the Ray cluster. If set to None, a local Ray instance is started.
+
+
+
+## \_RayDeploymentWrapper
+
+```python
+class _RayDeploymentWrapper()
+```
+
+Ray Serve supports calling of __init__ methods on the Classes to create "deployment" instances.
+
+In case of Haystack, some Components like Retrievers have complex init methods that needs objects
+like Document Stores.
+
+This wrapper class encapsulates the initialization of Components. Given a Component Class
+name, it creates an instance using the YAML Pipeline config.
+
+
+
+#### \_RayDeploymentWrapper.\_\_init\_\_
+
+```python
+def __init__(pipeline_config: dict, component_name: str)
+```
+
+Create an instance of Component.
+
+**Arguments**:
+
+- `pipeline_config`: Pipeline YAML parsed as a dict.
+- `component_name`: Component Class name.
+
+
+
+#### \_RayDeploymentWrapper.\_\_call\_\_
+
+```python
+def __call__(*args, **kwargs)
+```
+
+Ray calls this method which is then re-directed to the corresponding component's run().
+
+
+
+#### \_RayDeploymentWrapper.load\_from\_pipeline\_config
+
+```python
+@staticmethod
+def load_from_pipeline_config(pipeline_config: dict, component_name: str)
+```
+
+Load an individual component from a YAML config for Pipelines.
+
+**Arguments**:
+
+- `pipeline_config`: the Pipelines YAML config parsed as a dict.
+- `component_name`: the name of the component to load.
+
+
+
+# Module standard\_pipelines
+
+
+
+## BaseStandardPipeline
+
+```python
+class BaseStandardPipeline(ABC)
+```
+
+Base class for pre-made standard Haystack pipelines.
+This class does not inherit from Pipeline.
+
+
+
+#### BaseStandardPipeline.add\_node
+
+```python
+def add_node(component, name: str, inputs: List[str])
+```
+
+Add a new node to the pipeline.
+
+**Arguments**:
+
+- `component`: The object to be called when the data is passed to the node. It can be a Haystack component
+(like Retriever, Reader, or Generator) or a user-defined object that implements a run()
+method to process incoming data from predecessor node.
+- `name`: The name for the node. It must not contain any dots.
+- `inputs`: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name
+of node is sufficient. For instance, a 'BM25Retriever' node would always output a single
+edge with a list of documents. It can be represented as ["BM25Retriever"].
+
+In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output
+must be specified explicitly as "QueryClassifier.output_2".
+
+
+
+#### BaseStandardPipeline.get\_node
+
+```python
+def get_node(name: str)
+```
+
+Get a node from the Pipeline.
+
+**Arguments**:
+
+- `name`: The name of the node.
+
+
+
+#### BaseStandardPipeline.set\_node
+
+```python
+def set_node(name: str, component)
+```
+
+Set the component for a node in the Pipeline.
+
+**Arguments**:
+
+- `name`: The name of the node.
+- `component`: The component object to be set at the node.
+
+
+
+#### BaseStandardPipeline.draw
+
+```python
+def draw(path: Path = Path("pipeline.png"))
+```
+
+Create a Graphviz visualization of the pipeline.
+
+**Arguments**:
+
+- `path`: the path to save the image.
+
+
+
+#### BaseStandardPipeline.save\_to\_yaml
+
+```python
+def save_to_yaml(path: Path, return_defaults: bool = False)
+```
+
+Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`.
+
+**Arguments**:
+
+- `path`: path of the output YAML file.
+- `return_defaults`: whether to output parameters that have the default values.
+
+
+
+#### BaseStandardPipeline.load\_from\_yaml
+
+```python
+@classmethod
+def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True)
+```
+
+Load Pipeline from a YAML file defining the individual components and how they're tied together to form
+
+a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must
+be passed.
+
+Here's a sample configuration:
+
+ ```yaml
+ | version: '1.0.0'
+ |
+ | components: # define all the building-blocks for Pipeline
+ | - name: MyReader # custom-name for the component; helpful for visualization & debugging
+ | type: FARMReader # Haystack Class name for the component
+ | params:
+ | no_ans_boost: -10
+ | model_name_or_path: deepset/roberta-base-squad2
+ | - name: MyESRetriever
+ | type: BM25Retriever
+ | params:
+ | document_store: MyDocumentStore # params can reference other components defined in the YAML
+ | custom_query: null
+ | - name: MyDocumentStore
+ | type: ElasticsearchDocumentStore
+ | params:
+ | index: haystack_test
+ |
+ | pipelines: # multiple Pipelines can be defined using the components from above
+ | - name: my_query_pipeline # a simple extractive-qa Pipeline
+ | nodes:
+ | - name: MyESRetriever
+ | inputs: [Query]
+ | - name: MyReader
+ | inputs: [MyESRetriever]
+ ```
+
+**Arguments**:
+
+- `path`: path of the YAML file.
+- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set.
+- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example,
+to change index name param for an ElasticsearchDocumentStore, an env
+variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
+`_` sign must be used to specify nested hierarchical properties.
+
+
+
+#### BaseStandardPipeline.get\_nodes\_by\_class
+
+```python
+def get_nodes_by_class(class_type) -> List[Any]
+```
+
+Gets all nodes in the pipeline that are an instance of a certain class (incl. subclasses).
+
+This is for example helpful if you loaded a pipeline and then want to interact directly with the document store.
+Example:
+```python
+| from haystack.document_stores.base import BaseDocumentStore
+| INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
+| res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore)
+```
+
+**Returns**:
+
+List of components that are an instance of the requested class
+
+
+
+#### BaseStandardPipeline.get\_document\_store
+
+```python
+def get_document_store() -> Optional[BaseDocumentStore]
+```
+
+Return the document store object used in the current pipeline.
+
+**Returns**:
+
+Instance of DocumentStore or None
+
+
+
+#### BaseStandardPipeline.eval
+
+```python
+def eval(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult
+```
+
+Evaluates the pipeline by running the pipeline once per query in debug mode
+
+and putting together all data that is needed for evaluation, e.g. calculating metrics.
+
+If you want to calculate SAS (Semantic Answer Similarity) metrics, you have to specify `sas_model_name_or_path`.
+
+You will be able to control the scope within which an answer or a document is considered correct afterwards (See `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`).
+Some of these scopes require additional information that already needs to be specified during `eval()`:
+- `custom_document_id_field` param to select a custom document ID from document's meta data for ID matching (only affects 'document_id' scopes)
+- `context_matching_...` param to fine-tune the fuzzy matching mechanism that determines whether some text contexts match each other (only affects 'context' scopes, default values should work most of the time)
+
+**Arguments**:
+
+- `labels`: The labels to evaluate on
+- `params`: Params for the `retriever` and `reader`. For instance,
+params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
+- `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model to be used for sas value calculation,
+should be path or string pointing to downloadable models.
+- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS.
+- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+Falls back to CPU if no GPU is available.
+- `add_isolated_node_eval`: Whether to additionally evaluate the reader based on labels as input instead of output of previous node in pipeline
+- `custom_document_id_field`: Custom field name within `Document`'s `meta` which identifies the document and is being used as criterion for matching documents to labels during evaluation.
+This is especially useful if you want to match documents on other criteria (e.g. file names) than the default document ids as these could be heavily influenced by preprocessing.
+If not set (default) the `Document`'s `id` is being used as criterion for matching documents to labels.
+- `context_matching_min_length`: The minimum string length context and candidate need to have in order to be scored.
+Returns 0.0 otherwise.
+- `context_matching_boost_split_overlaps`: Whether to boost split overlaps (e.g. [AB] <-> [BC]) that result from different preprocessing params.
+If we detect that the score is near a half match and the matching part of the candidate is at its boundaries
+we cut the context on the same side, recalculate the score and take the mean of both.
+Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total.
+- `context_matching_threshold`: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]
+
+
+
+#### BaseStandardPipeline.print\_eval\_report
+
+```python
+def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[
+ "document_id",
+ "context",
+ "document_id_and_context",
+ "document_id_or_context",
+ "answer",
+ "document_id_or_answer",
+ ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any")
+```
+
+Prints evaluation report containing a metrics funnel and worst queries for further analysis.
+
+**Arguments**:
+
+- `eval_result`: The evaluation result, can be obtained by running eval().
+- `n_wrong_examples`: The number of worst queries to show.
+- `metrics_filter`: The metrics to show per node. If None all metrics will be shown.
+- `document_scope`: A criterion for deciding whether documents are relevant or not.
+You can select between:
+- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+ A typical use case is Document Retrieval.
+- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
+ A typical use case is Document-Independent Passage Retrieval.
+- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
+ A typical use case is Document-Specific Passage Retrieval.
+- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
+ A typical use case is Document Retrieval having sparse context labels.
+- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
+ A typical use case is Question Answering.
+- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
+ This is intended to be a proper default value in order to support both main use cases:
+ - Document Retrieval
+ - Question Answering
+The default value is 'document_id_or_answer'.
+- `answer_scope`: Specifies the scope in which a matching answer is considered correct.
+You can select between:
+- 'any' (default): Any matching answer is considered correct.
+- 'context': The answer is only considered correct if its context matches as well.
+ Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
+- 'document_id': The answer is only considered correct if its document ID matches as well.
+ You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
+The default value is 'any'.
+In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
+
+
+
+#### BaseStandardPipeline.run\_batch
+
+```python
+def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None)
+```
+
+Run a batch of queries through the pipeline.
+
+**Arguments**:
+
+- `queries`: List of query strings.
+- `params`: Parameters for the individual nodes of the pipeline. For instance,
+`params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}`
+- `debug`: Whether the pipeline should instruct nodes to collect debug information
+about their execution. By default these include the input parameters
+they received and the output they generated.
+All debug information can then be found in the dict returned
+by this method under the key "_debug"
+
+
+
+## ExtractiveQAPipeline
+
+```python
+class ExtractiveQAPipeline(BaseStandardPipeline)
+```
+
+Pipeline for Extractive Question Answering.
+
+
+
+#### ExtractiveQAPipeline.\_\_init\_\_
+
+```python
+def __init__(reader: BaseReader, retriever: BaseRetriever)
+```
+
+**Arguments**:
+
+- `reader`: Reader instance
+- `retriever`: Retriever instance
+
+
+
+#### ExtractiveQAPipeline.run
+
+```python
+def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
+```
+
+**Arguments**:
+
+- `query`: The search query string.
+- `params`: Params for the `retriever` and `reader`. For instance,
+params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
+- `debug`: Whether the pipeline should instruct nodes to collect debug information
+about their execution. By default these include the input parameters
+they received and the output they generated.
+All debug information can then be found in the dict returned
+by this method under the key "_debug"
+
+
+
+## DocumentSearchPipeline
+
+```python
+class DocumentSearchPipeline(BaseStandardPipeline)
+```
+
+Pipeline for semantic document search.
+
+
+
+#### DocumentSearchPipeline.\_\_init\_\_
+
+```python
+def __init__(retriever: BaseRetriever)
+```
+
+**Arguments**:
+
+- `retriever`: Retriever instance
+
+
+
+#### DocumentSearchPipeline.run
+
+```python
+def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
+```
+
+**Arguments**:
+
+- `query`: the query string.
+- `params`: params for the `retriever` and `reader`. For instance, params={"Retriever": {"top_k": 10}}
+- `debug`: Whether the pipeline should instruct nodes to collect debug information
+about their execution. By default these include the input parameters
+they received and the output they generated.
+All debug information can then be found in the dict returned
+by this method under the key "_debug"
+
+
+
+## GenerativeQAPipeline
+
+```python
+class GenerativeQAPipeline(BaseStandardPipeline)
+```
+
+Pipeline for Generative Question Answering.
+
+
+
+#### GenerativeQAPipeline.\_\_init\_\_
+
+```python
+def __init__(generator: BaseGenerator, retriever: BaseRetriever)
+```
+
+**Arguments**:
+
+- `generator`: Generator instance
+- `retriever`: Retriever instance
+
+
+
+#### GenerativeQAPipeline.run
+
+```python
+def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
+```
+
+**Arguments**:
+
+- `query`: the query string.
+- `params`: params for the `retriever` and `generator`. For instance,
+params={"Retriever": {"top_k": 10}, "Generator": {"top_k": 5}}
+- `debug`: Whether the pipeline should instruct nodes to collect debug information
+about their execution. By default these include the input parameters
+they received and the output they generated.
+All debug information can then be found in the dict returned
+by this method under the key "_debug"
+
+
+
+## SearchSummarizationPipeline
+
+```python
+class SearchSummarizationPipeline(BaseStandardPipeline)
+```
+
+Pipeline that retrieves documents for a query and then summarizes those documents.
+
+
+
+#### SearchSummarizationPipeline.\_\_init\_\_
+
+```python
+def __init__(summarizer: BaseSummarizer, retriever: BaseRetriever, return_in_answer_format: bool = False)
+```
+
+**Arguments**:
+
+- `summarizer`: Summarizer instance
+- `retriever`: Retriever instance
+- `return_in_answer_format`: Whether the results should be returned as documents (False) or in the answer
+format used in other QA pipelines (True). With the latter, you can use this
+pipeline as a "drop-in replacement" for other QA pipelines.
+
+
+
+#### SearchSummarizationPipeline.run
+
+```python
+def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
+```
+
+**Arguments**:
+
+- `query`: the query string.
+- `params`: params for the `retriever` and `summarizer`. For instance,
+params={"Retriever": {"top_k": 10}, "Summarizer": {"generate_single_summary": True}}
+- `debug`: Whether the pipeline should instruct nodes to collect debug information
+about their execution. By default these include the input parameters
+they received and the output they generated.
+All debug information can then be found in the dict returned
+by this method under the key "_debug"
+
+
+
+#### SearchSummarizationPipeline.run\_batch
+
+```python
+def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None)
+```
+
+Run a batch of queries through the pipeline.
+
+**Arguments**:
+
+- `queries`: List of query strings.
+- `params`: Parameters for the individual nodes of the pipeline. For instance,
+`params={"Retriever": {"top_k": 10}, "Summarizer": {"generate_single_summary": True}}`
+- `debug`: Whether the pipeline should instruct nodes to collect debug information
+about their execution. By default these include the input parameters
+they received and the output they generated.
+All debug information can then be found in the dict returned
+by this method under the key "_debug"
+
+
+
+## FAQPipeline
+
+```python
+class FAQPipeline(BaseStandardPipeline)
+```
+
+Pipeline for finding similar FAQs using semantic document search.
+
+
+
+#### FAQPipeline.\_\_init\_\_
+
+```python
+def __init__(retriever: BaseRetriever)
+```
+
+**Arguments**:
+
+- `retriever`: Retriever instance
+
+
+
+#### FAQPipeline.run
+
+```python
+def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
+```
+
+**Arguments**:
+
+- `query`: the query string.
+- `params`: params for the `retriever`. For instance, params={"Retriever": {"top_k": 10}}
+- `debug`: Whether the pipeline should instruct nodes to collect debug information
+about their execution. By default these include the input parameters
+they received and the output they generated.
+All debug information can then be found in the dict returned
+by this method under the key "_debug"
+
+
+
+## TranslationWrapperPipeline
+
+```python
+class TranslationWrapperPipeline(BaseStandardPipeline)
+```
+
+Takes an existing search pipeline and adds one "input translation node" after the Query and one
+"output translation" node just before returning the results
+
+
+
+#### TranslationWrapperPipeline.\_\_init\_\_
+
+```python
+def __init__(input_translator: BaseTranslator, output_translator: BaseTranslator, pipeline: BaseStandardPipeline)
+```
+
+Wrap a given `pipeline` with the `input_translator` and `output_translator`.
+
+**Arguments**:
+
+- `input_translator`: A Translator node that shall translate the input query from language A to B
+- `output_translator`: A Translator node that shall translate the pipeline results from language B to A
+- `pipeline`: The pipeline object (e.g. ExtractiveQAPipeline) you want to "wrap".
+Note that pipelines with split or merge nodes are currently not supported.
+
+
+
+## QuestionGenerationPipeline
+
+```python
+class QuestionGenerationPipeline(BaseStandardPipeline)
+```
+
+A simple pipeline that takes documents as input and generates
+questions that it thinks can be answered by the documents.
+
+
+
+## RetrieverQuestionGenerationPipeline
+
+```python
+class RetrieverQuestionGenerationPipeline(BaseStandardPipeline)
+```
+
+A simple pipeline that takes a query as input, performs retrieval, and then generates
+questions that it thinks can be answered by the retrieved documents.
+
+
+
+## QuestionAnswerGenerationPipeline
+
+```python
+class QuestionAnswerGenerationPipeline(BaseStandardPipeline)
+```
+
+This is a pipeline which takes a document as input, generates questions that the model thinks can be answered by
+this document, and then performs question answering of this questions using that single document.
+
+
+
+## MostSimilarDocumentsPipeline
+
+```python
+class MostSimilarDocumentsPipeline(BaseStandardPipeline)
+```
+
+
+
+#### MostSimilarDocumentsPipeline.\_\_init\_\_
+
+```python
+def __init__(document_store: BaseDocumentStore)
+```
+
+Initialize a Pipeline for finding the most similar documents to a given document.
+
+This pipeline can be helpful if you already show a relevant document to your end users and they want to search for just similar ones.
+
+**Arguments**:
+
+- `document_store`: Document Store instance with already stored embeddings.
+
+
+
+#### MostSimilarDocumentsPipeline.run
+
+```python
+def run(document_ids: List[str], top_k: int = 5)
+```
+
+**Arguments**:
+
+- `document_ids`: document ids
+- `top_k`: How many documents id to return against single document
+
+
+
+#### MostSimilarDocumentsPipeline.run\_batch
+
+```python
+def run_batch(document_ids: List[str], top_k: int = 5)
+```
+
+**Arguments**:
+
+- `document_ids`: document ids
+- `top_k`: How many documents id to return against single document
+
diff --git a/docs/v1.5.0/_src/api/api/preprocessor.md b/docs/v1.5.0/_src/api/api/preprocessor.md
new file mode 100644
index 0000000000..852099be92
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/preprocessor.md
@@ -0,0 +1,104 @@
+
+
+# Module base
+
+
+
+## BasePreProcessor
+
+```python
+class BasePreProcessor(BaseComponent)
+```
+
+
+
+#### BasePreProcessor.process
+
+```python
+@abstractmethod
+def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[Document]
+```
+
+Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a
+list of Documents.
+
+
+
+# Module preprocessor
+
+
+
+## PreProcessor
+
+```python
+class PreProcessor(BasePreProcessor)
+```
+
+
+
+#### PreProcessor.\_\_init\_\_
+
+```python
+def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, language: str = "en", id_hash_keys: Optional[List[str]] = None)
+```
+
+**Arguments**:
+
+- `clean_header_footer`: Use heuristic to remove footers and headers across different pages by searching
+for the longest common string. This heuristic uses exact matches and therefore
+works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
+or similar.
+- `clean_whitespace`: Strip whitespaces before or after each line in the text.
+- `clean_empty_lines`: Remove more than two empty lines in the text.
+- `remove_substrings`: Remove specified substrings from the text.
+- `split_by`: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
+- `split_length`: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by ->
+"sentence", then each output document will have 10 sentences.
+- `split_overlap`: Word overlap between two adjacent documents after a split.
+Setting this to a positive number essentially enables the sliding window approach.
+For example, if split_by -> `word`,
+split_length -> 5 & split_overlap -> 2, then the splits would be like:
+[w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12].
+Set the value to 0 to ensure there is no overlap among the documents after splitting.
+- `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set
+to True, the individual split will always have complete sentences &
+the number of words will be <= split_length.
+- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+#### PreProcessor.process
+
+```python
+def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
+
+
+
+#### PreProcessor.clean
+
+```python
+def clean(document: Union[dict, Document], clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str], id_hash_keys: Optional[List[str]] = None) -> Document
+```
+
+Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
+and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
+
+
+
+#### PreProcessor.split
+
+```python
+def split(document: Union[dict, Document], split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool, id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Perform document splitting on a single document. This method can split on different units, at different lengths,
+with different strides. It can also respect sentence boundaries. Its exact functionality is defined by
+the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents.
+
diff --git a/docs/v1.5.0/_src/api/api/primitives.md b/docs/v1.5.0/_src/api/api/primitives.md
new file mode 100644
index 0000000000..65d6b60369
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/primitives.md
@@ -0,0 +1,517 @@
+
+
+# Module schema
+
+
+
+## Document
+
+```python
+@dataclass
+class Document()
+```
+
+
+
+#### Document.\_\_init\_\_
+
+```python
+def __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None)
+```
+
+One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
+
+Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
+many other places that manipulate or interact with document-level data.
+
+Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
+into smaller passages. We'll have one Document per passage in this case.
+
+Each document has a unique ID. This can be supplied by the user or generated automatically.
+It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
+
+There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
+
+**Arguments**:
+
+- `content`: Content of the document. For most cases, this will be text, but it can be a table or image.
+- `content_type`: One of "text", "table" or "image". Haystack components can use this to adjust their
+handling of Documents and check compatibility.
+- `id`: Unique ID for the document. If not supplied by the user, we'll generate one automatically by
+creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`.
+- `score`: The relevance score of the Document determined by a model (e.g. Retriever or Re-Ranker).
+If model's `scale_score` was set to True (default) score is in the unit interval (range of [0,1]), where 1 means extremely relevant.
+- `meta`: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed).
+- `embedding`: Vector encoding of the text
+- `id_hash_keys`: Generate the document id from a custom list of strings that refere to the documents attributes.
+If you want ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]).
+In this case the id will be generated by using the content and the defined metadata.
+
+
+
+#### Document.to\_dict
+
+```python
+def to_dict(field_map={}) -> Dict
+```
+
+Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the
+
+resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
+they are serialized / stored in other places (e.g. elasticsearch)
+Example:
+| doc = Document(content="some text", content_type="text")
+| doc.to_dict(field_map={"custom_content_field": "content"})
+| >>> {"custom_content_field": "some text", content_type": "text"}
+
+**Arguments**:
+
+- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes
+
+**Returns**:
+
+dict with content of the Document
+
+
+
+#### Document.from\_dict
+
+```python
+@classmethod
+def from_dict(cls, dict, field_map={}, id_hash_keys=None)
+```
+
+Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
+
+input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
+they are serialized / stored in other places (e.g. elasticsearch)
+Example:
+| my_dict = {"custom_content_field": "some text", content_type": "text"}
+| Document.from_dict(my_dict, field_map={"custom_content_field": "content"})
+
+**Arguments**:
+
+- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes
+
+**Returns**:
+
+dict with content of the Document
+
+
+
+#### Document.\_\_lt\_\_
+
+```python
+def __lt__(other)
+```
+
+Enable sorting of Documents by score
+
+
+
+## Span
+
+```python
+@dataclass
+class Span()
+```
+
+
+
+#### end
+
+Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
+
+For extractive QA: Character where answer starts/ends
+For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table)
+
+**Arguments**:
+
+- `start`: Position where the span starts
+- `end`: Position where the spand ends
+
+
+
+## Answer
+
+```python
+@dataclass
+class Answer()
+```
+
+
+
+#### meta
+
+The fundamental object in Haystack to represent any type of Answers (e.g. extractive QA, generative QA or TableQA).
+
+For example, it's used within some Nodes like the Reader, but also in the REST API.
+
+**Arguments**:
+
+- `answer`: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
+- `type`: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
+(i.e. we can locate an exact answer string in one of the documents) or from a generative model
+(i.e. no pointer to a specific document, no offsets ...).
+- `score`: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
+In the range of [0,1], where 1 means extremely relevant.
+- `context`: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
+- `offsets_in_document`: List of `Span` objects with start and end positions of the answer **in the
+document** (as stored in the document store).
+For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
+For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
+(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
+- `offsets_in_context`: List of `Span` objects with start and end positions of the answer **in the
+context** (i.e. the surrounding text/table of a certain window size).
+For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
+For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
+(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
+- `document_id`: ID of the document that the answer was located it (if any)
+- `meta`: Dict that can be used to associate any kind of custom meta data with the answer.
+In extractive QA, this will carry the meta data of the document where the answer was found.
+
+
+
+#### Answer.\_\_lt\_\_
+
+```python
+def __lt__(other)
+```
+
+Enable sorting of Answers by score
+
+
+
+## Label
+
+```python
+@dataclass
+class Label()
+```
+
+
+
+#### Label.\_\_init\_\_
+
+```python
+def __init__(query: str, document: Document, is_correct_answer: bool, is_correct_document: bool, origin: Literal["user-feedback", "gold-label"], answer: Optional[Answer], id: Optional[str] = None, no_answer: Optional[bool] = None, pipeline_id: Optional[str] = None, created_at: Optional[str] = None, updated_at: Optional[str] = None, meta: Optional[dict] = None, filters: Optional[dict] = None)
+```
+
+Object used to represent label/feedback in a standardized way within Haystack.
+
+This includes labels from dataset like SQuAD, annotations from labeling tools,
+or, user-feedback from the Haystack REST API.
+
+**Arguments**:
+
+- `query`: the question (or query) for finding answers.
+- `document`:
+- `answer`: the answer object.
+- `is_correct_answer`: whether the sample is positive or negative.
+- `is_correct_document`: in case of negative sample(is_correct_answer is False), there could be two cases;
+incorrect answer but correct document & incorrect document. This flag denotes if
+the returned document was correct.
+- `origin`: the source for the labels. It can be used to later for filtering.
+- `id`: Unique ID used within the DocumentStore. If not supplied, a uuid will be generated automatically.
+- `no_answer`: whether the question in unanswerable.
+- `pipeline_id`: pipeline identifier (any str) that was involved for generating this label (in-case of user feedback).
+- `created_at`: Timestamp of creation with format yyyy-MM-dd HH:mm:ss.
+Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S").
+- `created_at`: Timestamp of update with format yyyy-MM-dd HH:mm:ss.
+Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S")
+- `meta`: Meta fields like "annotator_name" in the form of a custom dict (any keys and values allowed).
+- `filters`: filters that should be applied to the query to rule out non-relevant documents. For example, if there are different correct answers
+in a DocumentStore depending on the retrieved document and the answer in this label is correct only on condition of the filters.
+
+
+
+## MultiLabel
+
+```python
+@dataclass
+class MultiLabel()
+```
+
+
+
+#### MultiLabel.\_\_init\_\_
+
+```python
+def __init__(labels: List[Label], drop_negative_labels=False, drop_no_answers=False)
+```
+
+There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated
+
+answers for one question or multiple documents contain the information you want for a query.
+This class is "syntactic sugar" that simplifies the work with such a list of related Labels.
+It stored the original labels in MultiLabel.labels and provides additional aggregated attributes that are
+automatically created at init time. For example, MultiLabel.no_answer allows you to easily access if any of the
+underlying Labels provided a text answer and therefore demonstrates that there is indeed a possible answer.
+
+**Arguments**:
+
+- `labels`: A list of labels that belong to a similar query and shall be "grouped" together
+- `drop_negative_labels`: Whether to drop negative labels from that group (e.g. thumbs down feedback from UI)
+- `drop_no_answers`: Whether to drop labels that specify the answer is impossible
+
+
+
+## EvaluationResult
+
+```python
+class EvaluationResult()
+```
+
+
+
+#### EvaluationResult.\_\_init\_\_
+
+```python
+def __init__(node_results: Dict[str, pd.DataFrame] = None) -> None
+```
+
+A convenience class to store, pass, and interact with results of a pipeline evaluation run (for example `pipeline.eval()`).
+
+Detailed results are stored as one dataframe per node. This class makes them more accessible and provides
+convenience methods to work with them.
+For example, you can calculate eval metrics, get detailed reports, or simulate different top_k settings:
+
+```python
+| eval_results = pipeline.eval(...)
+|
+| # derive detailed metrics
+| eval_results.calculate_metrics()
+|
+| # show summary of incorrect queries
+| eval_results.wrong_examples()
+```
+
+Each row of the underlying DataFrames contains either an answer or a document that has been retrieved during evaluation.
+Rows are enriched with basic information like rank, query, type, or node.
+Additional answer or document-specific evaluation information, like gold labels
+and metrics showing whether the row matches the gold labels, are included, too.
+The DataFrames have the following schema:
+- multilabel_id: The ID of the multilabel, which is unique for the pair of query and filters.
+- query: The actual query string.
+- filters: The filters used with the query.
+- gold_answers (answers only): The expected answers.
+- answer (answers only): The actual answer.
+- context: The content of the document (the surrounding context of the answer for QA).
+- exact_match (answers only): A metric showing if the answer exactly matches the gold label.
+- f1 (answers only): A metric showing how well the answer overlaps with the gold label on a token basis.
+- sas (answers only, optional): A metric showing how well the answer matches the gold label on a semantic basis.
+- exact_match_context_scope (answers only): exact_match with enforced context match.
+- f1_context_scope (answers only): f1 with enforced context scope match.
+- sas_context_scope (answers only): sas with enforced context scope match.
+- exact_match_document_scope (answers only): exact_match with enforced document scope match.
+- f1_document_scope (answers only): f1 with enforced document scope match.
+- sas_document_scope (answers only): sas with enforced document scope match.
+- exact_match_document_id_and_context_scope: (answers only): exact_match with enforced document and context scope match.
+- f1_document_id_and_context_scope (answers only): f1 with enforced document and context scope match.
+- sas_document_id_and_context_scope (answers only): sas with enforced document and context scope match.
+- gold_contexts: The contents of the gold documents.
+- gold_id_match (documents only): A metric showing whether one of the gold document IDs matches the document.
+- context_match (documents only): A metric showing whether one of the gold contexts matches the document content.
+- answer_match (documents only): A metric showing whether the document contains the answer.
+- gold_id_or_answer_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'answer_match'`.
+- gold_id_and_answer_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'answer_match'`.
+- gold_id_or_context_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'context_match'`.
+- gold_id_and_context_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'context_match'`.
+- gold_id_and_context_and_answer_match (documents only): A Boolean operation specifying that there should be `'gold_id_match' AND 'context_match' AND 'answer_match'`.
+- context_and_answer_match (documents only): A Boolean operation specifying that there should be both `'context_match' AND 'answer_match'`.
+- rank: A rank or 1-based-position in the result list.
+- document_id: The ID of the document that has been retrieved or that contained the answer.
+- gold_document_ids: The IDs of the documents to be retrieved.
+- custom_document_id: The custom ID of the document (specified by `custom_document_id_field`) that has been retrieved or that contained the answer.
+- gold_custom_document_ids: The custom documents IDs (specified by `custom_document_id_field`) to be retrieved.
+- offsets_in_document (answers only): The position or offsets within the document where the answer was found.
+- gold_offsets_in_documents (answers only): The position or offsets of the gold answer within the document.
+- gold_answers_exact_match (answers only): exact_match values per gold_answer.
+- gold_answers_f1 (answers only): f1 values per gold_answer.
+- gold_answers_sas (answers only): sas values per gold answer.
+- gold_documents_id_match: The document ID match per gold label (if `custom_document_id_field` has been specified, custom IDs are used).
+- gold_contexts_similarity: Context similarity per gold label.
+- gold_answers_match (documents only): Specifies whether the document contains an answer per gold label.
+- type: Possible values: 'answer' or 'document'.
+- node: The node name
+- eval_mode: Specifies whether the evaluation was executed in integrated or isolated mode.
+ Check pipeline.eval()'s add_isolated_node_eval parameter for more information.
+
+**Arguments**:
+
+- `node_results`: The evaluation Dataframes per pipeline node.
+
+
+
+#### EvaluationResult.calculate\_metrics
+
+```python
+def calculate_metrics(simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[
+ "document_id",
+ "context",
+ "document_id_and_context",
+ "document_id_or_context",
+ "answer",
+ "document_id_or_answer",
+ ] = "document_id_or_answer", eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> Dict[str, Dict[str, float]]
+```
+
+Calculates proper metrics for each node.
+
+For Nodes that return Documents, the default metrics are:
+- mrr (`Mean Reciprocal Rank `_)
+- map (`Mean Average Precision `_)
+- ndcg (`Normalized Discounted Cumulative Gain `_)
+- precision (Precision: How many of the returned documents were relevant?)
+- recall_multi_hit (Recall according to Information Retrieval definition: How many of the relevant documents were retrieved per query?)
+- recall_single_hit (Recall for Question Answering: How many of the queries returned at least one relevant document?)
+
+For Nodes that return answers, the default metrics are:
+- exact_match (How many of the queries returned the exact answer?)
+- f1 (How well do the returned results overlap with any gold answer on a token basis?)
+- sas, if a SAS model has been provided when calling `pipeline.eval()` (How semantically similar is the prediction to the gold answers?)
+
+During the eval run, you can simulate lower top_k values for Reader and Retriever than the actual values.
+For example, you can calculate `top_1_f1` for Reader nodes by setting `simulated_top_k_reader=1`.
+
+If you applied `simulated_top_k_retriever` to a Reader node, you should treat the results with caution as they can differ from an actual eval run with a corresponding `top_k_retriever` heavily.
+
+**Arguments**:
+
+- `simulated_top_k_reader`: Simulates the `top_k` parameter of the Reader.
+- `simulated_top_k_retriever`: Simulates the `top_k` parameter of the Retriever.
+Note: There might be a discrepancy between simulated Reader metrics and an actual Pipeline run with Retriever `top_k`.
+- `eval_mode`: The input the Node was evaluated on.
+Usually a Node gets evaluated on the prediction provided by its predecessor Nodes in the Pipeline (`value='integrated'`).
+However, as the quality of the Node can heavily depend on the Node's input and thus the predecessor's quality,
+you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your Node.
+For example, when evaluating the Reader, use `value='isolated'` to simulate a perfect Retriever in an ExtractiveQAPipeline.
+Possible values are: `integrated`, `isolated`.
+The default value is `integrated`.
+- `document_scope`: A criterion for deciding whether documents are relevant or not.
+You can select between:
+- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+ A typical use case is Document Retrieval.
+- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
+ A typical use case is Document-Independent Passage Retrieval.
+- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
+ A typical use case is Document-Specific Passage Retrieval.
+- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
+ A typical use case is Document Retrieval having sparse context labels.
+- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
+ A typical use case is Question Answering.
+- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
+ This is intended to be a proper default value in order to support both main use cases:
+ - Document Retrieval
+ - Question Answering
+The default value is 'document_id_or_answer'.
+- `answer_scope`: Specifies the scope in which a matching answer is considered correct.
+You can select between:
+- 'any' (default): Any matching answer is considered correct.
+- 'context': The answer is only considered correct if its context matches as well.
+ Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
+- 'document_id': The answer is only considered correct if its document ID matches as well.
+ You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
+The default value is 'any'.
+In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
+
+
+
+#### EvaluationResult.wrong\_examples
+
+```python
+def wrong_examples(node: str, n: int = 3, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[
+ "document_id",
+ "context",
+ "document_id_and_context",
+ "document_id_or_context",
+ "answer",
+ "document_id_or_answer",
+ ] = "document_id_or_answer", document_metric: str = "recall_single_hit", answer_metric: str = "f1", eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> List[Dict]
+```
+
+Returns the worst performing queries.
+
+Worst performing queries are calculated based on the metric
+that is either a document metric or an answer metric according to the node type.
+
+Lower top_k values for reader and retriever than the actual values during the eval run can be simulated.
+See calculate_metrics() for more information.
+
+**Arguments**:
+
+- `simulated_top_k_reader`: simulates top_k param of reader
+- `simulated_top_k_retriever`: simulates top_k param of retriever.
+remarks: there might be a discrepancy between simulated reader metrics and an actual pipeline run with retriever top_k
+- `document_metric`: the document metric worst queries are calculated with.
+values can be: 'recall_single_hit', 'recall_multi_hit', 'mrr', 'map', 'precision'
+- `document_metric`: the answer metric worst queries are calculated with.
+values can be: 'f1', 'exact_match' and 'sas' if the evaluation was made using a SAS model.
+- `eval_mode`: the input on which the node was evaluated on.
+Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='integrated').
+However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality,
+you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node.
+For example when evaluating the reader use value='isolated' to simulate a perfect retriever in an ExtractiveQAPipeline.
+Values can be 'integrated', 'isolated'.
+Default value is 'integrated'.
+- `document_scope`: A criterion for deciding whether documents are relevant or not.
+You can select between:
+- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+ A typical use case is Document Retrieval.
+- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
+ A typical use case is Document-Independent Passage Retrieval.
+- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
+ A typical use case is Document-Specific Passage Retrieval.
+- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
+ A typical use case is Document Retrieval having sparse context labels.
+- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
+ A typical use case is Question Answering.
+- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
+ This is intended to be a proper default value in order to support both main use cases:
+ - Document Retrieval
+ - Question Answering
+The default value is 'document_id_or_answer'.
+- `answer_scope`: Specifies the scope in which a matching answer is considered correct.
+You can select between:
+- 'any' (default): Any matching answer is considered correct.
+- 'context': The answer is only considered correct if its context matches as well.
+ Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
+- 'document_id': The answer is only considered correct if its document ID matches as well.
+ You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
+- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
+The default value is 'any'.
+In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
+
+
+
+#### EvaluationResult.save
+
+```python
+def save(out_dir: Union[str, Path])
+```
+
+Saves the evaluation result.
+
+The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.
+
+**Arguments**:
+
+- `out_dir`: Path to the target folder the csvs will be saved.
+
+
+
+#### EvaluationResult.load
+
+```python
+@classmethod
+def load(cls, load_dir: Union[str, Path])
+```
+
+Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
+
+**Arguments**:
+
+- `load_dir`: The directory containing the csv files.
+
diff --git a/docs/v1.5.0/_src/api/api/query_classifier.md b/docs/v1.5.0/_src/api/api/query_classifier.md
new file mode 100644
index 0000000000..34b9fade19
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/query_classifier.md
@@ -0,0 +1,152 @@
+
+
+# Module base
+
+
+
+## BaseQueryClassifier
+
+```python
+class BaseQueryClassifier(BaseComponent)
+```
+
+Abstract class for Query Classifiers
+
+
+
+# Module sklearn
+
+
+
+## SklearnQueryClassifier
+
+```python
+class SklearnQueryClassifier(BaseQueryClassifier)
+```
+
+A node to classify an incoming query into one of two categories using a lightweight sklearn model. Depending on the result, the query flows to a different branch in your pipeline
+and the further processing can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2` from this node.
+
+**Example**:
+
+ ```python
+ |{
+ |pipe = Pipeline()
+ |pipe.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
+ |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"])
+ |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
+
+ |# Keyword queries will use the ElasticRetriever
+ |pipe.run("kubernetes aws")
+
+ |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever
+ |pipe.run("How to manage kubernetes on aws")
+
+ ```
+
+ Models:
+
+ Pass your own `Sklearn` binary classification model or use one of the following pretrained ones:
+ 1) Keywords vs. Questions/Statements (Default)
+ query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle)
+ query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle)
+ output_1 => question/statement
+ output_2 => keyword query
+ [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt)
+
+
+ 2) Questions vs. Statements
+ query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle)
+ query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle)
+ output_1 => question
+ output_2 => statement
+ [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt)
+
+ See also the [tutorial](https://haystack.deepset.ai/tutorials/v1.5.0/pipelines) on pipelines.
+
+
+
+#### SklearnQueryClassifier.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: Union[
+ str, Any
+ ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[
+ str, Any
+ ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle", batch_size: Optional[int] = None)
+```
+
+**Arguments**:
+
+- `model_name_or_path`: Gradient boosting based binary classifier to classify between keyword vs statement/question
+queries or statement vs question queries.
+- `vectorizer_name_or_path`: A ngram based Tfidf vectorizer for extracting features from query.
+- `batch_size`: Number of queries to process at a time.
+
+
+
+# Module transformers
+
+
+
+## TransformersQueryClassifier
+
+```python
+class TransformersQueryClassifier(BaseQueryClassifier)
+```
+
+A node to classify an incoming query into one of two categories using a (small) BERT transformer model.
+Depending on the result, the query flows to a different branch in your pipeline and the further processing
+can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2`
+from this node.
+
+**Example**:
+
+ ```python
+ |{
+ |pipe = Pipeline()
+ |pipe.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"])
+ |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"])
+ |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
+
+ |# Keyword queries will use the ElasticRetriever
+ |pipe.run("kubernetes aws")
+
+ |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever
+ |pipe.run("How to manage kubernetes on aws")
+
+ ```
+
+ Models:
+
+ Pass your own `Transformer` binary classification model from file/huggingface or use one of the following
+ pretrained ones hosted on Huggingface:
+ 1) Keywords vs. Questions/Statements (Default)
+ model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection"
+ output_1 => question/statement
+ output_2 => keyword query
+ [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt)
+
+
+ 2) Questions vs. Statements
+ `model_name_or_path`="shahrukhx01/question-vs-statement-classifier"
+ output_1 => question
+ output_2 => statement
+ [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt)
+
+
+ See also the [tutorial](https://haystack.deepset.ai/tutorials/v1.5.0/pipelines) on pipelines.
+
+
+
+#### TransformersQueryClassifier.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", use_gpu: bool = True, batch_size: Optional[int] = None)
+```
+
+**Arguments**:
+
+- `model_name_or_path`: Transformer based fine tuned mini bert model for query classification
+- `use_gpu`: Whether to use GPU (if available).
+
diff --git a/docs/v1.5.0/_src/api/api/question_generator.md b/docs/v1.5.0/_src/api/api/question_generator.md
new file mode 100644
index 0000000000..378333f8f6
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/question_generator.md
@@ -0,0 +1,57 @@
+
+
+# Module question\_generator
+
+
+
+## QuestionGenerator
+
+```python
+class QuestionGenerator(BaseComponent)
+```
+
+The Question Generator takes only a document as input and outputs questions that it thinks can be
+answered by this document. In our current implementation, input texts are split into chunks of 50 words
+with a 10 word overlap. This is because the default model `valhalla/t5-base-e2e-qg` seems to generate only
+about 3 questions per passage regardless of length. Our approach prioritizes the creation of more questions
+over processing efficiency (T5 is able to digest much more than 50 words at once). The returned questions
+generally come in an order dictated by the order of their answers i.e. early questions in the list generally
+come from earlier in the document.
+
+
+
+#### QuestionGenerator.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, batch_size: Optional[int] = None)
+```
+
+Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is
+
+implemented as a Seq2SeqLM in HuggingFace Transformers. Note that this style of question generation (where the only input
+is a document) is sometimes referred to as end-to-end question generation. Answer-supervised question
+generation is not currently supported.
+
+**Arguments**:
+
+- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. "valhalla/t5-base-e2e-qg".
+See https://huggingface.co/models for full list of available models.
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `use_gpu`: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available.
+- `batch_size`: Number of documents to process at a time.
+
+
+
+#### QuestionGenerator.generate\_batch
+
+```python
+def generate_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None) -> Union[List[List[str]], List[List[List[str]]]]
+```
+
+Generates questions for a list of strings or a list of lists of strings.
+
+**Arguments**:
+
+- `texts`: List of str or list of list of str.
+- `batch_size`: Number of texts to process at a time.
+
diff --git a/docs/v1.5.0/_src/api/api/ranker.md b/docs/v1.5.0/_src/api/api/ranker.md
new file mode 100644
index 0000000000..39253dbb9f
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/ranker.md
@@ -0,0 +1,166 @@
+
+
+# Module base
+
+
+
+## BaseRanker
+
+```python
+class BaseRanker(BaseComponent)
+```
+
+
+
+#### BaseRanker.timing
+
+```python
+def timing(fn, attr_name)
+```
+
+Wrapper method used to time functions.
+
+
+
+#### BaseRanker.eval
+
+```python
+def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False) -> dict
+```
+
+Performs evaluation of the Ranker.
+
+Ranker is evaluated in the same way as a Retriever based on whether it finds the correct document given the query string and at which
+position in the ranking of documents the correct document is.
+
+| Returns a dict containing the following metrics:
+
+ - "recall": Proportion of questions for which correct document is among retrieved documents
+ - "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank.
+ Only considers the highest ranked relevant document.
+ - "map": Mean of average precision for each question. Rewards retrievers that give relevant
+ documents a higher rank. Considers all retrieved relevant documents. If ``open_domain=True``,
+ average precision is normalized by the number of retrieved relevant documents per query.
+ If ``open_domain=False``, average precision is normalized by the number of all relevant documents
+ per query.
+
+**Arguments**:
+
+- `label_index`: Index/Table in DocumentStore where labeled questions are stored
+- `doc_index`: Index/Table in DocumentStore where documents that are used for evaluation are stored
+- `top_k`: How many documents to return per query
+- `open_domain`: If ``True``, retrieval will be evaluated by checking if the answer string to a question is
+contained in the retrieved docs (common approach in open-domain QA).
+If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids
+are within ids explicitly stated in the labels.
+- `return_preds`: Whether to add predictions in the returned dictionary. If True, the returned dictionary
+contains the keys "predictions" and "metrics".
+
+
+
+# Module sentence\_transformers
+
+
+
+## SentenceTransformersRanker
+
+```python
+class SentenceTransformersRanker(BaseRanker)
+```
+
+Sentence Transformer based pre-trained Cross-Encoder model for Document Re-ranking (https://huggingface.co/cross-encoder).
+Re-Ranking can be used on top of a retriever to boost the performance for document search. This is particularly useful if the retriever has a high recall but is bad in sorting the documents by relevance.
+
+SentenceTransformerRanker handles Cross-Encoder models
+ - use a single logit as similarity score e.g. cross-encoder/ms-marco-MiniLM-L-12-v2
+ - use two output logits (no_answer, has_answer) e.g. deepset/gbert-base-germandpr-reranking
+https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers
+
+| With a SentenceTransformersRanker, you can:
+ - directly get predictions via predict()
+
+Usage example:
+...
+retriever = BM25Retriever(document_store=document_store)
+ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")
+p = Pipeline()
+p.add_node(component=retriever, name="ESRetriever", inputs=["Query"])
+p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"])
+
+
+
+#### SentenceTransformersRanker.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, batch_size: Optional[int] = None)
+```
+
+**Arguments**:
+
+- `model_name_or_path`: Directory of a saved model or the name of a public model e.g.
+'cross-encoder/ms-marco-MiniLM-L-12-v2'.
+See https://huggingface.co/cross-encoder for full list of available models
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `top_k`: The maximum number of documents to return
+- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
+- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+The strings will be converted into pytorch devices, so use the string notation described here:
+https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+(e.g. ["cuda:0"]).
+- `batch_size`: Number of documents to process at a time.
+
+
+
+#### SentenceTransformersRanker.predict
+
+```python
+def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document]
+```
+
+Use loaded ranker model to re-rank the supplied list of Document.
+
+Returns list of Document sorted by (desc.) similarity with the query.
+
+**Arguments**:
+
+- `query`: Query string
+- `documents`: List of Document to be re-ranked
+- `top_k`: The maximum number of documents to return
+
+**Returns**:
+
+List of Document
+
+
+
+#### SentenceTransformersRanker.predict\_batch
+
+```python
+def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]]
+```
+
+Use loaded ranker model to re-rank the supplied lists of Documents.
+
+Returns lists of Documents sorted by (desc.) similarity with the corresponding queries.
+
+
+- If you provide a list containing a single query...
+
+ - ... and a single list of Documents, the single list of Documents will be re-ranked based on the
+ supplied query.
+ - ... and a list of lists of Documents, each list of Documents will be re-ranked individually based on the
+ supplied query.
+
+
+- If you provide a list of multiple queries...
+
+ - ... you need to provide a list of lists of Documents. Each list of Documents will be re-ranked based on
+ its corresponding query.
+
+**Arguments**:
+
+- `queries`: Single query string or list of queries
+- `documents`: Single list of Documents or list of lists of Documents to be reranked.
+- `top_k`: The maximum number of documents to return per Document list.
+- `batch_size`: Number of Documents to process at a time.
+
diff --git a/docs/v1.5.0/_src/api/api/reader.md b/docs/v1.5.0/_src/api/api/reader.md
new file mode 100644
index 0000000000..36acfe5dee
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/reader.md
@@ -0,0 +1,891 @@
+
+
+# Module base
+
+
+
+## BaseReader
+
+```python
+class BaseReader(BaseComponent)
+```
+
+
+
+#### BaseReader.timing
+
+```python
+def timing(fn, attr_name)
+```
+
+Wrapper method used to time functions.
+
+
+
+# Module farm
+
+
+
+## FARMReader
+
+```python
+class FARMReader(BaseReader)
+```
+
+Transformer based model for extractive Question Answering using the FARM framework (https://github.com/deepset-ai/FARM).
+While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interface remains the same.
+
+| With a FARMReader, you can:
+
+ - directly get predictions via predict()
+ - fine-tune the model on QA data via train()
+
+
+
+#### FARMReader.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: List[torch.device] = [], no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None)
+```
+
+**Arguments**:
+
+- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased',
+'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'.
+See https://huggingface.co/models for full list of available models.
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `context_window_size`: The size, in characters, of the window around the answer span that is used when
+displaying the context around the answer.
+- `batch_size`: Number of samples the model receives in one batch for inference.
+Memory consumption is much lower in inference mode. Recommendation: Increase the batch size
+to a value so only a single batch is used.
+- `use_gpu`: Whether to use GPUs or the CPU. Falls back on CPU if no GPU is available.
+- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
+Unused if `use_gpu` is False.
+- `no_ans_boost`: How much the no_answer logit is boosted/increased.
+If set to 0 (default), the no_answer logit is not changed.
+If a negative number, there is a lower chance of "no_answer" being predicted.
+If a positive number, there is an increased chance of "no_answer"
+- `return_no_answer`: Whether to include no_answer predictions in the results.
+- `top_k`: The maximum number of answers to return
+- `top_k_per_candidate`: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text).
+Note that this is not the number of "final answers" you will receive
+(see `top_k` in FARMReader.predict() or Finder.get_answers() for that)
+and that FARM includes no_answer in the sorted list of predictions.
+- `top_k_per_sample`: How many answers to extract from each small text passage that the model can process at once
+(one "candidate doc" is usually split into many smaller "passages").
+You usually want a very small value here, as it slows down inference
+and you don't gain much of quality by having multiple answers from one passage.
+Note that this is not the number of "final answers" you will receive
+(see `top_k` in FARMReader.predict() or Finder.get_answers() for that)
+and that FARM includes no_answer in the sorted list of predictions.
+- `num_processes`: The number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
+multiprocessing. Set to None to let Inferencer determine optimum number. If you
+want to debug the Language Model, you might need to disable multiprocessing!
+- `max_seq_len`: Max sequence length of one input text for the model
+- `doc_stride`: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``)
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered.
+The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
+- `use_confidence_scores`: Sets the type of score that is returned with every predicted answer.
+`True` => a scaled confidence / relevance score between [0, 1].
+This score can also be further calibrated on your dataset via self.eval()
+(see https://haystack.deepset.ai/components/v1.5.0/reader#confidence-scores) .
+`False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit
+from the model for the predicted span.
+- `confidence_threshold`: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default.
+- `proxies`: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'}
+- `local_files_only`: Whether to force checking for local files only (and forbid downloads)
+- `force_download`: Whether fo force a (re-)download even if the model exists locally in the cache.
+- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`,
+the local token will be used, which must be previously created via `transformer-cli login`.
+Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+
+
+
+#### FARMReader.train
+
+```python
+def train(data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"))
+```
+
+Fine-tune a model on a QA dataset. Options:
+
+- Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data)
+- Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool)
+
+Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps.
+If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint.
+
+**Arguments**:
+
+- `data_dir`: Path to directory containing your training data in SQuAD style
+- `train_filename`: Filename of training data
+- `dev_filename`: Filename of dev / eval data
+- `test_filename`: Filename of test data
+- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
+that gets split off from training data for eval.
+- `use_gpu`: Whether to use GPU (if available)
+- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
+Unused if `use_gpu` is False.
+- `batch_size`: Number of samples the model receives in one batch for training
+- `n_epochs`: Number of iterations on the whole training data set
+- `learning_rate`: Learning rate of the optimizer
+- `max_seq_len`: Maximum text length (in tokens). Everything longer gets cut down.
+- `warmup_proportion`: Proportion of training steps until maximum learning rate is reached.
+Until that point LR is increasing linearly. After that it's decreasing again linearly.
+Options for different schedules are available in FARM.
+- `evaluate_every`: Evaluate the model every X steps on the hold-out eval dataset
+- `save_dir`: Path to store the final model
+- `num_processes`: The number of processes for `multiprocessing.Pool` during preprocessing.
+Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set.
+Set to None to use all CPU cores minus one.
+- `use_amp`: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model.
+Available options:
+None (Don't use AMP)
+"O0" (Normal FP32 training)
+"O1" (Mixed Precision => Recommended)
+"O2" (Almost FP16)
+"O3" (Pure FP16).
+See details on: https://nvidia.github.io/apex/amp.html
+- `checkpoint_root_dir`: the Path of directory where all train checkpoints are saved. For each individual
+checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created.
+- `checkpoint_every`: save a train checkpoint after this many steps of training.
+- `checkpoints_to_keep`: maximum number of train checkpoints to save.
+- `caching`: whether or not to use caching for preprocessed dataset
+- `cache_path`: Path to cache the preprocessed dataset
+- `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used.
+
+**Returns**:
+
+None
+
+
+
+#### FARMReader.distil\_prediction\_layer\_from
+
+```python
+def distil_prediction_layer_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], student_batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 2, learning_rate: float = 3e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss_weight: float = 0.5, distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "kl_div", temperature: float = 1.0)
+```
+
+Fine-tune a model on a QA dataset using logit-based distillation. You need to provide a teacher model that is already finetuned on the dataset
+
+and a student model that will be trained using the teacher's logits. The idea of this is to increase the accuracy of a lightweight student model.
+using a more complex teacher.
+Originally proposed in: https://arxiv.org/pdf/1503.02531.pdf
+This can also be considered as the second stage of distillation finetuning as described in the TinyBERT paper:
+https://arxiv.org/pdf/1909.10351.pdf
+**Example**
+```python
+student = FARMReader(model_name_or_path="prajjwal1/bert-medium")
+teacher = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2")
+student.distil_prediction_layer_from(teacher, data_dir="squad2", train_filename="train.json", test_filename="dev.json",
+ learning_rate=3e-5, distillation_loss_weight=1.0, temperature=5)
+```
+
+Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps.
+If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint.
+
+**Arguments**:
+
+- `teacher_model`: Model whose logits will be used to improve accuracy
+- `data_dir`: Path to directory containing your training data in SQuAD style
+- `train_filename`: Filename of training data
+- `dev_filename`: Filename of dev / eval data
+- `test_filename`: Filename of test data
+- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
+that gets split off from training data for eval.
+- `use_gpu`: Whether to use GPU (if available)
+- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
+Unused if `use_gpu` is False.
+- `student_batch_size`: Number of samples the student model receives in one batch for training
+- `student_batch_size`: Number of samples the teacher model receives in one batch for distillation
+- `n_epochs`: Number of iterations on the whole training data set
+- `learning_rate`: Learning rate of the optimizer
+- `max_seq_len`: Maximum text length (in tokens). Everything longer gets cut down.
+- `warmup_proportion`: Proportion of training steps until maximum learning rate is reached.
+Until that point LR is increasing linearly. After that it's decreasing again linearly.
+Options for different schedules are available in FARM.
+- `evaluate_every`: Evaluate the model every X steps on the hold-out eval dataset
+- `save_dir`: Path to store the final model
+- `num_processes`: The number of processes for `multiprocessing.Pool` during preprocessing.
+Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set.
+Set to None to use all CPU cores minus one.
+- `use_amp`: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model.
+Available options:
+None (Don't use AMP)
+"O0" (Normal FP32 training)
+"O1" (Mixed Precision => Recommended)
+"O2" (Almost FP16)
+"O3" (Pure FP16).
+See details on: https://nvidia.github.io/apex/amp.html
+- `checkpoint_root_dir`: the Path of directory where all train checkpoints are saved. For each individual
+checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created.
+- `checkpoint_every`: save a train checkpoint after this many steps of training.
+- `checkpoints_to_keep`: maximum number of train checkpoints to save.
+- `caching`: whether or not to use caching for preprocessed dataset and teacher logits
+- `cache_path`: Path to cache the preprocessed dataset and teacher logits
+- `distillation_loss_weight`: The weight of the distillation loss. A higher weight means the teacher outputs are more important.
+- `distillation_loss`: Specifies how teacher and model logits should be compared. Can either be a string ("mse" for mean squared error or "kl_div" for kl divergence loss) or a callable loss function (needs to have named parameters student_logits and teacher_logits)
+- `temperature`: The temperature for distillation. A higher temperature will result in less certainty of teacher outputs. A lower temperature means more certainty. A temperature of 1.0 does not change the certainty of the model.
+- `tinybert_loss`: Whether to use the TinyBERT loss function for distillation. This requires the student to be a TinyBERT model and the teacher to be a finetuned version of bert-base-uncased.
+- `tinybert_epochs`: Number of epochs to train the student model with the TinyBERT loss function. After this many epochs, the student model is trained with the regular distillation loss function.
+- `tinybert_learning_rate`: Learning rate to use when training the student model with the TinyBERT loss function.
+- `tinybert_train_filename`: Filename of training data to use when training the student model with the TinyBERT loss function. To best follow the original paper, this should be an augmented version of the training data created using the augment_squad.py script. If not specified, the training data from the original training is used.
+- `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used.
+
+**Returns**:
+
+None
+
+
+
+#### FARMReader.distil\_intermediate\_layers\_from
+
+```python
+def distil_intermediate_layers_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 5, learning_rate: float = 5e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "mse", temperature: float = 1.0, processor: Optional[Processor] = None)
+```
+
+The first stage of distillation finetuning as described in the TinyBERT paper:
+
+https://arxiv.org/pdf/1909.10351.pdf
+**Example**
+```python
+student = FARMReader(model_name_or_path="prajjwal1/bert-medium")
+teacher = FARMReader(model_name_or_path="huawei-noah/TinyBERT_General_6L_768D")
+student.distil_intermediate_layers_from(teacher, data_dir="squad2", train_filename="train.json", test_filename="dev.json",
+ learning_rate=3e-5, distillation_loss_weight=1.0, temperature=5)
+```
+
+Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps.
+If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint.
+
+**Arguments**:
+
+- `teacher_model`: Model whose logits will be used to improve accuracy
+- `data_dir`: Path to directory containing your training data in SQuAD style
+- `train_filename`: Filename of training data. To best follow the original paper, this should be an augmented version of the training data created using the augment_squad.py script
+- `dev_filename`: Filename of dev / eval data
+- `test_filename`: Filename of test data
+- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
+that gets split off from training data for eval.
+- `use_gpu`: Whether to use GPU (if available)
+- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
+Unused if `use_gpu` is False.
+- `student_batch_size`: Number of samples the student model receives in one batch for training
+- `student_batch_size`: Number of samples the teacher model receives in one batch for distillation
+- `n_epochs`: Number of iterations on the whole training data set
+- `learning_rate`: Learning rate of the optimizer
+- `max_seq_len`: Maximum text length (in tokens). Everything longer gets cut down.
+- `warmup_proportion`: Proportion of training steps until maximum learning rate is reached.
+Until that point LR is increasing linearly. After that it's decreasing again linearly.
+Options for different schedules are available in FARM.
+- `evaluate_every`: Evaluate the model every X steps on the hold-out eval dataset
+- `save_dir`: Path to store the final model
+- `num_processes`: The number of processes for `multiprocessing.Pool` during preprocessing.
+Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set.
+Set to None to use all CPU cores minus one.
+- `use_amp`: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model.
+Available options:
+None (Don't use AMP)
+"O0" (Normal FP32 training)
+"O1" (Mixed Precision => Recommended)
+"O2" (Almost FP16)
+"O3" (Pure FP16).
+See details on: https://nvidia.github.io/apex/amp.html
+- `checkpoint_root_dir`: the Path of directory where all train checkpoints are saved. For each individual
+checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created.
+- `checkpoint_every`: save a train checkpoint after this many steps of training.
+- `checkpoints_to_keep`: maximum number of train checkpoints to save.
+- `caching`: whether or not to use caching for preprocessed dataset and teacher logits
+- `cache_path`: Path to cache the preprocessed dataset and teacher logits
+- `distillation_loss_weight`: The weight of the distillation loss. A higher weight means the teacher outputs are more important.
+- `distillation_loss`: Specifies how teacher and model logits should be compared. Can either be a string ("mse" for mean squared error or "kl_div" for kl divergence loss) or a callable loss function (needs to have named parameters student_logits and teacher_logits)
+- `temperature`: The temperature for distillation. A higher temperature will result in less certainty of teacher outputs. A lower temperature means more certainty. A temperature of 1.0 does not change the certainty of the model.
+- `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used.
+
+**Returns**:
+
+None
+
+
+
+#### FARMReader.update\_parameters
+
+```python
+def update_parameters(context_window_size: Optional[int] = None, no_ans_boost: Optional[float] = None, return_no_answer: Optional[bool] = None, max_seq_len: Optional[int] = None, doc_stride: Optional[int] = None)
+```
+
+Hot update parameters of a loaded Reader. It may not to be safe when processing concurrent requests.
+
+
+
+#### FARMReader.save
+
+```python
+def save(directory: Path)
+```
+
+Saves the Reader model so that it can be reused at a later point in time.
+
+**Arguments**:
+
+- `directory`: Directory where the Reader model should be saved
+
+
+
+#### FARMReader.predict\_batch
+
+```python
+def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
+```
+
+Use loaded QA model to find answers for the queries in the Documents.
+
+- If you provide a list containing a single query...
+
+ - ... and a single list of Documents, the query will be applied to each Document individually.
+ - ... and a list of lists of Documents, the query will be applied to each list of Documents and the Answers
+ will be aggregated per Document list.
+
+- If you provide a list of multiple queries...
+
+ - ... and a single list of Documents, each query will be applied to each Document individually.
+ - ... and a list of lists of Documents, each query will be applied to its corresponding list of Documents
+ and the Answers will be aggregated per query-Document pair.
+
+**Arguments**:
+
+- `queries`: Single query or list of queries.
+- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
+Can be a single list of Documents or a list of lists of Documents.
+- `top_k`: Number of returned answers per query.
+- `batch_size`: Number of query-document pairs to be processed at a time.
+
+
+
+#### FARMReader.predict
+
+```python
+def predict(query: str, documents: List[Document], top_k: Optional[int] = None)
+```
+
+Use loaded QA model to find answers for a query in the supplied list of Document.
+
+Returns dictionaries containing answers sorted by (desc.) score.
+Example:
+ ```python
+ |{
+ | 'query': 'Who is the father of Arya Stark?',
+ | 'answers':[Answer(
+ | 'answer': 'Eddard,',
+ | 'context': "She travels with her father, Eddard, to King's Landing when he is",
+ | 'score': 0.9787139466668613,
+ | 'offsets_in_context': [Span(start=29, end=35],
+ | 'offsets_in_context': [Span(start=347, end=353],
+ | 'document_id': '88d1ed769d003939d3a0d28034464ab2'
+ | ),...
+ | ]
+ |}
+ ```
+
+**Arguments**:
+
+- `query`: Query string
+- `documents`: List of Document in which to search for the answer
+- `top_k`: The maximum number of answers to return
+
+**Returns**:
+
+Dict containing query and answers
+
+
+
+#### FARMReader.eval\_on\_file
+
+```python
+def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None)
+```
+
+Performs evaluation on a SQuAD-formatted file.
+
+Returns a dict containing the following metrics:
+ - "EM": exact match score
+ - "f1": F1-Score
+ - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer
+
+**Arguments**:
+
+- `data_dir`: The directory in which the test set can be found
+- `test_filename`: The name of the file containing the test data in SQuAD format.
+- `device`: The device on which the tensors should be processed.
+Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
+or use the Reader's device by default.
+
+
+
+#### FARMReader.eval
+
+```python
+def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False, use_no_answer_legacy_confidence=False)
+```
+
+Performs evaluation on evaluation documents in the DocumentStore.
+
+Returns a dict containing the following metrics:
+ - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers
+ - "f1": Average overlap between predicted answers and their corresponding correct answers
+ - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer
+
+**Arguments**:
+
+- `document_store`: DocumentStore containing the evaluation documents
+- `device`: The device on which the tensors should be processed.
+Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
+or use the Reader's device by default.
+- `label_index`: Index/Table name where labeled questions are stored
+- `doc_index`: Index/Table name where documents that are used for evaluation are stored
+- `label_origin`: Field name where the gold labels are stored
+- `calibrate_conf_scores`: Whether to calibrate the temperature for temperature scaling of the confidence scores
+- `use_no_answer_legacy_confidence`: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
+Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
+
+
+
+#### FARMReader.calibrate\_confidence\_scores
+
+```python
+def calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label")
+```
+
+Calibrates confidence scores on evaluation documents in the DocumentStore.
+
+**Arguments**:
+
+- `document_store`: DocumentStore containing the evaluation documents
+- `device`: The device on which the tensors should be processed.
+Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
+or use the Reader's device by default.
+- `label_index`: Index/Table name where labeled questions are stored
+- `doc_index`: Index/Table name where documents that are used for evaluation are stored
+- `label_origin`: Field name where the gold labels are stored
+
+
+
+#### FARMReader.predict\_on\_texts
+
+```python
+def predict_on_texts(question: str, texts: List[str], top_k: Optional[int] = None)
+```
+
+Use loaded QA model to find answers for a question in the supplied list of Document.
+
+Returns dictionaries containing answers sorted by (desc.) score.
+Example:
+ ```python
+ |{
+ | 'question': 'Who is the father of Arya Stark?',
+ | 'answers':[
+ | {'answer': 'Eddard,',
+ | 'context': " She travels with her father, Eddard, to King's Landing when he is ",
+ | 'offset_answer_start': 147,
+ | 'offset_answer_end': 154,
+ | 'score': 0.9787139466668613,
+ | 'document_id': '1337'
+ | },...
+ | ]
+ |}
+ ```
+
+**Arguments**:
+
+- `question`: Question string
+- `documents`: List of documents as string type
+- `top_k`: The maximum number of answers to return
+
+**Returns**:
+
+Dict containing question and answers
+
+
+
+#### FARMReader.convert\_to\_onnx
+
+```python
+@classmethod
+def convert_to_onnx(cls, model_name: str, output_path: Path, convert_to_float16: bool = False, quantize: bool = False, task_type: str = "question_answering", opset_version: int = 11)
+```
+
+Convert a PyTorch BERT model to ONNX format and write to ./onnx-export dir. The converted ONNX model
+
+can be loaded with in the `FARMReader` using the export path as `model_name_or_path` param.
+
+Usage:
+
+ `from haystack.reader.farm import FARMReader
+ from pathlib import Path
+ onnx_model_path = Path("roberta-onnx-model")
+ FARMReader.convert_to_onnx(model_name="deepset/bert-base-cased-squad2", output_path=onnx_model_path)
+ reader = FARMReader(onnx_model_path)`
+
+**Arguments**:
+
+- `model_name`: transformers model name
+- `output_path`: Path to output the converted model
+- `convert_to_float16`: Many models use float32 precision by default. With the half precision of float16,
+inference is faster on Nvidia GPUs with Tensor core like T4 or V100. On older GPUs,
+float32 could still be be more performant.
+- `quantize`: convert floating point number to integers
+- `task_type`: Type of task for the model. Available options: "question_answering" or "embeddings".
+- `opset_version`: ONNX opset version
+
+
+
+# Module transformers
+
+
+
+## TransformersReader
+
+```python
+class TransformersReader(BaseReader)
+```
+
+Transformer based model for extractive Question Answering using the HuggingFace's transformers framework
+(https://github.com/huggingface/transformers).
+While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
+With this reader, you can directly get predictions via predict()
+
+
+
+#### TransformersReader.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answers: bool = False, max_seq_len: int = 256, doc_stride: int = 128, batch_size: Optional[int] = None)
+```
+
+Load a QA model from Transformers.
+
+Available models include:
+
+- ``'distilbert-base-uncased-distilled-squad`'``
+- ``'bert-large-cased-whole-word-masking-finetuned-squad``'
+- ``'bert-large-uncased-whole-word-masking-finetuned-squad``'
+
+See https://huggingface.co/models for full list of available QA models
+
+**Arguments**:
+
+- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased',
+'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'.
+See https://huggingface.co/models for full list of available models.
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `tokenizer`: Name of the tokenizer (usually the same as model)
+- `context_window_size`: Num of chars (before and after the answer) to return as "context" for each answer.
+The context usually helps users to understand if the answer really makes sense.
+- `use_gpu`: Whether to use GPU (if available).
+- `top_k`: The maximum number of answers to return
+- `top_k_per_candidate`: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text).
+Note that this is not the number of "final answers" you will receive
+(see `top_k` in TransformersReader.predict() or Finder.get_answers() for that)
+and that no_answer can be included in the sorted list of predictions.
+- `return_no_answers`: If True, the HuggingFace Transformers model could return a "no_answer" (i.e. when there is an unanswerable question)
+If False, it cannot return a "no_answer". Note that `no_answer_boost` is unfortunately not available with TransformersReader.
+If you would like to set no_answer_boost, use a `FARMReader`.
+- `max_seq_len`: max sequence length of one input text for the model
+- `doc_stride`: length of striding window for splitting long texts (used if len(text) > max_seq_len)
+- `batch_size`: Number of documents to process at a time.
+
+
+
+#### TransformersReader.predict
+
+```python
+def predict(query: str, documents: List[Document], top_k: Optional[int] = None)
+```
+
+Use loaded QA model to find answers for a query in the supplied list of Document.
+
+Returns dictionaries containing answers sorted by (desc.) score.
+Example:
+
+ ```python
+ |{
+ | 'query': 'Who is the father of Arya Stark?',
+ | 'answers':[
+ | {'answer': 'Eddard,',
+ | 'context': " She travels with her father, Eddard, to King's Landing when he is ",
+ | 'offset_answer_start': 147,
+ | 'offset_answer_end': 154,
+ | 'score': 0.9787139466668613,
+ | 'document_id': '1337'
+ | },...
+ | ]
+ |}
+ ```
+
+**Arguments**:
+
+- `query`: Query string
+- `documents`: List of Document in which to search for the answer
+- `top_k`: The maximum number of answers to return
+
+**Returns**:
+
+Dict containing query and answers
+
+
+
+#### TransformersReader.predict\_batch
+
+```python
+def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
+```
+
+Use loaded QA model to find answers for the queries in the Documents.
+
+- If you provide a list containing a single query...
+
+ - ... and a single list of Documents, the query will be applied to each Document individually.
+ - ... and a list of lists of Documents, the query will be applied to each list of Documents and the Answers
+ will be aggregated per Document list.
+
+- If you provide a list of multiple queries...
+
+ - ... and a single list of Documents, each query will be applied to each Document individually.
+ - ... and a list of lists of Documents, each query will be applied to its corresponding list of Documents
+ and the Answers will be aggregated per query-Document pair.
+
+**Arguments**:
+
+- `queries`: Single query or list of queries.
+- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
+Can be a single list of Documents or a list of lists of Documents.
+- `top_k`: Number of returned answers per query.
+- `batch_size`: Number of query-document pairs to be processed at a time.
+
+
+
+# Module table
+
+
+
+## TableReader
+
+```python
+class TableReader(BaseReader)
+```
+
+Transformer-based model for extractive Question Answering on Tables with TaPas
+using the HuggingFace's transformers framework (https://github.com/huggingface/transformers).
+With this reader, you can directly get predictions via predict()
+
+**Example**:
+
+```python
+from haystack import Document
+from haystack.reader import TableReader
+import pandas as pd
+
+table_reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq")
+data = {
+ "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+ "age": ["57", "46", "60"],
+ "number of movies": ["87", "53", "69"],
+ "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+}
+table = pd.DataFrame(data)
+document = Document(content=table, content_type="table")
+query = "When was DiCaprio born?"
+prediction = table_reader.predict(query=query, documents=[document])
+answer = prediction["answers"][0].answer # "10 june 1996"
+```
+
+
+
+#### TableReader.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256)
+```
+
+Load a TableQA model from Transformers.
+
+Available models include:
+
+- ``'google/tapas-base-finetuned-wtq`'``
+- ``'google/tapas-base-finetuned-wikisql-supervised``'
+- ``'deepset/tapas-large-nq-hn-reader'``
+- ``'deepset/tapas-large-nq-reader'``
+
+See https://huggingface.co/models?pipeline_tag=table-question-answering
+for full list of available TableQA models.
+
+The nq-reader models are able to provide confidence scores, but cannot handle questions that need aggregation
+over multiple cells. The returned answers are sorted first by a general table score and then by answer span
+scores.
+All the other models can handle aggregation questions, but don't provide reasonable confidence scores.
+
+**Arguments**:
+
+- `model_name_or_path`: Directory of a saved model or the name of a public model e.g.
+See https://huggingface.co/models?pipeline_tag=table-question-answering for full list of available models.
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name,
+or commit hash.
+- `tokenizer`: Name of the tokenizer (usually the same as model)
+- `use_gpu`: Whether to use GPU or CPU. Falls back on CPU if no GPU is available.
+- `top_k`: The maximum number of answers to return
+- `top_k_per_candidate`: How many answers to extract for each candidate table that is coming from
+the retriever.
+- `return_no_answer`: Whether to include no_answer predictions in the results.
+(Only applicable with nq-reader models.)
+- `max_seq_len`: Max sequence length of one input table for the model. If the number of tokens of
+query + table exceed max_seq_len, the table will be truncated by removing rows until the
+input size fits the model.
+
+
+
+#### TableReader.predict
+
+```python
+def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
+```
+
+Use loaded TableQA model to find answers for a query in the supplied list of Documents
+
+of content_type ``'table'``.
+
+Returns dictionary containing query and list of Answer objects sorted by (desc.) score.
+WARNING: The answer scores are not reliable, as they are always extremely high, even if
+ a question cannot be answered by a given table.
+
+**Arguments**:
+
+- `query`: Query string
+- `documents`: List of Document in which to search for the answer. Documents should be
+of content_type ``'table'``.
+- `top_k`: The maximum number of answers to return
+
+**Returns**:
+
+Dict containing query and answers
+
+
+
+#### TableReader.predict\_batch
+
+```python
+def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
+```
+
+Use loaded TableQA model to find answers for the supplied queries in the supplied Documents
+
+of content_type ``'table'``.
+
+Returns dictionary containing query and list of Answer objects sorted by (desc.) score.
+
+WARNING: The answer scores are not reliable, as they are always extremely high, even if
+a question cannot be answered by a given table.
+
+- If you provide a list containing a single query...
+
+ - ... and a single list of Documents, the query will be applied to each Document individually.
+ - ... and a list of lists of Documents, the query will be applied to each list of Documents and the Answers
+ will be aggregated per Document list.
+
+- If you provide a list of multiple queries...
+
+ - ... and a single list of Documents, each query will be applied to each Document individually.
+ - ... and a list of lists of Documents, each query will be applied to its corresponding list of Documents
+ and the Answers will be aggregated per query-Document pair.
+
+**Arguments**:
+
+- `queries`: Single query string or list of queries.
+- `documents`: Single list of Documents or list of lists of Documents in which to search for the answers.
+Documents should be of content_type ``'table'``.
+- `top_k`: The maximum number of answers to return per query.
+- `batch_size`: Not applicable.
+
+
+
+## RCIReader
+
+```python
+class RCIReader(BaseReader)
+```
+
+Table Reader model based on Glass et al. (2021)'s Row-Column-Intersection model.
+See the original paper for more details:
+Glass, Michael, et al. (2021): "Capturing Row and Column Semantics in Transformer Based Question Answering over Tables"
+(https://aclanthology.org/2021.naacl-main.96/)
+
+Each row and each column is given a score with regard to the query by two separate models. The score of each cell
+is then calculated as the sum of the corresponding row score and column score. Accordingly, the predicted answer is
+the cell with the highest score.
+
+Pros and Cons of RCIReader compared to TableReader:
++ Provides meaningful confidence scores
++ Allows larger tables as input
+- Does not support aggregation over table cells
+- Slower
+
+
+
+#### RCIReader.\_\_init\_\_
+
+```python
+def __init__(row_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-row", column_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-col", row_model_version: Optional[str] = None, column_model_version: Optional[str] = None, row_tokenizer: Optional[str] = None, column_tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, max_seq_len: int = 256)
+```
+
+Load an RCI model from Transformers.
+
+Available models include:
+
+- ``'michaelrglass/albert-base-rci-wikisql-row'`` + ``'michaelrglass/albert-base-rci-wikisql-col'``
+- ``'michaelrglass/albert-base-rci-wtq-row'`` + ``'michaelrglass/albert-base-rci-wtq-col'``
+
+**Arguments**:
+
+- `row_model_name_or_path`: Directory of a saved row scoring model or the name of a public model
+- `column_model_name_or_path`: Directory of a saved column scoring model or the name of a public model
+- `row_model_version`: The version of row model to use from the HuggingFace model hub.
+Can be tag name, branch name, or commit hash.
+- `column_model_version`: The version of column model to use from the HuggingFace model hub.
+Can be tag name, branch name, or commit hash.
+- `row_tokenizer`: Name of the tokenizer for the row model (usually the same as model)
+- `column_tokenizer`: Name of the tokenizer for the column model (usually the same as model)
+- `use_gpu`: Whether to use GPU or CPU. Falls back on CPU if no GPU is available.
+- `top_k`: The maximum number of answers to return
+- `max_seq_len`: Max sequence length of one input table for the model. If the number of tokens of
+query + table exceed max_seq_len, the table will be truncated by removing rows until the
+input size fits the model.
+
+
+
+#### RCIReader.predict
+
+```python
+def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
+```
+
+Use loaded RCI models to find answers for a query in the supplied list of Documents
+
+of content_type ``'table'``.
+
+Returns dictionary containing query and list of Answer objects sorted by (desc.) score.
+The existing RCI models on the HF model hub don"t allow aggregation, therefore, the answer will always be
+composed of a single cell.
+
+**Arguments**:
+
+- `query`: Query string
+- `documents`: List of Document in which to search for the answer. Documents should be
+of content_type ``'table'``.
+- `top_k`: The maximum number of answers to return
+
+**Returns**:
+
+Dict containing query and answers
+
diff --git a/docs/v1.5.0/_src/api/api/retriever.md b/docs/v1.5.0/_src/api/api/retriever.md
new file mode 100644
index 0000000000..5e9f90f71b
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/retriever.md
@@ -0,0 +1,1553 @@
+
+
+# Module base
+
+
+
+## BaseGraphRetriever
+
+```python
+class BaseGraphRetriever(BaseComponent)
+```
+
+Base classfor knowledge graph retrievers.
+
+
+
+## BaseRetriever
+
+```python
+class BaseRetriever(BaseComponent)
+```
+
+Base class for regular retrievers.
+
+
+
+#### BaseRetriever.retrieve
+
+```python
+@abstractmethod
+def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query.
+
+**Arguments**:
+
+- `query`: The query
+- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### BaseRetriever.timing
+
+```python
+def timing(fn, attr_name)
+```
+
+Wrapper method used to time functions.
+
+
+
+#### BaseRetriever.eval
+
+```python
+def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False, headers: Optional[Dict[str, str]] = None) -> dict
+```
+
+Performs evaluation on the Retriever.
+
+Retriever is evaluated based on whether it finds the correct document given the query string and at which
+position in the ranking of documents the correct document is.
+
+| Returns a dict containing the following metrics:
+
+ - "recall": Proportion of questions for which correct document is among retrieved documents
+ - "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank.
+ Only considers the highest ranked relevant document.
+ - "map": Mean of average precision for each question. Rewards retrievers that give relevant
+ documents a higher rank. Considers all retrieved relevant documents. If ``open_domain=True``,
+ average precision is normalized by the number of retrieved relevant documents per query.
+ If ``open_domain=False``, average precision is normalized by the number of all relevant documents
+ per query.
+
+**Arguments**:
+
+- `label_index`: Index/Table in DocumentStore where labeled questions are stored
+- `doc_index`: Index/Table in DocumentStore where documents that are used for evaluation are stored
+- `top_k`: How many documents to return per query
+- `open_domain`: If ``True``, retrieval will be evaluated by checking if the answer string to a question is
+contained in the retrieved docs (common approach in open-domain QA).
+If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids
+are within ids explicitly stated in the labels.
+- `return_preds`: Whether to add predictions in the returned dictionary. If True, the returned dictionary
+contains the keys "predictions" and "metrics".
+- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+
+
+
+# Module sparse
+
+
+
+## BM25Retriever
+
+```python
+class BM25Retriever(BaseRetriever)
+```
+
+
+
+#### BM25Retriever.\_\_init\_\_
+
+```python
+def __init__(document_store: KeywordDocumentStore, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, scale_score: bool = True)
+```
+
+**Arguments**:
+
+- `document_store`: an instance of one of the following DocumentStores to retrieve from: ElasticsearchDocumentStore, OpenSearchDocumentStore and OpenDistroElasticsearchDocumentStore
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to False.
+- `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query).
+ Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
+ that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
+ names must match with the filters dict supplied in self.retrieve().
+ ::
+
+ **An example custom_query:**
+ ```python
+ | {
+ | "size": 10,
+ | "query": {
+ | "bool": {
+ | "should": [{"multi_match": {
+ | "query": ${query}, // mandatory query placeholder
+ | "type": "most_fields",
+ | "fields": ["content", "title"]}}],
+ | "filter": [ // optional custom filters
+ | {"terms": {"year": ${years}}},
+ | {"terms": {"quarter": ${quarters}}},
+ | {"range": {"date": {"gte": ${date}}}}
+ | ],
+ | }
+ | },
+ | }
+ ```
+
+ **For this custom_query, a sample retrieve() could be:**
+ ```python
+| self.retrieve(query="Why did the revenue increase?",
+| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
+```
+
+ Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings.
+ See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
+ You will find the highlighted output in the returned Document's meta field by key "highlighted".
+ ::
+
+ **Example custom_query with highlighting:**
+ ```python
+ | {
+ | "size": 10,
+ | "query": {
+ | "bool": {
+ | "should": [{"multi_match": {
+ | "query": ${query}, // mandatory query placeholder
+ | "type": "most_fields",
+ | "fields": ["content", "title"]}}],
+ | }
+ | },
+ | "highlight": { // enable highlighting
+ | "fields": { // for fields content and title
+ | "content": {},
+ | "title": {}
+ | }
+ | },
+ | }
+ ```
+
+ **For this custom_query, highlighting info can be accessed by:**
+ ```python
+ | docs = self.retrieve(query="Why did the revenue increase?")
+ | highlighted_content = docs[0].meta["highlighted"]["content"]
+ | highlighted_title = docs[0].meta["highlighted"]["title"]
+ ```
+- `top_k`: How many documents to return per query.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### BM25Retriever.retrieve
+
+```python
+def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query.
+
+**Arguments**:
+
+- `query`: The query
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### BM25Retriever.retrieve\_batch
+
+```python
+def retrieve_batch(queries: List[str], filters: Optional[
+ Union[
+ Dict[str, Union[Dict, List, str, int, float, bool]],
+ List[Dict[str, Union[Dict, List, str, int, float, bool]]],
+ ]
+ ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the supplied queries.
+
+Returns a list of lists of Documents (one per query).
+
+**Arguments**:
+
+- `queries`: List of query strings.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+- `batch_size`: Not applicable.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different
+value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+## FilterRetriever
+
+```python
+class FilterRetriever(BM25Retriever)
+```
+
+Naive "Retriever" that returns all documents that match the given filters. No impact of query at all.
+Helpful for benchmarking, testing and if you want to do QA on small documents without an "active" retriever.
+
+
+
+#### FilterRetriever.retrieve
+
+```python
+def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query.
+
+**Arguments**:
+
+- `query`: Has no effect, can pass in empty string
+- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+- `top_k`: Has no effect, pass in any int or None
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+## TfidfRetriever
+
+```python
+class TfidfRetriever(BaseRetriever)
+```
+
+Read all documents from a SQL backend.
+
+Split documents into smaller units (eg, paragraphs or pages) to reduce the
+computations when text is passed on to a Reader for QA.
+
+It uses sklearn's TfidfVectorizer to compute a tf-idf matrix.
+
+
+
+#### TfidfRetriever.\_\_init\_\_
+
+```python
+def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True)
+```
+
+**Arguments**:
+
+- `document_store`: an instance of a DocumentStore to retrieve documents from.
+- `top_k`: How many documents to return per query.
+- `auto_fit`: Whether to automatically update tf-idf matrix by calling fit() after new documents have been added
+
+
+
+#### TfidfRetriever.retrieve
+
+```python
+def retrieve(query: str, filters: Optional[
+ Union[
+ Dict[str, Union[Dict, List, str, int, float, bool]],
+ List[Dict[str, Union[Dict, List, str, int, float, bool]]],
+ ]
+ ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query.
+
+**Arguments**:
+
+- `query`: The query
+- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### TfidfRetriever.retrieve\_batch
+
+```python
+def retrieve_batch(queries: List[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the supplied queries.
+
+Returns a list of lists of Documents (one per query).
+
+**Arguments**:
+
+- `queries`: Single query string or list of queries.
+- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `batch_size`: Not applicable.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different
+value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### TfidfRetriever.fit
+
+```python
+def fit()
+```
+
+Performing training on this class according to the TF-IDF algorithm.
+
+
+
+# Module dense
+
+
+
+## DensePassageRetriever
+
+```python
+class DensePassageRetriever(BaseRetriever)
+```
+
+Retriever that uses a bi-encoder (one transformer for query, one transformer for passage).
+See the original paper for more details:
+Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Question Answering."
+(https://arxiv.org/abs/2004.04906).
+
+
+
+#### DensePassageRetriever.\_\_init\_\_
+
+```python
+def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
+```
+
+Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
+
+The checkpoint format matches huggingface transformers' model format
+
+**Example:**
+
+ ```python
+ | # remote model from FAIR
+ | DensePassageRetriever(document_store=your_doc_store,
+ | query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
+ | passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base")
+ | # or from local path
+ | DensePassageRetriever(document_store=your_doc_store,
+ | query_embedding_model="model_directory/question-encoder",
+ | passage_embedding_model="model_directory/context-encoder")
+ ```
+
+**Arguments**:
+
+- `document_store`: An instance of DocumentStore from which to retrieve documents.
+- `query_embedding_model`: Local path or remote name of question encoder checkpoint. The format equals the
+one used by hugging-face transformers' modelhub models
+Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"``
+- `passage_embedding_model`: Local path or remote name of passage encoder checkpoint. The format equals the
+one used by hugging-face transformers' modelhub models
+Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"``
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `max_seq_len_query`: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down."
+- `max_seq_len_passage`: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down."
+- `top_k`: How many documents to return per query.
+- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
+- `batch_size`: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size.
+- `embed_title`: Whether to concatenate title and passage to a text pair that is then used to create the embedding.
+This is the approach used in the original paper and is likely to improve performance if your
+titles contain meaningful information for retrieval (topic, entities etc.) .
+The title is expected to be present in doc.meta["name"] and can be supplied in the documents
+before writing them to the DocumentStore like this:
+{"text": "my text", "meta": {"name": "my title"}}.
+- `use_fast_tokenizers`: Whether to use fast Rust tokenizers
+- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name.
+If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`.
+- `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training.
+Options: `dot_product` (Default) or `cosine`
+- `global_loss_buffer_size`: Buffer size for all_gather() in DDP.
+Increase if errors like "encoded data exceeds max_size ..." come up
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+These strings will be converted into pytorch devices, so use the string notation described here:
+https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for DPR, training
+will only use the first device provided in this list.
+- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`,
+the local token will be used, which must be previously created via `transformer-cli login`.
+Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### DensePassageRetriever.retrieve
+
+```python
+def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query.
+
+**Arguments**:
+
+- `query`: The query
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### DensePassageRetriever.retrieve\_batch
+
+```python
+def retrieve_batch(queries: List[str], filters: Optional[
+ Union[
+ Dict[str, Union[Dict, List, str, int, float, bool]],
+ List[Dict[str, Union[Dict, List, str, int, float, bool]]],
+ ]
+ ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the supplied queries.
+
+Returns a list of lists of Documents (one per query).
+
+**Arguments**:
+
+- `queries`: List of query strings.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions. Can be a single filter that will be applied to each query or a list of filters
+(one filter per query).
+
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `batch_size`: Number of queries to embed at a time.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different
+value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### DensePassageRetriever.embed\_queries
+
+```python
+def embed_queries(texts: List[str]) -> List[np.ndarray]
+```
+
+Create embeddings for a list of queries using the query encoder
+
+**Arguments**:
+
+- `texts`: Queries to embed
+
+**Returns**:
+
+Embeddings, one per input queries
+
+
+
+#### DensePassageRetriever.embed\_documents
+
+```python
+def embed_documents(docs: List[Document]) -> List[np.ndarray]
+```
+
+Create embeddings for a list of documents using the passage encoder
+
+**Arguments**:
+
+- `docs`: List of Document objects used to represent documents / passages in a standardized way within Haystack.
+
+**Returns**:
+
+Embeddings of documents / passages shape (batch_size, embedding_dim)
+
+
+
+#### DensePassageRetriever.train
+
+```python
+def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, multiprocessing_strategy: Optional[str] = None, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3)
+```
+
+train a DensePassageRetrieval model
+
+**Arguments**:
+
+- `data_dir`: Directory where training file, dev file and test file are present
+- `train_filename`: training filename
+- `dev_filename`: development set filename, file to be used by model in eval step of training
+- `test_filename`: test set filename, file to be used by model in test step after training
+- `max_samples`: maximum number of input samples to convert. Can be used for debugging a smaller dataset.
+- `max_processes`: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo.
+It can be set to 1 to disable the use of multiprocessing or make debugging easier.
+- `multiprocessing_strategy`: Set the multiprocessing sharing strategy, this can be one of file_descriptor/file_system depending on your OS.
+If your system has low limits for the number of open file descriptors, and you can’t raise them,
+you should use the file_system strategy.
+- `dev_split`: The proportion of the train set that will sliced. Only works if dev_filename is set to None
+- `batch_size`: total number of samples in 1 batch of data
+- `embed_title`: whether to concatenate passage title with each passage. The default setting in official DPR embeds passage title with the corresponding passage
+- `num_hard_negatives`: number of hard negative passages(passages which are very similar(high score by BM25) to query but do not contain the answer
+- `num_positives`: number of positive passages
+- `n_epochs`: number of epochs to train the model on
+- `evaluate_every`: number of training steps after evaluation is run
+- `n_gpu`: number of gpus to train on
+- `learning_rate`: learning rate of optimizer
+- `epsilon`: epsilon parameter of optimizer
+- `weight_decay`: weight decay parameter of optimizer
+- `grad_acc_steps`: number of steps to accumulate gradient over before back-propagation is done
+- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are:
+"O0" (FP32)
+"O1" (Mixed Precision)
+"O2" (Almost FP16)
+"O3" (Pure FP16).
+For more information, refer to: https://nvidia.github.io/apex/amp.html
+- `optimizer_name`: what optimizer to use (default: AdamW)
+- `num_warmup_steps`: number of warmup steps
+- `optimizer_correct_bias`: Whether to correct bias in optimizer
+- `save_dir`: directory where models are saved
+- `query_encoder_save_dir`: directory inside save_dir where query_encoder model files are saved
+- `passage_encoder_save_dir`: directory inside save_dir where passage_encoder model files are saved
+Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps.
+If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint.
+
+
+
+#### DensePassageRetriever.save
+
+```python
+def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder")
+```
+
+Save DensePassageRetriever to the specified directory.
+
+**Arguments**:
+
+- `save_dir`: Directory to save to.
+- `query_encoder_dir`: Directory in save_dir that contains query encoder model.
+- `passage_encoder_dir`: Directory in save_dir that contains passage encoder model.
+
+**Returns**:
+
+None
+
+
+
+#### DensePassageRetriever.load
+
+```python
+@classmethod
+def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", infer_tokenizer_classes: bool = False)
+```
+
+Load DensePassageRetriever from the specified directory.
+
+
+
+## TableTextRetriever
+
+```python
+class TableTextRetriever(BaseRetriever)
+```
+
+Retriever that uses a tri-encoder to jointly retrieve among a database consisting of text passages and tables
+(one transformer for query, one transformer for text passages, one transformer for tables).
+See the original paper for more details:
+Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using Tri-encoder Models"
+(https://arxiv.org/abs/2108.04049),
+
+
+
+#### TableTextRetriever.\_\_init\_\_
+
+```python
+def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
+```
+
+Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
+
+The checkpoint format matches huggingface transformers' model format
+
+**Arguments**:
+
+- `document_store`: An instance of DocumentStore from which to retrieve documents.
+- `query_embedding_model`: Local path or remote name of question encoder checkpoint. The format equals the
+one used by hugging-face transformers' modelhub models.
+- `passage_embedding_model`: Local path or remote name of passage encoder checkpoint. The format equals the
+one used by hugging-face transformers' modelhub models.
+- `table_embedding_model`: Local path or remote name of table encoder checkpoint. The format equala the
+one used by hugging-face transformers' modelhub models.
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `max_seq_len_query`: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down."
+- `max_seq_len_passage`: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down."
+- `top_k`: How many documents to return per query.
+- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
+- `batch_size`: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size.
+- `embed_meta_fields`: Concatenate the provided meta fields and text passage / table to a text pair that is
+then used to create the embedding.
+This is the approach used in the original paper and is likely to improve
+performance if your titles contain meaningful information for retrieval
+(topic, entities etc.).
+- `use_fast_tokenizers`: Whether to use fast Rust tokenizers
+- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name.
+If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`.
+- `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training.
+Options: `dot_product` (Default) or `cosine`
+- `global_loss_buffer_size`: Buffer size for all_gather() in DDP.
+Increase if errors like "encoded data exceeds max_size ..." come up
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+These strings will be converted into pytorch devices, so use the string notation described here:
+https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for TableTextRetriever,
+training will only use the first device provided in this list.
+- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`,
+the local token will be used, which must be previously created via `transformer-cli login`.
+Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### TableTextRetriever.retrieve\_batch
+
+```python
+def retrieve_batch(queries: List[str], filters: Optional[
+ Union[
+ Dict[str, Union[Dict, List, str, int, float, bool]],
+ List[Dict[str, Union[Dict, List, str, int, float, bool]]],
+ ]
+ ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the supplied queries.
+
+Returns a list of lists of Documents (one per query).
+
+**Arguments**:
+
+- `queries`: List of query strings.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions. Can be a single filter that will be applied to each query or a list of filters
+(one filter per query).
+
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `batch_size`: Number of queries to embed at a time.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different
+value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### TableTextRetriever.embed\_queries
+
+```python
+def embed_queries(texts: List[str]) -> List[np.ndarray]
+```
+
+Create embeddings for a list of queries using the query encoder
+
+**Arguments**:
+
+- `texts`: Queries to embed
+
+**Returns**:
+
+Embeddings, one per input queries
+
+
+
+#### TableTextRetriever.embed\_documents
+
+```python
+def embed_documents(docs: List[Document]) -> List[np.ndarray]
+```
+
+Create embeddings for a list of text documents and / or tables using the text passage encoder and
+
+the table encoder.
+
+**Arguments**:
+
+- `docs`: List of Document objects used to represent documents / passages in
+a standardized way within Haystack.
+
+**Returns**:
+
+Embeddings of documents / passages. Shape: (batch_size, embedding_dim)
+
+
+
+#### TableTextRetriever.train
+
+```python
+def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_meta_fields: List[str] = ["page_title", "section_title", "caption"], num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/mm_retrieval", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", table_encoder_save_dir: str = "table_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3)
+```
+
+Train a TableTextRetrieval model.
+
+**Arguments**:
+
+- `data_dir`: Directory where training file, dev file and test file are present.
+- `train_filename`: Training filename.
+- `dev_filename`: Development set filename, file to be used by model in eval step of training.
+- `test_filename`: Test set filename, file to be used by model in test step after training.
+- `max_samples`: Maximum number of input samples to convert. Can be used for debugging a smaller dataset.
+- `max_processes`: The maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo.
+It can be set to 1 to disable the use of multiprocessing or make debugging easier.
+- `dev_split`: The proportion of the train set that will sliced. Only works if dev_filename is set to None.
+- `batch_size`: Total number of samples in 1 batch of data.
+- `embed_meta_fields`: Concatenate meta fields with each passage and table.
+The default setting in official MMRetrieval embeds page title,
+section title and caption with the corresponding table and title with
+corresponding text passage.
+- `num_hard_negatives`: Number of hard negative passages (passages which are
+very similar (high score by BM25) to query but do not contain the answer)-
+- `num_positives`: Number of positive passages.
+- `n_epochs`: Number of epochs to train the model on.
+- `evaluate_every`: Number of training steps after evaluation is run.
+- `n_gpu`: Number of gpus to train on.
+- `learning_rate`: Learning rate of optimizer.
+- `epsilon`: Epsilon parameter of optimizer.
+- `weight_decay`: Weight decay parameter of optimizer.
+- `grad_acc_steps`: Number of steps to accumulate gradient over before back-propagation is done.
+- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are:
+"O0" (FP32)
+"O1" (Mixed Precision)
+"O2" (Almost FP16)
+"O3" (Pure FP16).
+For more information, refer to: https://nvidia.github.io/apex/amp.html
+- `optimizer_name`: What optimizer to use (default: TransformersAdamW).
+- `num_warmup_steps`: Number of warmup steps.
+- `optimizer_correct_bias`: Whether to correct bias in optimizer.
+- `save_dir`: Directory where models are saved.
+- `query_encoder_save_dir`: Directory inside save_dir where query_encoder model files are saved.
+- `passage_encoder_save_dir`: Directory inside save_dir where passage_encoder model files are saved.
+- `table_encoder_save_dir`: Directory inside save_dir where table_encoder model files are saved.
+
+
+
+#### TableTextRetriever.save
+
+```python
+def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder")
+```
+
+Save TableTextRetriever to the specified directory.
+
+**Arguments**:
+
+- `save_dir`: Directory to save to.
+- `query_encoder_dir`: Directory in save_dir that contains query encoder model.
+- `passage_encoder_dir`: Directory in save_dir that contains passage encoder model.
+- `table_encoder_dir`: Directory in save_dir that contains table encoder model.
+
+**Returns**:
+
+None
+
+
+
+#### TableTextRetriever.load
+
+```python
+@classmethod
+def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder", infer_tokenizer_classes: bool = False)
+```
+
+Load TableTextRetriever from the specified directory.
+
+
+
+## EmbeddingRetriever
+
+```python
+class EmbeddingRetriever(BaseRetriever)
+```
+
+
+
+#### EmbeddingRetriever.\_\_init\_\_
+
+```python
+def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: Optional[str] = None, pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = [])
+```
+
+**Arguments**:
+
+- `document_store`: An instance of DocumentStore from which to retrieve documents.
+- `embedding_model`: Local path or name of model in Hugging Face's model hub such as ``'sentence-transformers/all-MiniLM-L6-v2'``
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
+- `batch_size`: Number of documents to encode at once.
+- `max_seq_len`: Longest length of each document sequence. Maximum number of tokens for the document text. Longer ones will be cut down.
+- `model_format`: Name of framework that was used for saving the model or model type. If no model_format is
+provided, it will be inferred automatically from the model configuration files.
+Options:
+
+- ``'farm'`` (will use `_DefaultEmbeddingEncoder` as embedding encoder)
+- ``'transformers'`` (will use `_DefaultEmbeddingEncoder` as embedding encoder)
+- ``'sentence_transformers'`` (will use `_SentenceTransformersEmbeddingEncoder` as embedding encoder)
+- ``'retribert'`` (will use `_RetribertEmbeddingEncoder` as embedding encoder)
+- `pooling_strategy`: Strategy for combining the embeddings from the model (for farm / transformers models only).
+Options:
+
+- ``'cls_token'`` (sentence vector)
+- ``'reduce_mean'`` (sentence vector)
+- ``'reduce_max'`` (sentence vector)
+- ``'per_token'`` (individual token vectors)
+- `emb_extraction_layer`: Number of layer from which the embeddings shall be extracted (for farm / transformers models only).
+Default: -1 (very last layer).
+- `top_k`: How many documents to return per query.
+- `progress_bar`: If true displays progress bar during embedding.
+- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+These strings will be converted into pytorch devices, so use the string notation described here:
+https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+(e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
+training will only use the first device provided in this list.
+- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`,
+the local token will be used, which must be previously created via `transformer-cli login`.
+Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+- `embed_meta_fields`: Concatenate the provided meta fields and text passage / table to a text pair that is
+then used to create the embedding.
+This approach is also used in the TableTextRetriever paper and is likely to improve
+performance if your titles contain meaningful information for retrieval
+(topic, entities etc.).
+
+
+
+#### EmbeddingRetriever.retrieve
+
+```python
+def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the query.
+
+**Arguments**:
+
+- `query`: The query
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### EmbeddingRetriever.retrieve\_batch
+
+```python
+def retrieve_batch(queries: List[str], filters: Optional[
+ Union[
+ Dict[str, Union[Dict, List, str, int, float, bool]],
+ List[Dict[str, Union[Dict, List, str, int, float, bool]]],
+ ]
+ ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
+```
+
+Scan through documents in DocumentStore and return a small number documents
+
+that are most relevant to the supplied queries.
+
+Returns a list of lists of Documents (one per query).
+
+**Arguments**:
+
+- `queries`: List of query strings.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions. Can be a single filter that will be applied to each query or a list of filters
+(one filter per query).
+
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+- `top_k`: How many documents to return per query.
+- `index`: The name of the index in the DocumentStore from which to retrieve documents
+- `batch_size`: Number of queries to embed at a time.
+- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
+If true similarity scores (e.g. cosine or dot_product) which naturally have a different
+value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+
+
+
+#### EmbeddingRetriever.embed\_queries
+
+```python
+def embed_queries(texts: List[str]) -> List[np.ndarray]
+```
+
+Create embeddings for a list of queries.
+
+**Arguments**:
+
+- `texts`: Queries to embed
+
+**Returns**:
+
+Embeddings, one per input queries
+
+
+
+#### EmbeddingRetriever.embed\_documents
+
+```python
+def embed_documents(docs: List[Document]) -> List[np.ndarray]
+```
+
+Create embeddings for a list of documents.
+
+**Arguments**:
+
+- `docs`: List of documents to embed
+
+**Returns**:
+
+Embeddings, one per input document
+
+
+
+#### EmbeddingRetriever.train
+
+```python
+def train(training_data: List[Dict[str, Any]], learning_rate: float = 2e-5, n_epochs: int = 1, num_warmup_steps: int = None, batch_size: int = 16) -> None
+```
+
+Trains/adapts the underlying embedding model.
+
+Each training data example is a dictionary with the following keys:
+
+* question: the question string
+* pos_doc: the positive document string
+* neg_doc: the negative document string
+* score: the score margin
+
+**Arguments**:
+
+- `training_data` (`List[Dict[str, Any]]`): The training data
+- `learning_rate` (`float`): The learning rate
+- `n_epochs` (`int`): The number of epochs
+- `num_warmup_steps` (`int`): The number of warmup steps
+- `batch_size` (`int (optional)`): The batch size to use for the training, defaults to 16
+
+
+
+#### EmbeddingRetriever.save
+
+```python
+def save(save_dir: Union[Path, str]) -> None
+```
+
+Save the model to the given directory
+
+**Arguments**:
+
+- `save_dir` (`Union[Path, str]`): The directory where the model will be saved
+
+
+
+# Module text2sparql
+
+
+
+## Text2SparqlRetriever
+
+```python
+class Text2SparqlRetriever(BaseGraphRetriever)
+```
+
+Graph retriever that uses a pre-trained Bart model to translate natural language questions
+given in text form to queries in SPARQL format.
+The generated SPARQL query is executed on a knowledge graph.
+
+
+
+#### Text2SparqlRetriever.\_\_init\_\_
+
+```python
+def __init__(knowledge_graph, model_name_or_path, top_k: int = 1)
+```
+
+Init the Retriever by providing a knowledge graph and a pre-trained BART model
+
+**Arguments**:
+
+- `knowledge_graph`: An instance of BaseKnowledgeGraph on which to execute SPARQL queries.
+- `model_name_or_path`: Name of or path to a pre-trained BartForConditionalGeneration model.
+- `top_k`: How many SPARQL queries to generate per text query.
+
+
+
+#### Text2SparqlRetriever.retrieve
+
+```python
+def retrieve(query: str, top_k: Optional[int] = None)
+```
+
+Translate a text query to SPARQL and execute it on the knowledge graph to retrieve a list of answers
+
+**Arguments**:
+
+- `query`: Text query that shall be translated to SPARQL and then executed on the knowledge graph
+- `top_k`: How many SPARQL queries to generate per text query.
+
+
+
+#### Text2SparqlRetriever.retrieve\_batch
+
+```python
+def retrieve_batch(queries: List[str], top_k: Optional[int] = None)
+```
+
+Translate a list of queries to SPARQL and execute it on the knowledge graph to retrieve
+
+a list of lists of answers (one per query).
+
+**Arguments**:
+
+- `queries`: List of queries that shall be translated to SPARQL and then executed on the
+knowledge graph.
+- `top_k`: How many SPARQL queries to generate per text query.
+
+
+
+#### Text2SparqlRetriever.format\_result
+
+```python
+def format_result(result)
+```
+
+Generate formatted dictionary output with text answer and additional info
+
+**Arguments**:
+
+- `result`: The result of a SPARQL query as retrieved from the knowledge graph
+
diff --git a/docs/v1.5.0/_src/api/api/summarizer.md b/docs/v1.5.0/_src/api/api/summarizer.md
new file mode 100644
index 0000000000..b000003f09
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/summarizer.md
@@ -0,0 +1,164 @@
+
+
+# Module base
+
+
+
+## BaseSummarizer
+
+```python
+class BaseSummarizer(BaseComponent)
+```
+
+Abstract class for Summarizer
+
+
+
+#### BaseSummarizer.predict
+
+```python
+@abstractmethod
+def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document]
+```
+
+Abstract method for creating a summary.
+
+**Arguments**:
+
+- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
+- `generate_single_summary`: Whether to generate a single summary for all documents or one summary per document.
+If set to "True", all docs will be joined to a single string that will then
+be summarized.
+Important: The summary will depend on the order of the supplied documents!
+
+**Returns**:
+
+List of Documents, where Document.text contains the summarization and Document.meta["context"]
+the original, not summarized text
+
+
+
+# Module transformers
+
+
+
+## TransformersSummarizer
+
+```python
+class TransformersSummarizer(BaseSummarizer)
+```
+
+Transformer based model to summarize the documents using the HuggingFace's transformers framework
+
+You can use any model that has been fine-tuned on a summarization task. For example:
+'`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'.
+See the up-to-date list of available models on
+`huggingface.co/models `__
+
+**Example**
+
+```python
+| docs = [Document(text="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions."
+| "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by"
+| "the shutoffs which were expected to last through at least midday tomorrow.")]
+|
+| # Summarize
+| summary = summarizer.predict(
+| documents=docs,
+| generate_single_summary=True
+| )
+|
+| # Show results (List of Documents, containing summary and original text)
+| print(summary)
+|
+| [
+| {
+| "text": "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
+| ...
+| "meta": {
+| "context": "PGE stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. ..."
+| },
+| ...
+| },
+```
+
+
+
+#### TransformersSummarizer.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False, batch_size: Optional[int] = None)
+```
+
+Load a Summarization model from Transformers.
+
+See the up-to-date list of available models at
+https://huggingface.co/models?filter=summarization
+
+**Arguments**:
+
+- `model_name_or_path`: Directory of a saved model or the name of a public model e.g.
+'facebook/rag-token-nq', 'facebook/rag-sequence-nq'.
+See https://huggingface.co/models?filter=summarization for full list of available models.
+- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+- `tokenizer`: Name of the tokenizer (usually the same as model)
+- `max_length`: Maximum length of summarized text
+- `min_length`: Minimum length of summarized text
+- `use_gpu`: Whether to use GPU (if available).
+- `clean_up_tokenization_spaces`: Whether or not to clean up the potential extra spaces in the text output
+- `separator_for_single_summary`: If `generate_single_summary=True` in `predict()`, we need to join all docs
+into a single text. This separator appears between those subsequent docs.
+- `generate_single_summary`: Whether to generate a single summary for all documents or one summary per document.
+If set to "True", all docs will be joined to a single string that will then
+be summarized.
+Important: The summary will depend on the order of the supplied documents!
+- `batch_size`: Number of documents to process at a time.
+
+
+
+#### TransformersSummarizer.predict
+
+```python
+def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document]
+```
+
+Produce the summarization from the supplied documents.
+
+These document can for example be retrieved via the Retriever.
+
+**Arguments**:
+
+- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
+- `generate_single_summary`: Whether to generate a single summary for all documents or one summary per document.
+If set to "True", all docs will be joined to a single string that will then
+be summarized.
+Important: The summary will depend on the order of the supplied documents!
+
+**Returns**:
+
+List of Documents, where Document.text contains the summarization and Document.meta["context"]
+the original, not summarized text
+
+
+
+#### TransformersSummarizer.predict\_batch
+
+```python
+def predict_batch(documents: Union[List[Document], List[List[Document]]], generate_single_summary: Optional[bool] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]]
+```
+
+Produce the summarization from the supplied documents.
+
+These documents can for example be retrieved via the Retriever.
+
+**Arguments**:
+
+- `documents`: Single list of related documents or list of lists of related documents
+(e.g. coming from a retriever) that the answer shall be conditioned on.
+- `generate_single_summary`: Whether to generate a single summary for each provided document list or
+one summary per document.
+If set to "True", all docs of a document list will be joined to a single string
+that will then be summarized.
+Important: The summary will depend on the order of the supplied documents!
+- `batch_size`: Number of Documents to process at a time.
+
diff --git a/docs/v1.5.0/_src/api/api/translator.md b/docs/v1.5.0/_src/api/api/translator.md
new file mode 100644
index 0000000000..2a1d1759b8
--- /dev/null
+++ b/docs/v1.5.0/_src/api/api/translator.md
@@ -0,0 +1,129 @@
+
+
+# Module base
+
+
+
+## BaseTranslator
+
+```python
+class BaseTranslator(BaseComponent)
+```
+
+Abstract class for a Translator component that translates either a query or a doc from language A to language B.
+
+
+
+#### BaseTranslator.translate
+
+```python
+@abstractmethod
+def translate(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
+```
+
+Translate the passed query or a list of documents from language A to B.
+
+
+
+#### BaseTranslator.run
+
+```python
+def run(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None)
+```
+
+Method that gets executed when this class is used as a Node in a Haystack Pipeline
+
+
+
+# Module transformers
+
+
+
+## TransformersTranslator
+
+```python
+class TransformersTranslator(BaseTranslator)
+```
+
+Translator component based on Seq2Seq models from Huggingface's transformers library.
+Exemplary use cases:
+- Translate a query from Language A to B (e.g. if you only have good models + documents in language B)
+- Translate a document from Language A to B (e.g. if you want to return results in the native language of the user)
+
+We currently recommend using OPUS models (see __init__() for details)
+
+**Example:**
+
+```python
+| DOCS = [
+| Document(content="Heinz von Foerster was an Austrian American scientist combining physics and philosophy,
+| and widely attributed as the originator of Second-order cybernetics.")
+| ]
+| translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de")
+| res = translator.translate(documents=DOCS, query=None)
+```
+
+
+
+#### TransformersTranslator.\_\_init\_\_
+
+```python
+def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True)
+```
+
+Initialize the translator with a model that fits your targeted languages. While we support all seq2seq
+
+models from Hugging Face's model hub, we recommend using the OPUS models from Helsinki NLP. They provide plenty
+of different models, usually one model per language pair and translation direction.
+They have a pretty standardized naming that should help you find the right model:
+- "Helsinki-NLP/opus-mt-en-de" => translating from English to German
+- "Helsinki-NLP/opus-mt-de-en" => translating from German to English
+- "Helsinki-NLP/opus-mt-fr-en" => translating from French to English
+- "Helsinki-NLP/opus-mt-hi-en"=> translating from Hindi to English
+...
+
+They also have a few multilingual models that support multiple languages at once.
+
+**Arguments**:
+
+- `model_name_or_path`: Name of the seq2seq model that shall be used for translation.
+Can be a remote name from Huggingface's modelhub or a local path.
+- `tokenizer_name`: Optional tokenizer name. If not supplied, `model_name_or_path` will also be used for the
+tokenizer.
+- `max_seq_len`: The maximum sentence length the model accepts. (Optional)
+- `clean_up_tokenization_spaces`: Whether or not to clean up the tokenization spaces. (default True)
+- `use_gpu`: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available.
+
+
+
+#### TransformersTranslator.translate
+
+```python
+def translate(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
+```
+
+Run the actual translation. You can supply a query or a list of documents. Whatever is supplied will be translated.
+
+**Arguments**:
+
+- `results`: Generated QA pairs to translate
+- `query`: The query string to translate
+- `documents`: The documents to translate
+- `dict_key`: If you pass a dictionary in `documents`, you can specify here the field which shall be translated.
+
+
+
+#### TransformersTranslator.translate\_batch
+
+```python
+def translate_batch(queries: Optional[List[str]] = None, documents: Optional[Union[List[Document], List[Answer], List[List[Document]], List[List[Answer]]]] = None, batch_size: Optional[int] = None) -> Union[str, List[str], List[Document], List[Answer], List[List[Document]], List[List[Answer]]]
+```
+
+Run the actual translation. You can supply a single query, a list of queries or a list (of lists) of documents.
+
+**Arguments**:
+
+- `queries`: Single query or list of queries.
+- `documents`: List of documents or list of lists of documets.
+- `batch_size`: Not applicable.
+
diff --git a/docs/v1.5.0/_src/api/conf.py b/docs/v1.5.0/_src/api/conf.py
new file mode 100644
index 0000000000..46046eccc0
--- /dev/null
+++ b/docs/v1.5.0/_src/api/conf.py
@@ -0,0 +1,52 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import sphinx_rtd_theme
+import os
+import sys
+
+sys.path.append("/Users/deepset/deepset/haystack")
+
+
+# -- Project information -----------------------------------------------------
+
+project = "Haystack"
+copyright = "2020, deepset"
+author = "deepset"
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ["sphinx.ext.autodoc", "sphinx_rtd_theme"]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
diff --git a/docs/v1.5.0/_src/api/img/annotation_tool.png b/docs/v1.5.0/_src/api/img/annotation_tool.png
new file mode 100644
index 0000000000..eb2c601d9e
Binary files /dev/null and b/docs/v1.5.0/_src/api/img/annotation_tool.png differ
diff --git a/docs/v1.5.0/_src/api/img/code_snippet_usage.png b/docs/v1.5.0/_src/api/img/code_snippet_usage.png
new file mode 100644
index 0000000000..e7d836bd9c
Binary files /dev/null and b/docs/v1.5.0/_src/api/img/code_snippet_usage.png differ
diff --git a/docs/v1.5.0/_src/api/img/colab_gpu_runtime.jpg b/docs/v1.5.0/_src/api/img/colab_gpu_runtime.jpg
new file mode 100644
index 0000000000..883180b97e
Binary files /dev/null and b/docs/v1.5.0/_src/api/img/colab_gpu_runtime.jpg differ
diff --git a/docs/v1.5.0/_src/api/img/sketched_concepts_white.png b/docs/v1.5.0/_src/api/img/sketched_concepts_white.png
new file mode 100644
index 0000000000..9fe5fd5c94
Binary files /dev/null and b/docs/v1.5.0/_src/api/img/sketched_concepts_white.png differ
diff --git a/docs/v1.5.0/_src/api/index.rst b/docs/v1.5.0/_src/api/index.rst
new file mode 100644
index 0000000000..42ff660913
--- /dev/null
+++ b/docs/v1.5.0/_src/api/index.rst
@@ -0,0 +1,16 @@
+.. Haystack documentation master file, created by
+ sphinx-quickstart on Tue Jul 28 14:14:55 2020.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+ api/database
+ api/retriever
+ api/reader
+ api/indexing
+ api/rest_api
+ api/file_converters
+ api/finder
diff --git a/docs/v1.5.0/_src/api/make.bat b/docs/v1.5.0/_src/api/make.bat
new file mode 100644
index 0000000000..2119f51099
--- /dev/null
+++ b/docs/v1.5.0/_src/api/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/v1.5.0/_src/api/openapi/openapi-1.2.0.json b/docs/v1.5.0/_src/api/openapi/openapi-1.2.0.json
new file mode 100644
index 0000000000..36971bd89f
--- /dev/null
+++ b/docs/v1.5.0/_src/api/openapi/openapi-1.2.0.json
@@ -0,0 +1,834 @@
+{
+ "openapi": "3.0.2",
+ "info": {
+ "title": "Haystack REST API",
+ "version": "1.2.0"
+ },
+ "paths": {
+ "/initialized": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Check Status",
+ "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.",
+ "operationId": "check_status_initialized_get",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/hs_version": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Haystack Version",
+ "description": "Get the running Haystack version.",
+ "operationId": "haystack_version_hs_version_get",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/query": {
+ "post": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Query",
+ "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.",
+ "operationId": "query_query_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback",
+ "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.",
+ "operationId": "get_feedback_feedback_get",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ },
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Post Feedback",
+ "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.",
+ "operationId": "post_feedback_feedback_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Feedback",
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/LabelSerialized"
+ },
+ {
+ "$ref": "#/components/schemas/CreateLabelSerialized"
+ }
+ ]
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ },
+ "delete": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Delete Feedback",
+ "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint",
+ "operationId": "delete_feedback_feedback_delete",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/eval-feedback": {
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback Metrics",
+ "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`",
+ "operationId": "get_feedback_metrics_eval_feedback_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ }
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/export-feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Export Feedback",
+ "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.",
+ "operationId": "export_feedback_export_feedback_get",
+ "parameters": [
+ {
+ "required": false,
+ "schema": {
+ "title": "Context Size",
+ "type": "integer",
+ "default": 100000
+ },
+ "name": "context_size",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Full Document Context",
+ "type": "boolean",
+ "default": true
+ },
+ "name": "full_document_context",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Only Positive Labels",
+ "type": "boolean",
+ "default": false
+ },
+ "name": "only_positive_labels",
+ "in": "query"
+ }
+ ],
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/file-upload": {
+ "post": {
+ "tags": [
+ "file-upload"
+ ],
+ "summary": "Upload File",
+ "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).",
+ "operationId": "upload_file_file_upload_post",
+ "requestBody": {
+ "content": {
+ "multipart/form-data": {
+ "schema": {
+ "$ref": "#/components/schemas/Body_upload_file_file_upload_post"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/get_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Get Documents",
+ "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "get_documents_documents_get_by_filters_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Documents Documents Get By Filters Post",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ }
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/delete_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Delete Documents",
+ "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "delete_documents_documents_delete_by_filters_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Delete Documents Documents Delete By Filters Post",
+ "type": "boolean"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "components": {
+ "schemas": {
+ "AnswerSerialized": {
+ "title": "AnswerSerialized",
+ "required": [
+ "answer"
+ ],
+ "type": "object",
+ "properties": {
+ "answer": {
+ "title": "Answer",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "enum": [
+ "generative",
+ "extractive",
+ "other"
+ ],
+ "type": "string",
+ "default": "extractive"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "context": {
+ "title": "Context",
+ "type": "string"
+ },
+ "offsets_in_document": {
+ "title": "Offsets In Document",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "offsets_in_context": {
+ "title": "Offsets In Context",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "document_id": {
+ "title": "Document Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ }
+ }
+ },
+ "Body_upload_file_file_upload_post": {
+ "title": "Body_upload_file_file_upload_post",
+ "required": [
+ "files"
+ ],
+ "type": "object",
+ "properties": {
+ "files": {
+ "title": "Files",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "format": "binary"
+ }
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "string",
+ "default": "null"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables"
+ },
+ "valid_languages": {
+ "title": "Valid Languages"
+ },
+ "clean_whitespace": {
+ "title": "Clean Whitespace"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer"
+ },
+ "split_by": {
+ "title": "Split By"
+ },
+ "split_length": {
+ "title": "Split Length"
+ },
+ "split_overlap": {
+ "title": "Split Overlap"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary"
+ }
+ }
+ },
+ "CreateLabelSerialized": {
+ "title": "CreateLabelSerialized",
+ "required": [
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/AnswerSerialized"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ },
+ "additionalProperties": false
+ },
+ "DocumentSerialized": {
+ "title": "DocumentSerialized",
+ "required": [
+ "content",
+ "content_type",
+ "id",
+ "meta"
+ ],
+ "type": "object",
+ "properties": {
+ "content": {
+ "title": "Content",
+ "type": "string"
+ },
+ "content_type": {
+ "title": "Content Type",
+ "enum": [
+ "text",
+ "table",
+ "image"
+ ],
+ "type": "string"
+ },
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "embedding": {
+ "title": "Embedding",
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ }
+ },
+ "FilterRequest": {
+ "title": "FilterRequest",
+ "type": "object",
+ "properties": {
+ "filters": {
+ "title": "Filters",
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ "HTTPValidationError": {
+ "title": "HTTPValidationError",
+ "type": "object",
+ "properties": {
+ "detail": {
+ "title": "Detail",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ValidationError"
+ }
+ }
+ }
+ },
+ "LabelSerialized": {
+ "title": "LabelSerialized",
+ "required": [
+ "id",
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/AnswerSerialized"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ }
+ },
+ "QueryRequest": {
+ "title": "QueryRequest",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "params": {
+ "title": "Params",
+ "type": "object"
+ },
+ "debug": {
+ "title": "Debug",
+ "type": "boolean",
+ "default": false
+ }
+ },
+ "additionalProperties": false
+ },
+ "QueryResponse": {
+ "title": "QueryResponse",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "answers": {
+ "title": "Answers",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AnswerSerialized"
+ },
+ "default": []
+ },
+ "documents": {
+ "title": "Documents",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ },
+ "default": []
+ },
+ "_debug": {
+ "title": " Debug",
+ "type": "object"
+ }
+ }
+ },
+ "Span": {
+ "title": "Span",
+ "required": [
+ "start",
+ "end"
+ ],
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start",
+ "type": "integer"
+ },
+ "end": {
+ "title": "End",
+ "type": "integer"
+ }
+ }
+ },
+ "ValidationError": {
+ "title": "ValidationError",
+ "required": [
+ "loc",
+ "msg",
+ "type"
+ ],
+ "type": "object",
+ "properties": {
+ "loc": {
+ "title": "Location",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "msg": {
+ "title": "Message",
+ "type": "string"
+ },
+ "type": {
+ "title": "Error Type",
+ "type": "string"
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/api/openapi/openapi-1.2.1rc0.json b/docs/v1.5.0/_src/api/openapi/openapi-1.2.1rc0.json
new file mode 100644
index 0000000000..5958d6a11f
--- /dev/null
+++ b/docs/v1.5.0/_src/api/openapi/openapi-1.2.1rc0.json
@@ -0,0 +1,827 @@
+{
+ "openapi": "3.0.2",
+ "info": {
+ "title": "Haystack REST API",
+ "version": "1.2.1rc0"
+ },
+ "paths": {
+ "/initialized": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Check Status",
+ "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.",
+ "operationId": "check_status_initialized_get",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/hs_version": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Haystack Version",
+ "description": "Get the running Haystack version.",
+ "operationId": "haystack_version_hs_version_get",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/query": {
+ "post": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Query",
+ "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.",
+ "operationId": "query_query_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback",
+ "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.",
+ "operationId": "get_feedback_feedback_get",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ },
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Post Feedback",
+ "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.",
+ "operationId": "post_feedback_feedback_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Feedback",
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/LabelSerialized"
+ },
+ {
+ "$ref": "#/components/schemas/CreateLabelSerialized"
+ }
+ ]
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ },
+ "delete": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Delete Feedback",
+ "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint",
+ "operationId": "delete_feedback_feedback_delete",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/eval-feedback": {
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback Metrics",
+ "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`",
+ "operationId": "get_feedback_metrics_eval_feedback_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ }
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/export-feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Export Feedback",
+ "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.",
+ "operationId": "export_feedback_export_feedback_get",
+ "parameters": [
+ {
+ "required": false,
+ "schema": {
+ "title": "Context Size",
+ "type": "integer",
+ "default": 100000
+ },
+ "name": "context_size",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Full Document Context",
+ "type": "boolean",
+ "default": true
+ },
+ "name": "full_document_context",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Only Positive Labels",
+ "type": "boolean",
+ "default": false
+ },
+ "name": "only_positive_labels",
+ "in": "query"
+ }
+ ],
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/file-upload": {
+ "post": {
+ "tags": [
+ "file-upload"
+ ],
+ "summary": "Upload File",
+ "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).",
+ "operationId": "upload_file_file_upload_post",
+ "requestBody": {
+ "content": {
+ "multipart/form-data": {
+ "schema": {
+ "$ref": "#/components/schemas/Body_upload_file_file_upload_post"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/get_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Get Documents",
+ "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "get_documents_documents_get_by_filters_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Documents Documents Get By Filters Post",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ }
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/delete_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Delete Documents",
+ "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "delete_documents_documents_delete_by_filters_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Delete Documents Documents Delete By Filters Post",
+ "type": "boolean"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "components": {
+ "schemas": {
+ "AnswerSerialized": {
+ "title": "AnswerSerialized",
+ "required": [
+ "answer"
+ ],
+ "type": "object",
+ "properties": {
+ "answer": {
+ "title": "Answer",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "enum": [
+ "generative",
+ "extractive",
+ "other"
+ ],
+ "type": "string",
+ "default": "extractive"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "context": {
+ "title": "Context",
+ "type": "string"
+ },
+ "offsets_in_document": {
+ "title": "Offsets In Document",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "offsets_in_context": {
+ "title": "Offsets In Context",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "document_id": {
+ "title": "Document Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ }
+ }
+ },
+ "Body_upload_file_file_upload_post": {
+ "title": "Body_upload_file_file_upload_post",
+ "required": [
+ "files"
+ ],
+ "type": "object",
+ "properties": {
+ "files": {
+ "title": "Files",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "format": "binary"
+ }
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "string",
+ "default": "null"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables"
+ },
+ "valid_languages": {
+ "title": "Valid Languages"
+ },
+ "clean_whitespace": {
+ "title": "Clean Whitespace"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer"
+ },
+ "split_by": {
+ "title": "Split By"
+ },
+ "split_length": {
+ "title": "Split Length"
+ },
+ "split_overlap": {
+ "title": "Split Overlap"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary"
+ }
+ }
+ },
+ "CreateLabelSerialized": {
+ "title": "CreateLabelSerialized",
+ "required": [
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/AnswerSerialized"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ },
+ "additionalProperties": false
+ },
+ "DocumentSerialized": {
+ "title": "DocumentSerialized",
+ "required": [
+ "content",
+ "content_type",
+ "id",
+ "meta"
+ ],
+ "type": "object",
+ "properties": {
+ "content": {
+ "title": "Content",
+ "type": "string"
+ },
+ "content_type": {
+ "title": "Content Type",
+ "enum": [
+ "text",
+ "table",
+ "image"
+ ],
+ "type": "string"
+ },
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "embedding": {
+ "title": "Embedding",
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ }
+ }
+ },
+ "FilterRequest": {
+ "title": "FilterRequest",
+ "type": "object",
+ "properties": {
+ "filters": {
+ "title": "Filters",
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ "HTTPValidationError": {
+ "title": "HTTPValidationError",
+ "type": "object",
+ "properties": {
+ "detail": {
+ "title": "Detail",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ValidationError"
+ }
+ }
+ }
+ },
+ "LabelSerialized": {
+ "title": "LabelSerialized",
+ "required": [
+ "id",
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/AnswerSerialized"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ }
+ },
+ "QueryRequest": {
+ "title": "QueryRequest",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "params": {
+ "title": "Params",
+ "type": "object"
+ },
+ "debug": {
+ "title": "Debug",
+ "type": "boolean",
+ "default": false
+ }
+ },
+ "additionalProperties": false
+ },
+ "QueryResponse": {
+ "title": "QueryResponse",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "answers": {
+ "title": "Answers",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AnswerSerialized"
+ },
+ "default": []
+ },
+ "documents": {
+ "title": "Documents",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ },
+ "default": []
+ },
+ "_debug": {
+ "title": " Debug",
+ "type": "object"
+ }
+ }
+ },
+ "Span": {
+ "title": "Span",
+ "required": [
+ "start",
+ "end"
+ ],
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start",
+ "type": "integer"
+ },
+ "end": {
+ "title": "End",
+ "type": "integer"
+ }
+ }
+ },
+ "ValidationError": {
+ "title": "ValidationError",
+ "required": [
+ "loc",
+ "msg",
+ "type"
+ ],
+ "type": "object",
+ "properties": {
+ "loc": {
+ "title": "Location",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "msg": {
+ "title": "Message",
+ "type": "string"
+ },
+ "type": {
+ "title": "Error Type",
+ "type": "string"
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/api/openapi/openapi-1.3.0.json b/docs/v1.5.0/_src/api/openapi/openapi-1.3.0.json
new file mode 100644
index 0000000000..cd388129fb
--- /dev/null
+++ b/docs/v1.5.0/_src/api/openapi/openapi-1.3.0.json
@@ -0,0 +1,834 @@
+{
+ "openapi": "3.0.2",
+ "info": {
+ "title": "Haystack REST API",
+ "version": "1.3.0"
+ },
+ "paths": {
+ "/initialized": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Check Status",
+ "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.",
+ "operationId": "check_status_initialized_get",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/hs_version": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Haystack Version",
+ "description": "Get the running Haystack version.",
+ "operationId": "haystack_version_hs_version_get",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/query": {
+ "post": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Query",
+ "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.",
+ "operationId": "query_query_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback",
+ "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.",
+ "operationId": "get_feedback_feedback_get",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ },
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Post Feedback",
+ "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.",
+ "operationId": "post_feedback_feedback_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Feedback",
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/LabelSerialized"
+ },
+ {
+ "$ref": "#/components/schemas/CreateLabelSerialized"
+ }
+ ]
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ },
+ "delete": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Delete Feedback",
+ "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint",
+ "operationId": "delete_feedback_feedback_delete",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/eval-feedback": {
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback Metrics",
+ "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`",
+ "operationId": "get_feedback_metrics_eval_feedback_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ }
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/export-feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Export Feedback",
+ "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.",
+ "operationId": "export_feedback_export_feedback_get",
+ "parameters": [
+ {
+ "required": false,
+ "schema": {
+ "title": "Context Size",
+ "type": "integer",
+ "default": 100000
+ },
+ "name": "context_size",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Full Document Context",
+ "type": "boolean",
+ "default": true
+ },
+ "name": "full_document_context",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Only Positive Labels",
+ "type": "boolean",
+ "default": false
+ },
+ "name": "only_positive_labels",
+ "in": "query"
+ }
+ ],
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/file-upload": {
+ "post": {
+ "tags": [
+ "file-upload"
+ ],
+ "summary": "Upload File",
+ "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).",
+ "operationId": "upload_file_file_upload_post",
+ "requestBody": {
+ "content": {
+ "multipart/form-data": {
+ "schema": {
+ "$ref": "#/components/schemas/Body_upload_file_file_upload_post"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/get_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Get Documents",
+ "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "get_documents_documents_get_by_filters_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Documents Documents Get By Filters Post",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ }
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/delete_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Delete Documents",
+ "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "delete_documents_documents_delete_by_filters_post",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Delete Documents Documents Delete By Filters Post",
+ "type": "boolean"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "components": {
+ "schemas": {
+ "AnswerSerialized": {
+ "title": "AnswerSerialized",
+ "required": [
+ "answer"
+ ],
+ "type": "object",
+ "properties": {
+ "answer": {
+ "title": "Answer",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "enum": [
+ "generative",
+ "extractive",
+ "other"
+ ],
+ "type": "string",
+ "default": "extractive"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "context": {
+ "title": "Context",
+ "type": "string"
+ },
+ "offsets_in_document": {
+ "title": "Offsets In Document",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "offsets_in_context": {
+ "title": "Offsets In Context",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "document_id": {
+ "title": "Document Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ }
+ }
+ },
+ "Body_upload_file_file_upload_post": {
+ "title": "Body_upload_file_file_upload_post",
+ "required": [
+ "files"
+ ],
+ "type": "object",
+ "properties": {
+ "files": {
+ "title": "Files",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "format": "binary"
+ }
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "string",
+ "default": "null"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables"
+ },
+ "valid_languages": {
+ "title": "Valid Languages"
+ },
+ "clean_whitespace": {
+ "title": "Clean Whitespace"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer"
+ },
+ "split_by": {
+ "title": "Split By"
+ },
+ "split_length": {
+ "title": "Split Length"
+ },
+ "split_overlap": {
+ "title": "Split Overlap"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary"
+ }
+ }
+ },
+ "CreateLabelSerialized": {
+ "title": "CreateLabelSerialized",
+ "required": [
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/AnswerSerialized"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ },
+ "additionalProperties": false
+ },
+ "DocumentSerialized": {
+ "title": "DocumentSerialized",
+ "required": [
+ "content",
+ "content_type",
+ "id",
+ "meta"
+ ],
+ "type": "object",
+ "properties": {
+ "content": {
+ "title": "Content",
+ "type": "string"
+ },
+ "content_type": {
+ "title": "Content Type",
+ "enum": [
+ "text",
+ "table",
+ "image"
+ ],
+ "type": "string"
+ },
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "embedding": {
+ "title": "Embedding",
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ }
+ },
+ "FilterRequest": {
+ "title": "FilterRequest",
+ "type": "object",
+ "properties": {
+ "filters": {
+ "title": "Filters",
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ "HTTPValidationError": {
+ "title": "HTTPValidationError",
+ "type": "object",
+ "properties": {
+ "detail": {
+ "title": "Detail",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ValidationError"
+ }
+ }
+ }
+ },
+ "LabelSerialized": {
+ "title": "LabelSerialized",
+ "required": [
+ "id",
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/AnswerSerialized"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ }
+ },
+ "QueryRequest": {
+ "title": "QueryRequest",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "params": {
+ "title": "Params",
+ "type": "object"
+ },
+ "debug": {
+ "title": "Debug",
+ "type": "boolean",
+ "default": false
+ }
+ },
+ "additionalProperties": false
+ },
+ "QueryResponse": {
+ "title": "QueryResponse",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "answers": {
+ "title": "Answers",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AnswerSerialized"
+ },
+ "default": []
+ },
+ "documents": {
+ "title": "Documents",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/DocumentSerialized"
+ },
+ "default": []
+ },
+ "_debug": {
+ "title": " Debug",
+ "type": "object"
+ }
+ }
+ },
+ "Span": {
+ "title": "Span",
+ "required": [
+ "start",
+ "end"
+ ],
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start",
+ "type": "integer"
+ },
+ "end": {
+ "title": "End",
+ "type": "integer"
+ }
+ }
+ },
+ "ValidationError": {
+ "title": "ValidationError",
+ "required": [
+ "loc",
+ "msg",
+ "type"
+ ],
+ "type": "object",
+ "properties": {
+ "loc": {
+ "title": "Location",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "msg": {
+ "title": "Message",
+ "type": "string"
+ },
+ "type": {
+ "title": "Error Type",
+ "type": "string"
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/api/openapi/openapi-1.3.1rc0.json b/docs/v1.5.0/_src/api/openapi/openapi-1.3.1rc0.json
new file mode 100644
index 0000000000..8fff7c9626
--- /dev/null
+++ b/docs/v1.5.0/_src/api/openapi/openapi-1.3.1rc0.json
@@ -0,0 +1,892 @@
+{
+ "openapi": "3.0.2",
+ "info": {
+ "title": "Haystack REST API",
+ "version": "1.3.1rc0"
+ },
+ "paths": {
+ "/initialized": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Check Status",
+ "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.",
+ "operationId": "check_status",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/hs_version": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Haystack Version",
+ "description": "Get the running Haystack version.",
+ "operationId": "haystack_version",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/query": {
+ "post": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Query",
+ "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.",
+ "operationId": "query",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback",
+ "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.",
+ "operationId": "get_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Feedback Feedback Get",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Label"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Post Feedback",
+ "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.",
+ "operationId": "post_feedback",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Feedback",
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Label"
+ },
+ {
+ "$ref": "#/components/schemas/CreateLabelSerialized"
+ }
+ ]
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ },
+ "delete": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Delete Feedback",
+ "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint",
+ "operationId": "delete_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/eval-feedback": {
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback Metrics",
+ "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`",
+ "operationId": "get_feedback_metrics",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ }
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/export-feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Export Feedback",
+ "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.",
+ "operationId": "export_feedback",
+ "parameters": [
+ {
+ "required": false,
+ "schema": {
+ "title": "Context Size",
+ "type": "integer",
+ "default": 100000
+ },
+ "name": "context_size",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Full Document Context",
+ "type": "boolean",
+ "default": true
+ },
+ "name": "full_document_context",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Only Positive Labels",
+ "type": "boolean",
+ "default": false
+ },
+ "name": "only_positive_labels",
+ "in": "query"
+ }
+ ],
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/file-upload": {
+ "post": {
+ "tags": [
+ "file-upload"
+ ],
+ "summary": "Upload File",
+ "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).",
+ "operationId": "upload_file",
+ "requestBody": {
+ "content": {
+ "multipart/form-data": {
+ "schema": {
+ "$ref": "#/components/schemas/Body_upload_file_file_upload_post"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/get_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Get Documents",
+ "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "get_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Documents Documents Get By Filters Post",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ }
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/delete_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Delete Documents",
+ "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "delete_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Delete Documents Documents Delete By Filters Post",
+ "type": "boolean"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "components": {
+ "schemas": {
+ "Answer": {
+ "title": "Answer",
+ "required": [
+ "answer"
+ ],
+ "type": "object",
+ "properties": {
+ "answer": {
+ "title": "Answer",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "enum": [
+ "generative",
+ "extractive",
+ "other"
+ ],
+ "type": "string",
+ "default": "extractive"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "context": {
+ "title": "Context",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "offsets_in_document": {
+ "title": "Offsets In Document",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "offsets_in_context": {
+ "title": "Offsets In Context",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "document_id": {
+ "title": "Document Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ }
+ }
+ },
+ "Body_upload_file_file_upload_post": {
+ "title": "Body_upload_file_file_upload_post",
+ "required": [
+ "files"
+ ],
+ "type": "object",
+ "properties": {
+ "files": {
+ "title": "Files",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "format": "binary"
+ }
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "string",
+ "default": "null"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables"
+ },
+ "valid_languages": {
+ "title": "Valid Languages"
+ },
+ "clean_whitespace": {
+ "title": "Clean Whitespace"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer"
+ },
+ "split_by": {
+ "title": "Split By"
+ },
+ "split_length": {
+ "title": "Split Length"
+ },
+ "split_overlap": {
+ "title": "Split Overlap"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary"
+ }
+ }
+ },
+ "CreateLabelSerialized": {
+ "title": "CreateLabelSerialized",
+ "required": [
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ },
+ "additionalProperties": false
+ },
+ "Document": {
+ "title": "Document",
+ "required": [
+ "content",
+ "content_type",
+ "id",
+ "meta"
+ ],
+ "type": "object",
+ "properties": {
+ "content": {
+ "title": "Content",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "content_type": {
+ "title": "Content Type",
+ "enum": [
+ "text",
+ "table",
+ "image"
+ ],
+ "type": "string"
+ },
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "embedding": {
+ "title": "Embedding",
+ "type": "string"
+ }
+ }
+ },
+ "FilterRequest": {
+ "title": "FilterRequest",
+ "type": "object",
+ "properties": {
+ "filters": {
+ "title": "Filters",
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ "HTTPValidationError": {
+ "title": "HTTPValidationError",
+ "type": "object",
+ "properties": {
+ "detail": {
+ "title": "Detail",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ValidationError"
+ }
+ }
+ }
+ },
+ "Label": {
+ "title": "Label",
+ "required": [
+ "id",
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ }
+ },
+ "QueryRequest": {
+ "title": "QueryRequest",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "params": {
+ "title": "Params",
+ "type": "object"
+ },
+ "debug": {
+ "title": "Debug",
+ "type": "boolean",
+ "default": false
+ }
+ },
+ "additionalProperties": false
+ },
+ "QueryResponse": {
+ "title": "QueryResponse",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "answers": {
+ "title": "Answers",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "default": []
+ },
+ "documents": {
+ "title": "Documents",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "default": []
+ },
+ "_debug": {
+ "title": " Debug",
+ "type": "object"
+ }
+ }
+ },
+ "Span": {
+ "title": "Span",
+ "required": [
+ "start",
+ "end"
+ ],
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start",
+ "type": "integer"
+ },
+ "end": {
+ "title": "End",
+ "type": "integer"
+ }
+ }
+ },
+ "ValidationError": {
+ "title": "ValidationError",
+ "required": [
+ "loc",
+ "msg",
+ "type"
+ ],
+ "type": "object",
+ "properties": {
+ "loc": {
+ "title": "Location",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ }
+ ]
+ }
+ },
+ "msg": {
+ "title": "Message",
+ "type": "string"
+ },
+ "type": {
+ "title": "Error Type",
+ "type": "string"
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/api/openapi/openapi-1.4.0.json b/docs/v1.5.0/_src/api/openapi/openapi-1.4.0.json
new file mode 100644
index 0000000000..9dd0363856
--- /dev/null
+++ b/docs/v1.5.0/_src/api/openapi/openapi-1.4.0.json
@@ -0,0 +1,892 @@
+{
+ "openapi": "3.0.2",
+ "info": {
+ "title": "Haystack REST API",
+ "version": "1.4.0"
+ },
+ "paths": {
+ "/initialized": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Check Status",
+ "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.",
+ "operationId": "check_status",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/hs_version": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Haystack Version",
+ "description": "Get the running Haystack version.",
+ "operationId": "haystack_version",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/query": {
+ "post": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Query",
+ "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.",
+ "operationId": "query",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback",
+ "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.",
+ "operationId": "get_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Feedback Feedback Get",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Label"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Post Feedback",
+ "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.",
+ "operationId": "post_feedback",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Feedback",
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Label"
+ },
+ {
+ "$ref": "#/components/schemas/CreateLabelSerialized"
+ }
+ ]
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ },
+ "delete": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Delete Feedback",
+ "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint",
+ "operationId": "delete_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/eval-feedback": {
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback Metrics",
+ "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`",
+ "operationId": "get_feedback_metrics",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ }
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/export-feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Export Feedback",
+ "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.",
+ "operationId": "export_feedback",
+ "parameters": [
+ {
+ "required": false,
+ "schema": {
+ "title": "Context Size",
+ "type": "integer",
+ "default": 100000
+ },
+ "name": "context_size",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Full Document Context",
+ "type": "boolean",
+ "default": true
+ },
+ "name": "full_document_context",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Only Positive Labels",
+ "type": "boolean",
+ "default": false
+ },
+ "name": "only_positive_labels",
+ "in": "query"
+ }
+ ],
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/file-upload": {
+ "post": {
+ "tags": [
+ "file-upload"
+ ],
+ "summary": "Upload File",
+ "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).",
+ "operationId": "upload_file",
+ "requestBody": {
+ "content": {
+ "multipart/form-data": {
+ "schema": {
+ "$ref": "#/components/schemas/Body_upload_file_file_upload_post"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/get_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Get Documents",
+ "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "get_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Documents Documents Get By Filters Post",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ }
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/delete_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Delete Documents",
+ "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "delete_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Delete Documents Documents Delete By Filters Post",
+ "type": "boolean"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "components": {
+ "schemas": {
+ "Answer": {
+ "title": "Answer",
+ "required": [
+ "answer"
+ ],
+ "type": "object",
+ "properties": {
+ "answer": {
+ "title": "Answer",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "enum": [
+ "generative",
+ "extractive",
+ "other"
+ ],
+ "type": "string",
+ "default": "extractive"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "context": {
+ "title": "Context",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "offsets_in_document": {
+ "title": "Offsets In Document",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "offsets_in_context": {
+ "title": "Offsets In Context",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "document_id": {
+ "title": "Document Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ }
+ }
+ },
+ "Body_upload_file_file_upload_post": {
+ "title": "Body_upload_file_file_upload_post",
+ "required": [
+ "files"
+ ],
+ "type": "object",
+ "properties": {
+ "files": {
+ "title": "Files",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "format": "binary"
+ }
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "string",
+ "default": "null"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables"
+ },
+ "valid_languages": {
+ "title": "Valid Languages"
+ },
+ "clean_whitespace": {
+ "title": "Clean Whitespace"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer"
+ },
+ "split_by": {
+ "title": "Split By"
+ },
+ "split_length": {
+ "title": "Split Length"
+ },
+ "split_overlap": {
+ "title": "Split Overlap"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary"
+ }
+ }
+ },
+ "CreateLabelSerialized": {
+ "title": "CreateLabelSerialized",
+ "required": [
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ },
+ "additionalProperties": false
+ },
+ "Document": {
+ "title": "Document",
+ "required": [
+ "content",
+ "content_type",
+ "id",
+ "meta"
+ ],
+ "type": "object",
+ "properties": {
+ "content": {
+ "title": "Content",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "content_type": {
+ "title": "Content Type",
+ "enum": [
+ "text",
+ "table",
+ "image"
+ ],
+ "type": "string"
+ },
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "embedding": {
+ "title": "Embedding",
+ "type": "string"
+ }
+ }
+ },
+ "FilterRequest": {
+ "title": "FilterRequest",
+ "type": "object",
+ "properties": {
+ "filters": {
+ "title": "Filters",
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ "HTTPValidationError": {
+ "title": "HTTPValidationError",
+ "type": "object",
+ "properties": {
+ "detail": {
+ "title": "Detail",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ValidationError"
+ }
+ }
+ }
+ },
+ "Label": {
+ "title": "Label",
+ "required": [
+ "id",
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ }
+ },
+ "QueryRequest": {
+ "title": "QueryRequest",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "params": {
+ "title": "Params",
+ "type": "object"
+ },
+ "debug": {
+ "title": "Debug",
+ "type": "boolean",
+ "default": false
+ }
+ },
+ "additionalProperties": false
+ },
+ "QueryResponse": {
+ "title": "QueryResponse",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "answers": {
+ "title": "Answers",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "default": []
+ },
+ "documents": {
+ "title": "Documents",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "default": []
+ },
+ "_debug": {
+ "title": " Debug",
+ "type": "object"
+ }
+ }
+ },
+ "Span": {
+ "title": "Span",
+ "required": [
+ "start",
+ "end"
+ ],
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start",
+ "type": "integer"
+ },
+ "end": {
+ "title": "End",
+ "type": "integer"
+ }
+ }
+ },
+ "ValidationError": {
+ "title": "ValidationError",
+ "required": [
+ "loc",
+ "msg",
+ "type"
+ ],
+ "type": "object",
+ "properties": {
+ "loc": {
+ "title": "Location",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ }
+ ]
+ }
+ },
+ "msg": {
+ "title": "Message",
+ "type": "string"
+ },
+ "type": {
+ "title": "Error Type",
+ "type": "string"
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/api/openapi/openapi-1.4.1rc0.json b/docs/v1.5.0/_src/api/openapi/openapi-1.4.1rc0.json
new file mode 100644
index 0000000000..5d70c32a2f
--- /dev/null
+++ b/docs/v1.5.0/_src/api/openapi/openapi-1.4.1rc0.json
@@ -0,0 +1,892 @@
+{
+ "openapi": "3.0.2",
+ "info": {
+ "title": "Haystack REST API",
+ "version": "1.4.1rc0"
+ },
+ "paths": {
+ "/initialized": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Check Status",
+ "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.",
+ "operationId": "check_status",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/hs_version": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Haystack Version",
+ "description": "Get the running Haystack version.",
+ "operationId": "haystack_version",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/query": {
+ "post": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Query",
+ "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.",
+ "operationId": "query",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback",
+ "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.",
+ "operationId": "get_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Feedback Feedback Get",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Label"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Post Feedback",
+ "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.",
+ "operationId": "post_feedback",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Feedback",
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Label"
+ },
+ {
+ "$ref": "#/components/schemas/CreateLabelSerialized"
+ }
+ ]
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ },
+ "delete": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Delete Feedback",
+ "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint",
+ "operationId": "delete_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/eval-feedback": {
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback Metrics",
+ "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`",
+ "operationId": "get_feedback_metrics",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ }
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/export-feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Export Feedback",
+ "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.",
+ "operationId": "export_feedback",
+ "parameters": [
+ {
+ "required": false,
+ "schema": {
+ "title": "Context Size",
+ "type": "integer",
+ "default": 100000
+ },
+ "name": "context_size",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Full Document Context",
+ "type": "boolean",
+ "default": true
+ },
+ "name": "full_document_context",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Only Positive Labels",
+ "type": "boolean",
+ "default": false
+ },
+ "name": "only_positive_labels",
+ "in": "query"
+ }
+ ],
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/file-upload": {
+ "post": {
+ "tags": [
+ "file-upload"
+ ],
+ "summary": "Upload File",
+ "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).",
+ "operationId": "upload_file",
+ "requestBody": {
+ "content": {
+ "multipart/form-data": {
+ "schema": {
+ "$ref": "#/components/schemas/Body_upload_file_file_upload_post"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/get_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Get Documents",
+ "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "get_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Documents Documents Get By Filters Post",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ }
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/delete_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Delete Documents",
+ "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "delete_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Delete Documents Documents Delete By Filters Post",
+ "type": "boolean"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "components": {
+ "schemas": {
+ "Answer": {
+ "title": "Answer",
+ "required": [
+ "answer"
+ ],
+ "type": "object",
+ "properties": {
+ "answer": {
+ "title": "Answer",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "enum": [
+ "generative",
+ "extractive",
+ "other"
+ ],
+ "type": "string",
+ "default": "extractive"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "context": {
+ "title": "Context",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "offsets_in_document": {
+ "title": "Offsets In Document",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "offsets_in_context": {
+ "title": "Offsets In Context",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "document_id": {
+ "title": "Document Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ }
+ }
+ },
+ "Body_upload_file_file_upload_post": {
+ "title": "Body_upload_file_file_upload_post",
+ "required": [
+ "files"
+ ],
+ "type": "object",
+ "properties": {
+ "files": {
+ "title": "Files",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "format": "binary"
+ }
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "string",
+ "default": "null"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables"
+ },
+ "valid_languages": {
+ "title": "Valid Languages"
+ },
+ "clean_whitespace": {
+ "title": "Clean Whitespace"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer"
+ },
+ "split_by": {
+ "title": "Split By"
+ },
+ "split_length": {
+ "title": "Split Length"
+ },
+ "split_overlap": {
+ "title": "Split Overlap"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary"
+ }
+ }
+ },
+ "CreateLabelSerialized": {
+ "title": "CreateLabelSerialized",
+ "required": [
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ },
+ "additionalProperties": false
+ },
+ "Document": {
+ "title": "Document",
+ "required": [
+ "content",
+ "content_type",
+ "id",
+ "meta"
+ ],
+ "type": "object",
+ "properties": {
+ "content": {
+ "title": "Content",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "content_type": {
+ "title": "Content Type",
+ "enum": [
+ "text",
+ "table",
+ "image"
+ ],
+ "type": "string"
+ },
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "embedding": {
+ "title": "Embedding",
+ "type": "string"
+ }
+ }
+ },
+ "FilterRequest": {
+ "title": "FilterRequest",
+ "type": "object",
+ "properties": {
+ "filters": {
+ "title": "Filters",
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ "HTTPValidationError": {
+ "title": "HTTPValidationError",
+ "type": "object",
+ "properties": {
+ "detail": {
+ "title": "Detail",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ValidationError"
+ }
+ }
+ }
+ },
+ "Label": {
+ "title": "Label",
+ "required": [
+ "id",
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ }
+ },
+ "QueryRequest": {
+ "title": "QueryRequest",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "params": {
+ "title": "Params",
+ "type": "object"
+ },
+ "debug": {
+ "title": "Debug",
+ "type": "boolean",
+ "default": false
+ }
+ },
+ "additionalProperties": false
+ },
+ "QueryResponse": {
+ "title": "QueryResponse",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "answers": {
+ "title": "Answers",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "default": []
+ },
+ "documents": {
+ "title": "Documents",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "default": []
+ },
+ "_debug": {
+ "title": " Debug",
+ "type": "object"
+ }
+ }
+ },
+ "Span": {
+ "title": "Span",
+ "required": [
+ "start",
+ "end"
+ ],
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start",
+ "type": "integer"
+ },
+ "end": {
+ "title": "End",
+ "type": "integer"
+ }
+ }
+ },
+ "ValidationError": {
+ "title": "ValidationError",
+ "required": [
+ "loc",
+ "msg",
+ "type"
+ ],
+ "type": "object",
+ "properties": {
+ "loc": {
+ "title": "Location",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ }
+ ]
+ }
+ },
+ "msg": {
+ "title": "Message",
+ "type": "string"
+ },
+ "type": {
+ "title": "Error Type",
+ "type": "string"
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/api/openapi/openapi.json b/docs/v1.5.0/_src/api/openapi/openapi.json
new file mode 100644
index 0000000000..5d70c32a2f
--- /dev/null
+++ b/docs/v1.5.0/_src/api/openapi/openapi.json
@@ -0,0 +1,892 @@
+{
+ "openapi": "3.0.2",
+ "info": {
+ "title": "Haystack REST API",
+ "version": "1.4.1rc0"
+ },
+ "paths": {
+ "/initialized": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Check Status",
+ "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.",
+ "operationId": "check_status",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/hs_version": {
+ "get": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Haystack Version",
+ "description": "Get the running Haystack version.",
+ "operationId": "haystack_version",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/query": {
+ "post": {
+ "tags": [
+ "search"
+ ],
+ "summary": "Query",
+ "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.",
+ "operationId": "query",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/QueryResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback",
+ "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.",
+ "operationId": "get_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Feedback Feedback Get",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Label"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Post Feedback",
+ "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.",
+ "operationId": "post_feedback",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Feedback",
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Label"
+ },
+ {
+ "$ref": "#/components/schemas/CreateLabelSerialized"
+ }
+ ]
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ },
+ "delete": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Delete Feedback",
+ "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint",
+ "operationId": "delete_feedback",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ }
+ }
+ }
+ },
+ "/eval-feedback": {
+ "post": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Get Feedback Metrics",
+ "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`",
+ "operationId": "get_feedback_metrics",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ }
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/export-feedback": {
+ "get": {
+ "tags": [
+ "feedback"
+ ],
+ "summary": "Export Feedback",
+ "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.",
+ "operationId": "export_feedback",
+ "parameters": [
+ {
+ "required": false,
+ "schema": {
+ "title": "Context Size",
+ "type": "integer",
+ "default": 100000
+ },
+ "name": "context_size",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Full Document Context",
+ "type": "boolean",
+ "default": true
+ },
+ "name": "full_document_context",
+ "in": "query"
+ },
+ {
+ "required": false,
+ "schema": {
+ "title": "Only Positive Labels",
+ "type": "boolean",
+ "default": false
+ },
+ "name": "only_positive_labels",
+ "in": "query"
+ }
+ ],
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/file-upload": {
+ "post": {
+ "tags": [
+ "file-upload"
+ ],
+ "summary": "Upload File",
+ "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).",
+ "operationId": "upload_file",
+ "requestBody": {
+ "content": {
+ "multipart/form-data": {
+ "schema": {
+ "$ref": "#/components/schemas/Body_upload_file_file_upload_post"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {}
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/get_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Get Documents",
+ "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "get_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Get Documents Documents Get By Filters Post",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ }
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/documents/delete_by_filters": {
+ "post": {
+ "tags": [
+ "document"
+ ],
+ "summary": "Delete Documents",
+ "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`",
+ "operationId": "delete_documents",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/FilterRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "title": "Response Delete Documents Documents Delete By Filters Post",
+ "type": "boolean"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "components": {
+ "schemas": {
+ "Answer": {
+ "title": "Answer",
+ "required": [
+ "answer"
+ ],
+ "type": "object",
+ "properties": {
+ "answer": {
+ "title": "Answer",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "enum": [
+ "generative",
+ "extractive",
+ "other"
+ ],
+ "type": "string",
+ "default": "extractive"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "context": {
+ "title": "Context",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "offsets_in_document": {
+ "title": "Offsets In Document",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "offsets_in_context": {
+ "title": "Offsets In Context",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Span"
+ }
+ },
+ "document_id": {
+ "title": "Document Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ }
+ }
+ },
+ "Body_upload_file_file_upload_post": {
+ "title": "Body_upload_file_file_upload_post",
+ "required": [
+ "files"
+ ],
+ "type": "object",
+ "properties": {
+ "files": {
+ "title": "Files",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "format": "binary"
+ }
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "string",
+ "default": "null"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables"
+ },
+ "valid_languages": {
+ "title": "Valid Languages"
+ },
+ "clean_whitespace": {
+ "title": "Clean Whitespace"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer"
+ },
+ "split_by": {
+ "title": "Split By"
+ },
+ "split_length": {
+ "title": "Split Length"
+ },
+ "split_overlap": {
+ "title": "Split Overlap"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary"
+ }
+ }
+ },
+ "CreateLabelSerialized": {
+ "title": "CreateLabelSerialized",
+ "required": [
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ },
+ "additionalProperties": false
+ },
+ "Document": {
+ "title": "Document",
+ "required": [
+ "content",
+ "content_type",
+ "id",
+ "meta"
+ ],
+ "type": "object",
+ "properties": {
+ "content": {
+ "title": "Content",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "content_type": {
+ "title": "Content Type",
+ "enum": [
+ "text",
+ "table",
+ "image"
+ ],
+ "type": "string"
+ },
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "score": {
+ "title": "Score",
+ "type": "number"
+ },
+ "embedding": {
+ "title": "Embedding",
+ "type": "string"
+ }
+ }
+ },
+ "FilterRequest": {
+ "title": "FilterRequest",
+ "type": "object",
+ "properties": {
+ "filters": {
+ "title": "Filters",
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ "HTTPValidationError": {
+ "title": "HTTPValidationError",
+ "type": "object",
+ "properties": {
+ "detail": {
+ "title": "Detail",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ValidationError"
+ }
+ }
+ }
+ },
+ "Label": {
+ "title": "Label",
+ "required": [
+ "id",
+ "query",
+ "document",
+ "is_correct_answer",
+ "is_correct_document",
+ "origin"
+ ],
+ "type": "object",
+ "properties": {
+ "id": {
+ "title": "Id",
+ "type": "string"
+ },
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "document": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "is_correct_answer": {
+ "title": "Is Correct Answer",
+ "type": "boolean"
+ },
+ "is_correct_document": {
+ "title": "Is Correct Document",
+ "type": "boolean"
+ },
+ "origin": {
+ "title": "Origin",
+ "enum": [
+ "user-feedback",
+ "gold-label"
+ ],
+ "type": "string"
+ },
+ "answer": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "no_answer": {
+ "title": "No Answer",
+ "type": "boolean"
+ },
+ "pipeline_id": {
+ "title": "Pipeline Id",
+ "type": "string"
+ },
+ "created_at": {
+ "title": "Created At",
+ "type": "string"
+ },
+ "updated_at": {
+ "title": "Updated At",
+ "type": "string"
+ },
+ "meta": {
+ "title": "Meta",
+ "type": "object"
+ },
+ "filters": {
+ "title": "Filters",
+ "type": "object"
+ }
+ }
+ },
+ "QueryRequest": {
+ "title": "QueryRequest",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "params": {
+ "title": "Params",
+ "type": "object"
+ },
+ "debug": {
+ "title": "Debug",
+ "type": "boolean",
+ "default": false
+ }
+ },
+ "additionalProperties": false
+ },
+ "QueryResponse": {
+ "title": "QueryResponse",
+ "required": [
+ "query"
+ ],
+ "type": "object",
+ "properties": {
+ "query": {
+ "title": "Query",
+ "type": "string"
+ },
+ "answers": {
+ "title": "Answers",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Answer"
+ },
+ "default": []
+ },
+ "documents": {
+ "title": "Documents",
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Document"
+ },
+ "default": []
+ },
+ "_debug": {
+ "title": " Debug",
+ "type": "object"
+ }
+ }
+ },
+ "Span": {
+ "title": "Span",
+ "required": [
+ "start",
+ "end"
+ ],
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start",
+ "type": "integer"
+ },
+ "end": {
+ "title": "End",
+ "type": "integer"
+ }
+ }
+ },
+ "ValidationError": {
+ "title": "ValidationError",
+ "required": [
+ "loc",
+ "msg",
+ "type"
+ ],
+ "type": "object",
+ "properties": {
+ "loc": {
+ "title": "Location",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ }
+ ]
+ }
+ },
+ "msg": {
+ "title": "Message",
+ "type": "string"
+ },
+ "type": {
+ "title": "Error Type",
+ "type": "string"
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/api/pydoc/answer-generator.yml b/docs/v1.5.0/_src/api/pydoc/answer-generator.yml
new file mode 100644
index 0000000000..c4a4ca5b9b
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/answer-generator.yml
@@ -0,0 +1,21 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/answer_generator]
+ modules: ['base', 'transformers']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: generator.md
+
diff --git a/docs/v1.5.0/_src/api/pydoc/crawler.yml b/docs/v1.5.0/_src/api/pydoc/crawler.yml
new file mode 100644
index 0000000000..b9489d589f
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/crawler.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/connector]
+ modules: ['crawler']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: crawler.md
diff --git a/docs/v1.5.0/_src/api/pydoc/document-classifier.yml b/docs/v1.5.0/_src/api/pydoc/document-classifier.yml
new file mode 100644
index 0000000000..4070d36773
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/document-classifier.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/document_classifier]
+ modules: ['base', 'transformers']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: document_classifier.md
diff --git a/docs/v1.5.0/_src/api/pydoc/document-store.yml b/docs/v1.5.0/_src/api/pydoc/document-store.yml
new file mode 100644
index 0000000000..6327fe2344
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/document-store.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/document_stores]
+ modules: ['base', 'elasticsearch', 'memory', 'sql', 'faiss', 'milvus1', 'milvus2', 'weaviate', 'graphdb', 'deepsetcloud', 'pinecone', 'utils']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: document_store.md
diff --git a/docs/v1.5.0/_src/api/pydoc/evaluation.yml b/docs/v1.5.0/_src/api/pydoc/evaluation.yml
new file mode 100644
index 0000000000..cfff806d3f
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/evaluation.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/evaluator]
+ modules: ['evaluator']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: evaluation.md
diff --git a/docs/v1.5.0/_src/api/pydoc/extractor.yml b/docs/v1.5.0/_src/api/pydoc/extractor.yml
new file mode 100644
index 0000000000..5dd3add5eb
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/extractor.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/extractor]
+ modules: ['entity']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: extractor.md
diff --git a/docs/v1.5.0/_src/api/pydoc/file-classifier.yml b/docs/v1.5.0/_src/api/pydoc/file-classifier.yml
new file mode 100644
index 0000000000..b6fa4c94a2
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/file-classifier.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/file_classifier]
+ modules: ['file_type']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: file_classifier.md
diff --git a/docs/v1.5.0/_src/api/pydoc/file-converters.yml b/docs/v1.5.0/_src/api/pydoc/file-converters.yml
new file mode 100644
index 0000000000..49abec35c7
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/file-converters.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/file_converter]
+ modules: ['base', 'docx', 'image', 'markdown', 'pdf', 'tika', 'txt']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: file_converter.md
diff --git a/docs/v1.5.0/_src/api/pydoc/other.yml b/docs/v1.5.0/_src/api/pydoc/other.yml
new file mode 100644
index 0000000000..46355f6160
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/other.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/other]
+ modules: ['docs2answers', 'join_docs', 'join_answers', 'route_documents']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: other.md
diff --git a/docs/v1.5.0/_src/api/pydoc/pipelines.yml b/docs/v1.5.0/_src/api/pydoc/pipelines.yml
new file mode 100644
index 0000000000..705643ccac
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/pipelines.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/pipelines]
+ modules: ['base', 'ray', 'standard_pipelines']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: pipelines.md
diff --git a/docs/v1.5.0/_src/api/pydoc/preprocessor.yml b/docs/v1.5.0/_src/api/pydoc/preprocessor.yml
new file mode 100644
index 0000000000..5481f76c15
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/preprocessor.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/preprocessor]
+ modules: ['base', 'preprocessor']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: preprocessor.md
diff --git a/docs/v1.5.0/_src/api/pydoc/primitives.yml b/docs/v1.5.0/_src/api/pydoc/primitives.yml
new file mode 100644
index 0000000000..6a3e8f9d41
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/primitives.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/]
+ modules: ['schema']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: primitives.md
diff --git a/docs/v1.5.0/_src/api/pydoc/query-classifier.yml b/docs/v1.5.0/_src/api/pydoc/query-classifier.yml
new file mode 100644
index 0000000000..5be82ee872
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/query-classifier.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/query_classifier]
+ modules: ['base', 'sklearn', 'transformers']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: query_classifier.md
diff --git a/docs/v1.5.0/_src/api/pydoc/question-generator.yml b/docs/v1.5.0/_src/api/pydoc/question-generator.yml
new file mode 100644
index 0000000000..4d52568635
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/question-generator.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/question_generator]
+ modules: ['question_generator']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: question_generator.md
diff --git a/docs/v1.5.0/_src/api/pydoc/ranker.yml b/docs/v1.5.0/_src/api/pydoc/ranker.yml
new file mode 100644
index 0000000000..dfbce80af6
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/ranker.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/ranker]
+ modules: ['base', 'sentence_transformers']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: ranker.md
diff --git a/docs/v1.5.0/_src/api/pydoc/reader.yml b/docs/v1.5.0/_src/api/pydoc/reader.yml
new file mode 100644
index 0000000000..1910d36e7e
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/reader.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/reader]
+ modules: ['base', 'farm', 'transformers', 'table']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: reader.md
diff --git a/docs/v1.5.0/_src/api/pydoc/retriever.yml b/docs/v1.5.0/_src/api/pydoc/retriever.yml
new file mode 100644
index 0000000000..ee64cdfa04
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/retriever.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/retriever]
+ modules: ['base', 'sparse', 'dense', 'text2sparql']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: retriever.md
diff --git a/docs/v1.5.0/_src/api/pydoc/summarizer.yml b/docs/v1.5.0/_src/api/pydoc/summarizer.yml
new file mode 100644
index 0000000000..d6c53bc25f
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/summarizer.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/summarizer]
+ modules: ['base', 'transformers']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: summarizer.md
diff --git a/docs/v1.5.0/_src/api/pydoc/translator.yml b/docs/v1.5.0/_src/api/pydoc/translator.yml
new file mode 100644
index 0000000000..36038321e8
--- /dev/null
+++ b/docs/v1.5.0/_src/api/pydoc/translator.yml
@@ -0,0 +1,20 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/translator]
+ modules: ['base', 'transformers']
+ ignore_when_discovered: ['__init__']
+processors:
+ - type: filter
+ expression:
+ documented_only: true
+ do_not_filter_modules: false
+ skip_empty_modules: true
+ - type: smart
+ - type: crossref
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: true
+ add_member_class_prefix: false
+ filename: translator.md
diff --git a/docs/v1.5.0/_src/benchmarks/farm_per_component.html b/docs/v1.5.0/_src/benchmarks/farm_per_component.html
new file mode 100644
index 0000000000..6a9d3d5cea
--- /dev/null
+++ b/docs/v1.5.0/_src/benchmarks/farm_per_component.html
@@ -0,0 +1,48 @@
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/v1.5.0/_src/benchmarks/reader_performance.json b/docs/v1.5.0/_src/benchmarks/reader_performance.json
new file mode 100644
index 0000000000..be935fe271
--- /dev/null
+++ b/docs/v1.5.0/_src/benchmarks/reader_performance.json
@@ -0,0 +1,44 @@
+{
+ "chart_type": "BarChart",
+ "title": "Reader Performance",
+ "subtitle": "Time and Accuracy Benchmarks",
+ "description": "Performance benchmarks of different Readers that can be used off-the-shelf in Haystack. Some models are geared towards speed, while others are more performance-focused. Accuracy is measured as F1 score and speed as passages/sec (with passages of 384 tokens). Each Reader is benchmarked using the SQuAD v2.0 development set, which contains 11866 question answer pairs. When tokenized using the BERT tokenizer and split using a sliding window approach, these become 12350 passages that are passed into the model. We set max_seq_len=384 and doc_stride=128. These benchmarking tests are run using an AWS p3.2xlarge instance with a Nvidia V100 GPU with this script. Please note that we are using the FARMReader class rather than the TransformersReader class. Also, the F1 measure that is reported here is in fact calculated on token level, rather than word level as is done in the official SQuAD script.",
+ "bars": "horizontal",
+ "columns": [
+ "Model",
+ "F1",
+ "Speed (passages/sec)"
+ ],
+ "data": [
+ {
+ "F1": 82.58860575299658,
+ "Speed": 125.81040525892848,
+ "Model": "RoBERTa"
+ },
+ {
+ "F1": 78.87858491007042,
+ "Speed": 260.6443097981493,
+ "Model": "MiniLM"
+ },
+ {
+ "F1": 74.31182400443286,
+ "Speed": 121.08066567525722,
+ "Model": "BERT base"
+ },
+ {
+ "F1": 83.26306774734308,
+ "Speed": 42.21949937744112,
+ "Model": "BERT large"
+ },
+ {
+ "F1": 84.50422699207468,
+ "Speed": 42.07400844838985,
+ "Model": "XLM-RoBERTa"
+ },
+ {
+ "F1": 42.31925844723574,
+ "Speed": 222.91207128366702,
+ "Model": "DistilBERT"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/benchmarks/retriever_map.json b/docs/v1.5.0/_src/benchmarks/retriever_map.json
new file mode 100644
index 0000000000..51e0687cf3
--- /dev/null
+++ b/docs/v1.5.0/_src/benchmarks/retriever_map.json
@@ -0,0 +1,204 @@
+{
+ "chart_type": "LineChart",
+ "title": "Retriever Accuracy",
+ "subtitle": "mAP at different number of docs",
+ "description": "Here you can see how the mean avg. precision (mAP) of the retriever decays as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.",
+ "columns": [
+ "n_docs",
+ "BM25 / Elasticsearch",
+ "DPR / Elasticsearch",
+ "DPR / FAISS (flat)",
+ "DPR / FAISS (HNSW)",
+ "DPR / Milvus (flat)",
+ "DPR / Milvus (HNSW)",
+ "Sentence Transformers / Elasticsearch"
+ ],
+ "axis": [
+ {
+ "x": "Number of docs",
+ "y": "mAP"
+ }
+ ],
+ "data": [
+ {
+ "model": "DPR / Elasticsearch",
+ "n_docs": 1000,
+ "map": 92.95105322830891
+ },
+ {
+ "model": "DPR / Elasticsearch",
+ "n_docs": 10000,
+ "map": 89.87097014904354
+ },
+ {
+ "model": "BM25 / Elasticsearch",
+ "n_docs": 10000,
+ "map": 66.26543444531747
+ },
+ {
+ "model": "Sentence Transformers / Elasticsearch",
+ "n_docs": 1000,
+ "map": 90.06638620360428
+ },
+ {
+ "model": "Sentence Transformers / Elasticsearch",
+ "n_docs": 10000,
+ "map": 87.11255142468549
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 1000,
+ "map": 92.95105322830891
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 10000,
+ "map": 89.87097014904354
+ },
+ {
+ "model": "DPR / FAISS (HNSW)",
+ "n_docs": 1000,
+ "map": 92.95105322830891
+ },
+ {
+ "model": "DPR / FAISS (HNSW)",
+ "n_docs": 10000,
+ "map": 89.51337675393017
+ },
+ {
+ "model": "DPR / Milvus (flat)",
+ "n_docs": 1000,
+ "map": 92.95105322830891
+ },
+ {
+ "model": "DPR / Milvus (flat)",
+ "n_docs": 10000,
+ "map": 89.87097014904354
+ },
+ {
+ "model": "DPR / Milvus (HNSW)",
+ "n_docs": 1000,
+ "map": 92.95105322830891
+ },
+ {
+ "model": "DPR / Milvus (HNSW)",
+ "n_docs": 10000,
+ "map": 88.24421129104469
+ },
+ {
+ "model": "DPR / Elasticsearch",
+ "n_docs": 100000,
+ "map": 86.54606328368976
+ },
+ {
+ "model": "DPR / Elasticsearch",
+ "n_docs": 500000,
+ "map": 80.86137228234091
+ },
+ {
+ "model": "BM25 / Elasticsearch",
+ "n_docs": 100000,
+ "map": 56.25299537353825
+ },
+ {
+ "model": "BM25 / Elasticsearch",
+ "n_docs": 500000,
+ "map": 45.595090262466535
+ },
+ {
+ "model": "Sentence Transformers / Elasticsearch",
+ "n_docs": 100000,
+ "map": 82.74686664920836
+ },
+ {
+ "model": "Sentence Transformers / Elasticsearch",
+ "n_docs": 500000,
+ "map": 76.49564526892904
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 100000,
+ "map": 86.54606328368973
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 500000,
+ "map": 80.86137228234091
+ },
+ {
+ "model": "DPR / FAISS (HNSW)",
+ "n_docs": 100000,
+ "map": 84.33419639513305
+ },
+ {
+ "model": "DPR / FAISS (HNSW)",
+ "n_docs": 500000,
+ "map": 75.73062475537202
+ },
+ {
+ "model": "DPR / Milvus (flat)",
+ "n_docs": 100000,
+ "map": 86.54606328368973
+ },
+ {
+ "model": "DPR / Milvus (flat)",
+ "n_docs": 500000,
+ "map": 80.86137228234091
+ },
+ {
+ "model": "DPR / Milvus (HNSW)",
+ "n_docs": 100000,
+ "map": 81.63864883662649
+ },
+ {
+ "model": "DPR / Milvus (HNSW)",
+ "n_docs": 500000,
+ "map": 73.57986207906387
+ },
+ {
+ "model": "BM25 / Elasticsearch",
+ "n_docs": 1000,
+ "map": 74.20444712972909
+ },
+ {
+ "model": "DPR / OpenSearch (flat)",
+ "n_docs": 1000,
+ "map": 92.95105322830891
+ },
+ {
+ "model": "DPR / OpenSearch (flat)",
+ "n_docs": 10000,
+ "map": 89.8709701490436
+ },
+ {
+ "model": "DPR / OpenSearch (flat)",
+ "n_docs": 100000,
+ "map": 86.54014997282701
+ },
+ {
+ "model": "DPR / OpenSearch (HNSW)",
+ "n_docs": 1000,
+ "map": 92.76308330349686
+ },
+ {
+ "model": "DPR / OpenSearch (HNSW)",
+ "n_docs": 10000,
+ "map": 89.00403653862938
+ },
+ {
+ "model": "DPR / OpenSearch (HNSW)",
+ "n_docs": 100000,
+ "map": 85.7342431384476
+ },
+ {
+ "model": "DPR / OpenSearch (flat)",
+ "n_docs": 500000,
+ "map": 80.85588135082547
+ },
+ {
+ "model": "DPR / OpenSearch (HNSW)",
+ "n_docs": 500000,
+ "map": 77.5426462347698
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/benchmarks/retriever_performance.json b/docs/v1.5.0/_src/benchmarks/retriever_performance.json
new file mode 100644
index 0000000000..dbb9340481
--- /dev/null
+++ b/docs/v1.5.0/_src/benchmarks/retriever_performance.json
@@ -0,0 +1,88 @@
+{
+ "chart_type": "BarChart",
+ "title": "Retriever Performance",
+ "subtitle": "Time and Accuracy Benchmarks",
+ "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. We use a cosine similarity function with BM25 retrievers, and dot product with DPR. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.",
+ "bars": "horizontal",
+ "columns": [
+ "Model",
+ "mAP",
+ "Index Speed (docs/sec)",
+ "Query Speed (queries/sec)"
+ ],
+ "series": {
+ "s0": "map",
+ "s1": "time",
+ "s2": "time"
+ },
+ "axes": {
+ "label": "map",
+ "time_side": "top",
+ "time_label": "seconds"
+ },
+ "data": [
+ {
+ "model": "DPR / Elasticsearch",
+ "n_docs": 100000,
+ "index_speed": 71.36964873196698,
+ "query_speed": 5.192368815242574,
+ "map": 86.54606328368976
+ },
+ {
+ "model": "BM25 / Elasticsearch",
+ "n_docs": 100000,
+ "index_speed": 485.5602670200369,
+ "query_speed": 103.0884393334727,
+ "map": 56.25299537353825
+ },
+ {
+ "model": "Sentence Transformers / Elasticsearch",
+ "n_docs": 100000,
+ "index_speed": 119.52937722555107,
+ "query_speed": 6.385621466857457,
+ "map": 82.74686664920836
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 100000,
+ "index_speed": 100.01184910084558,
+ "query_speed": 6.6270933964840415,
+ "map": 86.54606328368973
+ },
+ {
+ "model": "DPR / FAISS (HNSW)",
+ "n_docs": 100000,
+ "index_speed": 89.90389306648805,
+ "query_speed": 39.7839528511866,
+ "map": 84.33419639513305
+ },
+ {
+ "model": "DPR / Milvus (flat)",
+ "n_docs": 100000,
+ "index_speed": 116.00982709720004,
+ "query_speed": 28.57264344960955,
+ "map": 86.54606328368973
+ },
+ {
+ "model": "DPR / Milvus (HNSW)",
+ "n_docs": 100000,
+ "index_speed": 115.61076852516383,
+ "query_speed": 38.80526238789059,
+ "map": 81.63864883662649
+ },
+ {
+ "model": "DPR / OpenSearch (flat)",
+ "n_docs": 100000,
+ "index_speed": 70.05381128388427,
+ "query_speed": 15.306895223372484,
+ "map": 86.54014997282701
+ },
+ {
+ "model": "DPR / OpenSearch (HNSW)",
+ "n_docs": 100000,
+ "index_speed": 70.31004397719536,
+ "query_speed": 24.95733865947408,
+ "map": 85.7342431384476
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/benchmarks/retriever_speed.json b/docs/v1.5.0/_src/benchmarks/retriever_speed.json
new file mode 100644
index 0000000000..7877d2a358
--- /dev/null
+++ b/docs/v1.5.0/_src/benchmarks/retriever_speed.json
@@ -0,0 +1,204 @@
+{
+ "chart_type": "LineChart",
+ "title": "Retriever Speed",
+ "subtitle": "Query Speed at different number of docs",
+ "description": "Here you can see how the query speed of different Retriever / DocumentStore combinations scale as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.",
+ "columns": [
+ "n_docs",
+ "BM25 / Elasticsearch",
+ "DPR / Elasticsearch",
+ "DPR / FAISS (flat)",
+ "DPR / FAISS (HNSW)",
+ "DPR / Milvus (flat)",
+ "DPR / Milvus (HNSW)",
+ "Sentence Transformers / Elasticsearch"
+ ],
+ "axis": [
+ {
+ "x": "Number of docs",
+ "y": "Queries/sec"
+ }
+ ],
+ "data": [
+ {
+ "model": "DPR / Elasticsearch",
+ "n_docs": 1000,
+ "query_speed": 34.22768858415144
+ },
+ {
+ "model": "DPR / Elasticsearch",
+ "n_docs": 10000,
+ "query_speed": 22.197089725786853
+ },
+ {
+ "model": "BM25 / Elasticsearch",
+ "n_docs": 10000,
+ "query_speed": 127.11481826852273
+ },
+ {
+ "model": "Sentence Transformers / Elasticsearch",
+ "n_docs": 1000,
+ "query_speed": 47.51341215808855
+ },
+ {
+ "model": "Sentence Transformers / Elasticsearch",
+ "n_docs": 10000,
+ "query_speed": 29.74515869340777
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 1000,
+ "query_speed": 42.49634272581313
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 10000,
+ "query_speed": 27.684040507849826
+ },
+ {
+ "model": "DPR / FAISS (HNSW)",
+ "n_docs": 1000,
+ "query_speed": 43.36685860983961
+ },
+ {
+ "model": "DPR / FAISS (HNSW)",
+ "n_docs": 10000,
+ "query_speed": 41.819147130090286
+ },
+ {
+ "model": "DPR / Milvus (flat)",
+ "n_docs": 1000,
+ "query_speed": 41.12204778755844
+ },
+ {
+ "model": "DPR / Milvus (flat)",
+ "n_docs": 10000,
+ "query_speed": 37.86882443918513
+ },
+ {
+ "model": "DPR / Milvus (HNSW)",
+ "n_docs": 1000,
+ "query_speed": 41.14803671045185
+ },
+ {
+ "model": "DPR / Milvus (HNSW)",
+ "n_docs": 10000,
+ "query_speed": 40.072871546542935
+ },
+ {
+ "model": "DPR / Elasticsearch",
+ "n_docs": 100000,
+ "query_speed": 5.192368815242574
+ },
+ {
+ "model": "DPR / Elasticsearch",
+ "n_docs": 500000,
+ "query_speed": 1.0337466563959614
+ },
+ {
+ "model": "BM25 / Elasticsearch",
+ "n_docs": 100000,
+ "query_speed": 103.0884393334727
+ },
+ {
+ "model": "BM25 / Elasticsearch",
+ "n_docs": 500000,
+ "query_speed": 78.95037031647355
+ },
+ {
+ "model": "Sentence Transformers / Elasticsearch",
+ "n_docs": 100000,
+ "query_speed": 6.385621466857457
+ },
+ {
+ "model": "Sentence Transformers / Elasticsearch",
+ "n_docs": 500000,
+ "query_speed": 1.4175454254854258
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 100000,
+ "query_speed": 6.6270933964840415
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 500000,
+ "query_speed": 1.5394964631878052
+ },
+ {
+ "model": "DPR / FAISS (HNSW)",
+ "n_docs": 100000,
+ "query_speed": 39.7839528511866
+ },
+ {
+ "model": "DPR / FAISS (HNSW)",
+ "n_docs": 500000,
+ "query_speed": 39.84177061191119
+ },
+ {
+ "model": "DPR / Milvus (flat)",
+ "n_docs": 100000,
+ "query_speed": 28.57264344960955
+ },
+ {
+ "model": "DPR / Milvus (flat)",
+ "n_docs": 500000,
+ "query_speed": 15.645867393099733
+ },
+ {
+ "model": "DPR / Milvus (HNSW)",
+ "n_docs": 100000,
+ "query_speed": 38.80526238789059
+ },
+ {
+ "model": "DPR / Milvus (HNSW)",
+ "n_docs": 500000,
+ "query_speed": 37.15717318924075
+ },
+ {
+ "model": "BM25 / Elasticsearch",
+ "n_docs": 1000,
+ "query_speed": 282.95914917837337
+ },
+ {
+ "model": "DPR / OpenSearch (flat)",
+ "n_docs": 1000,
+ "query_speed": 29.061163356184426
+ },
+ {
+ "model": "DPR / OpenSearch (flat)",
+ "n_docs": 10000,
+ "query_speed": 24.834414667596725
+ },
+ {
+ "model": "DPR / OpenSearch (flat)",
+ "n_docs": 100000,
+ "query_speed": 15.306895223372484
+ },
+ {
+ "model": "DPR / OpenSearch (HNSW)",
+ "n_docs": 1000,
+ "query_speed": 29.10621389658101
+ },
+ {
+ "model": "DPR / OpenSearch (HNSW)",
+ "n_docs": 10000,
+ "query_speed": 26.92417300437131
+ },
+ {
+ "model": "DPR / OpenSearch (HNSW)",
+ "n_docs": 100000,
+ "query_speed": 24.95733865947408
+ },
+ {
+ "model": "DPR / OpenSearch (flat)",
+ "n_docs": 500000,
+ "query_speed": 11.33271222977541
+ },
+ {
+ "model": "DPR / OpenSearch (HNSW)",
+ "n_docs": 500000,
+ "query_speed": 24.13921492357397
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/v1.5.0/_src/tutorials/Makefile b/docs/v1.5.0/_src/tutorials/Makefile
new file mode 100644
index 0000000000..d4bb2cbb9e
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/v1.5.0/_src/tutorials/conf.py b/docs/v1.5.0/_src/tutorials/conf.py
new file mode 100644
index 0000000000..4511b84159
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/conf.py
@@ -0,0 +1,51 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = "Tutorials"
+copyright = "2020, deepset"
+author = "deepset"
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ["IPython.sphinxext.ipython_console_highlighting"]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = "alabaster"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
diff --git a/docs/v1.5.0/_src/tutorials/index.rst b/docs/v1.5.0/_src/tutorials/index.rst
new file mode 100644
index 0000000000..4351a5f784
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/index.rst
@@ -0,0 +1,13 @@
+Tutorials
+====================================
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Contents:
+
+ 1) Using Haystack to search through your own documents
+ 2) Make Haystack understand your jargon
+ 3) Connect Haystack to your Datastore of choice
+ 4) Answer incoming questions using FAQ pages
+ 5) Benchmark the different components of Haystack
+ 6) SoTA: Powerup Haystack with DPR
diff --git a/docs/v1.5.0/_src/tutorials/make.bat b/docs/v1.5.0/_src/tutorials/make.bat
new file mode 100644
index 0000000000..2119f51099
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/1.md b/docs/v1.5.0/_src/tutorials/tutorials/1.md
new file mode 100644
index 0000000000..c64b5c70df
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/1.md
@@ -0,0 +1,286 @@
+
+
+# Build Your First QA System
+
+
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb)
+
+Question Answering can be used in a variety of use cases. A very common one: Using it to navigate through complex knowledge bases or long documents ("search setting").
+
+A "knowledge base" could for example be your website, an internal wiki or a collection of financial reports.
+In this tutorial we will work on a slightly different domain: "Game of Thrones".
+
+Let's see how we can use a bunch of Wikipedia articles to answer a variety of questions about the
+marvellous seven kingdoms.
+
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+```
+
+
+```python
+from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
+from haystack.nodes import FARMReader, TransformersReader
+```
+
+## Document Store
+
+Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`.
+
+**Here:** We recommended Elasticsearch as it comes preloaded with features like [full-text queries](https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html), [BM25 retrieval](https://www.elastic.co/elasticon/conf/2016/sf/improved-text-scoring-with-bm25), and [vector storage for text embeddings](https://www.elastic.co/guide/en/elasticsearch/reference/7.6/dense-vector.html).
+
+**Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the [Tutorial 3](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb) for using SQL/InMemory document stores.
+
+**Hint**: This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can configure Haystack to work with your existing document stores.
+
+### Start an Elasticsearch server
+You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source.
+
+
+```python
+# Recommended: Start Elasticsearch using Docker via the Haystack utility function
+from haystack.utils import launch_es
+
+launch_es()
+```
+
+
+```python
+# In Colab / No Docker environments: Start Elasticsearch from source
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
+
+import os
+from subprocess import Popen, PIPE, STDOUT
+
+es_server = Popen(
+ ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
+)
+# wait until ES has started
+! sleep 30
+```
+
+
+```python
+# Connect to Elasticsearch
+
+from haystack.document_stores import ElasticsearchDocumentStore
+
+document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
+```
+
+## Preprocessing of documents
+
+Haystack provides a customizable pipeline for:
+ - converting files into texts
+ - cleaning texts
+ - splitting texts
+ - writing them to a Document Store
+
+In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch.
+
+
+```python
+# Let's first fetch some documents that we want to query
+# Here: 517 Wikipedia articles for Game of Thrones
+doc_dir = "data/tutorial1"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+# Convert files to dicts
+# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
+# It must take a str as input, and return a str.
+docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
+
+# We now have a list of dictionaries that we can write to our document store.
+# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
+# The default format here is:
+# {
+# 'content': "",
+# 'meta': {'name': "", ...}
+# }
+# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and
+# can be accessed later for filtering or shown in the responses of the Pipeline)
+
+# Let's have a look at the first 3 entries:
+print(docs[:3])
+
+# Now, let's write the dicts containing documents to our DB.
+document_store.write_documents(docs)
+```
+
+## Initalize Retriever, Reader, & Pipeline
+
+### Retriever
+
+Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered.
+They use some simple but fast algorithm.
+
+**Here:** We use Elasticsearch's default BM25 algorithm
+
+**Alternatives:**
+
+- Customize the `BM25Retriever`with custom queries (e.g. boosting) and filters
+- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging
+- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)
+- Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6)
+
+
+```python
+from haystack.nodes import BM25Retriever
+
+retriever = BM25Retriever(document_store=document_store)
+```
+
+
+```python
+# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store.
+
+# from haystack.nodes import TfidfRetriever
+# retriever = TfidfRetriever(document_store=document_store)
+```
+
+### Reader
+
+A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
+on powerful, but slower deep learning models.
+
+Haystack currently supports Readers based on the frameworks FARM and Transformers.
+With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
+
+**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2)
+
+**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
+
+**Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
+
+**Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible"
+
+#### FARMReader
+
+
+```python
+# Load a local model or any of the QA models on
+# Hugging Face's model hub (https://huggingface.co/models)
+
+reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
+```
+
+#### TransformersReader
+
+
+```python
+# Alternative:
+# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
+```
+
+### Pipeline
+
+With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
+Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
+To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
+You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/components/v1.5.0/pipelines).
+
+
+```python
+from haystack.pipelines import ExtractiveQAPipeline
+
+pipe = ExtractiveQAPipeline(reader, retriever)
+```
+
+## Voilà ! Ask a question!
+
+
+```python
+# You can configure how many candidates the reader and retriever shall return
+# The higher top_k_retriever, the better (but also the slower) your answers.
+prediction = pipe.run(
+ query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
+)
+```
+
+
+```python
+# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
+# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
+```
+
+
+```python
+# Now you can either print the object directly...
+from pprint import pprint
+
+pprint(prediction)
+
+# Sample output:
+# {
+# 'answers': [ ,
+# ,
+# ...
+# ]
+# 'documents': [ ,
+# ,
+# ...
+# ],
+# 'no_ans_gap': 11.688868522644043,
+# 'node_id': 'Reader',
+# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
+# 'query': 'Who is the father of Arya Stark?',
+# 'root_node': 'Query'
+# }
+```
+
+
+```python
+# ...or use a util to simplify the output
+# Change `minimum` to `medium` or `all` to raise the level of detail
+print_answers(prediction, details="minimum")
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
+
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/10.md b/docs/v1.5.0/_src/tutorials/tutorials/10.md
new file mode 100644
index 0000000000..6b71c70893
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/10.md
@@ -0,0 +1,158 @@
+
+
+# Question Answering on a Knowledge Graph
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.ipynb)
+
+Haystack allows storing and querying knowledge graphs with the help of pre-trained models that translate text queries to SPARQL queries.
+This tutorial demonstrates how to load an existing knowledge graph into haystack, load a pre-trained retriever, and execute text queries on the knowledge graph.
+The training of models that translate text queries into SPARQL queries is currently not supported.
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,graphdb]
+```
+
+
+```python
+# Here are some imports that we'll need
+
+import subprocess
+import time
+from pathlib import Path
+
+from haystack.nodes import Text2SparqlRetriever
+from haystack.document_stores import GraphDBKnowledgeGraph
+from haystack.utils import fetch_archive_from_http
+```
+
+## Downloading Knowledge Graph and Model
+
+
+```python
+# Let's first fetch some triples that we want to store in our knowledge graph
+# Here: exemplary triples from the wizarding world
+graph_dir = "data/tutorial10"
+s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
+fetch_archive_from_http(url=s3_url, output_dir=graph_dir)
+
+# Fetch a pre-trained BART model that translates text queries to SPARQL queries
+model_dir = "../saved_models/tutorial10_knowledge_graph/"
+s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
+fetch_archive_from_http(url=s3_url, output_dir=model_dir)
+```
+
+## Launching a GraphDB instance
+
+
+```python
+# Unfortunately, there seems to be no good way to run GraphDB in colab environments
+# In your local environment, you could start a GraphDB server with docker
+# Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/
+print("Starting GraphDB ...")
+status = subprocess.run(
+ [
+ "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11"
+ ],
+ shell=True,
+)
+if status.returncode:
+ raise Exception(
+ "Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?"
+ )
+time.sleep(5)
+```
+
+## Creating a new GraphDB repository (also known as index in haystack's document stores)
+
+
+```python
+# Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
+kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
+
+# Delete the index as it might have been already created in previous runs
+kg.delete_index()
+
+# Create the index based on a configuration file
+kg.create_index(config_path=Path(graph_dir + "repo-config.ttl"))
+
+# Import triples of subject, predicate, and object statements from a ttl file
+kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl"))
+print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")
+print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")
+```
+
+
+```python
+# Define prefixes for names of resources so that we can use shorter resource names in queries
+prefixes = """PREFIX rdf:
+PREFIX xsd:
+PREFIX hp:
+"""
+kg.prefixes = prefixes
+
+# Load a pre-trained model that translates text queries to SPARQL queries
+kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4")
+```
+
+## Query Execution
+
+We can now ask questions that will be answered by our knowledge graph!
+One limitation though: our pre-trained model can only generate questions about resources it has seen during training.
+Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph.
+E.g. "Harry" -> "hp:Harry_potter"
+
+
+```python
+query = "In which house is Harry Potter?"
+print(f'Translating the text query "{query}" to a SPARQL query and executing it on the knowledge graph...')
+result = kgqa_retriever.retrieve(query=query)
+print(result)
+# Correct SPARQL query: select ?a { hp:Harry_potter hp:house ?a . }
+# Correct answer: Gryffindor
+
+print("Executing a SPARQL query with prefixed names of resources...")
+result = kgqa_retriever._query_kg(
+ sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }"
+)
+print(result)
+# Paraphrased question: Who is the keeper of keys and grounds?
+# Correct answer: Rubeus Hagrid
+
+print("Executing a SPARQL query with full names of resources...")
+result = kgqa_retriever._query_kg(
+ sparql_query="select distinct ?obj where { ?obj . }"
+)
+print(result)
+# Paraphrased question: What is the patronus of Hermione?
+# Correct answer: Otter
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/11.md b/docs/v1.5.0/_src/tutorials/tutorials/11.md
new file mode 100644
index 0000000000..d584d88975
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/11.md
@@ -0,0 +1,433 @@
+
+
+# Pipelines Tutorial
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial11_Pipelines.ipynb)
+
+In this tutorial, you will learn how the `Pipeline` class acts as a connector between all the different
+building blocks that are found in FARM. Whether you are using a Reader, Generator, Summarizer
+or Retriever (or 2), the `Pipeline` class will help you build a Directed Acyclic Graph (DAG) that
+determines how to route the output of one component into the input of another.
+
+
+
+
+## Setting Up the Environment
+
+Let's start by ensuring we have a GPU running to ensure decent speed in this tutorial.
+In Google colab, you can change to a GPU runtime in the menu:
+- **Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+These lines are to install Haystack through pip
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+
+# Install pygraphviz
+!apt install libgraphviz-dev
+!pip install pygraphviz
+```
+
+If running from Colab or a no Docker environment, you will want to start Elasticsearch from source
+
+
+```python
+# In Colab / No Docker environments: Start Elasticsearch from source
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
+
+import os
+from subprocess import Popen, PIPE, STDOUT
+
+es_server = Popen(
+ ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
+)
+# wait until ES has started
+! sleep 30
+```
+
+## Initialization
+
+Then let's fetch some data (in this case, pages from the Game of Thrones wiki) and prepare it so that it can
+be used indexed into our `DocumentStore`
+
+
+```python
+from haystack.utils import (
+ print_answers,
+ print_documents,
+ fetch_archive_from_http,
+ convert_files_to_docs,
+ clean_wiki_text,
+)
+
+# Download and prepare data - 517 Wikipedia articles for Game of Thrones
+doc_dir = "data/tutorial11"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt11.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+# convert files to dicts containing documents that can be indexed to our datastore
+got_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
+```
+
+Here we initialize the core components that we will be gluing together using the `Pipeline` class.
+We have a `DocumentStore`, an `BM25Retriever` and a `FARMReader`.
+These can be combined to create a classic Retriever-Reader pipeline that is designed
+to perform Open Domain Question Answering.
+
+
+```python
+from haystack import Pipeline
+from haystack.utils import launch_es
+from haystack.document_stores import ElasticsearchDocumentStore
+from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader
+
+
+# Initialize DocumentStore and index documents
+launch_es()
+document_store = ElasticsearchDocumentStore()
+document_store.delete_documents()
+document_store.write_documents(got_docs)
+
+# Initialize Sparse retriever
+bm25_retriever = BM25Retriever(document_store=document_store)
+
+# Initialize dense retriever
+embedding_retriever = EmbeddingRetriever(
+ document_store,
+ model_format="sentence_transformers",
+ embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
+)
+document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
+
+# Initialize reader
+reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
+```
+
+## Prebuilt Pipelines
+
+Haystack features many prebuilt pipelines that cover common tasks.
+Here we have an `ExtractiveQAPipeline` (the successor to the now deprecated `Finder` class).
+
+
+```python
+from haystack.pipelines import ExtractiveQAPipeline
+
+# Prebuilt pipeline
+p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=bm25_retriever)
+res = p_extractive_premade.run(
+ query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
+)
+print_answers(res, details="minimum")
+```
+
+If you want to just do the retrieval step, you can use a `DocumentSearchPipeline`
+
+
+```python
+from haystack.pipelines import DocumentSearchPipeline
+
+p_retrieval = DocumentSearchPipeline(bm25_retriever)
+res = p_retrieval.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}})
+print_documents(res, max_text_len=200)
+```
+
+Or if you want to use a `Generator` instead of a `Reader`,
+you can initialize a `GenerativeQAPipeline` like this:
+
+
+```python
+from haystack.pipelines import GenerativeQAPipeline, FAQPipeline
+from haystack.nodes import RAGenerator
+
+# We set this to True so that the document store returns document embeddings with each document
+# This is needed by the Generator
+document_store.return_embedding = True
+
+# Initialize generator
+rag_generator = RAGenerator()
+
+# Generative QA
+p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever)
+res = p_generator.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}})
+print_answers(res, details="minimum")
+
+# We are setting this to False so that in later pipelines,
+# we get a cleaner printout
+document_store.return_embedding = False
+```
+
+Haystack features prebuilt pipelines to do:
+- just document search (DocumentSearchPipeline),
+- document search with summarization (SearchSummarizationPipeline)
+- generative QA (GenerativeQAPipeline)
+- FAQ style QA (FAQPipeline)
+- translated search (TranslationWrapperPipeline)
+To find out more about these pipelines, have a look at our [documentation](https://haystack.deepset.ai/components/v1.5.0/pipelines)
+
+
+With any Pipeline, whether prebuilt or custom constructed,
+you can save a diagram showing how all the components are connected.
+
+![image](https://github.com/deepset-ai/haystack/blob/master/docs/img/retriever-reader-pipeline.png)
+
+
+```python
+p_extractive_premade.draw("pipeline_extractive_premade.png")
+p_retrieval.draw("pipeline_retrieval.png")
+p_generator.draw("pipeline_generator.png")
+```
+
+## Custom Pipelines
+
+Now we are going to rebuild the `ExtractiveQAPipelines` using the generic Pipeline class.
+We do this by adding the building blocks that we initialized as nodes in the graph.
+
+
+```python
+# Custom built extractive QA pipeline
+p_extractive = Pipeline()
+p_extractive.add_node(component=bm25_retriever, name="Retriever", inputs=["Query"])
+p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"])
+
+# Now we can run it
+res = p_extractive.run(
+ query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
+)
+print_answers(res, details="minimum")
+p_extractive.draw("pipeline_extractive.png")
+```
+
+Pipelines offer a very simple way to ensemble together different components.
+In this example, we are going to combine the power of an `EmbeddingRetriever`
+with the keyword based `BM25Retriever`.
+See our [documentation](https://haystack.deepset.ai/components/v1.5.0/retriever) to understand why
+we might want to combine a dense and sparse retriever.
+
+![image](https://github.com/deepset-ai/haystack/blob/master/docs/img/tutorial11_custompipelines_pipeline_ensemble.png?raw=true)
+
+Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together.
+
+
+```python
+from haystack.nodes import JoinDocuments
+
+# Create ensembled pipeline
+p_ensemble = Pipeline()
+p_ensemble.add_node(component=bm25_retriever, name="ESRetriever", inputs=["Query"])
+p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
+p_ensemble.add_node(
+ component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "EmbeddingRetriever"]
+)
+p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"])
+p_ensemble.draw("pipeline_ensemble.png")
+
+# Run pipeline
+res = p_ensemble.run(
+ query="Who is the father of Arya Stark?", params={"EmbeddingRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}}
+)
+print_answers(res, details="minimum")
+```
+
+## Custom Nodes
+
+Nodes are relatively simple objects
+and we encourage our users to design their own if they don't see on that fits their use case
+
+The only requirements are:
+- Create a class that inherits `BaseComponent`.
+- Add a method run() to your class. Add the mandatory and optional arguments it needs to process. These arguments must be passed as input to the pipeline, inside `params`, or output by preceding nodes.
+- Add processing logic inside the run() (e.g. reformatting the query).
+- Return a tuple that contains your output data (for the next node)
+and the name of the outgoing edge (by default "output_1" for nodes that have one output)
+- Add a class attribute outgoing_edges = 1 that defines the number of output options from your node. You only need a higher number here if you have a decision node (see below).
+
+Here we have a template for a Node:
+
+
+```python
+from haystack import BaseComponent
+from typing import Optional, List
+
+
+class CustomNode(BaseComponent):
+ outgoing_edges = 1
+
+ def run(self, query: str, my_optional_param: Optional[int]):
+ # process the inputs
+ output = {"my_output": ...}
+ return output, "output_1"
+
+ def run_batch(self, queries: List[str], my_optional_param: Optional[int]):
+ # process the inputs
+ output = {"my_output": ...}
+ return output, "output_1"
+```
+
+## Decision Nodes
+
+Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
+One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to EmbeddingRetriever + Reader.
+With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful.
+
+![image](https://github.com/deepset-ai/haystack/blob/master/docs/img/tutorial11_decision_nodes_pipeline_classifier.png?raw=true)
+
+Though this looks very similar to the ensembled pipeline shown above,
+the key difference is that only one of the retrievers is run for each request.
+By contrast both retrievers are always run in the ensembled approach.
+
+Below, we define a very naive `QueryClassifier` and show how to use it:
+
+
+```python
+class CustomQueryClassifier(BaseComponent):
+ outgoing_edges = 2
+
+ def run(self, query: str):
+ if "?" in query:
+ return {}, "output_2"
+ else:
+ return {}, "output_1"
+
+ def run_batch(self, queries: List[str]):
+ split = {"output_1": {"queries": []}, "output_2": {"queries": []}}
+ for query in queries:
+ if "?" in query:
+ split["output_2"]["queries"].append(query)
+ else:
+ split["output_1"]["queries"].append(query)
+
+ return split, "split"
+
+
+# Here we build the pipeline
+p_classifier = Pipeline()
+p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
+p_classifier.add_node(component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
+p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"])
+p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
+p_classifier.draw("pipeline_classifier.png")
+
+# Run only the dense retriever on the full sentence query
+res_1 = p_classifier.run(query="Who is the father of Arya Stark?")
+print("Embedding Retriever Results" + "\n" + "=" * 15)
+print_answers(res_1)
+
+# Run only the sparse retriever on a keyword based query
+res_2 = p_classifier.run(query="Arya Stark father")
+print("ES Results" + "\n" + "=" * 15)
+print_answers(res_2)
+```
+
+## Evaluation Nodes
+
+We have also designed a set of nodes that can be used to evaluate the performance of a system.
+Have a look at our [tutorial](https://haystack.deepset.ai/tutorials/v1.5.0/evaluation) to get hands on with the code and learn more about Evaluation Nodes!
+
+## Debugging Pipelines
+
+You can print out debug information from nodes in your pipelines in a few different ways.
+
+
+```python
+# 1) You can set the `debug` attribute of a given node.
+bm25_retriever.debug = True
+
+# 2) You can provide `debug` as a parameter when running your pipeline
+result = p_classifier.run(query="Who is the father of Arya Stark?", params={"ESRetriever": {"debug": True}})
+
+# 3) You can provide the `debug` paramter to all nodes in your pipeline
+result = p_classifier.run(query="Who is the father of Arya Stark?", params={"debug": True})
+
+result["_debug"]
+```
+
+## YAML Configs
+
+A full `Pipeline` can be defined in a YAML file and simply loaded.
+Having your pipeline available in a YAML is particularly useful
+when you move between experimentation and production environments.
+Just export the YAML from your notebook / IDE and import it into your production environment.
+It also helps with version control of pipelines,
+allows you to share your pipeline easily with colleagues,
+and simplifies the configuration of pipeline parameters in production.
+
+It consists of two main sections: you define all objects (e.g. a reader) in components
+and then stick them together to a pipeline in pipelines.
+You can also set one component to be multiple nodes of a pipeline or to be a node across multiple pipelines.
+It will be loaded just once in memory and therefore doesn't hurt your resources more than actually needed.
+
+The contents of a YAML file should look something like this:
+
+```yaml
+version: '0.7'
+components: # define all the building-blocks for Pipeline
+- name: MyReader # custom-name for the component; helpful for visualization & debugging
+ type: FARMReader # Haystack Class name for the component
+ params:
+ no_ans_boost: -10
+ model_name_or_path: deepset/roberta-base-squad2
+- name: MyESRetriever
+ type: BM25Retriever
+ params:
+ document_store: MyDocumentStore # params can reference other components defined in the YAML
+ custom_query: null
+- name: MyDocumentStore
+ type: ElasticsearchDocumentStore
+ params:
+ index: haystack_test
+pipelines: # multiple Pipelines can be defined using the components from above
+- name: my_query_pipeline # a simple extractive-qa Pipeline
+ nodes:
+ - name: MyESRetriever
+ inputs: [Query]
+ - name: MyReader
+ inputs: [MyESRetriever]
+```
+
+To load, simply call:
+``` python
+pipeline.load_from_yaml(Path("sample.yaml"))
+```
+
+## Conclusion
+
+The possibilities are endless with the `Pipeline` class and we hope that this tutorial will inspire you
+to build custom pipeplines that really work for your use case!
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/12.md b/docs/v1.5.0/_src/tutorials/tutorials/12.md
new file mode 100644
index 0000000000..ea2d227c48
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/12.md
@@ -0,0 +1,168 @@
+
+
+# Long-Form Question Answering
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial12_LFQA.ipynb)
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install -q git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]
+```
+
+
+```python
+from haystack.utils import convert_files_to_docs, fetch_archive_from_http, clean_wiki_text
+from haystack.nodes import Seq2SeqGenerator
+```
+
+### Document Store
+
+FAISS is a library for efficient similarity search on a cluster of dense vectors.
+The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
+to store the document text and other meta data. The vector embeddings of the text are
+indexed on a FAISS Index that later is queried for searching answers.
+The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
+faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
+For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
+
+
+```python
+from haystack.document_stores import FAISSDocumentStore
+
+document_store = FAISSDocumentStore(embedding_dim=128, faiss_index_factory_str="Flat")
+```
+
+### Cleaning & indexing documents
+
+Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore
+
+
+```python
+# Let's first get some files that we want to use
+doc_dir = "data/tutorial12"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+# Convert files to dicts
+docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
+
+# Now, let's write the dicts containing documents to our DB.
+document_store.write_documents(docs)
+```
+
+### Initalize Retriever and Reader/Generator
+
+#### Retriever
+
+We use a `DensePassageRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`
+
+
+
+
+```python
+from haystack.nodes import DensePassageRetriever
+
+retriever = DensePassageRetriever(
+ document_store=document_store,
+ query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
+ passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
+)
+
+document_store.update_embeddings(retriever)
+```
+
+Before we blindly use the `DensePassageRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents.
+
+
+```python
+from haystack.utils import print_documents
+from haystack.pipelines import DocumentSearchPipeline
+
+p_retrieval = DocumentSearchPipeline(retriever)
+res = p_retrieval.run(query="Tell me something about Arya Stark?", params={"Retriever": {"top_k": 10}})
+print_documents(res, max_text_len=512)
+```
+
+#### Reader/Generator
+
+Similar to previous Tutorials we now initalize our reader/generator.
+
+Here we use a `Seq2SeqGenerator` with the *vblagoje/bart_lfqa* model (see: https://huggingface.co/vblagoje/bart_lfqa)
+
+
+
+
+```python
+generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")
+```
+
+### Pipeline
+
+With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
+Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
+To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.
+You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/components/v1.5.0/pipelines).
+
+
+```python
+from haystack.pipelines import GenerativeQAPipeline
+
+pipe = GenerativeQAPipeline(generator, retriever)
+```
+
+## Voilà ! Ask a question!
+
+
+```python
+pipe.run(
+ query="How did Arya Stark's character get portrayed in a television adaptation?", params={"Retriever": {"top_k": 3}}
+)
+```
+
+
+```python
+pipe.run(query="Why is Arya Stark an unusual character?", params={"Retriever": {"top_k": 3}})
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/13.md b/docs/v1.5.0/_src/tutorials/tutorials/13.md
new file mode 100644
index 0000000000..0c66434e78
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/13.md
@@ -0,0 +1,187 @@
+
+
+# Question Generation
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial13_Question_generation.ipynb)
+
+This is a bare bones tutorial showing what is possible with the QuestionGenerator Nodes and Pipelines which automatically
+generate questions which the question generation model thinks can be answered by a given document.
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+```
+
+
+```python
+# Imports needed to run this notebook
+
+from pprint import pprint
+from tqdm import tqdm
+from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader
+from haystack.document_stores import ElasticsearchDocumentStore
+from haystack.pipelines import (
+ QuestionGenerationPipeline,
+ RetrieverQuestionGenerationPipeline,
+ QuestionAnswerGenerationPipeline,
+)
+from haystack.utils import launch_es, print_questions
+```
+
+Let's start an Elasticsearch instance with one of the options below:
+
+
+```python
+# Option 1: Start Elasticsearch service via Docker
+launch_es()
+```
+
+
+```python
+# Option 2: In Colab / No Docker environments: Start Elasticsearch from source
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
+
+import os
+from subprocess import Popen, PIPE, STDOUT
+
+es_server = Popen(
+ ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
+)
+# wait until ES has started
+! sleep 30
+```
+
+Let's initialize some core components
+
+
+```python
+text1 = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace."
+text2 = "Princess Arya Stark is the third child and second daughter of Lord Eddard Stark and his wife, Lady Catelyn Stark. She is the sister of the incumbent Westerosi monarchs, Sansa, Queen in the North, and Brandon, King of the Andals and the First Men. After narrowly escaping the persecution of House Stark by House Lannister, Arya is trained as a Faceless Man at the House of Black and White in Braavos, using her abilities to avenge her family. Upon her return to Westeros, she exacts retribution for the Red Wedding by exterminating the Frey male line."
+text3 = "Dry Cleaning are an English post-punk band who formed in South London in 2018.[3] The band is composed of vocalist Florence Shaw, guitarist Tom Dowse, bassist Lewis Maynard and drummer Nick Buxton. They are noted for their use of spoken word primarily in lieu of sung vocals, as well as their unconventional lyrics. Their musical stylings have been compared to Wire, Magazine and Joy Division.[4] The band released their debut single, 'Magic of Meghan' in 2019. Shaw wrote the song after going through a break-up and moving out of her former partner's apartment the same day that Meghan Markle and Prince Harry announced they were engaged.[5] This was followed by the release of two EPs that year: Sweet Princess in August and Boundary Road Snacks and Drinks in October. The band were included as part of the NME 100 of 2020,[6] as well as DIY magazine's Class of 2020.[7] The band signed to 4AD in late 2020 and shared a new single, 'Scratchcard Lanyard'.[8] In February 2021, the band shared details of their debut studio album, New Long Leg. They also shared the single 'Strong Feelings'.[9] The album, which was produced by John Parish, was released on 2 April 2021.[10]"
+
+docs = [{"content": text1}, {"content": text2}, {"content": text3}]
+
+# Initialize document store and write in the documents
+document_store = ElasticsearchDocumentStore()
+document_store.write_documents(docs)
+
+# Initialize Question Generator
+question_generator = QuestionGenerator()
+```
+
+## Question Generation Pipeline
+
+The most basic version of a question generator pipeline takes a document as input and outputs generated questions
+which the the document can answer.
+
+
+```python
+question_generation_pipeline = QuestionGenerationPipeline(question_generator)
+for idx, document in enumerate(document_store):
+
+ print(f"\n * Generating questions for document {idx}: {document.content[:100]}...\n")
+ result = question_generation_pipeline.run(documents=[document])
+ print_questions(result)
+```
+
+## Retriever Question Generation Pipeline
+
+This pipeline takes a query as input. It retrieves relevant documents and then generates questions based on these.
+
+
+```python
+retriever = BM25Retriever(document_store=document_store)
+rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)
+
+print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n")
+result = rqg_pipeline.run(query="Arya Stark")
+print_questions(result)
+```
+
+## Question Answer Generation Pipeline
+
+This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using
+a Reader model
+
+
+```python
+reader = FARMReader("deepset/roberta-base-squad2")
+qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
+for idx, document in enumerate(tqdm(document_store)):
+
+ print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
+ result = qag_pipeline.run(documents=[document])
+ print_questions(result)
+```
+
+## Translated Question Answer Generation Pipeline
+Trained models for Question Answer Generation are not available in many languages other than English. Haystack
+provides a workaround for that issue by machine-translating a pipeline's inputs and outputs with the
+TranslationWrapperPipeline. The following example generates German questions and answers on a German text
+document - by using an English model for Question Answer Generation.
+
+
+```python
+# Fill the document store with a German document.
+text1 = "Python ist eine interpretierte Hochsprachenprogrammiersprache für allgemeine Zwecke. Sie wurde von Guido van Rossum entwickelt und 1991 erstmals veröffentlicht. Die Design-Philosophie von Python legt den Schwerpunkt auf die Lesbarkeit des Codes und die Verwendung von viel Leerraum (Whitespace)."
+docs = [{"content": text1}]
+document_store.delete_documents()
+document_store.write_documents(docs)
+
+# Load machine translation models
+from haystack.nodes import TransformersTranslator
+
+in_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en")
+out_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de")
+
+# Wrap the previously defined QuestionAnswerGenerationPipeline
+from haystack.pipelines import TranslationWrapperPipeline
+
+pipeline_with_translation = TranslationWrapperPipeline(
+ input_translator=in_translator, output_translator=out_translator, pipeline=qag_pipeline
+)
+
+for idx, document in enumerate(tqdm(document_store)):
+ print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
+ result = pipeline_with_translation.run(documents=[document])
+ print_questions(result)
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/14.md b/docs/v1.5.0/_src/tutorials/tutorials/14.md
new file mode 100644
index 0000000000..4c5c5e7cc8
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/14.md
@@ -0,0 +1,376 @@
+
+
+# Query Classifier Tutorial
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial14_Query_Classifier.ipynb)
+
+In this tutorial we introduce the query classifier the goal of introducing this feature was to optimize the overall flow of Haystack pipeline by detecting the nature of user queries. Now, the Haystack can detect primarily three types of queries using both light-weight SKLearn Gradient Boosted classifier or Transformer based more robust classifier. The three categories of queries are as follows:
+
+
+### 1. Keyword Queries:
+Such queries don't have semantic meaning and merely consist of keywords. For instance these three are the examples of keyword queries.
+
+* arya stark father
+* jon snow country
+* arya stark younger brothers
+
+### 2. Interrogative Queries:
+In such queries users usually ask a question, regardless of presence of "?" in the query the goal here is to detect the intent of the user whether any question is asked or not in the query. For example:
+
+* who is the father of arya stark ?
+* which country was jon snow filmed ?
+* who are the younger brothers of arya stark ?
+
+### 3. Declarative Queries:
+Such queries are variation of keyword queries, however, there is semantic relationship between words. Fo example:
+
+* Arya stark was a daughter of a lord.
+* Jon snow was filmed in a country in UK.
+* Bran was brother of a princess.
+
+In this tutorial, you will learn how the `TransformersQueryClassifier` and `SklearnQueryClassifier` classes can be used to intelligently route your queries, based on the nature of the user query. Also, you can choose between a lightweight Gradients boosted classifier or a transformer based classifier.
+
+Furthermore, there are two types of classifiers you can use out of the box from Haystack.
+1. Keyword vs Statement/Question Query Classifier
+2. Statement vs Question Query Classifier
+
+As evident from the name the first classifier detects the keywords search queries and semantic statements like sentences/questions. The second classifier differentiates between question based queries and declarative sentences.
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+These lines are to install Haystack through pip
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+
+# Install pygraphviz
+!apt install libgraphviz-dev
+!pip install pygraphviz
+
+# In Colab / No Docker environments: Start Elasticsearch from source
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
+
+import os
+from subprocess import Popen, PIPE, STDOUT
+
+es_server = Popen(
+ ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
+)
+# wait until ES has started
+! sleep 30
+```
+
+If running from Colab or a no Docker environment, you will want to start Elasticsearch from source
+
+## Initialization
+
+Here are some core imports
+
+Then let's fetch some data (in this case, pages from the Game of Thrones wiki) and prepare it so that it can
+be used indexed into our `DocumentStore`
+
+
+```python
+from haystack.utils import (
+ print_answers,
+ print_documents,
+ fetch_archive_from_http,
+ convert_files_to_docs,
+ clean_wiki_text,
+ launch_es,
+)
+from haystack.pipelines import Pipeline
+from haystack.document_stores import ElasticsearchDocumentStore
+from haystack.nodes import (
+ BM25Retriever,
+ EmbeddingRetriever,
+ FARMReader,
+ TransformersQueryClassifier,
+ SklearnQueryClassifier,
+)
+
+# Download and prepare data - 517 Wikipedia articles for Game of Thrones
+doc_dir = "data/tutorial14"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+# convert files to dicts containing documents that can be indexed to our datastore
+got_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
+
+# Initialize DocumentStore and index documents
+launch_es()
+document_store = ElasticsearchDocumentStore()
+document_store.delete_documents()
+document_store.write_documents(got_docs)
+
+# Initialize Sparse retriever
+bm25_retriever = BM25Retriever(document_store=document_store)
+
+# Initialize dense retriever
+embedding_retriever = EmbeddingRetriever(
+ document_store=document_store,
+ model_format="sentence_transformers",
+ embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
+)
+document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
+
+reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
+```
+
+## Keyword vs Question/Statement Classifier
+
+The keyword vs question/statement query classifier essentially distinguishes between the keyword queries and statements/questions. So you can intelligently route to different retrieval nodes based on the nature of the query. Using this classifier can potentially yield the following benefits:
+
+* Getting better search results (e.g. by routing only proper questions to DPR / QA branches and not keyword queries)
+* Less GPU costs (e.g. if 50% of your traffic is only keyword queries you could just use elastic here and save the GPU resources for the other 50% of traffic with semantic queries)
+
+![image]()
+
+
+Below, we define a `SklearnQueryClassifier` and show how to use it:
+
+Read more about the trained model and dataset used [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt)
+
+
+```python
+# Here we build the pipeline
+sklearn_keyword_classifier = Pipeline()
+sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
+sklearn_keyword_classifier.add_node(
+ component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"]
+)
+sklearn_keyword_classifier.add_node(component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
+sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
+sklearn_keyword_classifier.draw("pipeline_classifier.png")
+```
+
+
+```python
+# Run only the dense retriever on the full sentence query
+res_1 = sklearn_keyword_classifier.run(query="Who is the father of Arya Stark?")
+print("Embedding Retriever Results" + "\n" + "=" * 15)
+print_answers(res_1, details="minimum")
+
+# Run only the sparse retriever on a keyword based query
+res_2 = sklearn_keyword_classifier.run(query="arya stark father")
+print("ES Results" + "\n" + "=" * 15)
+print_answers(res_2, details="minimum")
+```
+
+
+```python
+# Run only the dense retriever on the full sentence query
+res_3 = sklearn_keyword_classifier.run(query="which country was jon snow filmed ?")
+print("Embedding Retriever Results" + "\n" + "=" * 15)
+print_answers(res_3, details="minimum")
+
+# Run only the sparse retriever on a keyword based query
+res_4 = sklearn_keyword_classifier.run(query="jon snow country")
+print("ES Results" + "\n" + "=" * 15)
+print_answers(res_4, details="minimum")
+```
+
+
+```python
+# Run only the dense retriever on the full sentence query
+res_5 = sklearn_keyword_classifier.run(query="who are the younger brothers of arya stark ?")
+print("Embedding Retriever Results" + "\n" + "=" * 15)
+print_answers(res_5, details="minimum")
+
+# Run only the sparse retriever on a keyword based query
+res_6 = sklearn_keyword_classifier.run(query="arya stark younger brothers")
+print("ES Results" + "\n" + "=" * 15)
+print_answers(res_6, details="minimum")
+```
+
+## Transformer Keyword vs Question/Statement Classifier
+
+Firstly, it's essential to understand the trade-offs between SkLearn and Transformer query classifiers. The transformer classifier is more accurate than SkLearn classifier however, it requires more memory and most probably GPU for faster inference however the transformer size is roughly `50 MBs`. Whereas, SkLearn is less accurate however is much more faster and doesn't require GPU for inference.
+
+Below, we define a `TransformersQueryClassifier` and show how to use it:
+
+Read more about the trained model and dataset used [here](https://huggingface.co/shahrukhx01/bert-mini-finetune-question-detection)
+
+
+```python
+# Here we build the pipeline
+transformer_keyword_classifier = Pipeline()
+transformer_keyword_classifier.add_node(
+ component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]
+)
+transformer_keyword_classifier.add_node(
+ component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"]
+)
+transformer_keyword_classifier.add_node(
+ component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]
+)
+transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
+transformer_keyword_classifier.draw("pipeline_classifier.png")
+```
+
+
+```python
+# Run only the dense retriever on the full sentence query
+res_1 = transformer_keyword_classifier.run(query="Who is the father of Arya Stark?")
+print("Embedding Retriever Results" + "\n" + "=" * 15)
+print_answers(res_1, details="minimum")
+
+# Run only the sparse retriever on a keyword based query
+res_2 = transformer_keyword_classifier.run(query="arya stark father")
+print("ES Results" + "\n" + "=" * 15)
+print_answers(res_2, details="minimum")
+```
+
+
+```python
+# Run only the dense retriever on the full sentence query
+res_3 = transformer_keyword_classifier.run(query="which country was jon snow filmed ?")
+print("Embedding Retriever Results" + "\n" + "=" * 15)
+print_answers(res_3, details="minimum")
+
+# Run only the sparse retriever on a keyword based query
+res_4 = transformer_keyword_classifier.run(query="jon snow country")
+print("ES Results" + "\n" + "=" * 15)
+print_answers(res_4, details="minimum")
+```
+
+
+```python
+# Run only the dense retriever on the full sentence query
+res_5 = transformer_keyword_classifier.run(query="who are the younger brothers of arya stark ?")
+print("Embedding Retriever Results" + "\n" + "=" * 15)
+print_answers(res_5, details="minimum")
+
+# Run only the sparse retriever on a keyword based query
+res_6 = transformer_keyword_classifier.run(query="arya stark younger brothers")
+print("ES Results" + "\n" + "=" * 15)
+print_answers(res_6, details="minimum")
+```
+
+## Question vs Statement Classifier
+
+One possible use case of this classifier could be to route queries after the document retrieval to only send questions to QA reader and in case of declarative sentence, just return the DPR/ES results back to user to enhance user experience and only show answers when user explicitly asks it.
+
+![image]()
+
+
+Below, we define a `TransformersQueryClassifier` and show how to use it:
+
+Read more about the trained model and dataset used [here](https://huggingface.co/shahrukhx01/question-vs-statement-classifier)
+
+
+```python
+# Here we build the pipeline
+transformer_question_classifier = Pipeline()
+transformer_question_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
+transformer_question_classifier.add_node(
+ component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"),
+ name="QueryClassifier",
+ inputs=["EmbeddingRetriever"],
+)
+transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
+transformer_question_classifier.draw("question_classifier.png")
+
+# Run only the QA reader on the question query
+res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?")
+print("Embedding Retriever Results" + "\n" + "=" * 15)
+print_answers(res_1, details="minimum")
+
+res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.")
+print("ES Results" + "\n" + "=" * 15)
+print_documents(res_2)
+```
+
+## Standalone Query Classifier
+Below we run queries classifiers standalone to better understand their outputs on each of the three types of queries
+
+
+```python
+# Here we create the keyword vs question/statement query classifier
+from haystack.nodes import TransformersQueryClassifier
+
+queries = [
+ "arya stark father",
+ "jon snow country",
+ "who is the father of arya stark",
+ "which country was jon snow filmed?",
+]
+
+keyword_classifier = TransformersQueryClassifier()
+
+for query in queries:
+ result = keyword_classifier.run(query=query)
+ if result[1] == "output_1":
+ category = "question/statement"
+ else:
+ category = "keyword"
+
+ print(f"Query: {query}, raw_output: {result}, class: {category}")
+```
+
+
+```python
+# Here we create the question vs statement query classifier
+from haystack.nodes import TransformersQueryClassifier
+
+queries = [
+ "Lord Eddard was the father of Arya Stark.",
+ "Jon Snow was filmed in United Kingdom.",
+ "who is the father of arya stark?",
+ "Which country was jon snow filmed in?",
+]
+
+question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier")
+
+for query in queries:
+ result = question_classifier.run(query=query)
+ if result[1] == "output_1":
+ category = "question"
+ else:
+ category = "statement"
+
+ print(f"Query: {query}, raw_output: {result}, class: {category}")
+```
+
+## Conclusion
+
+The query classifier gives you more possibility to be more creative with the pipelines and use different retrieval nodes in a flexible fashion. Moreover, as in the case of Question vs Statement classifier you can also choose the queries which you want to send to the reader.
+
+Finally, you also have the possible of bringing your own classifier and plugging it into either `TransformersQueryClassifier(model_name_or_path="")` or using the `SklearnQueryClassifier(model_name_or_path="url_to_classifier_or_file_path_as_pickle", vectorizer_name_or_path="url_to_vectorizer_or_file_path_as_pickle")`
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/15.md b/docs/v1.5.0/_src/tutorials/tutorials/15.md
new file mode 100644
index 0000000000..136ca41ae2
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/15.md
@@ -0,0 +1,417 @@
+
+
+# Open-Domain QA on Tables
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial15_TableQA.ipynb)
+
+This tutorial shows you how to perform question-answering on tables using the `EmbeddingRetriever` or `BM25Retriever` as retriever node and the `TableReader` as reader node.
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+
+# The TaPAs-based TableReader requires the torch-scatter library
+import torch
+
+version = torch.__version__
+!pip install torch-scatter -f https://data.pyg.org/whl/torch-{version}.html
+
+# Install pygraphviz for visualization of Pipelines
+!apt install libgraphviz-dev
+!pip install pygraphviz
+```
+
+### Start an Elasticsearch server
+You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source.
+
+
+```python
+# Recommended: Start Elasticsearch using Docker via the Haystack utility function
+from haystack.utils import launch_es
+
+launch_es()
+```
+
+
+```python
+# In Colab / No Docker environments: Start Elasticsearch from source
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
+
+import os
+from subprocess import Popen, PIPE, STDOUT
+
+es_server = Popen(
+ ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
+)
+# wait until ES has started
+! sleep 30
+```
+
+
+```python
+# Connect to Elasticsearch
+from haystack.document_stores import ElasticsearchDocumentStore
+
+document_index = "document"
+document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index=document_index)
+```
+
+## Add Tables to DocumentStore
+To quickly demonstrate the capabilities of the `EmbeddingRetriever` and the `TableReader` we use a subset of 1000 tables and text documents from a dataset we have published in [this paper](https://arxiv.org/abs/2108.04049).
+
+Just as text passages, tables are represented as `Document` objects in Haystack. The content field, though, is a pandas DataFrame instead of a string.
+
+
+```python
+# Let's first fetch some tables that we want to query
+# Here: 1000 tables from OTT-QA
+from haystack.utils import fetch_archive_from_http
+
+doc_dir = "data/tutorial15"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+```
+
+
+```python
+# Add the tables to the DocumentStore
+
+import json
+from haystack import Document
+import pandas as pd
+
+
+def read_tables(filename):
+ processed_tables = []
+ with open(filename) as tables:
+ tables = json.load(tables)
+ for key, table in tables.items():
+ current_columns = table["header"]
+ current_rows = table["data"]
+ current_df = pd.DataFrame(columns=current_columns, data=current_rows)
+ document = Document(content=current_df, content_type="table", id=key)
+ processed_tables.append(document)
+
+ return processed_tables
+
+
+tables = read_tables(f"{doc_dir}/tables.json")
+document_store.write_documents(tables, index=document_index)
+
+# Showing content field and meta field of one of the Documents of content_type 'table'
+print(tables[0].content)
+print(tables[0].meta)
+```
+
+## Initalize Retriever, Reader, & Pipeline
+
+### Retriever
+
+Retrievers help narrowing down the scope for the Reader to a subset of tables where a given question could be answered.
+They use some simple but fast algorithm.
+
+**Here:** We specify an embedding model that is finetuned so it can also generate embeddings for tables (instead of just text).
+
+**Alternatives:**
+
+- `BM25Retriever` that uses BM25 algorithm
+
+
+
+```python
+from haystack.nodes.retriever import EmbeddingRetriever
+
+retriever = EmbeddingRetriever(
+ document_store=document_store,
+ embedding_model="deepset/all-mpnet-base-v2-table",
+ model_format="sentence_transformers",
+)
+```
+
+
+```python
+# Add table embeddings to the tables in DocumentStore
+document_store.update_embeddings(retriever=retriever)
+```
+
+
+```python
+## Alternative: BM25Retriever
+# from haystack.nodes.retriever import BM25Retriever
+# retriever = BM25Retriever(document_store=document_store)
+```
+
+
+```python
+# Try the Retriever
+from haystack.utils import print_documents
+
+retrieved_tables = retriever.retrieve("Who won the Super Bowl?", top_k=5)
+# Get highest scored table
+print(retrieved_tables[0].content)
+```
+
+### Reader
+The `TableReader` is based on TaPas, a transformer-based language model capable of grasping the two-dimensional structure of a table. It scans the tables returned by the retriever and extracts the anser. The available TableReader models can be found [here](https://huggingface.co/models?pipeline_tag=table-question-answering&sort=downloads).
+
+**Notice**: The `TableReader` will return an answer for each table, even if the query cannot be answered by the table. Furthermore, the confidence scores are not useful as of now, given that they will *always* be very high (i.e. 1 or close to 1).
+
+
+```python
+from haystack.nodes import TableReader
+
+reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq", max_seq_len=512)
+```
+
+
+```python
+# Try the TableReader on one Table
+
+table_doc = document_store.get_document_by_id("36964e90-3735-4ba1-8e6a-bec236e88bb2")
+print(table_doc.content)
+```
+
+
+```python
+from haystack.utils import print_answers
+
+prediction = reader.predict(query="Who played Gregory House in the series House?", documents=[table_doc])
+print_answers(prediction, details="all")
+```
+
+The offsets in the `offsets_in_document` and `offsets_in_context` field indicate the table cells that the model predicts to be part of the answer. They need to be interpreted on the linearized table, i.e., a flat list containing all of the table cells.
+
+
+```python
+print(f"Predicted answer: {prediction['answers'][0].answer}")
+print(f"Meta field: {prediction['answers'][0].meta}")
+```
+
+### Pipeline
+The Retriever and the Reader can be sticked together to a pipeline in order to first retrieve relevant tables and then extract the answer.
+
+**Notice**: Given that the `TableReader` does not provide useful confidence scores and returns an answer for each of the tables, the sorting of the answers might be not helpful.
+
+
+```python
+# Initialize pipeline
+from haystack import Pipeline
+
+table_qa_pipeline = Pipeline()
+table_qa_pipeline.add_node(component=retriever, name="EmbeddingRetriever", inputs=["Query"])
+table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["EmbeddingRetriever"])
+```
+
+
+```python
+prediction = table_qa_pipeline.run("When was Guilty Gear Xrd : Sign released?", params={"top_k": 30})
+print_answers(prediction, details="minimum")
+```
+
+
+```python
+# Add 500 text passages to our document store.
+
+
+def read_texts(filename):
+ processed_passages = []
+ with open(filename) as passages:
+ passages = json.load(passages)
+ for key, content in passages.items():
+ document = Document(content=content, content_type="text", id=key)
+ processed_passages.append(document)
+
+ return processed_passages
+
+
+passages = read_texts(f"{doc_dir}/texts.json")
+document_store.write_documents(passages, index=document_index)
+```
+
+
+```python
+document_store.update_embeddings(retriever=retriever, update_existing_embeddings=False)
+```
+
+## Pipeline for QA on Combination of Text and Tables
+We are using one node for retrieving both texts and tables, the `EmbeddingRetriever`. In order to do question-answering on the Documents coming from the `EmbeddingRetriever`, we need to route Documents of type `"text"` to a `FARMReader` (or alternatively `TransformersReader`) and Documents of type `"table"` to a `TableReader`.
+
+To achieve this, we make use of two additional nodes:
+- `RouteDocuments`: Splits the List of Documents retrieved by the `EmbeddingRetriever` into two lists containing only Documents of type `"text"` or `"table"`, respectively.
+- `JoinAnswers`: Takes Answers coming from two different Readers (in this case `FARMReader` and `TableReader`) and joins them to a single list of Answers.
+
+
+```python
+from haystack.nodes import FARMReader, RouteDocuments, JoinAnswers
+
+text_reader = FARMReader("deepset/roberta-base-squad2")
+# In order to get meaningful scores from the TableReader, use "deepset/tapas-large-nq-hn-reader" or
+# "deepset/tapas-large-nq-reader" as TableReader models. The disadvantage of these models is, however,
+# that they are not capable of doing aggregations over multiple table cells.
+table_reader = TableReader("deepset/tapas-large-nq-hn-reader")
+route_documents = RouteDocuments()
+join_answers = JoinAnswers()
+```
+
+
+```python
+text_table_qa_pipeline = Pipeline()
+text_table_qa_pipeline.add_node(component=retriever, name="EmbeddingRetriever", inputs=["Query"])
+text_table_qa_pipeline.add_node(component=route_documents, name="RouteDocuments", inputs=["EmbeddingRetriever"])
+text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"])
+text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"])
+text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"])
+```
+
+
+```python
+# Let's have a look on the structure of the combined Table an Text QA pipeline.
+from IPython import display
+
+text_table_qa_pipeline.draw()
+display.Image("pipeline.png")
+```
+
+
+```python
+# Example query whose answer resides in a text passage
+predictions = text_table_qa_pipeline.run(query="Who was Thomas Alva Edison?")
+```
+
+
+```python
+# We can see both text passages and tables as contexts of the predicted answers.
+print_answers(predictions, details="minimum")
+```
+
+
+```python
+# Example query whose answer resides in a table
+predictions = text_table_qa_pipeline.run(query="Which country does the film Macaroni come from?")
+```
+
+
+```python
+# We can see both text passages and tables as contexts of the predicted answers.
+print_answers(predictions, details="minimum")
+```
+
+## Evaluation
+To evaluate our pipeline, we can use haystack's evaluation feature. We just need to convert our labels into `MultiLabel` objects and the `eval` method will do the rest.
+
+
+```python
+from haystack import Label, MultiLabel, Answer
+
+
+def read_labels(filename, tables):
+ processed_labels = []
+ with open(filename) as labels:
+ labels = json.load(labels)
+ for table in tables:
+ if table.id not in labels:
+ continue
+ label = labels[table.id]
+ label = Label(
+ query=label["query"],
+ document=table,
+ is_correct_answer=True,
+ is_correct_document=True,
+ answer=Answer(answer=label["answer"]),
+ origin="gold-label",
+ )
+ processed_labels.append(MultiLabel(labels=[label]))
+ return processed_labels
+
+
+table_labels = read_labels(f"{doc_dir}/labels.json", tables)
+passage_labels = read_labels(f"{doc_dir}/labels.json", passages)
+```
+
+
+```python
+eval_results = text_table_qa_pipeline.eval(table_labels + passage_labels, params={"top_k": 10})
+```
+
+
+```python
+# Calculating and printing the evaluation metrics
+print(eval_results.calculate_metrics())
+```
+
+## Adding tables from PDFs
+It can sometimes be hard to provide your data in form of a pandas DataFrame. For this case, we provide the `ParsrConverter` wrapper that can help you to convert, for example, a PDF file into a document that you can index.
+
+
+```python
+import time
+
+!docker run -d -p 3001:3001 axarev/parsr
+time.sleep(30)
+```
+
+
+```python
+!wget https://www.w3.org/WAI/WCAG21/working-examples/pdf-table/table.pdf
+```
+
+
+```python
+from haystack.nodes import ParsrConverter
+
+converter = ParsrConverter()
+
+docs = converter.convert("table.pdf")
+
+tables = [doc for doc in docs if doc["content_type"] == "table"]
+```
+
+
+```python
+print(tables)
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
+
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/16.md b/docs/v1.5.0/_src/tutorials/tutorials/16.md
new file mode 100644
index 0000000000..9dde92d509
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/16.md
@@ -0,0 +1,260 @@
+
+
+# Extending your Metadata using DocumentClassifiers at Index Time
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb)
+
+With DocumentClassifier it's possible to automatically enrich your documents with categories, sentiments, topics or whatever metadata you like. This metadata could be used for efficient filtering or further processing. Say you have some categories your users typically filter on. If the documents are tagged manually with these categories, you could automate this process by training a model. Or you can leverage the full power and flexibility of zero shot classification. All you need to do is pass your categories to the classifier, no labels required. This tutorial shows how to integrate it in your indexing pipeline.
+
+DocumentClassifier adds the classification result (label and score) to Document's meta property.
+Hence, we can use it to classify documents at index time. \
+The result can be accessed at query time: for example by applying a filter for "classification.label".
+
+This tutorial will show you how to integrate a classification model into your preprocessing steps and how you can filter for this additional metadata at query time. In the last section we show how to put it all together and create an indexing pipeline.
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]
+
+!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz
+!tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
+
+# Install pygraphviz
+!apt install libgraphviz-dev
+!pip install pygraphviz
+```
+
+
+```python
+# Here are the imports we need
+from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
+from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, BM25Retriever
+from haystack.schema import Document
+from haystack.utils import convert_files_to_docs, fetch_archive_from_http, print_answers
+```
+
+
+```python
+# This fetches some sample files to work with
+
+doc_dir = "data/tutorial16"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+```
+
+## Read and preprocess documents
+
+
+
+```python
+# note that you can also use the document classifier before applying the PreProcessor, e.g. before splitting your documents
+
+all_docs = convert_files_to_docs(dir_path=doc_dir)
+preprocessor_sliding_window = PreProcessor(split_overlap=3, split_length=10, split_respect_sentence_boundary=False)
+docs_sliding_window = preprocessor_sliding_window.process(all_docs)
+```
+
+## Apply DocumentClassifier
+
+We can enrich the document metadata at index time using any transformers document classifier model. While traditional classification models are trained to predict one of a few "hard-coded" classes and required a dedicated training dataset, zero-shot classification is super flexible and you can easily switch the classes the model should predict on the fly. Just supply them via the labels param.
+Here we use a zero shot model that is supposed to classify our documents in 'music', 'natural language processing' and 'history'. Feel free to change them for whatever you like to classify. \
+These classes can later on be accessed at query time.
+
+
+```python
+doc_classifier = TransformersDocumentClassifier(
+ model_name_or_path="cross-encoder/nli-distilroberta-base",
+ task="zero-shot-classification",
+ labels=["music", "natural language processing", "history"],
+ batch_size=16,
+)
+```
+
+
+```python
+# we can also use any other transformers model besides zero shot classification
+
+# doc_classifier_model = 'bhadresh-savani/distilbert-base-uncased-emotion'
+# doc_classifier = TransformersDocumentClassifier(model_name_or_path=doc_classifier_model, batch_size=16, use_gpu=-1)
+```
+
+
+```python
+# we could also specifiy a different field we want to run the classification on
+
+# doc_classifier = TransformersDocumentClassifier(model_name_or_path="cross-encoder/nli-distilroberta-base",
+# task="zero-shot-classification",
+# labels=["music", "natural language processing", "history"],
+# batch_size=16, use_gpu=-1,
+# classification_field="description")
+```
+
+
+```python
+# classify using gpu, batch_size makes sure we do not run out of memory
+classified_docs = doc_classifier.predict(docs_sliding_window)
+```
+
+
+```python
+# let's see how it looks: there should be a classification result in the meta entry containing labels and scores.
+print(classified_docs[0].to_dict())
+```
+
+## Indexing
+
+
+```python
+# In Colab / No Docker environments: Start Elasticsearch from source
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
+
+import os
+from subprocess import Popen, PIPE, STDOUT
+
+es_server = Popen(
+ ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
+)
+# wait until ES has started
+! sleep 30
+```
+
+
+```python
+# Connect to Elasticsearch
+document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
+```
+
+
+```python
+# Now, let's write the docs to our DB.
+document_store.delete_all_documents()
+document_store.write_documents(classified_docs)
+```
+
+
+```python
+# check if indexed docs contain classification results
+test_doc = document_store.get_all_documents()[0]
+print(
+ f'document {test_doc.id} with content \n\n{test_doc.content}\n\nhas label {test_doc.meta["classification"]["label"]}'
+)
+```
+
+## Querying the data
+
+All we have to do to filter for one of our classes is to set a filter on "classification.label".
+
+
+```python
+# Initialize QA-Pipeline
+from haystack.pipelines import ExtractiveQAPipeline
+
+retriever = BM25Retriever(document_store=document_store)
+reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
+pipe = ExtractiveQAPipeline(reader, retriever)
+```
+
+
+```python
+## Voilà ! Ask a question while filtering for "music"-only documents
+prediction = pipe.run(
+ query="What is heavy metal?",
+ params={"Retriever": {"top_k": 10, "filters": {"classification.label": ["music"]}}, "Reader": {"top_k": 5}},
+)
+```
+
+
+```python
+print_answers(prediction, details="high")
+```
+
+## Wrapping it up in an indexing pipeline
+
+
+```python
+from pathlib import Path
+from haystack.pipelines import Pipeline
+from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter
+```
+
+
+```python
+file_type_classifier = FileTypeClassifier()
+text_converter = TextConverter()
+pdf_converter = PDFToTextConverter()
+docx_converter = DocxToTextConverter()
+
+indexing_pipeline_with_classification = Pipeline()
+indexing_pipeline_with_classification.add_node(
+ component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
+)
+indexing_pipeline_with_classification.add_node(
+ component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"]
+)
+indexing_pipeline_with_classification.add_node(
+ component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"]
+)
+indexing_pipeline_with_classification.add_node(
+ component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"]
+)
+indexing_pipeline_with_classification.add_node(
+ component=preprocessor_sliding_window,
+ name="Preprocessor",
+ inputs=["TextConverter", "PdfConverter", "DocxConverter"],
+)
+indexing_pipeline_with_classification.add_node(
+ component=doc_classifier, name="DocumentClassifier", inputs=["Preprocessor"]
+)
+indexing_pipeline_with_classification.add_node(
+ component=document_store, name="DocumentStore", inputs=["DocumentClassifier"]
+)
+indexing_pipeline_with_classification.draw("index_time_document_classifier.png")
+
+document_store.delete_documents()
+txt_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".txt"]
+pdf_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".pdf"]
+docx_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".docx"]
+indexing_pipeline_with_classification.run(file_paths=txt_files)
+indexing_pipeline_with_classification.run(file_paths=pdf_files)
+indexing_pipeline_with_classification.run(file_paths=docx_files)
+
+document_store.get_all_documents()[0]
+```
+
+
+```python
+# we can store this pipeline and use it from the REST-API
+indexing_pipeline_with_classification.save_to_yaml("indexing_pipeline_with_classification.yaml")
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
+
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/2.md b/docs/v1.5.0/_src/tutorials/tutorials/2.md
new file mode 100644
index 0000000000..81559ec5fb
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/2.md
@@ -0,0 +1,157 @@
+
+
+# Fine-tuning a Model on Your Own Data
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb)
+
+For many use cases it is sufficient to just use one of the existing public models that were trained on SQuAD or other public QA datasets (e.g. Natural Questions).
+However, if you have domain-specific questions, fine-tuning your model on custom examples will very likely boost your performance.
+While this varies by domain, we saw that ~ 2000 examples can easily increase performance by +5-20%.
+
+This tutorial shows you how to fine-tune a pretrained model on your own dataset.
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+```
+
+
+```python
+from haystack.nodes import FARMReader
+from haystack.utils import fetch_archive_from_http
+```
+
+
+## Create Training Data
+
+There are two ways to generate training data
+
+1. **Annotation**: You can use the [annotation tool](https://haystack.deepset.ai/guides/v1.5.0/annotation) to label your data, i.e. highlighting answers to your questions in a document. The tool supports structuring your workflow with organizations, projects, and users. The labels can be exported in SQuAD format that is compatible for training with Haystack.
+
+![Snapshot of the annotation tool](https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/annotation_tool.png)
+
+2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's [REST API interface](https://github.com/deepset-ai/haystack#rest-api). This includes a customizable user feedback API for providing feedback on the answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data for fine-tuning your model further.
+
+
+## Fine-tune your model
+
+Once you have collected training data, you can fine-tune your base models.
+We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
+We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer Learning effects.
+
+**Recommendation**: Run training on a GPU.
+If you are using Colab: Enable this in the menu "Runtime" > "Change Runtime type" > Select "GPU" in dropdown.
+Then change the `use_gpu` arguments below to `True`
+
+
+```python
+reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)
+data_dir = "data/squad20"
+# data_dir = "PATH/TO_YOUR/TRAIN_DATA"
+reader.train(data_dir=data_dir, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model")
+```
+
+
+```python
+# Saving the model happens automatically at the end of training into the `save_dir` you specified
+# However, you could also save a reader manually again via:
+reader.save(directory="my_model")
+```
+
+
+```python
+# If you want to load it at a later point, just do:
+new_reader = FARMReader(model_name_or_path="my_model")
+```
+
+## Distill your model
+In this case, we have used "distilbert-base-uncased" as our base model. This model was trained using a process called distillation. In this process, a bigger model is trained first and is used to train a smaller model which increases its accuracy. This is why "distilbert-base-uncased" can achieve quite competitive performance while being very small.
+
+Sometimes, however, you can't use an already distilled model and have to distil it yourself. For this case, haystack has implemented [distillation features](https://haystack.deepset.ai/guides/model-distillation).
+
+### Augmenting your training data
+To get the most out of model distillation, we recommend increasing the size of your training data by using data augmentation. You can do this by running the [`augment_squad.py` script](https://github.com/deepset-ai/haystack/blob/master/haystack/utils/augment_squad.py):
+
+
+```python
+# Downloading script
+!wget https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/utils/augment_squad.py
+
+doc_dir = "data/tutorial2"
+
+# Downloading smaller glove vector file (only for demonstration purposes)
+glove_url = "https://nlp.stanford.edu/data/glove.6B.zip"
+fetch_archive_from_http(url=glove_url, output_dir=doc_dir)
+
+# Downloading very small dataset to make tutorial faster (please use a bigger dataset for real use cases)
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/squad_small.json.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+# Just replace the path with your dataset and adjust the output (also please remove glove path to use bigger glove vector file)
+!python augment_squad.py --squad_path squad_small.json --output_path augmented_dataset.json --multiplication_factor 2 --glove_path glove.6B.300d.txt
+```
+
+In this case, we use a multiplication factor of 2 to keep this example lightweight. Usually you would use a factor like 20 depending on the size of your training data. Augmenting this small dataset with a multiplication factor of 2, should take about 5 to 10 minutes to run on one V100 GPU.
+
+### Running distillation
+Distillation in haystack is done in two steps: First, you run intermediate layer distillation on the augmented dataset to ensure the two models behave similarly. After that, you run the prediction layer distillation on the non-augmented dataset to optimize the model for your specific task.
+
+If you want, you can leave out the intermediate layer distillation step and only run the prediction layer distillation. This way you also do not need to perform data augmentation. However, this will make the model significantly less accurate.
+
+
+```python
+# Loading a fine-tuned model as teacher e.g. "deepset/​bert-​base-​uncased-​squad2"
+teacher = FARMReader(model_name_or_path="my_model", use_gpu=True)
+
+# You can use any pre-trained language model as teacher that uses the same tokenizer as the teacher model.
+# The number of the layers in the teacher model also needs to be a multiple of the number of the layers in the student.
+student = FARMReader(model_name_or_path="huawei-noah/TinyBERT_General_6L_768D", use_gpu=True)
+
+student.distil_intermediate_layers_from(teacher, data_dir=".", train_filename="augmented_dataset.json", use_gpu=True)
+student.distil_prediction_layer_from(teacher, data_dir="data/squad20", train_filename="dev-v2.0.json", use_gpu=True)
+
+student.save(directory="my_distilled_model")
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/3.md b/docs/v1.5.0/_src/tutorials/tutorials/3.md
new file mode 100644
index 0000000000..f6c82ca84f
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/3.md
@@ -0,0 +1,230 @@
+
+
+# Build a QA System Without Elasticsearch
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb)
+
+Haystack provides alternatives to Elasticsearch for developing quick prototypes.
+
+You can use an `InMemoryDocumentStore` or a `SQLDocumentStore`(with SQLite) as the document store.
+
+If you are interested in more feature-rich Elasticsearch, then please refer to the Tutorial 1.
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+```
+
+
+```python
+from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
+from haystack.nodes import FARMReader, TransformersReader
+```
+
+## Document Store
+
+
+
+```python
+# In-Memory Document Store
+from haystack.document_stores import InMemoryDocumentStore
+
+document_store = InMemoryDocumentStore()
+```
+
+
+```python
+# SQLite Document Store
+# from haystack.document_stores import SQLDocumentStore
+# document_store = SQLDocumentStore(url="sqlite:///qa.db")
+```
+
+## Preprocessing of documents
+
+Haystack provides a customizable pipeline for:
+ - converting files into texts
+ - cleaning texts
+ - splitting texts
+ - writing them to a Document Store
+
+In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch.
+
+
+```python
+# Let's first get some documents that we want to query
+# Here: 517 Wikipedia articles for Game of Thrones
+doc_dir = "data/tutorial3"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+# convert files to dicts containing documents that can be indexed to our datastore
+# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
+# It must take a str as input, and return a str.
+docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
+
+# We now have a list of dictionaries that we can write to our document store.
+# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
+# The default format here is: {"name": ", "text": ""}
+
+# Let's have a look at the first 3 entries:
+print(docs[:3])
+# Now, let's write the docs to our DB.
+document_store.write_documents(docs)
+```
+
+## Initalize Retriever, Reader & Pipeline
+
+### Retriever
+
+Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered.
+
+With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more retrievers, please refer to the tutorial-1.
+
+
+```python
+# An in-memory TfidfRetriever based on Pandas dataframes
+
+from haystack.nodes import TfidfRetriever
+
+retriever = TfidfRetriever(document_store=document_store)
+```
+
+### Reader
+
+A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
+on powerful, but slower deep learning models.
+
+Haystack currently supports Readers based on the frameworks FARM and Transformers.
+With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
+
+**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2)
+
+**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
+
+**Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
+
+**Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible"
+
+#### FARMReader
+
+
+```python
+# Load a local model or any of the QA models on
+# Hugging Face's model hub (https://huggingface.co/models)
+
+reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
+```
+
+#### TransformersReader
+
+
+```python
+# Alternative:
+# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
+```
+
+### Pipeline
+
+With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
+Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
+To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
+You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/components/v1.5.0/pipelines).
+
+
+```python
+from haystack.pipelines import ExtractiveQAPipeline
+
+pipe = ExtractiveQAPipeline(reader, retriever)
+```
+
+## Voilà ! Ask a question!
+
+
+```python
+# You can configure how many candidates the reader and retriever shall return
+# The higher top_k for retriever, the better (but also the slower) your answers.
+prediction = pipe.run(
+ query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
+)
+```
+
+
+```python
+# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
+# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
+```
+
+
+```python
+# Now you can either print the object directly...
+from pprint import pprint
+
+pprint(prediction)
+
+# Sample output:
+# {
+# 'answers': [ ,
+# ,
+# ...
+# ]
+# 'documents': [ ,
+# ,
+# ...
+# ],
+# 'no_ans_gap': 11.688868522644043,
+# 'node_id': 'Reader',
+# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
+# 'query': 'Who is the father of Arya Stark?',
+# 'root_node': 'Query'
+# }
+```
+
+
+```python
+# ...or use a util to simplify the output
+# Change `minimum` to `medium` or `all` to raise the level of detail
+print_answers(prediction, details="minimum")
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/4.md b/docs/v1.5.0/_src/tutorials/tutorials/4.md
new file mode 100644
index 0000000000..5740b05b1a
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/4.md
@@ -0,0 +1,185 @@
+
+
+# Utilizing existing FAQs for Question Answering
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial4_FAQ_style_QA.ipynb)
+
+While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data.
+
+**Pros**:
+
+- Very fast at inference time
+- Utilize existing FAQ data
+- Quite good control over answers
+
+**Cons**:
+
+- Generalizability: We can only answer questions that are similar to existing ones in FAQ
+
+In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option.
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+```
+
+
+```python
+from haystack.document_stores import ElasticsearchDocumentStore
+
+from haystack.nodes import EmbeddingRetriever
+import pandas as pd
+import requests
+```
+
+### Start an Elasticsearch server
+You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
+
+
+```python
+# Recommended: Start Elasticsearch using Docker via the Haystack utility function
+from haystack.utils import launch_es
+
+launch_es()
+```
+
+
+```python
+# In Colab / No Docker environments: Start Elasticsearch from source
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
+
+import os
+from subprocess import Popen, PIPE, STDOUT
+
+es_server = Popen(
+ ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
+)
+# wait until ES has started
+! sleep 30
+```
+
+### Init the DocumentStore
+In contrast to Tutorial 1 (extractive QA), we:
+
+* specify the name of our `text_field` in Elasticsearch that we want to return as an answer
+* specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question
+* set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results
+
+
+```python
+from haystack.document_stores import ElasticsearchDocumentStore
+
+document_store = ElasticsearchDocumentStore(
+ host="localhost",
+ username="",
+ password="",
+ index="document",
+ embedding_field="question_emb",
+ embedding_dim=384,
+ excluded_meta_data=["question_emb"],
+)
+```
+
+### Create a Retriever using embeddings
+Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).
+We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings.
+
+
+```python
+retriever = EmbeddingRetriever(
+ document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2", use_gpu=True
+)
+```
+
+### Prepare & Index FAQ data
+We create a pandas dataframe containing some FAQ data (i.e curated pairs of question + answer) and index those in elasticsearch.
+Here: We download some question-answer pairs related to COVID-19
+
+
+```python
+from haystack.utils import fetch_archive_from_http
+
+# Download
+doc_dir = "data/tutorial4"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+# Get dataframe with columns "question", "answer" and some custom metadata
+df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv")
+# Minimal cleaning
+df.fillna(value="", inplace=True)
+df["question"] = df["question"].apply(lambda x: x.strip())
+print(df.head())
+
+# Get embeddings for our questions from the FAQs
+questions = list(df["question"].values)
+df["question_emb"] = retriever.embed_queries(texts=questions)
+df = df.rename(columns={"question": "content"})
+
+# Convert Dataframe to list of dicts and index them in our DocumentStore
+docs_to_index = df.to_dict(orient="records")
+document_store.write_documents(docs_to_index)
+```
+
+### Ask questions
+Initialize a Pipeline (this time without a reader) and ask questions
+
+
+```python
+from haystack.pipelines import FAQPipeline
+
+pipe = FAQPipeline(retriever=retriever)
+```
+
+
+```python
+from haystack.utils import print_answers
+
+prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
+print_answers(prediction, details="medium")
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/5.md b/docs/v1.5.0/_src/tutorials/tutorials/5.md
new file mode 100644
index 0000000000..41a215ff34
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/5.md
@@ -0,0 +1,393 @@
+
+
+# Evaluation of a Pipeline and its Components
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial5_Evaluation.ipynb)
+
+To be able to make a statement about the quality of results a question-answering pipeline or any other pipeline in haystack produces, it is important to evaluate it. Furthermore, evaluation allows determining which components of the pipeline can be improved.
+The results of the evaluation can be saved as CSV files, which contain all the information to calculate additional metrics later on or inspect individual predictions.
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+```
+
+## Start an Elasticsearch server
+You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
+
+
+```python
+# If Docker is available: Start Elasticsearch as docker container
+# from haystack.utils import launch_es
+# launch_es()
+
+# Alternative in Colab / No Docker environments: Start Elasticsearch from source
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
+
+import os
+from subprocess import Popen, PIPE, STDOUT
+
+es_server = Popen(
+ ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
+)
+# wait until ES has started
+! sleep 30
+```
+
+## Fetch, Store And Preprocess the Evaluation Dataset
+
+
+```python
+from haystack.utils import fetch_archive_from_http
+
+# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers
+doc_dir = "data/tutorial5"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+```
+
+
+```python
+# make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted
+doc_index = "tutorial5_docs"
+label_index = "tutorial5_labels"
+```
+
+
+```python
+# Connect to Elasticsearch
+from haystack.document_stores import ElasticsearchDocumentStore
+
+# Connect to Elasticsearch
+document_store = ElasticsearchDocumentStore(
+ host="localhost",
+ username="",
+ password="",
+ index=doc_index,
+ label_index=label_index,
+ embedding_field="emb",
+ embedding_dim=768,
+ excluded_meta_data=["emb"],
+)
+```
+
+
+```python
+from haystack.nodes import PreProcessor
+
+# Add evaluation data to Elasticsearch Document Store
+# We first delete the custom tutorial indices to not have duplicate elements
+# and also split our documents into shorter passages using the PreProcessor
+preprocessor = PreProcessor(
+ split_length=200,
+ split_overlap=0,
+ split_respect_sentence_boundary=False,
+ clean_empty_lines=False,
+ clean_whitespace=False,
+)
+document_store.delete_documents(index=doc_index)
+document_store.delete_documents(index=label_index)
+
+# The add_eval_data() method converts the given dataset in json format into Haystack document and label objects. Those objects are then indexed in their respective document and label index in the document store. The method can be used with any dataset in SQuAD format.
+document_store.add_eval_data(
+ filename="data/tutorial5/nq_dev_subset_v2.json",
+ doc_index=doc_index,
+ label_index=label_index,
+ preprocessor=preprocessor,
+)
+```
+
+## Initialize the Two Components of an ExtractiveQAPipeline: Retriever and Reader
+
+
+```python
+# Initialize Retriever
+from haystack.nodes import BM25Retriever
+
+retriever = BM25Retriever(document_store=document_store)
+
+# Alternative: Evaluate dense retrievers (EmbeddingRetriever or DensePassageRetriever)
+# The EmbeddingRetriever uses a single transformer based encoder model for query and document.
+# In contrast, DensePassageRetriever uses two separate encoders for both.
+
+# Please make sure the "embedding_dim" parameter in the DocumentStore above matches the output dimension of your models!
+# Please also take care that the PreProcessor splits your files into chunks that can be completely converted with
+# the max_seq_len limitations of Transformers
+# The SentenceTransformer model "sentence-transformers/multi-qa-mpnet-base-dot-v1" generally works well with the EmbeddingRetriever on any kind of English text.
+# For more information and suggestions on different models check out the documentation at: https://www.sbert.net/docs/pretrained_models.html
+
+# from haystack.retriever import EmbeddingRetriever, DensePassageRetriever
+# retriever = EmbeddingRetriever(document_store=document_store, model_format="sentence_transformers",
+# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1")
+# retriever = DensePassageRetriever(document_store=document_store,
+# query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
+# passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+# use_gpu=True,
+# max_seq_len_passage=256,
+# embed_title=True)
+# document_store.update_embeddings(retriever, index=doc_index)
+```
+
+
+```python
+# Initialize Reader
+from haystack.nodes import FARMReader
+
+reader = FARMReader("deepset/roberta-base-squad2", top_k=4, return_no_answer=True)
+
+# Define a pipeline consisting of the initialized retriever and reader
+from haystack.pipelines import ExtractiveQAPipeline
+
+pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
+
+# The evaluation also works with any other pipeline.
+# For example you could use a DocumentSearchPipeline as an alternative:
+
+# from haystack.pipelines import DocumentSearchPipeline
+# pipeline = DocumentSearchPipeline(retriever=retriever)
+```
+
+## Evaluation of an ExtractiveQAPipeline
+Here we evaluate retriever and reader in open domain fashion on the full corpus of documents i.e. a document is considered
+correctly retrieved if it contains the gold answer string within it. The reader is evaluated based purely on the
+predicted answer string, regardless of which document this came from and the position of the extracted span.
+
+The generation of predictions is seperated from the calculation of metrics. This allows you to run the computation-heavy model predictions only once and then iterate flexibly on the metrics or reports you want to generate.
+
+
+
+```python
+from haystack.schema import EvaluationResult, MultiLabel
+
+# We can load evaluation labels from the document store
+# We are also opting to filter out no_answer samples
+eval_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=True)
+
+## Alternative: Define queries and labels directly
+
+# eval_labels = [
+# MultiLabel(
+# labels=[
+# Label(
+# query="who is written in the book of life",
+# answer=Answer(
+# answer="every person who is destined for Heaven or the World to Come",
+# offsets_in_context=[Span(374, 434)]
+# ),
+# document=Document(
+# id='1b090aec7dbd1af6739c4c80f8995877-0',
+# content_type="text",
+# content='Book of Life - wikipedia Book of Life Jump to: navigation, search This article is
+# about the book mentioned in Christian and Jewish religious teachings...'
+# ),
+# is_correct_answer=True,
+# is_correct_document=True,
+# origin="gold-label"
+# )
+# ]
+# )
+# ]
+
+# Similar to pipeline.run() we can execute pipeline.eval()
+eval_result = pipeline.eval(labels=eval_labels, params={"Retriever": {"top_k": 5}})
+```
+
+
+```python
+# The EvaluationResult contains a pandas dataframe for each pipeline node.
+# That's why there are two dataframes in the EvaluationResult of an ExtractiveQAPipeline.
+
+retriever_result = eval_result["Retriever"]
+retriever_result.head()
+```
+
+
+```python
+reader_result = eval_result["Reader"]
+reader_result.head()
+```
+
+
+```python
+# We can filter for all documents retrieved for a given query
+query = "who is written in the book of life"
+retriever_book_of_life = retriever_result[retriever_result["query"] == query]
+```
+
+
+```python
+# We can also filter for all answers predicted for a given query
+reader_book_of_life = reader_result[reader_result["query"] == query]
+```
+
+
+```python
+# Save the evaluation result so that we can reload it later and calculate evaluation metrics without running the pipeline again.
+eval_result.save("../")
+```
+
+## Calculating Evaluation Metrics
+Load an EvaluationResult to quickly calculate standard evaluation metrics for all predictions,
+such as F1-score of each individual prediction of the Reader node or recall of the retriever.
+To learn more about the metrics, see [Evaluation Metrics](https://haystack.deepset.ai/guides/v1.5.0/evaluation#metrics-retrieval)
+
+
+```python
+saved_eval_result = EvaluationResult.load("../")
+metrics = saved_eval_result.calculate_metrics()
+print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}')
+print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}')
+print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}')
+print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}')
+print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}')
+
+print(f'Reader - F1-Score: {metrics["Reader"]["f1"]}')
+print(f'Reader - Exact Match: {metrics["Reader"]["exact_match"]}')
+```
+
+## Generating an Evaluation Report
+A summary of the evaluation results can be printed to get a quick overview. It includes some aggregated metrics and also shows a few wrongly predicted examples.
+
+
+```python
+pipeline.print_eval_report(saved_eval_result)
+```
+
+## Advanced Evaluation Metrics
+As an advanced evaluation metric, semantic answer similarity (SAS) can be calculated. This metric takes into account whether the meaning of a predicted answer is similar to the annotated gold answer rather than just doing string comparison.
+To this end SAS relies on pre-trained models. For English, we recommend "cross-encoder/stsb-roberta-large", whereas for German we recommend "deepset/gbert-large-sts". A good multilingual model is "sentence-transformers/paraphrase-multilingual-mpnet-base-v2".
+More info on this metric can be found in our [paper](https://arxiv.org/abs/2108.06130) or in our [blog post](https://www.deepset.ai/blog/semantic-answer-similarity-to-evaluate-qa).
+
+
+```python
+advanced_eval_result = pipeline.eval(
+ labels=eval_labels, params={"Retriever": {"top_k": 5}}, sas_model_name_or_path="cross-encoder/stsb-roberta-large"
+)
+
+metrics = advanced_eval_result.calculate_metrics()
+print(metrics["Reader"]["sas"])
+```
+
+## Isolated Evaluation Mode
+The isolated node evaluation uses labels as input to the Reader node instead of the output of the preceeding Retriever node.
+Thereby, we can additionally calculate the upper bounds of the evaluation metrics of the Reader. Note that even with isolated evaluation enabled, integrated evaluation will still be running.
+
+
+
+```python
+eval_result_with_upper_bounds = pipeline.eval(
+ labels=eval_labels, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 5}}, add_isolated_node_eval=True
+)
+```
+
+
+```python
+pipeline.print_eval_report(eval_result_with_upper_bounds)
+```
+
+## Evaluation of Individual Components: Retriever
+Sometimes you might want to evaluate individual components, for example, if you don't have a pipeline but only a retriever or a reader with a model that you trained yourself.
+Here we evaluate only the retriever, based on whether the gold_label document is retrieved.
+
+
+```python
+## Evaluate Retriever on its own
+# Note that no_answer samples are omitted when evaluation is performed with this method
+retriever_eval_results = retriever.eval(top_k=5, label_index=label_index, doc_index=doc_index)
+# Retriever Recall is the proportion of questions for which the correct document containing the answer is
+# among the correct documents
+print("Retriever Recall:", retriever_eval_results["recall"])
+# Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
+print("Retriever Mean Avg Precision:", retriever_eval_results["map"])
+```
+
+Just as a sanity check, we can compare the recall from `retriever.eval()` with the multi hit recall from `pipeline.eval(add_isolated_node_eval=True)`.
+These two recall metrics are only comparable since we chose to filter out no_answer samples when generating eval_labels and setting doc_relevance_col to `"gold_id_match"`. Per default `calculate_metrics()` has doc_relevance_col set to `"gold_id_or_answer_match"` which interprets documents as relevant if they either match the gold_id or contain the answer.
+
+
+```python
+metrics = eval_result_with_upper_bounds.calculate_metrics(doc_relevance_col="gold_id_match")
+print(metrics["Retriever"]["recall_multi_hit"])
+```
+
+## Evaluation of Individual Components: Reader
+Here we evaluate only the reader in a closed domain fashion i.e. the reader is given one query
+and its corresponding relevant document and metrics are calculated on whether the right position in this text is selected by
+the model as the answer span (i.e. SQuAD style)
+
+
+```python
+# Evaluate Reader on its own
+reader_eval_results = reader.eval(document_store=document_store, label_index=label_index, doc_index=doc_index)
+top_n = reader_eval_results["top_n"]
+# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
+# reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)
+
+# Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer including no_answers
+print(f"Reader Top-{top_n}-Accuracy:", reader_eval_results["top_n_accuracy"])
+# Reader Top-1-Exact Match is the proportion of questions where the first predicted answer is exactly the same as the correct answer including no_answers
+print("Reader Top-1-Exact Match:", reader_eval_results["EM"])
+# Reader Top-1-F1-Score is the average overlap between the first predicted answers and the correct answers including no_answers
+print("Reader Top-1-F1-Score:", reader_eval_results["f1"])
+# Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer excluding no_answers
+print(f"Reader Top-{top_n}-Accuracy (without no_answers):", reader_eval_results["top_n_accuracy_text_answer"])
+# Reader Top-N-Exact Match is the proportion of questions where the predicted answer within the first n results is exactly the same as the correct answer excluding no_answers (no_answers are always present within top n).
+print(f"Reader Top-{top_n}-Exact Match (without no_answers):", reader_eval_results["top_n_EM_text_answer"])
+# Reader Top-N-F1-Score is the average overlap between the top n predicted answers and the correct answers excluding no_answers (no_answers are always present within top n).
+print(f"Reader Top-{top_n}-F1-Score (without no_answers):", reader_eval_results["top_n_f1_text_answer"])
+```
+
+Just as a sanity check, we can compare the top-n exact_match and f1 metrics from `reader.eval()` with the exact_match and f1 from `pipeline.eval(add_isolated_node_eval=True)`.
+These two approaches return the same values because pipeline.eval() calculates top-n metrics per default. Small discrepancies might occur due to string normalization in pipeline.eval()'s answer-to-label comparison. reader.eval() does not use string normalization.
+
+
+```python
+metrics = eval_result_with_upper_bounds.calculate_metrics(eval_mode="isolated")
+print(metrics["Reader"]["exact_match"])
+print(metrics["Reader"]["f1"])
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/6.md b/docs/v1.5.0/_src/tutorials/tutorials/6.md
new file mode 100644
index 0000000000..6bbef9388d
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/6.md
@@ -0,0 +1,248 @@
+
+
+# Better Retrieval via "Dense Passage Retrieval"
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb)
+
+### Importance of Retrievers
+
+The Retriever has a huge impact on the performance of our overall search pipeline.
+
+
+### Different types of Retrievers
+#### Sparse
+Family of algorithms based on counting the occurrences of words (bag-of-words) resulting in very sparse vectors with length = vocab size.
+
+**Examples**: BM25, TF-IDF
+
+**Pros**: Simple, fast, well explainable
+
+**Cons**: Relies on exact keyword matches between query and text
+
+
+#### Dense
+These retrievers use neural network models to create "dense" embedding vectors. Within this family there are two different approaches:
+
+a) Single encoder: Use a **single model** to embed both query and passage.
+b) Dual-encoder: Use **two models**, one to embed the query and one to embed the passage
+
+Recent work suggests that dual encoders work better, likely because they can deal better with the different nature of query and passage (length, style, syntax ...).
+
+**Examples**: REALM, DPR, Sentence-Transformers
+
+**Pros**: Captures semantinc similarity instead of "word matches" (e.g. synonyms, related topics ...)
+
+**Cons**: Computationally more heavy, initial training of model
+
+
+### "Dense Passage Retrieval"
+
+In this Tutorial, we want to highlight one "Dense Dual-Encoder" called Dense Passage Retriever.
+It was introdoced by Karpukhin et al. (2020, https://arxiv.org/abs/2004.04906.
+
+Original Abstract:
+
+_"Open-domain question answering relies on efficient passage retrieval to select candidate contexts, where traditional sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can be practically implemented using dense representations alone, where embeddings are learned from a small number of questions and passages by a simple dual-encoder framework. When evaluated on a wide range of open-domain QA datasets, our dense retriever outperforms a strong Lucene-BM25 system largely by 9%-19% absolute in terms of top-20 passage retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA benchmarks."_
+
+Paper: https://arxiv.org/abs/2004.04906
+Original Code: https://fburl.com/qa-dpr
+
+
+*Use this* [link](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb) *to open the notebook in Google Colab.*
+
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]
+```
+
+
+```python
+from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
+from haystack.nodes import FARMReader, TransformersReader
+```
+
+### Document Store
+
+#### Option 1: FAISS
+
+FAISS is a library for efficient similarity search on a cluster of dense vectors.
+The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
+to store the document text and other meta data. The vector embeddings of the text are
+indexed on a FAISS Index that later is queried for searching answers.
+The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
+faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
+For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
+
+
+```python
+from haystack.document_stores import FAISSDocumentStore
+
+document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
+```
+
+#### Option 2: Milvus
+
+Milvus is an open source database library that is also optimized for vector similarity searches like FAISS.
+Like FAISS it has both a "Flat" and "HNSW" mode but it outperforms FAISS when it comes to dynamic data management.
+It does require a little more setup, however, as it is run through Docker and requires the setup of some config files.
+See [their docs](https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md) for more details.
+
+
+```python
+# Milvus cannot be run on COlab, so this cell is commented out.
+# To run Milvus you need Docker (versions below 2.0.0) or a docker-compose (versions >= 2.0.0), neither of which is available on Colab.
+# See Milvus' documentation for more details: https://milvus.io/docs/install_standalone-docker.md
+
+# !pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[milvus]
+
+# from haystack.utils import launch_milvus
+# from haystack.document_stores import MilvusDocumentStore
+
+# launch_milvus()
+# document_store = MilvusDocumentStore()
+```
+
+### Cleaning & indexing documents
+
+Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore
+
+
+```python
+# Let's first get some files that we want to use
+doc_dir = "data/tutorial6"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+# Convert files to dicts
+docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
+
+# Now, let's write the dicts containing documents to our DB.
+document_store.write_documents(docs)
+```
+
+### Initalize Retriever, Reader & Pipeline
+
+#### Retriever
+
+**Here:** We use a `DensePassageRetriever`
+
+**Alternatives:**
+
+- The `BM25Retriever`with custom queries (e.g. boosting) and filters
+- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)
+- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging
+
+
+```python
+from haystack.nodes import DensePassageRetriever
+
+retriever = DensePassageRetriever(
+ document_store=document_store,
+ query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
+ passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+ max_seq_len_query=64,
+ max_seq_len_passage=256,
+ batch_size=16,
+ use_gpu=True,
+ embed_title=True,
+ use_fast_tokenizers=True,
+)
+# Important:
+# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
+# previously indexed documents and update their embedding representation.
+# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
+# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
+document_store.update_embeddings(retriever)
+```
+
+#### Reader
+
+Similar to previous Tutorials we now initalize our reader.
+
+Here we use a FARMReader with the *deepset/roberta-base-squad2* model (see: https://huggingface.co/deepset/roberta-base-squad2)
+
+
+
+##### FARMReader
+
+
+```python
+# Load a local model or any of the QA models on
+# Hugging Face's model hub (https://huggingface.co/models)
+
+reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
+```
+
+### Pipeline
+
+With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
+Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
+To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
+You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/components/v1.5.0/pipelines).
+
+
+```python
+from haystack.pipelines import ExtractiveQAPipeline
+
+pipe = ExtractiveQAPipeline(reader, retriever)
+```
+
+## Voilà ! Ask a question!
+
+
+```python
+# You can configure how many candidates the reader and retriever shall return
+# The higher top_k for retriever, the better (but also the slower) your answers.
+prediction = pipe.run(
+ query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
+)
+```
+
+
+```python
+print_answers(prediction, details="minimum")
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/7.md b/docs/v1.5.0/_src/tutorials/tutorials/7.md
new file mode 100644
index 0000000000..cbb4d541ee
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/7.md
@@ -0,0 +1,189 @@
+
+
+# Generative QA with "Retrieval-Augmented Generation"
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial7_RAG_Generator.ipynb)
+
+While extractive QA highlights the span of text that answers a query,
+generative QA can return a novel text answer that it has composed.
+In this tutorial, you will learn how to set up a generative system using the
+[RAG model](https://arxiv.org/abs/2005.11401) which conditions the
+answer generator on a set of retrieved documents.
+
+### Prepare environment
+
+#### Colab: Enable the GPU runtime
+Make sure you enable the GPU runtime to experience decent speed in this tutorial.
+**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
+
+
+
+
+```python
+# Make sure you have a GPU running
+!nvidia-smi
+```
+
+Here are the packages and imports that we'll need:
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]
+```
+
+
+```python
+from typing import List
+import requests
+import pandas as pd
+from haystack import Document
+from haystack.document_stores import FAISSDocumentStore
+from haystack.nodes import RAGenerator, DensePassageRetriever
+from haystack.utils import fetch_archive_from_http
+```
+
+Let's download a csv containing some sample text and preprocess the data.
+
+
+
+```python
+# Download sample
+doc_dir = "data/tutorial7/"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+# Create dataframe with columns "title" and "text"
+df = pd.read_csv(f"{doc_dir}/small_generator_dataset.csv", sep=",")
+# Minimal cleaning
+df.fillna(value="", inplace=True)
+
+print(df.head())
+```
+
+We can cast our data into Haystack Document objects.
+Alternatively, we can also just use dictionaries with "text" and "meta" fields
+
+
+```python
+# Use data to initialize Document objects
+titles = list(df["title"].values)
+texts = list(df["text"].values)
+documents: List[Document] = []
+for title, text in zip(titles, texts):
+ documents.append(Document(content=text, meta={"name": title or ""}))
+```
+
+Here we initialize the FAISSDocumentStore, DensePassageRetriever and RAGenerator.
+FAISS is chosen here since it is optimized vector storage.
+
+
+```python
+# Initialize FAISS document store.
+# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
+document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)
+
+# Initialize DPR Retriever to encode documents, encode question and query documents
+retriever = DensePassageRetriever(
+ document_store=document_store,
+ query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
+ passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+ use_gpu=True,
+ embed_title=True,
+)
+
+# Initialize RAG Generator
+generator = RAGenerator(
+ model_name_or_path="facebook/rag-token-nq",
+ use_gpu=True,
+ top_k=1,
+ max_length=200,
+ min_length=2,
+ embed_title=True,
+ num_beams=2,
+)
+```
+
+We write documents to the DocumentStore, first by deleting any remaining documents then calling `write_documents()`.
+The `update_embeddings()` method uses the retriever to create an embedding for each document.
+
+
+
+```python
+# Delete existing documents in documents store
+document_store.delete_documents()
+
+# Write documents to document store
+document_store.write_documents(documents)
+
+# Add documents embeddings to index
+document_store.update_embeddings(retriever=retriever)
+```
+
+Here are our questions:
+
+
+```python
+QUESTIONS = [
+ "who got the first nobel prize in physics",
+ "when is the next deadpool movie being released",
+ "which mode is used for short wave broadcast service",
+ "who is the owner of reading football club",
+ "when is the next scandal episode coming out",
+ "when is the last time the philadelphia won the superbowl",
+ "what is the most current adobe flash player version",
+ "how many episodes are there in dragon ball z",
+ "what is the first step in the evolution of the eye",
+ "where is gall bladder situated in human body",
+ "what is the main mineral in lithium batteries",
+ "who is the president of usa right now",
+ "where do the greasers live in the outsiders",
+ "panda is a national animal of which country",
+ "what is the name of manchester united stadium",
+]
+```
+
+Now let's run our system!
+The retriever will pick out a small subset of documents that it finds relevant.
+These are used to condition the generator as it generates the answer.
+What it should return then are novel text spans that form and answer to your question!
+
+
+```python
+# Or alternatively use the Pipeline class
+from haystack.pipelines import GenerativeQAPipeline
+from haystack.utils import print_answers
+
+pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
+for question in QUESTIONS:
+ res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
+ print_answers(res, details="minimum")
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/8.md b/docs/v1.5.0/_src/tutorials/tutorials/8.md
new file mode 100644
index 0000000000..3ebfd98bc5
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/8.md
@@ -0,0 +1,209 @@
+
+
+# Preprocessing
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial8_Preprocessing.ipynb)
+
+Haystack includes a suite of tools to extract text from different file types, normalize white space
+and split text into smaller pieces to optimize retrieval.
+These data preprocessing steps can have a big impact on the systems performance and effective handling of data is key to getting the most out of Haystack.
+
+Ultimately, Haystack expects data to be provided as a list documents in the following dictionary format:
+``` python
+docs = [
+ {
+ 'content': DOCUMENT_TEXT_HERE,
+ 'meta': {'name': DOCUMENT_NAME, ...}
+ }, ...
+]
+```
+
+This tutorial will show you all the tools that Haystack provides to help you cast your data into this format.
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]
+
+# For Colab/linux based machines
+!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz
+!tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
+
+# For Macos machines
+# !wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-mac-4.03.tar.gz
+# !tar -xvf xpdf-tools-mac-4.03.tar.gz && sudo cp xpdf-tools-mac-4.03/bin64/pdftotext /usr/local/bin
+```
+
+
+```python
+# Here are the imports we need
+from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
+from haystack.utils import convert_files_to_docs, fetch_archive_from_http
+```
+
+
+```python
+# This fetches some sample files to work with
+
+doc_dir = "data/tutorial8"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial8.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+```
+
+## Converters
+
+Haystack's converter classes are designed to help you turn files on your computer into the documents
+that can be processed by the Haystack pipeline.
+There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
+The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.
+
+
+```python
+# Here are some examples of how you would use file converters
+
+converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
+doc_txt = converter.convert(file_path="data/tutorial8/classics.txt", meta=None)[0]
+
+converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
+doc_pdf = converter.convert(file_path="data/tutorial8/bert.pdf", meta=None)[0]
+
+converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
+doc_docx = converter.convert(file_path="data/tutorial8/heavy_metal.docx", meta=None)[0]
+```
+
+
+```python
+# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.
+
+all_docs = convert_files_to_docs(dir_path=doc_dir)
+```
+
+## PreProcessor
+
+The PreProcessor class is designed to help you clean text and split text into sensible units.
+File splitting can have a very significant impact on the system's performance and is absolutely mandatory for Dense Passage Retrieval models.
+In general, we recommend you split the text from your files into small documents of around 100 words for dense retrieval methods
+and no more than 10,000 words for sparse methods.
+Have a look at the [Preprocessing](https://haystack.deepset.ai/components/v1.5.0/preprocessing)
+and [Optimization](https://haystack.deepset.ai/guides/v1.5.0/optimization) pages on our website for more details.
+
+
+```python
+# This is a default usage of the PreProcessor.
+# Here, it performs cleaning of consecutive whitespaces
+# and splits a single large document into smaller documents.
+# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
+# Note how the single document passed into the document gets split into 5 smaller documents
+
+preprocessor = PreProcessor(
+ clean_empty_lines=True,
+ clean_whitespace=True,
+ clean_header_footer=False,
+ split_by="word",
+ split_length=100,
+ split_respect_sentence_boundary=True,
+)
+docs_default = preprocessor.process([doc_txt])
+print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")
+```
+
+## Cleaning
+
+- `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines
+- `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text
+- `clean_header_footer` will remove any long header or footer texts that are repeated on each page
+
+## Splitting
+By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end
+midway through a sentence.
+This will help reduce the possibility of answer phrases being split between two documents.
+This feature can be turned off by setting `split_respect_sentence_boundary=False`.
+
+
+```python
+# Not respecting sentence boundary vs respecting sentence boundary
+
+preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False)
+docs_nrsb = preprocessor_nrsb.process([doc_txt])
+
+print("RESPECTING SENTENCE BOUNDARY")
+end_text = docs_default[0].content[-50:]
+print('End of document: "...' + end_text + '"')
+print()
+print("NOT RESPECTING SENTENCE BOUNDARY")
+end_text_nrsb = docs_nrsb[0].content[-50:]
+print('End of document: "...' + end_text_nrsb + '"')
+```
+
+A commonly used strategy to split long documents, especially in the field of Question Answering,
+is the sliding window approach. If `split_length=10` and `split_overlap=3`, your documents will look like this:
+
+- doc1 = words[0:10]
+- doc2 = words[7:17]
+- doc3 = words[14:24]
+- ...
+
+You can use this strategy by following the code below.
+
+
+```python
+# Sliding window approach
+
+preprocessor_sliding_window = PreProcessor(split_overlap=3, split_length=10, split_respect_sentence_boundary=False)
+docs_sliding_window = preprocessor_sliding_window.process([doc_txt])
+
+doc1 = docs_sliding_window[0].content[:200]
+doc2 = docs_sliding_window[1].content[:100]
+doc3 = docs_sliding_window[2].content[:100]
+
+print('Document 1: "' + doc1 + '..."')
+print('Document 2: "' + doc2 + '..."')
+print('Document 3: "' + doc3 + '..."')
+```
+
+## Bringing it all together
+
+
+```python
+all_docs = convert_files_to_docs(dir_path=doc_dir)
+preprocessor = PreProcessor(
+ clean_empty_lines=True,
+ clean_whitespace=True,
+ clean_header_footer=False,
+ split_by="word",
+ split_length=100,
+ split_respect_sentence_boundary=True,
+)
+docs = preprocessor.process(all_docs)
+
+print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
+
diff --git a/docs/v1.5.0/_src/tutorials/tutorials/9.md b/docs/v1.5.0/_src/tutorials/tutorials/9.md
new file mode 100644
index 0000000000..9490af9dc9
--- /dev/null
+++ b/docs/v1.5.0/_src/tutorials/tutorials/9.md
@@ -0,0 +1,248 @@
+
+
+# Training Your Own "Dense Passage Retrieval" Model
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial9_DPR_training.ipynb)
+
+Haystack contains all the tools needed to train your own Dense Passage Retrieval model.
+This tutorial will guide you through the steps required to create a retriever that is specifically tailored to your domain.
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install --upgrade pip
+!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+```
+
+
+```python
+# Here are some imports that we'll need
+
+from haystack.nodes import DensePassageRetriever
+from haystack.utils import fetch_archive_from_http
+from haystack.document_stores import InMemoryDocumentStore
+```
+
+## Training Data
+
+DPR training performed using Information Retrieval data.
+More specifically, you want to feed in pairs of queries and relevant documents.
+
+To train a model, we will need a dataset that has the same format as the original DPR training data.
+Each data point in the dataset should have the following dictionary structure.
+
+``` python
+ {
+ "dataset": str,
+ "question": str,
+ "answers": list of str
+ "positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
+ "negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
+ "hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
+ }
+```
+
+`positive_ctxs` are context passages which are relevant to the query.
+In some datasets, queries might have more than one positive context
+in which case you can set the `num_positives` parameter to be higher than the default 1.
+Note that `num_positives` needs to be lower or equal to the minimum number of `positive_ctxs` for queries in your data.
+If you have an unequal number of positive contexts per example,
+you might want to generate some soft labels by retrieving similar contexts which contain the answer.
+
+DPR is standardly trained using a method known as in-batch negatives.
+This means that positive contexts for a given query are treated as negative contexts for the other queries in the batch.
+Doing so allows for a high degree of computational efficiency, thus allowing the model to be trained on large amounts of data.
+
+`negative_ctxs` is not actually used in Haystack's DPR training so we recommend you set it to an empty list.
+They were used by the original DPR authors in an experiment to compare it against the in-batch negatives method.
+
+`hard_negative_ctxs` are passages that are not relevant to the query.
+In the original DPR paper, these are fetched using a retriever to find the most relevant passages to the query.
+Passages which contain the answer text are filtered out.
+
+If you'd like to convert your SQuAD format data into something that can train a DPR model,
+check out the utility script at [`haystack/utils/squad_to_dpr.py`](https://github.com/deepset-ai/haystack/blob/master/haystack/utils/squad_to_dpr.py)
+
+## Using Question Answering Data
+
+Question Answering datasets can sometimes be used as training data.
+Google's Natural Questions dataset, is sufficiently large
+and contains enough unique passages, that it can be converted into a DPR training set.
+This is done simply by considering answer containing passages as relevant documents to the query.
+
+The SQuAD dataset, however, is not as suited to this use case since its question and answer pairs
+are created on only a very small slice of wikipedia documents.
+
+## Download Original DPR Training Data
+
+WARNING: These files are large! The train set is 7.4GB and the dev set is 800MB
+
+We can download the original DPR training data with the following cell.
+Note that this data is probably only useful if you are trying to train from scratch.
+
+
+```python
+# Download original DPR data
+# WARNING: the train set is 7.4GB and the dev set is 800MB
+
+doc_dir = "data/tutorial9"
+
+s3_url_train = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz"
+s3_url_dev = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz"
+
+fetch_archive_from_http(s3_url_train, output_dir=doc_dir + "/train")
+fetch_archive_from_http(s3_url_dev, output_dir=doc_dir + "/dev")
+```
+
+## Option 1: Training DPR from Scratch
+
+The default variables that we provide below are chosen to train a DPR model from scratch.
+Here, both passage and query embedding models are initialized using BERT base
+and the model is trained using Google's Natural Questions dataset (in a format specialised for DPR).
+
+If you are working in a language other than English,
+you will want to initialize the passage and query embedding models with a language model that supports your language
+and also provide a dataset in your language.
+
+
+```python
+# Here are the variables to specify our training data, the models that we use to initialize DPR
+# and the directory where we'll be saving the model
+
+train_filename = "train/biencoder-nq-train.json"
+dev_filename = "dev/biencoder-nq-dev.json"
+
+query_model = "bert-base-uncased"
+passage_model = "bert-base-uncased"
+
+save_dir = "../saved_models/dpr"
+```
+
+## Option 2: Finetuning DPR
+
+If you have your own domain specific question answering or information retrieval dataset,
+you might instead be interested in finetuning a pretrained DPR model.
+In this case, you would initialize both query and passage models using the original pretrained model.
+You will want to load something like this set of variables instead of the ones above
+
+
+```python
+# Here are the variables you might want to use instead of the set above
+# in order to perform pretraining
+
+doc_dir = "PATH_TO_YOUR_DATA_DIR"
+train_filename = "TRAIN_FILENAME"
+dev_filename = "DEV_FILENAME"
+
+query_model = "facebook/dpr-question_encoder-single-nq-base"
+passage_model = "facebook/dpr-ctx_encoder-single-nq-base"
+
+save_dir = "../saved_models/dpr"
+```
+
+## Initialization
+
+Here we want to initialize our model either with plain language model weights for training from scratch
+or else with pretrained DPR weights for finetuning.
+We follow the [original DPR parameters](https://github.com/facebookresearch/DPR#best-hyperparameter-settings)
+for their max passage length but set max query length to 64 since queries are very rarely longer.
+
+
+```python
+## Initialize DPR model
+
+retriever = DensePassageRetriever(
+ document_store=InMemoryDocumentStore(),
+ query_embedding_model=query_model,
+ passage_embedding_model=passage_model,
+ max_seq_len_query=64,
+ max_seq_len_passage=256,
+)
+```
+
+## Training
+
+Let's start training and save our trained model!
+
+On a V100 GPU, you can fit up to batch size 16 so we set gradient accumulation steps to 8 in order
+to simulate the batch size 128 of the original DPR experiment.
+
+When `embed_title=True`, the document title is prepended to the input text sequence with a `[SEP]` token
+between it and document text.
+
+When training from scratch with the above variables, 1 epoch takes around an hour and we reached the following performance:
+
+```
+loss: 0.046580662854042276
+task_name: text_similarity
+acc: 0.992524064068483
+f1: 0.8804297774366846
+acc_and_f1: 0.9364769207525838
+average_rank: 0.19631619339984652
+report:
+ precision recall f1-score support
+
+hard_negative 0.9961 0.9961 0.9961 201887
+ positive 0.8804 0.8804 0.8804 6515
+
+ accuracy 0.9925 208402
+ macro avg 0.9383 0.9383 0.9383 208402
+ weighted avg 0.9925 0.9925 0.9925 208402
+
+```
+
+
+```python
+# Start training our model and save it when it is finished
+
+retriever.train(
+ data_dir=doc_dir,
+ train_filename=train_filename,
+ dev_filename=dev_filename,
+ test_filename=dev_filename,
+ n_epochs=1,
+ batch_size=16,
+ grad_acc_steps=8,
+ save_dir=save_dir,
+ evaluate_every=3000,
+ embed_title=True,
+ num_positives=1,
+ num_hard_negatives=1,
+)
+```
+
+## Loading
+
+Loading our newly trained model is simple!
+
+
+```python
+reloaded_retriever = DensePassageRetriever.load(load_dir=save_dir, document_store=None)
+```
+
+## About us
+
+This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
+
+We bring NLP to the industry via open source!
+Our focus: Industry specific language models & large scale QA systems.
+
+Some of our other work:
+- [German BERT](https://deepset.ai/german-bert)
+- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)
+- [FARM](https://github.com/deepset-ai/FARM)
+
+Get in touch:
+[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)
+
+By the way: [we're hiring!](https://www.deepset.ai/jobs)
diff --git a/docs/v1.5.0/make.bat b/docs/v1.5.0/make.bat
new file mode 100644
index 0000000000..7d79440912
--- /dev/null
+++ b/docs/v1.5.0/make.bat
@@ -0,0 +1,38 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=_src/
+set BUILDDIR=build
+set SPHINXFLAGS=-a -n -A local=1
+set SPHINXOPTS=%SPHINXFLAGS% %SOURCE%
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS%
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -b %1 %ALLSPINXOPTS% %BUILDDIR%/%1
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/haystack/json-schemas/haystack-pipeline-1.5.0.schema.json b/haystack/json-schemas/haystack-pipeline-1.5.0.schema.json
new file mode 100644
index 0000000000..4274250133
--- /dev/null
+++ b/haystack/json-schemas/haystack-pipeline-1.5.0.schema.json
@@ -0,0 +1,4514 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema",
+ "$id": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.5.0.schema.json",
+ "title": "Haystack Pipeline",
+ "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions",
+ "type": "object",
+ "properties": {
+ "version": {
+ "title": "Version",
+ "description": "Version of the Haystack Pipeline file.",
+ "type": "string",
+ "const": "1.5.0"
+ },
+ "extras": {
+ "title": "Additional properties group",
+ "description": "To be specified only if contains special pipelines (for example, if this is a Ray pipeline)",
+ "type": "string",
+ "enum": [
+ "ray"
+ ]
+ },
+ "components": {
+ "title": "Components",
+ "description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "$ref": "#/definitions/DeepsetCloudDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/ElasticsearchDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/FAISSDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/GraphDBKnowledgeGraphComponent"
+ },
+ {
+ "$ref": "#/definitions/InMemoryDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/Milvus2DocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/OpenDistroElasticsearchDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/OpenSearchDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/PineconeDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/SQLDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/WeaviateDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/AzureConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/BM25RetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/CrawlerComponent"
+ },
+ {
+ "$ref": "#/definitions/DensePassageRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/Docs2AnswersComponent"
+ },
+ {
+ "$ref": "#/definitions/DocxToTextConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/ElasticsearchFilterOnlyRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/ElasticsearchRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/EmbeddingRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/EntityExtractorComponent"
+ },
+ {
+ "$ref": "#/definitions/EvalAnswersComponent"
+ },
+ {
+ "$ref": "#/definitions/EvalDocumentsComponent"
+ },
+ {
+ "$ref": "#/definitions/FARMReaderComponent"
+ },
+ {
+ "$ref": "#/definitions/FileTypeClassifierComponent"
+ },
+ {
+ "$ref": "#/definitions/FilterRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/ImageToTextConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/JoinAnswersComponent"
+ },
+ {
+ "$ref": "#/definitions/JoinDocumentsComponent"
+ },
+ {
+ "$ref": "#/definitions/MarkdownConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/PDFToTextConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/PDFToTextOCRConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/ParsrConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/PreProcessorComponent"
+ },
+ {
+ "$ref": "#/definitions/PseudoLabelGeneratorComponent"
+ },
+ {
+ "$ref": "#/definitions/QuestionGeneratorComponent"
+ },
+ {
+ "$ref": "#/definitions/RAGeneratorComponent"
+ },
+ {
+ "$ref": "#/definitions/RCIReaderComponent"
+ },
+ {
+ "$ref": "#/definitions/RouteDocumentsComponent"
+ },
+ {
+ "$ref": "#/definitions/SentenceTransformersRankerComponent"
+ },
+ {
+ "$ref": "#/definitions/Seq2SeqGeneratorComponent"
+ },
+ {
+ "$ref": "#/definitions/SklearnQueryClassifierComponent"
+ },
+ {
+ "$ref": "#/definitions/TableReaderComponent"
+ },
+ {
+ "$ref": "#/definitions/TableTextRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/Text2SparqlRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/TextConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/TfidfRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/TikaConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersDocumentClassifierComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersQueryClassifierComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersReaderComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersSummarizerComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersTranslatorComponent"
+ }
+ ]
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": true
+ },
+ "pipelines": {
+ "title": "Pipelines",
+ "description": "Multiple pipelines can be defined using the components from the same YAML file.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Name of the pipeline.",
+ "type": "string"
+ },
+ "nodes": {
+ "title": "Nodes",
+ "description": "Nodes to be used by this particular pipeline",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.",
+ "type": "string"
+ },
+ "inputs": {
+ "title": "Inputs",
+ "description": "Input parameters for this node.",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "replicas": {
+ "title": "replicas",
+ "description": "How many replicas Ray should create for this node (only for Ray pipelines)",
+ "type": "integer"
+ }
+ },
+ "required": [
+ "name",
+ "inputs"
+ ],
+ "additionalProperties": false
+ },
+ "required": [
+ "name",
+ "nodes"
+ ],
+ "additionalProperties": false
+ },
+ "additionalProperties": false
+ },
+ "additionalProperties": false
+ }
+ }
+ },
+ "required": [
+ "version",
+ "components",
+ "pipelines"
+ ],
+ "additionalProperties": false,
+ "oneOf": [
+ {
+ "not": {
+ "required": [
+ "extras"
+ ]
+ },
+ "properties": {
+ "pipelines": {
+ "title": "Pipelines",
+ "items": {
+ "properties": {
+ "nodes": {
+ "items": {
+ "not": {
+ "required": [
+ "replicas"
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ {
+ "properties": {
+ "extras": {
+ "enum": [
+ "ray"
+ ]
+ }
+ },
+ "required": [
+ "extras"
+ ]
+ }
+ ],
+ "definitions": {
+ "DeepsetCloudDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "DeepsetCloudDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "Api Key",
+ "type": "string"
+ },
+ "workspace": {
+ "title": "Workspace",
+ "default": "default",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "type": "string"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "api_endpoint": {
+ "title": "Api Endpoint",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "label_index": {
+ "title": "Label Index",
+ "default": "default",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ElasticsearchDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ElasticsearchDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "host": {
+ "title": "Host",
+ "default": "localhost",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "port": {
+ "title": "Port",
+ "default": 9200,
+ "anyOf": [
+ {
+ "type": "integer"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ }
+ ]
+ },
+ "username": {
+ "title": "Username",
+ "default": "",
+ "type": "string"
+ },
+ "password": {
+ "title": "Password",
+ "default": "",
+ "type": "string"
+ },
+ "api_key_id": {
+ "title": "Api Key Id",
+ "type": "string"
+ },
+ "api_key": {
+ "title": "Api Key",
+ "type": "string"
+ },
+ "aws4auth": {
+ "title": "Aws4Auth"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "label_index": {
+ "title": "Label Index",
+ "default": "label",
+ "type": "string"
+ },
+ "search_fields": {
+ "title": "Search Fields",
+ "default": "content",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {}
+ }
+ ]
+ },
+ "content_field": {
+ "title": "Content Field",
+ "default": "content",
+ "type": "string"
+ },
+ "name_field": {
+ "title": "Name Field",
+ "default": "name",
+ "type": "string"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "custom_mapping": {
+ "title": "Custom Mapping",
+ "type": "object"
+ },
+ "excluded_meta_data": {
+ "title": "Excluded Meta Data",
+ "type": "array",
+ "items": {}
+ },
+ "analyzer": {
+ "title": "Analyzer",
+ "default": "standard",
+ "type": "string"
+ },
+ "scheme": {
+ "title": "Scheme",
+ "default": "http",
+ "type": "string"
+ },
+ "ca_certs": {
+ "title": "Ca Certs",
+ "type": "string"
+ },
+ "verify_certs": {
+ "title": "Verify Certs",
+ "default": true,
+ "type": "boolean"
+ },
+ "recreate_index": {
+ "title": "Recreate Index",
+ "default": false,
+ "type": "boolean"
+ },
+ "create_index": {
+ "title": "Create Index",
+ "default": true,
+ "type": "boolean"
+ },
+ "refresh_type": {
+ "title": "Refresh Type",
+ "default": "wait_for",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "timeout": {
+ "title": "Timeout",
+ "default": 30,
+ "type": "integer"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "index_type": {
+ "title": "Index Type",
+ "default": "flat",
+ "type": "string"
+ },
+ "scroll": {
+ "title": "Scroll",
+ "default": "1d",
+ "type": "string"
+ },
+ "skip_missing_embeddings": {
+ "title": "Skip Missing Embeddings",
+ "default": true,
+ "type": "boolean"
+ },
+ "synonyms": {
+ "title": "Synonyms",
+ "type": "array",
+ "items": {}
+ },
+ "synonym_type": {
+ "title": "Synonym Type",
+ "default": "synonym",
+ "type": "string"
+ },
+ "use_system_proxy": {
+ "title": "Use System Proxy",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "FAISSDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "FAISSDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "sql_url": {
+ "title": "Sql Url",
+ "default": "sqlite:///faiss_document_store.db",
+ "type": "string"
+ },
+ "vector_dim": {
+ "title": "Vector Dim",
+ "type": "integer"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "faiss_index_factory_str": {
+ "title": "Faiss Index Factory Str",
+ "default": "Flat",
+ "type": "string"
+ },
+ "faiss_index": {
+ "title": "Faiss Index",
+ "type": "string",
+ "default": null
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "faiss_index_path": {
+ "title": "Faiss Index Path",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string",
+ "format": "path"
+ }
+ ]
+ },
+ "faiss_config_path": {
+ "title": "Faiss Config Path",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string",
+ "format": "path"
+ }
+ ]
+ },
+ "isolation_level": {
+ "title": "Isolation Level",
+ "type": "string"
+ },
+ "n_links": {
+ "title": "N Links",
+ "default": 64,
+ "type": "integer"
+ },
+ "ef_search": {
+ "title": "Ef Search",
+ "default": 20,
+ "type": "integer"
+ },
+ "ef_construction": {
+ "title": "Ef Construction",
+ "default": 80,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "GraphDBKnowledgeGraphComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "GraphDBKnowledgeGraph"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "host": {
+ "title": "Host",
+ "default": "localhost",
+ "type": "string"
+ },
+ "port": {
+ "title": "Port",
+ "default": 7200,
+ "type": "integer"
+ },
+ "username": {
+ "title": "Username",
+ "default": "",
+ "type": "string"
+ },
+ "password": {
+ "title": "Password",
+ "default": "",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "type": "string"
+ },
+ "prefixes": {
+ "title": "Prefixes",
+ "default": "",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "InMemoryDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "InMemoryDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "label_index": {
+ "title": "Label Index",
+ "default": "label",
+ "type": "string"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "scoring_batch_size": {
+ "title": "Scoring Batch Size",
+ "default": 500000,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "Milvus2DocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Milvus2DocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "sql_url": {
+ "title": "Sql Url",
+ "default": "sqlite:///",
+ "type": "string"
+ },
+ "host": {
+ "title": "Host",
+ "default": "localhost",
+ "type": "string"
+ },
+ "port": {
+ "title": "Port",
+ "default": "19530",
+ "type": "string"
+ },
+ "connection_pool": {
+ "title": "Connection Pool",
+ "default": "SingletonThread",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "vector_dim": {
+ "title": "Vector Dim",
+ "type": "integer"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "index_file_size": {
+ "title": "Index File Size",
+ "default": 1024,
+ "type": "integer"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "index_type": {
+ "title": "Index Type",
+ "default": "IVF_FLAT",
+ "type": "string"
+ },
+ "index_param": {
+ "title": "Index Param",
+ "type": "object"
+ },
+ "search_param": {
+ "title": "Search Param",
+ "type": "object"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "id_field": {
+ "title": "Id Field",
+ "default": "id",
+ "type": "string"
+ },
+ "custom_fields": {
+ "title": "Custom Fields",
+ "type": "array",
+ "items": {}
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "isolation_level": {
+ "title": "Isolation Level",
+ "type": "string"
+ },
+ "consistency_level": {
+ "title": "Consistency Level",
+ "default": 0,
+ "type": "integer"
+ },
+ "recreate_index": {
+ "title": "Recreate Index",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "OpenDistroElasticsearchDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "OpenDistroElasticsearchDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "scheme": {
+ "title": "Scheme",
+ "default": "https",
+ "type": "string"
+ },
+ "username": {
+ "title": "Username",
+ "default": "admin",
+ "type": "string"
+ },
+ "password": {
+ "title": "Password",
+ "default": "admin",
+ "type": "string"
+ },
+ "host": {
+ "title": "Host",
+ "default": "localhost",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "port": {
+ "title": "Port",
+ "default": 9200,
+ "anyOf": [
+ {
+ "type": "integer"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ }
+ ]
+ },
+ "api_key_id": {
+ "title": "Api Key Id",
+ "type": "string"
+ },
+ "api_key": {
+ "title": "Api Key",
+ "type": "string"
+ },
+ "aws4auth": {
+ "title": "Aws4Auth"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "label_index": {
+ "title": "Label Index",
+ "default": "label",
+ "type": "string"
+ },
+ "search_fields": {
+ "title": "Search Fields",
+ "default": "content",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {}
+ }
+ ]
+ },
+ "content_field": {
+ "title": "Content Field",
+ "default": "content",
+ "type": "string"
+ },
+ "name_field": {
+ "title": "Name Field",
+ "default": "name",
+ "type": "string"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "custom_mapping": {
+ "title": "Custom Mapping",
+ "type": "object"
+ },
+ "excluded_meta_data": {
+ "title": "Excluded Meta Data",
+ "type": "array",
+ "items": {}
+ },
+ "analyzer": {
+ "title": "Analyzer",
+ "default": "standard",
+ "type": "string"
+ },
+ "ca_certs": {
+ "title": "Ca Certs",
+ "type": "string"
+ },
+ "verify_certs": {
+ "title": "Verify Certs",
+ "default": false,
+ "type": "boolean"
+ },
+ "recreate_index": {
+ "title": "Recreate Index",
+ "default": false,
+ "type": "boolean"
+ },
+ "create_index": {
+ "title": "Create Index",
+ "default": true,
+ "type": "boolean"
+ },
+ "refresh_type": {
+ "title": "Refresh Type",
+ "default": "wait_for",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "cosine",
+ "type": "string"
+ },
+ "timeout": {
+ "title": "Timeout",
+ "default": 30,
+ "type": "integer"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "index_type": {
+ "title": "Index Type",
+ "default": "flat",
+ "type": "string"
+ },
+ "scroll": {
+ "title": "Scroll",
+ "default": "1d",
+ "type": "string"
+ },
+ "skip_missing_embeddings": {
+ "title": "Skip Missing Embeddings",
+ "default": true,
+ "type": "boolean"
+ },
+ "synonyms": {
+ "title": "Synonyms",
+ "type": "array",
+ "items": {}
+ },
+ "synonym_type": {
+ "title": "Synonym Type",
+ "default": "synonym",
+ "type": "string"
+ },
+ "use_system_proxy": {
+ "title": "Use System Proxy",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "OpenSearchDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "OpenSearchDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "scheme": {
+ "title": "Scheme",
+ "default": "https",
+ "type": "string"
+ },
+ "username": {
+ "title": "Username",
+ "default": "admin",
+ "type": "string"
+ },
+ "password": {
+ "title": "Password",
+ "default": "admin",
+ "type": "string"
+ },
+ "host": {
+ "title": "Host",
+ "default": "localhost",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "port": {
+ "title": "Port",
+ "default": 9200,
+ "anyOf": [
+ {
+ "type": "integer"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ }
+ ]
+ },
+ "api_key_id": {
+ "title": "Api Key Id",
+ "type": "string"
+ },
+ "api_key": {
+ "title": "Api Key",
+ "type": "string"
+ },
+ "aws4auth": {
+ "title": "Aws4Auth"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "label_index": {
+ "title": "Label Index",
+ "default": "label",
+ "type": "string"
+ },
+ "search_fields": {
+ "title": "Search Fields",
+ "default": "content",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {}
+ }
+ ]
+ },
+ "content_field": {
+ "title": "Content Field",
+ "default": "content",
+ "type": "string"
+ },
+ "name_field": {
+ "title": "Name Field",
+ "default": "name",
+ "type": "string"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "custom_mapping": {
+ "title": "Custom Mapping",
+ "type": "object"
+ },
+ "excluded_meta_data": {
+ "title": "Excluded Meta Data",
+ "type": "array",
+ "items": {}
+ },
+ "analyzer": {
+ "title": "Analyzer",
+ "default": "standard",
+ "type": "string"
+ },
+ "ca_certs": {
+ "title": "Ca Certs",
+ "type": "string"
+ },
+ "verify_certs": {
+ "title": "Verify Certs",
+ "default": false,
+ "type": "boolean"
+ },
+ "recreate_index": {
+ "title": "Recreate Index",
+ "default": false,
+ "type": "boolean"
+ },
+ "create_index": {
+ "title": "Create Index",
+ "default": true,
+ "type": "boolean"
+ },
+ "refresh_type": {
+ "title": "Refresh Type",
+ "default": "wait_for",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "timeout": {
+ "title": "Timeout",
+ "default": 30,
+ "type": "integer"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "index_type": {
+ "title": "Index Type",
+ "default": "flat",
+ "type": "string"
+ },
+ "scroll": {
+ "title": "Scroll",
+ "default": "1d",
+ "type": "string"
+ },
+ "skip_missing_embeddings": {
+ "title": "Skip Missing Embeddings",
+ "default": true,
+ "type": "boolean"
+ },
+ "synonyms": {
+ "title": "Synonyms",
+ "type": "array",
+ "items": {}
+ },
+ "synonym_type": {
+ "title": "Synonym Type",
+ "default": "synonym",
+ "type": "string"
+ },
+ "use_system_proxy": {
+ "title": "Use System Proxy",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "PineconeDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "PineconeDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "Api Key",
+ "type": "string"
+ },
+ "environment": {
+ "title": "Environment",
+ "default": "us-west1-gcp",
+ "type": "string"
+ },
+ "sql_url": {
+ "title": "Sql Url",
+ "default": "sqlite:///pinecone_document_store.db",
+ "type": "string"
+ },
+ "pinecone_index": {
+ "title": "Pinecone Index",
+ "type": "string",
+ "default": null
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "cosine",
+ "type": "string"
+ },
+ "replicas": {
+ "title": "Replicas",
+ "default": 1,
+ "type": "integer"
+ },
+ "shards": {
+ "title": "Shards",
+ "default": 1,
+ "type": "integer"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "recreate_index": {
+ "title": "Recreate Index",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "api_key"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "SQLDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "SQLDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "url": {
+ "title": "Url",
+ "default": "sqlite://",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "label_index": {
+ "title": "Label Index",
+ "default": "label",
+ "type": "string"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "check_same_thread": {
+ "title": "Check Same Thread",
+ "default": false,
+ "type": "boolean"
+ },
+ "isolation_level": {
+ "title": "Isolation Level",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "WeaviateDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "WeaviateDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "host": {
+ "title": "Host",
+ "default": "http://localhost",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "port": {
+ "title": "Port",
+ "default": 8080,
+ "anyOf": [
+ {
+ "type": "integer"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ }
+ ]
+ },
+ "timeout_config": {
+ "title": "Timeout Config",
+ "default": [
+ 5,
+ 15
+ ],
+ "type": "array",
+ "items": {}
+ },
+ "username": {
+ "title": "Username",
+ "type": "string"
+ },
+ "password": {
+ "title": "Password",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "default": "Document",
+ "type": "string"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "content_field": {
+ "title": "Content Field",
+ "default": "content",
+ "type": "string"
+ },
+ "name_field": {
+ "title": "Name Field",
+ "default": "name",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "cosine",
+ "type": "string"
+ },
+ "index_type": {
+ "title": "Index Type",
+ "default": "hnsw",
+ "type": "string"
+ },
+ "custom_schema": {
+ "title": "Custom Schema",
+ "type": "object"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "recreate_index": {
+ "title": "Recreate Index",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "AzureConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "AzureConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "endpoint": {
+ "title": "Endpoint",
+ "type": "string"
+ },
+ "credential_key": {
+ "title": "Credential Key",
+ "type": "string"
+ },
+ "model_id": {
+ "title": "Model Id",
+ "default": "prebuilt-document",
+ "type": "string"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "save_json": {
+ "title": "Save Json",
+ "default": false,
+ "type": "boolean"
+ },
+ "preceding_context_len": {
+ "title": "Preceding Context Len",
+ "default": 3,
+ "type": "integer"
+ },
+ "following_context_len": {
+ "title": "Following Context Len",
+ "default": 3,
+ "type": "integer"
+ },
+ "merge_multiple_column_headers": {
+ "title": "Merge Multiple Column Headers",
+ "default": true,
+ "type": "boolean"
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "required": [
+ "endpoint",
+ "credential_key"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "BM25RetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "BM25Retriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
+ "custom_query": {
+ "title": "Custom Query",
+ "type": "string"
+ },
+ "scale_score": {
+ "title": "Scale Score",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "CrawlerComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Crawler"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "output_dir": {
+ "title": "Output Dir",
+ "type": "string"
+ },
+ "urls": {
+ "title": "Urls",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "crawler_depth": {
+ "title": "Crawler Depth",
+ "default": 1,
+ "type": "integer"
+ },
+ "filter_urls": {
+ "title": "Filter Urls",
+ "type": "array",
+ "items": {}
+ },
+ "overwrite_existing_files": {
+ "title": "Overwrite Existing Files",
+ "default": true
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "required": [
+ "output_dir"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "DensePassageRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "DensePassageRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "query_embedding_model": {
+ "title": "Query Embedding Model",
+ "default": "facebook/dpr-question_encoder-single-nq-base",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "passage_embedding_model": {
+ "title": "Passage Embedding Model",
+ "default": "facebook/dpr-ctx_encoder-single-nq-base",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "max_seq_len_query": {
+ "title": "Max Seq Len Query",
+ "default": 64,
+ "type": "integer"
+ },
+ "max_seq_len_passage": {
+ "title": "Max Seq Len Passage",
+ "default": 256,
+ "type": "integer"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": 16,
+ "type": "integer"
+ },
+ "embed_title": {
+ "title": "Embed Title",
+ "default": true,
+ "type": "boolean"
+ },
+ "use_fast_tokenizers": {
+ "title": "Use Fast Tokenizers",
+ "default": true,
+ "type": "boolean"
+ },
+ "infer_tokenizer_classes": {
+ "title": "Infer Tokenizer Classes",
+ "default": false,
+ "type": "boolean"
+ },
+ "similarity_function": {
+ "title": "Similarity Function",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "global_loss_buffer_size": {
+ "title": "Global Loss Buffer Size",
+ "default": 150000,
+ "type": "integer"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ "use_auth_token": {
+ "title": "Use Auth Token",
+ "anyOf": [
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "scale_score": {
+ "title": "Scale Score",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "Docs2AnswersComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Docs2Answers"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {},
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "DocxToTextConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "DocxToTextConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ElasticsearchFilterOnlyRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ElasticsearchFilterOnlyRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
+ "custom_query": {
+ "title": "Custom Query",
+ "type": "string"
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ElasticsearchRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ElasticsearchRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
+ "custom_query": {
+ "title": "Custom Query",
+ "type": "string"
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "EmbeddingRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "EmbeddingRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "embedding_model": {
+ "title": "Embedding Model",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": 32,
+ "type": "integer"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 512,
+ "type": "integer"
+ },
+ "model_format": {
+ "title": "Model Format",
+ "type": "string"
+ },
+ "pooling_strategy": {
+ "title": "Pooling Strategy",
+ "default": "reduce_mean",
+ "type": "string"
+ },
+ "emb_extraction_layer": {
+ "title": "Emb Extraction Layer",
+ "default": -1,
+ "type": "integer"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ "use_auth_token": {
+ "title": "Use Auth Token",
+ "anyOf": [
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "scale_score": {
+ "title": "Scale Score",
+ "default": true,
+ "type": "boolean"
+ },
+ "embed_meta_fields": {
+ "title": "Embed Meta Fields",
+ "default": [],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "required": [
+ "document_store",
+ "embedding_model"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "EntityExtractorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "EntityExtractor"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "dslim/bert-base-NER",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "EvalAnswersComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "EvalAnswers"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "skip_incorrect_retrieval": {
+ "title": "Skip Incorrect Retrieval",
+ "default": true,
+ "type": "boolean"
+ },
+ "open_domain": {
+ "title": "Open Domain",
+ "default": true,
+ "type": "boolean"
+ },
+ "sas_model": {
+ "title": "Sas Model",
+ "type": "string"
+ },
+ "debug": {
+ "title": "Debug",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "EvalDocumentsComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "EvalDocuments"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "debug": {
+ "title": "Debug",
+ "default": false,
+ "type": "boolean"
+ },
+ "open_domain": {
+ "title": "Open Domain",
+ "default": true,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "FARMReaderComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "FARMReader"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "context_window_size": {
+ "title": "Context Window Size",
+ "default": 150,
+ "type": "integer"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": 50,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "default": [],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "no_ans_boost": {
+ "title": "No Ans Boost",
+ "default": 0.0,
+ "type": "number"
+ },
+ "return_no_answer": {
+ "title": "Return No Answer",
+ "default": false,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "top_k_per_candidate": {
+ "title": "Top K Per Candidate",
+ "default": 3,
+ "type": "integer"
+ },
+ "top_k_per_sample": {
+ "title": "Top K Per Sample",
+ "default": 1,
+ "type": "integer"
+ },
+ "num_processes": {
+ "title": "Num Processes",
+ "type": "integer"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 256,
+ "type": "integer"
+ },
+ "doc_stride": {
+ "title": "Doc Stride",
+ "default": 128,
+ "type": "integer"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_filtering": {
+ "title": "Duplicate Filtering",
+ "default": 0,
+ "type": "integer"
+ },
+ "use_confidence_scores": {
+ "title": "Use Confidence Scores",
+ "default": true,
+ "type": "boolean"
+ },
+ "confidence_threshold": {
+ "title": "Confidence Threshold",
+ "type": "number"
+ },
+ "proxies": {
+ "title": "Proxies",
+ "type": "object",
+ "additionalProperties": {
+ "type": "string"
+ }
+ },
+ "local_files_only": {
+ "title": "Local Files Only",
+ "default": false
+ },
+ "force_download": {
+ "title": "Force Download",
+ "default": false
+ },
+ "use_auth_token": {
+ "title": "Use Auth Token",
+ "anyOf": [
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ "required": [
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "FileTypeClassifierComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "FileTypeClassifier"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "supported_types": {
+ "title": "Supported Types",
+ "default": [
+ "txt",
+ "pdf",
+ "md",
+ "docx",
+ "html"
+ ],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "FilterRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "FilterRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
+ "custom_query": {
+ "title": "Custom Query",
+ "type": "string"
+ },
+ "scale_score": {
+ "title": "Scale Score",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ImageToTextConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ImageToTextConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "default": [
+ "eng"
+ ],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "JoinAnswersComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "JoinAnswers"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "join_mode": {
+ "title": "Join Mode",
+ "default": "concatenate",
+ "type": "string"
+ },
+ "weights": {
+ "title": "Weights",
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ "top_k_join": {
+ "title": "Top K Join",
+ "type": "integer"
+ },
+ "sort_by_score": {
+ "title": "Sort By Score",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "JoinDocumentsComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "JoinDocuments"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "join_mode": {
+ "title": "Join Mode",
+ "default": "concatenate",
+ "type": "string"
+ },
+ "weights": {
+ "title": "Weights",
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ "top_k_join": {
+ "title": "Top K Join",
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "MarkdownConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "MarkdownConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "PDFToTextConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "PDFToTextConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "encoding": {
+ "title": "Encoding",
+ "default": "UTF-8",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "PDFToTextOCRConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "PDFToTextOCRConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "default": [
+ "eng"
+ ],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ParsrConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ParsrConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "parsr_url": {
+ "title": "Parsr Url",
+ "default": "http://localhost:3001",
+ "type": "string"
+ },
+ "extractor": {
+ "title": "Extractor",
+ "default": "pdfminer",
+ "enum": [
+ "pdfminer",
+ "pdfjs"
+ ],
+ "type": "string"
+ },
+ "table_detection_mode": {
+ "title": "Table Detection Mode",
+ "default": "lattice",
+ "enum": [
+ "lattice",
+ "stream"
+ ],
+ "type": "string"
+ },
+ "preceding_context_len": {
+ "title": "Preceding Context Len",
+ "default": 3,
+ "type": "integer"
+ },
+ "following_context_len": {
+ "title": "Following Context Len",
+ "default": 3,
+ "type": "integer"
+ },
+ "remove_page_headers": {
+ "title": "Remove Page Headers",
+ "default": false,
+ "type": "boolean"
+ },
+ "remove_page_footers": {
+ "title": "Remove Page Footers",
+ "default": false,
+ "type": "boolean"
+ },
+ "remove_table_of_contents": {
+ "title": "Remove Table Of Contents",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "PreProcessorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "PreProcessor"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "clean_whitespace": {
+ "title": "Clean Whitespace",
+ "default": true,
+ "type": "boolean"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer",
+ "default": false,
+ "type": "boolean"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines",
+ "default": true,
+ "type": "boolean"
+ },
+ "remove_substrings": {
+ "title": "Remove Substrings",
+ "default": [],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "split_by": {
+ "title": "Split By",
+ "default": "word",
+ "type": "string"
+ },
+ "split_length": {
+ "title": "Split Length",
+ "default": 200,
+ "type": "integer"
+ },
+ "split_overlap": {
+ "title": "Split Overlap",
+ "default": 0,
+ "type": "integer"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary",
+ "default": true,
+ "type": "boolean"
+ },
+ "language": {
+ "title": "Language",
+ "default": "en",
+ "type": "string"
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "PseudoLabelGeneratorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "PseudoLabelGenerator"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "question_producer": {
+ "title": "Question Producer",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "string"
+ }
+ }
+ }
+ ]
+ },
+ "retriever": {
+ "title": "Retriever",
+ "type": "string"
+ },
+ "cross_encoder_model_name_or_path": {
+ "title": "Cross Encoder Model Name Or Path",
+ "default": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+ "type": "string"
+ },
+ "max_questions_per_document": {
+ "title": "Max Questions Per Document",
+ "default": 3,
+ "type": "integer"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 50,
+ "type": "integer"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": 4,
+ "type": "integer"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "question_producer",
+ "retriever"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "QuestionGeneratorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "QuestionGenerator"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "valhalla/t5-base-e2e-qg"
+ },
+ "model_version": {
+ "title": "Model Version"
+ },
+ "num_beams": {
+ "title": "Num Beams",
+ "default": 4
+ },
+ "max_length": {
+ "title": "Max Length",
+ "default": 256
+ },
+ "no_repeat_ngram_size": {
+ "title": "No Repeat Ngram Size",
+ "default": 3
+ },
+ "length_penalty": {
+ "title": "Length Penalty",
+ "default": 1.5
+ },
+ "early_stopping": {
+ "title": "Early Stopping",
+ "default": true
+ },
+ "split_length": {
+ "title": "Split Length",
+ "default": 50
+ },
+ "split_overlap": {
+ "title": "Split Overlap",
+ "default": 10
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true
+ },
+ "prompt": {
+ "title": "Prompt",
+ "default": "generate questions:"
+ },
+ "num_queries_per_doc": {
+ "title": "Num Queries Per Doc",
+ "default": 1
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "RAGeneratorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "RAGenerator"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "facebook/rag-token-nq",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "retriever": {
+ "title": "Retriever",
+ "type": "string",
+ "default": null
+ },
+ "generator_type": {
+ "title": "Generator Type",
+ "default": "token",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 2,
+ "type": "integer"
+ },
+ "max_length": {
+ "title": "Max Length",
+ "default": 200,
+ "type": "integer"
+ },
+ "min_length": {
+ "title": "Min Length",
+ "default": 2,
+ "type": "integer"
+ },
+ "num_beams": {
+ "title": "Num Beams",
+ "default": 2,
+ "type": "integer"
+ },
+ "embed_title": {
+ "title": "Embed Title",
+ "default": true,
+ "type": "boolean"
+ },
+ "prefix": {
+ "title": "Prefix",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "RCIReaderComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "RCIReader"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "row_model_name_or_path": {
+ "title": "Row Model Name Or Path",
+ "default": "michaelrglass/albert-base-rci-wikisql-row",
+ "type": "string"
+ },
+ "column_model_name_or_path": {
+ "title": "Column Model Name Or Path",
+ "default": "michaelrglass/albert-base-rci-wikisql-col",
+ "type": "string"
+ },
+ "row_model_version": {
+ "title": "Row Model Version",
+ "type": "string"
+ },
+ "column_model_version": {
+ "title": "Column Model Version",
+ "type": "string"
+ },
+ "row_tokenizer": {
+ "title": "Row Tokenizer",
+ "type": "string"
+ },
+ "column_tokenizer": {
+ "title": "Column Tokenizer",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 256,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "RouteDocumentsComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "RouteDocuments"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "split_by": {
+ "title": "Split By",
+ "default": "content_type",
+ "type": "string"
+ },
+ "metadata_values": {
+ "title": "Metadata Values",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "SentenceTransformersRankerComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "SentenceTransformersRanker"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string",
+ "format": "path"
+ }
+ ]
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "type": "integer"
+ }
+ },
+ "required": [
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "Seq2SeqGeneratorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Seq2SeqGenerator"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "type": "string"
+ },
+ "input_converter": {
+ "title": "Input Converter",
+ "type": "string",
+ "default": null
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 1,
+ "type": "integer"
+ },
+ "max_length": {
+ "title": "Max Length",
+ "default": 200,
+ "type": "integer"
+ },
+ "min_length": {
+ "title": "Min Length",
+ "default": 2,
+ "type": "integer"
+ },
+ "num_beams": {
+ "title": "Num Beams",
+ "default": 8,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "SklearnQueryClassifierComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "SklearnQueryClassifier"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {}
+ ]
+ },
+ "vectorizer_name_or_path": {
+ "title": "Vectorizer Name Or Path",
+ "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {}
+ ]
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TableReaderComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TableReader"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "google/tapas-base-finetuned-wtq",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "tokenizer": {
+ "title": "Tokenizer",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "top_k_per_candidate": {
+ "title": "Top K Per Candidate",
+ "default": 3,
+ "type": "integer"
+ },
+ "return_no_answer": {
+ "title": "Return No Answer",
+ "default": false,
+ "type": "boolean"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 256,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TableTextRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TableTextRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "query_embedding_model": {
+ "title": "Query Embedding Model",
+ "default": "deepset/bert-small-mm_retrieval-question_encoder",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "passage_embedding_model": {
+ "title": "Passage Embedding Model",
+ "default": "deepset/bert-small-mm_retrieval-passage_encoder",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "table_embedding_model": {
+ "title": "Table Embedding Model",
+ "default": "deepset/bert-small-mm_retrieval-table_encoder",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "max_seq_len_query": {
+ "title": "Max Seq Len Query",
+ "default": 64,
+ "type": "integer"
+ },
+ "max_seq_len_passage": {
+ "title": "Max Seq Len Passage",
+ "default": 256,
+ "type": "integer"
+ },
+ "max_seq_len_table": {
+ "title": "Max Seq Len Table",
+ "default": 256,
+ "type": "integer"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": 16,
+ "type": "integer"
+ },
+ "embed_meta_fields": {
+ "title": "Embed Meta Fields",
+ "default": [
+ "name",
+ "section_title",
+ "caption"
+ ],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "use_fast_tokenizers": {
+ "title": "Use Fast Tokenizers",
+ "default": true,
+ "type": "boolean"
+ },
+ "infer_tokenizer_classes": {
+ "title": "Infer Tokenizer Classes",
+ "default": false,
+ "type": "boolean"
+ },
+ "similarity_function": {
+ "title": "Similarity Function",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "global_loss_buffer_size": {
+ "title": "Global Loss Buffer Size",
+ "default": 150000,
+ "type": "integer"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ "use_auth_token": {
+ "title": "Use Auth Token",
+ "anyOf": [
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "scale_score": {
+ "title": "Scale Score",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "Text2SparqlRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Text2SparqlRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "knowledge_graph": {
+ "title": "Knowledge Graph"
+ },
+ "model_name_or_path": {
+ "title": "Model Name Or Path"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 1,
+ "type": "integer"
+ }
+ },
+ "required": [
+ "knowledge_graph",
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TextConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TextConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TfidfRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TfidfRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "auto_fit": {
+ "title": "Auto Fit",
+ "default": true
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TikaConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TikaConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "tika_url": {
+ "title": "Tika Url",
+ "default": "http://localhost:9998/tika",
+ "type": "string"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "id_hash_keys": {
+ "title": "Id Hash Keys",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersDocumentClassifierComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersDocumentClassifier"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "bhadresh-savani/distilbert-base-uncased-emotion",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "tokenizer": {
+ "title": "Tokenizer",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "return_all_scores": {
+ "title": "Return All Scores",
+ "default": false,
+ "type": "boolean"
+ },
+ "task": {
+ "title": "Task",
+ "default": "text-classification",
+ "type": "string"
+ },
+ "labels": {
+ "title": "Labels",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "type": "integer"
+ },
+ "classification_field": {
+ "title": "Classification Field",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersQueryClassifierComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersQueryClassifier"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "shahrukhx01/bert-mini-finetune-question-detection",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersReaderComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersReader"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "distilbert-base-uncased-distilled-squad",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "tokenizer": {
+ "title": "Tokenizer",
+ "type": "string"
+ },
+ "context_window_size": {
+ "title": "Context Window Size",
+ "default": 70,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "top_k_per_candidate": {
+ "title": "Top K Per Candidate",
+ "default": 3,
+ "type": "integer"
+ },
+ "return_no_answers": {
+ "title": "Return No Answers",
+ "default": false,
+ "type": "boolean"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 256,
+ "type": "integer"
+ },
+ "doc_stride": {
+ "title": "Doc Stride",
+ "default": 128,
+ "type": "integer"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersSummarizerComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersSummarizer"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "google/pegasus-xsum",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "tokenizer": {
+ "title": "Tokenizer",
+ "type": "string"
+ },
+ "max_length": {
+ "title": "Max Length",
+ "default": 200,
+ "type": "integer"
+ },
+ "min_length": {
+ "title": "Min Length",
+ "default": 5,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "clean_up_tokenization_spaces": {
+ "title": "Clean Up Tokenization Spaces",
+ "default": true,
+ "type": "boolean"
+ },
+ "separator_for_single_summary": {
+ "title": "Separator For Single Summary",
+ "default": " ",
+ "type": "string"
+ },
+ "generate_single_summary": {
+ "title": "Generate Single Summary",
+ "default": false,
+ "type": "boolean"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersTranslatorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersTranslator"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "type": "string"
+ },
+ "tokenizer_name": {
+ "title": "Tokenizer Name",
+ "type": "string"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "type": "integer"
+ },
+ "clean_up_tokenization_spaces": {
+ "title": "Clean Up Tokenization Spaces",
+ "default": true,
+ "type": "boolean"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ }
+ }
+}
\ No newline at end of file
diff --git a/haystack/json-schemas/haystack-pipeline.schema.json b/haystack/json-schemas/haystack-pipeline.schema.json
index 920506344b..dfdcf2b8c1 100644
--- a/haystack/json-schemas/haystack-pipeline.schema.json
+++ b/haystack/json-schemas/haystack-pipeline.schema.json
@@ -116,6 +116,34 @@
"$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.4.0.schema.json"
}
]
+ },
+ {
+ "allOf": [
+ {
+ "properties": {
+ "version": {
+ "const": "1.5.0"
+ }
+ }
+ },
+ {
+ "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.5.0.schema.json"
+ }
+ ]
+ },
+ {
+ "allOf": [
+ {
+ "properties": {
+ "version": {
+ "const": "1.5.0"
+ }
+ }
+ },
+ {
+ "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.5.0.schema.json"
+ }
+ ]
}
]
}
\ No newline at end of file