Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters * Change return types of preprocessor * Change return types of crawler * Adapt utils to functions to new return types * Adapt __init__.py to new method names * Prevent circular imports * Update Documentation & Code Style * Let DocStores' run method accept Documents * Adapt tests to new return types * Update Documentation & Code Style * Put "# type: ignore" to right place * Remove id_hash_keys property from Document primitive * Update Documentation & Code Style * Adapt tests to new return types and missing id_hash_keys property * Fix mypy * Fix mypy * Adapt PDFToTextOCRConverter * Remove id_hash_keys from RestAPI tests * Update Documentation & Code Style * Rename tests * Remove redundant setting of content_type="text" * Add DeprecationWarning * Add id_hash_keys to elasticsearch_index_to_document_store * Change document type from dict to Docuemnt in PreProcessor test * Fix file path in Tutorial 5 * Remove added output in Tutorial 5 * Update Documentation & Code Style * Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http * Adapt tutorials to new return types * Adapt tutorial 14 to new return types * Update Documentation & Code Style * Change assertions to HaystackErrors * Import HaystackError correctly Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
deepset-ai · Mar 29, 2022 · 834f8c4 · 834f8c4
1 parent a73717b
commit 834f8c4
Show file tree

Hide file tree

Showing 66 changed files with 875 additions and 415 deletions.
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
@@ -298,8 +298,6 @@ jobs:
         pip install ui/
 
     - name: Run tests
-      env:
-        PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
       run: pytest -s ${{ matrix.test-path }}
 
 

diff --git a/docs/_src/api/api/crawler.md b/docs/_src/api/api/crawler.md
@@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
 #### \_\_init\_\_
 
 ```python
-def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True)
+def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None)
 ```
 
 Init object with basic params for crawling (can be overwritten later).
@@ -42,13 +42,17 @@ Init object with basic params for crawling (can be overwritten later).
 - `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
 All URLs not matching at least one of the regular expressions will be dropped.
 - `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
 
 <a id="crawler.Crawler.crawl"></a>
 
 #### crawl
 
 ```python
-def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None) -> List[Path]
+def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Path]
 ```
 
 Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@@ -68,6 +72,10 @@ If no parameters are provided to this method, the instance attributes that were
 - `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
 All URLs not matching at least one of the regular expressions will be dropped.
 - `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
 
 **Returns**:
 
@@ -78,7 +86,7 @@ List of paths where the crawled webpages got stored
 #### run
 
 ```python
-def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False) -> Tuple[Dict, str]
+def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None) -> Tuple[Dict, str]
 ```
 
 Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@@ -94,6 +102,10 @@ Method to be executed when the Crawler is used as a Node within a Haystack pipel
 All URLs not matching at least one of the regular expressions will be dropped.
 - `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
 - `return_documents`: Return json files content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
 
 **Returns**:
 

diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
@@ -272,7 +272,7 @@ None
 #### run
 
 ```python
-def run(documents: List[dict], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None)
+def run(documents: List[Union[dict, Document]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None)
 ```
 
 Run requests of document stores
@@ -4669,7 +4669,7 @@ and filter_utils.py.
 #### open\_search\_index\_to\_document\_store
 
 ```python
-def open_search_index_to_document_store(document_store: "BaseDocumentStore", original_index_name: str, original_content_field: str, original_name_field: Optional[str] = None, included_metadata_fields: Optional[List[str]] = None, excluded_metadata_fields: Optional[List[str]] = None, store_original_ids: bool = True, index: Optional[str] = None, preprocessor: Optional[PreProcessor] = None, batch_size: int = 10_000, host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "admin", password: str = "admin", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, scheme: str = "https", ca_certs: Optional[str] = None, verify_certs: bool = False, timeout: int = 30, use_system_proxy: bool = False) -> "BaseDocumentStore"
+def open_search_index_to_document_store(document_store: "BaseDocumentStore", original_index_name: str, original_content_field: str, original_name_field: Optional[str] = None, included_metadata_fields: Optional[List[str]] = None, excluded_metadata_fields: Optional[List[str]] = None, store_original_ids: bool = True, index: Optional[str] = None, preprocessor: Optional[PreProcessor] = None, id_hash_keys: Optional[List[str]] = None, batch_size: int = 10_000, host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "admin", password: str = "admin", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, scheme: str = "https", ca_certs: Optional[str] = None, verify_certs: bool = False, timeout: int = 30, use_system_proxy: bool = False) -> "BaseDocumentStore"
 ```
 
 This function provides brownfield support of existing OpenSearch indexes by converting each of the records in
@@ -4700,6 +4700,10 @@ all the indexed Documents in the `DocumentStore` will be overwritten in the seco
 - `index`: Name of index in `document_store` to use to store the resulting haystack `Document` objects.
 - `preprocessor`: Optional PreProcessor that will be applied on the content field of the original OpenSearch
 record.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
 - `batch_size`: Number of records to process at once.
 - `host`: URL(s) of OpenSearch nodes.
 - `port`: Ports(s) of OpenSearch nodes.
@@ -4721,7 +4725,7 @@ You can use certifi package with `certifi.where()` to find where the CA certs fi
 #### elasticsearch\_index\_to\_document\_store
 
 ```python
-def elasticsearch_index_to_document_store(document_store: "BaseDocumentStore", original_index_name: str, original_content_field: str, original_name_field: Optional[str] = None, included_metadata_fields: Optional[List[str]] = None, excluded_metadata_fields: Optional[List[str]] = None, store_original_ids: bool = True, index: Optional[str] = None, preprocessor: Optional[PreProcessor] = None, batch_size: int = 10_000, host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, timeout: int = 30, use_system_proxy: bool = False) -> "BaseDocumentStore"
+def elasticsearch_index_to_document_store(document_store: "BaseDocumentStore", original_index_name: str, original_content_field: str, original_name_field: Optional[str] = None, included_metadata_fields: Optional[List[str]] = None, excluded_metadata_fields: Optional[List[str]] = None, store_original_ids: bool = True, index: Optional[str] = None, preprocessor: Optional[PreProcessor] = None, id_hash_keys: Optional[List[str]] = None, batch_size: int = 10_000, host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, timeout: int = 30, use_system_proxy: bool = False) -> "BaseDocumentStore"
 ```
 
 This function provides brownfield support of existing Elasticsearch indexes by converting each of the records in
@@ -4752,6 +4756,10 @@ all the indexed Documents in the `DocumentStore` will be overwritten in the seco
 - `index`: Name of index in `document_store` to use to store the resulting haystack `Document` objects.
 - `preprocessor`: Optional PreProcessor that will be applied on the content field of the original Elasticsearch
 record.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
 - `batch_size`: Number of records to process at once.
 - `host`: URL(s) of Elasticsearch nodes.
 - `port`: Ports(s) of Elasticsearch nodes.