chore: increase version for 1.8 release (#3109)

* increase version for 1.8 release * ignore missing-timeout for pylint
deepset-ai · Aug 26, 2022 · 4e518cd · 4e518cd
1 parent 3e3ff33
commit 4e518cd
Show file tree

Hide file tree

Showing 114 changed files with 43,222 additions and 2 deletions.
diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-1.7.3rc0
+1.8.0
diff --git a/docs/_src/api/openapi/openapi-1.8.0.json b/docs/_src/api/openapi/openapi-1.8.0.json
diff --git a/docs/_src/api/openapi/openapi.json b/docs/_src/api/openapi/openapi.json
@@ -2,7 +2,7 @@
     "openapi": "3.0.2",
     "info": {
         "title": "Haystack REST API",
-        "version": "1.7.3rc0"
+        "version": "1.8.0"
     },
     "paths": {
         "/initialized": {

diff --git a/docs/v1.8.0/Makefile b/docs/v1.8.0/Makefile
@@ -0,0 +1,25 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+
+SPHINXBUILD := sphinx-build
+MAKEINFO    := makeinfo
+
+BUILDDIR    := build
+SOURCE      := _src/
+# SPHINXFLAGS := -a -W -n -A local=1 -d $(BUILDDIR)/doctree
+SPHINXFLAGS := -A local=1 -d $(BUILDDIR)/doctree
+SPHINXOPTS  := $(SPHINXFLAGS) $(SOURCE)
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	$(SPHINXBUILD) -M $@ $(SPHINXOPTS) $(BUILDDIR)/$@
diff --git a/docs/v1.8.0/_src/api/Makefile b/docs/v1.8.0/_src/api/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/v1.8.0/_src/api/_static/floating_sidebar.css b/docs/v1.8.0/_src/api/_static/floating_sidebar.css
@@ -0,0 +1,29 @@
+div.sphinxsidebarwrapper {
+    position: relative;
+    top: 0px;
+    padding: 0;
+}
+
+div.sphinxsidebar {
+    margin: 0;
+    padding: 0 15px 0 15px;
+    width: 210px;
+    float: left;
+    font-size: 1em;
+    text-align: left;
+}
+
+div.sphinxsidebar .logo {
+    font-size: 1.8em;
+    color: #0A507A;
+    font-weight: 300;
+    text-align: center;
+}
+
+div.sphinxsidebar .logo img {
+    vertical-align: middle;
+}
+
+div.sphinxsidebar .download a img {
+    vertical-align: middle;
+}
diff --git a/docs/v1.8.0/_src/api/_templates/xxlayout.html b/docs/v1.8.0/_src/api/_templates/xxlayout.html
@@ -0,0 +1,46 @@
+{# put the sidebar before the body #}
+{% block sidebar1 %}{{ sidebar() }}{% endblock %}
+{% block sidebar2 %}{% endblock %}
+
+{% block extrahead %}
+    <link href='https://fonts.googleapis.com/css?family=Open+Sans:300,400,700'
+          rel='stylesheet' type='text/css' />
+{{ super() }}
+{#- if not embedded #}
+    <style type="text/css">
+      table.right { float: left; margin-left: 20px; }
+      table.right td { border: 1px solid #ccc; }
+      {% if pagename == 'index' %}
+      .related { display: none; }
+      {% endif %}
+    </style>
+    <script>
+      // intelligent scrolling of the sidebar content
+      $(window).scroll(function() {
+        var sb = $('.sphinxsidebarwrapper');
+        var win = $(window);
+        var sbh = sb.height();
+        var offset = $('.sphinxsidebar').position()['top'];
+        var wintop = win.scrollTop();
+        var winbot = wintop + win.innerHeight();
+        var curtop = sb.position()['top'];
+        var curbot = curtop + sbh;
+        // does sidebar fit in window?
+        if (sbh < win.innerHeight()) {
+          // yes: easy case -- always keep at the top
+          sb.css('top', $u.min([$u.max([0, wintop - offset - 10]),
+                                $(document).height() - sbh - 200]));
+        } else {
+          // no: only scroll if top/bottom edge of sidebar is at
+          // top/bottom edge of window
+          if (curtop > wintop && curbot > winbot) {
+            sb.css('top', $u.max([wintop - offset - 10, 0]));
+          } else if (curtop < wintop && curbot < winbot) {
+            sb.css('top', $u.min([winbot - sbh - offset - 20,
+                                  $(document).height() - sbh - 200]));
+          }
+        }
+      });
+    </script>
+{#- endif #}
+{% endblock %}
diff --git a/docs/v1.8.0/_src/api/api/crawler.md b/docs/v1.8.0/_src/api/api/crawler.md
@@ -0,0 +1,154 @@
+<a id="crawler"></a>
+
+# Module crawler
+
+<a id="crawler.Crawler"></a>
+
+## Crawler
+
+```python
+class Crawler(BaseComponent)
+```
+
+Crawl texts from a website so that we can use them later in Haystack as a corpus for search / question answering etc.
+
+**Example:**
+```python
+|    from haystack.nodes.connector import Crawler
+|
+|    crawler = Crawler(output_dir="crawled_files")
+|    # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
+|    docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
+|                         filter_urls= ["haystack.deepset.ai/overview/"])
+```
+
+<a id="crawler.Crawler.__init__"></a>
+
+#### Crawler.\_\_init\_\_
+
+```python
+def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None)
+```
+
+Init object with basic params for crawling (can be overwritten later).
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http(s) address(es) (can also be supplied later when calling crawl())
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `extract_hidden_text`: Whether to extract the hidden text contained in page.
+E.g. the text can be inside a span with style="display: none"
+- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
+dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+        This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+     2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+        This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
+- `webdriver_options`: A list of options to send to Selenium webdriver. If none is provided,
+Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
+and avoids using GPU.
+Crawler always appends the following option: "--headless"
+For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
+        These are the default options which disable GPU, disable shared memory usage
+        and spawn a single process.
+     2) ["--no-sandbox"]
+        This option disables the sandbox, which is required for running Chrome as root.
+See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
+
+<a id="crawler.Crawler.crawl"></a>
+
+#### Crawler.crawl
+
+```python
+def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> List[Path]
+```
+
+Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
+
+file per URL, including text and basic meta data).
+You can optionally specify via `filter_urls` to only crawl URLs that match a certain pattern.
+All parameters are optional here and only meant to overwrite instance attributes at runtime.
+If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used.
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
+dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+        This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+     2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+        This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
+
+**Returns**:
+
+List of paths where the crawled webpages got stored
+
+<a id="crawler.Crawler.run"></a>
+
+#### Crawler.run
+
+```python
+def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
+```
+
+Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `return_documents`: Return json files content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `extract_hidden_text`: Whether to extract the hidden text contained in page.
+E.g. the text can be inside a span with style="display: none"
+- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
+dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+        This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+     2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+        This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
+
+**Returns**:
+
+Tuple({"paths": List of filepaths, ...}, Name of output edge)
+