From 6ee9ddc90dedbe0202cfdfa366f821fe587a882d Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Wed, 16 Feb 2022 17:22:37 +0100 Subject: [PATCH 01/22] pipeline.to_code() with jupyter support --- haystack/pipelines/base.py | 45 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 760883d9b8..f9bb957afc 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -6,6 +6,7 @@ import inspect import logging import os +import re import traceback import numpy as np import pandas as pd @@ -1040,6 +1041,50 @@ def get_config(self, return_defaults: bool = False) -> dict: config = {"components": list(components.values()), "pipelines": list(pipelines.values()), "version": "0.8"} return config + def to_code(self, pipeline_name="pipeline", create_ipython_cell=True): + code = self._generate_pipeline_code(pipeline_name=pipeline_name) + if create_ipython_cell: + try: + get_ipython().set_next_input(code) #type: ignore + except NameError: + return code + else: + return code + + def _order_component_code(self, depedency: Dict[str, List[str]], keys: Optional[List[str]]=None): + ordered = [] + keys = keys or depedency.keys() #type: ignore + for k in keys: #type: ignore + v = depedency[k] + if len(v) > 0: + ordered += [k for k in self._order_component_code(depedency, v) if k not in ordered] + if k not in ordered: + ordered.append(k) + return ordered + + def _generate_pipeline_code(self, pipeline_name="pipeline"): + config = self.get_config() + code = "" + name_to_variable_name = {c['name']: re.sub(r'(? Date: Wed, 16 Feb 2022 16:37:14 +0000 Subject: [PATCH 02/22] Update Documentation & Code Style --- docs/_src/api/openapi/openapi.json | 2 +- haystack/pipelines/base.py | 29 ++++++++++++++++++----------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/docs/_src/api/openapi/openapi.json b/docs/_src/api/openapi/openapi.json index af6d9ce457..f5fcee5b74 100644 --- a/docs/_src/api/openapi/openapi.json +++ b/docs/_src/api/openapi/openapi.json @@ -278,7 +278,7 @@ "file-upload" ], "summary": "Upload File", - "description": "You can use this endpoint to upload a file for indexing\n(see [http://localhost:3000/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store]).", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", "operationId": "upload_file_file_upload_post", "requestBody": { "content": { diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index f9bb957afc..70df39246a 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -1045,35 +1045,42 @@ def to_code(self, pipeline_name="pipeline", create_ipython_cell=True): code = self._generate_pipeline_code(pipeline_name=pipeline_name) if create_ipython_cell: try: - get_ipython().set_next_input(code) #type: ignore + get_ipython().set_next_input(code) # type: ignore except NameError: return code else: return code - def _order_component_code(self, depedency: Dict[str, List[str]], keys: Optional[List[str]]=None): + def _order_component_code(self, depedency: Dict[str, List[str]], keys: Optional[List[str]] = None): ordered = [] - keys = keys or depedency.keys() #type: ignore - for k in keys: #type: ignore + keys = keys or depedency.keys() # type: ignore + for k in keys: # type: ignore v = depedency[k] if len(v) > 0: ordered += [k for k in self._order_component_code(depedency, v) if k not in ordered] - if k not in ordered: + if k not in ordered: ordered.append(k) return ordered def _generate_pipeline_code(self, pipeline_name="pipeline"): config = self.get_config() code = "" - name_to_variable_name = {c['name']: re.sub(r'(? Date: Wed, 16 Feb 2022 18:01:52 +0100 Subject: [PATCH 03/22] add imports --- haystack/pipelines/base.py | 53 ++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index f9bb957afc..64319fab24 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -1,4 +1,5 @@ from __future__ import annotations +import sys from typing import Dict, List, Optional, Any import copy @@ -1041,39 +1042,63 @@ def get_config(self, return_defaults: bool = False) -> dict: config = {"components": list(components.values()), "pipelines": list(pipelines.values()), "version": "0.8"} return config - def to_code(self, pipeline_name="pipeline", create_ipython_cell=True): - code = self._generate_pipeline_code(pipeline_name=pipeline_name) + def to_code(self, pipeline_name: str = "pipeline", create_ipython_cell: bool = True, generate_imports: bool = True): + code = self._generate_pipeline_code(pipeline_name=pipeline_name, generate_imports=generate_imports) if create_ipython_cell: try: - get_ipython().set_next_input(code) #type: ignore + get_ipython().set_next_input(code) # type: ignore except NameError: return code else: return code - def _order_component_code(self, depedency: Dict[str, List[str]], keys: Optional[List[str]]=None): + def _order_component_code(self, depedency: Dict[str, List[str]], keys: Optional[List[str]] = None): ordered = [] - keys = keys or depedency.keys() #type: ignore - for k in keys: #type: ignore + keys = keys or depedency.keys() # type: ignore + for k in keys: # type: ignore v = depedency[k] if len(v) > 0: ordered += [k for k in self._order_component_code(depedency, v) if k not in ordered] - if k not in ordered: + if k not in ordered: ordered.append(k) return ordered - def _generate_pipeline_code(self, pipeline_name="pipeline"): + def _generate_pipeline_code(self, pipeline_name: str = "pipeline", generate_imports: bool = True): config = self.get_config() code = "" - name_to_variable_name = {c['name']: re.sub(r'(? Date: Wed, 16 Feb 2022 19:20:57 +0100 Subject: [PATCH 04/22] refactoring --- haystack/pipelines/base.py | 113 ++++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 39 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 64319fab24..36caab407e 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -40,6 +40,7 @@ ROOT_NODE_TO_PIPELINE_NAME = {"query": "query", "file": "indexing"} +CAMEL_CASE_TO_SNAKE_CASE_REGEX = re.compile(r"(? dict: return config def to_code(self, pipeline_name: str = "pipeline", create_ipython_cell: bool = True, generate_imports: bool = True): - code = self._generate_pipeline_code(pipeline_name=pipeline_name, generate_imports=generate_imports) + config = self.get_config() + code = self._generate_code(config=config, pipeline_name=pipeline_name, generate_imports=generate_imports) if create_ipython_cell: try: get_ipython().set_next_input(code) # type: ignore @@ -1052,62 +1054,95 @@ def to_code(self, pipeline_name: str = "pipeline", create_ipython_cell: bool = T else: return code - def _order_component_code(self, depedency: Dict[str, List[str]], keys: Optional[List[str]] = None): + @classmethod + def _order_component_code(cls, depedency: Dict[str, List[str]], keys: Optional[List[str]] = None): ordered = [] keys = keys or depedency.keys() # type: ignore for k in keys: # type: ignore v = depedency[k] if len(v) > 0: - ordered += [k for k in self._order_component_code(depedency, v) if k not in ordered] + ordered += [k for k in cls._order_component_code(depedency, v) if k not in ordered] if k not in ordered: ordered.append(k) return ordered - def _generate_pipeline_code(self, pipeline_name: str = "pipeline", generate_imports: bool = True): - config = self.get_config() - code = "" - if generate_imports: - types = [c["type"] for c in config["components"]] - allowed_imports = ["haystack.nodes", "haystack.document_stores"] - importable_classes = { - name: mod - for mod in allowed_imports - for name, obj in inspect.getmembers(sys.modules[mod]) - if inspect.isclass(obj) - } + @classmethod + def _camel_to_snake_case(cls, input: str) -> str: + return CAMEL_CASE_TO_SNAKE_CASE_REGEX.sub("_", input).lower() - for t in types: - if t in importable_classes: - code += f"from {importable_classes[t]} import {t}\n" - else: - code += f"# from MODULE_NOT_FOUND import {t}\n" - code += "\n" + @classmethod + def _generate_code(cls, config: dict, pipeline_name: str = "pipeline", generate_imports: bool = True): + component_definitions = config["components"] + pipeline_definition = config["pipelines"][0] + code_parts = [] + if generate_imports: + types_to_import = [c["type"] for c in component_definitions] + code_parts.append(cls._generate_imports_code(types_to_import=types_to_import)) - name_to_variable_name = { - c["name"]: re.sub(r"(? Date: Wed, 16 Feb 2022 18:23:02 +0000 Subject: [PATCH 05/22] Update Documentation & Code Style --- haystack/pipelines/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 36caab407e..4c5243a893 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -1079,9 +1079,7 @@ def _generate_code(cls, config: dict, pipeline_name: str = "pipeline", generate_ types_to_import = [c["type"] for c in component_definitions] code_parts.append(cls._generate_imports_code(types_to_import=types_to_import)) - component_variable_names = { - c["name"]: cls._camel_to_snake_case(c["name"]) for c in component_definitions - } + component_variable_names = {c["name"]: cls._camel_to_snake_case(c["name"]) for c in component_definitions} code_parts.append( cls._generate_components_code( component_definitions=component_definitions, component_variable_names=component_variable_names From 6ed77a0b8fd3071e175e1e12ee2ed3e9bf358c7c Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Thu, 17 Feb 2022 15:27:43 +0100 Subject: [PATCH 06/22] docstrings added and refactoring --- haystack/pipelines/base.py | 246 +++++++++++++++++++++---------------- 1 file changed, 142 insertions(+), 104 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 36caab407e..463df75084 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -71,6 +71,36 @@ def get_config(self, return_defaults: bool = False) -> dict: """ raise NotImplementedError + def to_code( + self, pipeline_variable_name: str = "pipeline", create_ipython_cell: bool = True, generate_imports: bool = True + ): + """ + Returns the respecting code to create this pipeline. + If we're running in a notebook environment and create_ipyhton_cell is set to True, + this will automatically create a new notebook cell containing the code instead of returning a string. + + :param pipeline_variable_name: The variable name of the generated pipeline. + Default value is 'pipeline'. + :param create_ipython_cell: If set to True and we're running in a notebook environment, to_code() will automatically create a new notebook cell containing the code. + Otherwise to_code() will return the code as string. + Default value is True. + :param generate_imports: Whether to include the required import statements into the code. + Default value is True. + """ + pipeline_config = self.get_config() + code = self._generate_code( + pipeline_config=pipeline_config, + pipeline_variable_name=pipeline_variable_name, + generate_imports=generate_imports, + ) + if create_ipython_cell: + try: + get_ipython().set_next_input(code) # type: ignore + except NameError: + return code + else: + return code + @classmethod def load_from_config( cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True @@ -338,7 +368,100 @@ def save_to_deepset_cloud( logger.info(f"Pipeline config '{pipeline_config_name}' successfully created.") @classmethod - def _get_pipeline_definition(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None): + def _camel_to_snake_case(cls, input: str) -> str: + return CAMEL_CASE_TO_SNAKE_CASE_REGEX.sub("_", input).lower() + + @classmethod + def _generate_code( + cls, pipeline_config: dict, pipeline_variable_name: str = "pipeline", generate_imports: bool = True + ) -> str: + component_definitions = cls._get_component_definitions( + pipeline_config=pipeline_config, overwrite_with_env_variables=False + ) + component_variable_names = {name: cls._camel_to_snake_case(name) for name in component_definitions.keys()} + pipeline_definition = cls._get_pipeline_definition(pipeline_config=pipeline_config) + + code_parts = [] + if generate_imports: + types_to_import = [component["type"] for component in component_definitions.values()] + imports_code = cls._generate_imports_code(types_to_import=types_to_import) + code_parts.append(imports_code) + + components_code = cls._generate_components_code( + component_definitions=component_definitions, component_variable_names=component_variable_names + ) + pipeline_code = cls._generate_pipeline_code( + pipeline_definition=pipeline_definition, + component_variable_names=component_variable_names, + pipeline_variable_name=pipeline_variable_name, + ) + + code_parts.append(components_code) + code_parts.append(pipeline_code) + code = "\n\n".join(code_parts) + return code + + @classmethod + def _generate_pipeline_code( + cls, pipeline_definition: dict, component_variable_names: Dict[str, str], pipeline_variable_name: str + ) -> str: + code_lines = [f"{pipeline_variable_name} = Pipeline()"] + for node in pipeline_definition["nodes"]: + node_name = node["name"] + component_variable_name = component_variable_names[node_name] + inputs = ", ".join(f'"{name}"' for name in node["inputs"]) + code_lines.append( + f'{pipeline_variable_name}.add_node(component={component_variable_name}, name="{node_name}", inputs=[{inputs}])' + ) + + code = "\n".join(code_lines) + return code + + @classmethod + def _generate_components_code(cls, component_definitions: dict, component_variable_names: Dict[str, str]) -> str: + code = "" + declarations = {} + dependency_map = {} + for name, definition in component_definitions.items(): + variable_name = component_variable_names[name] + class_name = definition["type"] + param_value_dict = { + key: component_variable_names.get(value, f'"{value}"') if type(value) == str else value + for key, value in definition["params"].items() + } + init_args = ", ".join(f"{key}={value}" for key, value in param_value_dict.items()) + declarations[name] = f"{variable_name} = {class_name}({init_args})" + dependency_map[name] = [ + param_value for param_value in definition["params"].values() if param_value in component_variable_names + ] + + ordered_components = cls._order_components(dependency_map=dependency_map) + ordered_declarations = [declarations[component] for component in ordered_components] + code = "\n".join(ordered_declarations) + return code + + @classmethod + def _generate_imports_code(cls, types_to_import: List[str]) -> str: + code_lines = [] + allowed_imports = ["haystack.nodes", "haystack.document_stores"] + importable_classes = { + name: mod + for mod in allowed_imports + for name, obj in inspect.getmembers(sys.modules[mod]) + if inspect.isclass(obj) + } + + for t in types_to_import: + if t in importable_classes: + code_lines.append(f"from {importable_classes[t]} import {t}") + else: + code_lines.append(f"# from MODULE_NOT_FOUND import {t}") + + code = "\n".join(code_lines) + return code + + @classmethod + def _get_pipeline_definition(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None) -> dict: """ Get the definition of Pipeline from a given pipeline config. If the config contains more than one Pipeline, then the pipeline_name must be supplied. @@ -360,7 +483,7 @@ def _get_pipeline_definition(cls, pipeline_config: Dict, pipeline_name: Optional return pipeline_definition @classmethod - def _get_component_definitions(cls, pipeline_config: Dict, overwrite_with_env_variables: bool): + def _get_component_definitions(cls, pipeline_config: Dict, overwrite_with_env_variables: bool) -> dict: """ Returns the definitions of all components from a given pipeline config. @@ -380,6 +503,23 @@ def _get_component_definitions(cls, pipeline_config: Dict, overwrite_with_env_va return component_definitions + @classmethod + def _order_components( + cls, dependency_map: Dict[str, List[str]], components_to_order: Optional[List[str]] = None + ) -> List[str]: + ordered_components = [] + if components_to_order is None: + components_to_order = list(dependency_map.keys()) + for component in components_to_order: + dependencies = dependency_map[component] + ordered_dependencies = cls._order_components( + dependency_map=dependency_map, components_to_order=dependencies + ) + ordered_components += [d for d in ordered_dependencies if d not in ordered_components] + if component not in ordered_components: + ordered_components.append(component) + return ordered_components + @classmethod def _overwrite_with_env_variables(cls, definition: dict): """ @@ -1043,108 +1183,6 @@ def get_config(self, return_defaults: bool = False) -> dict: config = {"components": list(components.values()), "pipelines": list(pipelines.values()), "version": "0.8"} return config - def to_code(self, pipeline_name: str = "pipeline", create_ipython_cell: bool = True, generate_imports: bool = True): - config = self.get_config() - code = self._generate_code(config=config, pipeline_name=pipeline_name, generate_imports=generate_imports) - if create_ipython_cell: - try: - get_ipython().set_next_input(code) # type: ignore - except NameError: - return code - else: - return code - - @classmethod - def _order_component_code(cls, depedency: Dict[str, List[str]], keys: Optional[List[str]] = None): - ordered = [] - keys = keys or depedency.keys() # type: ignore - for k in keys: # type: ignore - v = depedency[k] - if len(v) > 0: - ordered += [k for k in cls._order_component_code(depedency, v) if k not in ordered] - if k not in ordered: - ordered.append(k) - return ordered - - @classmethod - def _camel_to_snake_case(cls, input: str) -> str: - return CAMEL_CASE_TO_SNAKE_CASE_REGEX.sub("_", input).lower() - - @classmethod - def _generate_code(cls, config: dict, pipeline_name: str = "pipeline", generate_imports: bool = True): - component_definitions = config["components"] - pipeline_definition = config["pipelines"][0] - code_parts = [] - if generate_imports: - types_to_import = [c["type"] for c in component_definitions] - code_parts.append(cls._generate_imports_code(types_to_import=types_to_import)) - - component_variable_names = { - c["name"]: cls._camel_to_snake_case(c["name"]) for c in component_definitions - } - code_parts.append( - cls._generate_components_code( - component_definitions=component_definitions, component_variable_names=component_variable_names - ) - ) - code_parts.append( - cls._generate_pipeline_code( - pipeline_definition=pipeline_definition, - component_variable_names=component_variable_names, - pipeline_name=pipeline_name, - ) - ) - code = "\n".join(code_parts) - return code - - @classmethod - def _generate_pipeline_code( - cls, pipeline_definition: Dict, component_variable_names: Dict[str, str], pipeline_name: str - ): - code = f"{pipeline_name} = Pipeline()\n" - for node in pipeline_definition["nodes"]: - component_variable_name = component_variable_names[node["name"]] - inputs = ", ".join(f'"{name}"' for name in node["inputs"]) - code += f"{pipeline_name}.add_node(component={component_variable_name}, name=\"{node['name']}\", inputs=[{inputs}])\n" - - @classmethod - def _generate_components_code(cls, component_definitions: List[Dict], component_variable_names: Dict[str, str]): - code = "" - declarations = {} - dependencies = {} - for component in component_definitions: - param_value_dict = { - k: component_variable_names.get(v, f'"{v}"') if type(v) == str else v - for k, v in component["params"].items() - } - args = ", ".join(f"{k}={v}" for k, v in param_value_dict.items()) - variable_name = component_variable_names[component["name"]] - declarations[variable_name] = f"{variable_name} = {component['type']}({args})\n" - dependencies[variable_name] = [ - component_variable_names[v] for v in component["params"].values() if v in component_variable_names - ] - for v in cls._order_component_code(dependencies): - code += declarations[v] - return code - - @classmethod - def _generate_imports_code(cls, types_to_import: List[str]): - code = "" - allowed_imports = ["haystack.nodes", "haystack.document_stores"] - importable_classes = { - name: mod - for mod in allowed_imports - for name, obj in inspect.getmembers(sys.modules[mod]) - if inspect.isclass(obj) - } - - for t in types_to_import: - if t in importable_classes: - code += f"from {importable_classes[t]} import {t}\n" - else: - code += f"# from MODULE_NOT_FOUND import {t}\n" - return code - def _format_document_answer(self, document_or_answer: dict): return "\n \t".join([f"{name}: {value}" for name, value in document_or_answer.items()]) From 0c33bc717a0c45ae9b5dea80e53fc733a8e68e0e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 17 Feb 2022 14:34:39 +0000 Subject: [PATCH 07/22] Update Documentation & Code Style --- docs/_src/api/api/pipelines.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index b68750be81..76cce4558c 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -37,6 +37,29 @@ Returns a configuration for the Pipeline that can be used with `BasePipeline.loa - `return_defaults`: whether to output parameters that have the default values. + + +#### to\_code + +```python +def to_code(pipeline_variable_name: str = "pipeline", create_ipython_cell: bool = True, generate_imports: bool = True) +``` + +Returns the respecting code to create this pipeline. + +If we're running in a notebook environment and create_ipyhton_cell is set to True, +this will automatically create a new notebook cell containing the code instead of returning a string. + +**Arguments**: + +- `pipeline_variable_name`: The variable name of the generated pipeline. +Default value is 'pipeline'. +- `create_ipython_cell`: If set to True and we're running in a notebook environment, to_code() will automatically create a new notebook cell containing the code. +Otherwise to_code() will return the code as string. +Default value is True. +- `generate_imports`: Whether to include the required import statements into the code. +Default value is True. + #### load\_from\_config From 896616bb4f572db6358ba7820b37753eb0638ed0 Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Thu, 17 Feb 2022 16:10:02 +0100 Subject: [PATCH 08/22] improve imports code generation --- haystack/pipelines/base.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 463df75084..0fd667d98a 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -41,6 +41,8 @@ ROOT_NODE_TO_PIPELINE_NAME = {"query": "query", "file": "indexing"} CAMEL_CASE_TO_SNAKE_CASE_REGEX = re.compile(r"(? str: code_lines = [] - allowed_imports = ["haystack.nodes", "haystack.document_stores"] importable_classes = { name: mod - for mod in allowed_imports + for mod in CODE_GEN_ALLOWED_IMPORTS for name, obj in inspect.getmembers(sys.modules[mod]) if inspect.isclass(obj) } + imports_by_module: Dict[str, List[str]] = {} for t in types_to_import: - if t in importable_classes: - code_lines.append(f"from {importable_classes[t]} import {t}") + mod = importable_classes.get(t, MODULE_NOT_FOUND) + if mod in imports_by_module: + imports_by_module[mod].append(t) else: - code_lines.append(f"# from MODULE_NOT_FOUND import {t}") + imports_by_module[mod] = [t] + + for mod in sorted(imports_by_module.keys()): + sorted_types = sorted(set(imports_by_module[mod])) + import_types = ", ".join(sorted_types) + line_prefix = "# " if mod == MODULE_NOT_FOUND else "" + code_lines.append(f"{line_prefix}from {mod} import {import_types}") code = "\n".join(code_lines) return code From b27d73bcb8480a0eb0b885535839cdc93cd4368c Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Thu, 17 Feb 2022 17:19:08 +0100 Subject: [PATCH 09/22] add comment param --- haystack/pipelines/base.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 0fd667d98a..76ceb9a548 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -43,6 +43,7 @@ CAMEL_CASE_TO_SNAKE_CASE_REGEX = re.compile(r"(? dict: raise NotImplementedError def to_code( - self, pipeline_variable_name: str = "pipeline", create_ipython_cell: bool = True, generate_imports: bool = True + self, + pipeline_variable_name: str = "pipeline", + create_notebook_cell: bool = True, + generate_imports: bool = True, + comment: Optional[str] = CODE_GEN_DEFAULT_COMMENT, ): """ Returns the respecting code to create this pipeline. @@ -83,19 +88,22 @@ def to_code( :param pipeline_variable_name: The variable name of the generated pipeline. Default value is 'pipeline'. - :param create_ipython_cell: If set to True and we're running in a notebook environment, to_code() will automatically create a new notebook cell containing the code. + :param create_notebook_cell: If set to True and we're running in a notebook environment, to_code() will automatically create a new notebook cell containing the code. Otherwise to_code() will return the code as string. Default value is True. :param generate_imports: Whether to include the required import statements into the code. Default value is True. + :param comment: Preceding comment to add. + Defaults to "This code has been generated by Pipeline.to_code()" """ pipeline_config = self.get_config() code = self._generate_code( pipeline_config=pipeline_config, pipeline_variable_name=pipeline_variable_name, generate_imports=generate_imports, + comment=comment, ) - if create_ipython_cell: + if create_notebook_cell: try: get_ipython().set_next_input(code) # type: ignore except NameError: @@ -375,7 +383,11 @@ def _camel_to_snake_case(cls, input: str) -> str: @classmethod def _generate_code( - cls, pipeline_config: dict, pipeline_variable_name: str = "pipeline", generate_imports: bool = True + cls, + pipeline_config: dict, + pipeline_variable_name: str = "pipeline", + generate_imports: bool = True, + comment: Optional[str] = None, ) -> str: component_definitions = cls._get_component_definitions( pipeline_config=pipeline_config, overwrite_with_env_variables=False @@ -401,6 +413,11 @@ def _generate_code( code_parts.append(components_code) code_parts.append(pipeline_code) code = "\n\n".join(code_parts) + + if comment: + comment = re.sub(r"^(#\s)?", "# ", comment, flags=re.MULTILINE) + code = "\n".join([comment, code]) + return code @classmethod From f9fea7a9cf69b876d29ab0ec1611f61515fcf135 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 17 Feb 2022 16:28:45 +0000 Subject: [PATCH 10/22] Update Documentation & Code Style --- docs/_src/api/api/pipelines.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index 76cce4558c..425a2b6a41 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -42,7 +42,7 @@ Returns a configuration for the Pipeline that can be used with `BasePipeline.loa #### to\_code ```python -def to_code(pipeline_variable_name: str = "pipeline", create_ipython_cell: bool = True, generate_imports: bool = True) +def to_code(pipeline_variable_name: str = "pipeline", create_notebook_cell: bool = True, generate_imports: bool = True, comment: Optional[str] = CODE_GEN_DEFAULT_COMMENT) ``` Returns the respecting code to create this pipeline. @@ -54,11 +54,13 @@ this will automatically create a new notebook cell containing the code instead o - `pipeline_variable_name`: The variable name of the generated pipeline. Default value is 'pipeline'. -- `create_ipython_cell`: If set to True and we're running in a notebook environment, to_code() will automatically create a new notebook cell containing the code. +- `create_notebook_cell`: If set to True and we're running in a notebook environment, to_code() will automatically create a new notebook cell containing the code. Otherwise to_code() will return the code as string. Default value is True. - `generate_imports`: Whether to include the required import statements into the code. Default value is True. +- `comment`: Preceding comment to add. +Defaults to "This code has been generated by Pipeline.to_code()" From 62d77a38f7cf44825e3eb1de42283b2f7c5bae60 Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Fri, 18 Feb 2022 10:20:01 +0100 Subject: [PATCH 11/22] add simple test --- test/test_pipeline.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 75b94c2f86..af9d80438b 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -162,6 +162,24 @@ def test_load_tfidfretriever_yaml(tmp_path): assert prediction["answers"][0].answer == "haystack" +@pytest.mark.elasticsearch +def test_to_code(): + index_pipeline = Pipeline.load_from_yaml( + SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="indexing_pipeline" + ) + query_pipeline = Pipeline.load_from_yaml( + SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="query_pipeline" + ) + query_pipeline_code = query_pipeline.to_code(pipeline_variable_name="query_pipeline_from_code") + index_pipeline_code = index_pipeline.to_code(pipeline_variable_name="index_pipeline_from_code") + exec(query_pipeline_code) + exec(index_pipeline_code) + assert locals()["query_pipeline_from_code"] is not None + assert locals()["index_pipeline_from_code"] is not None + assert query_pipeline.get_config() == locals()["query_pipeline_from_code"].get_config() + assert index_pipeline.get_config() == locals()["index_pipeline_from_code"].get_config() + + @pytest.mark.usefixtures(deepset_cloud_fixture.__name__) @responses.activate def test_load_from_deepset_cloud_query(): From cf2248fbfe3a1fee963bc52e8109617b86c2f04c Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Mon, 21 Feb 2022 20:16:28 +0100 Subject: [PATCH 12/22] add to_notebook_cell() --- haystack/pipelines/base.py | 52 +++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 76ceb9a548..6c572b0416 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -43,7 +43,7 @@ CAMEL_CASE_TO_SNAKE_CASE_REGEX = re.compile(r"(? dict: def to_code( self, pipeline_variable_name: str = "pipeline", - create_notebook_cell: bool = True, generate_imports: bool = True, - comment: Optional[str] = CODE_GEN_DEFAULT_COMMENT, - ): + add_comment: bool = False, + ) -> str: """ - Returns the respecting code to create this pipeline. - If we're running in a notebook environment and create_ipyhton_cell is set to True, - this will automatically create a new notebook cell containing the code instead of returning a string. + Returns the code to create this pipeline as string. :param pipeline_variable_name: The variable name of the generated pipeline. Default value is 'pipeline'. - :param create_notebook_cell: If set to True and we're running in a notebook environment, to_code() will automatically create a new notebook cell containing the code. - Otherwise to_code() will return the code as string. - Default value is True. :param generate_imports: Whether to include the required import statements into the code. Default value is True. - :param comment: Preceding comment to add. - Defaults to "This code has been generated by Pipeline.to_code()" + :param add_comment: Whether to add a preceding comment that this code has been generated. + Default value is False. """ pipeline_config = self.get_config() code = self._generate_code( pipeline_config=pipeline_config, pipeline_variable_name=pipeline_variable_name, generate_imports=generate_imports, - comment=comment, + comment=CODE_GEN_DEFAULT_COMMENT if add_comment else None, ) - if create_notebook_cell: - try: - get_ipython().set_next_input(code) # type: ignore - except NameError: - return code - else: - return code + return code + + def to_notebook_cell( + self, + pipeline_variable_name: str = "pipeline", + generate_imports: bool = True, + add_comment: bool = True, + ): + """ + Creates a new notebook cell with the code to create this pipeline. + + :param pipeline_variable_name: The variable name of the generated pipeline. + Default value is 'pipeline'. + :param generate_imports: Whether to include the required import statements into the code. + Default value is True. + :param add_comment: Whether to add a preceding comment that this code has been generated. + Default value is True. + """ + code = self.to_code( + pipeline_variable_name=pipeline_variable_name, generate_imports=generate_imports, add_comment=add_comment + ) + try: + get_ipython().set_next_input(code) # type: ignore + except NameError: + logger.error("Could not create notebook cell. Make sure you're running in a notebook environment.") @classmethod def load_from_config( From 97634232cae819cc61cadf1020be7eeadc1662ff Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 21 Feb 2022 19:24:28 +0000 Subject: [PATCH 13/22] Update Documentation & Code Style --- docs/_src/api/api/pipelines.md | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index 425a2b6a41..48bbb7eda8 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -42,25 +42,38 @@ Returns a configuration for the Pipeline that can be used with `BasePipeline.loa #### to\_code ```python -def to_code(pipeline_variable_name: str = "pipeline", create_notebook_cell: bool = True, generate_imports: bool = True, comment: Optional[str] = CODE_GEN_DEFAULT_COMMENT) +def to_code(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = False) -> str ``` -Returns the respecting code to create this pipeline. - -If we're running in a notebook environment and create_ipyhton_cell is set to True, -this will automatically create a new notebook cell containing the code instead of returning a string. +Returns the code to create this pipeline as string. **Arguments**: - `pipeline_variable_name`: The variable name of the generated pipeline. Default value is 'pipeline'. -- `create_notebook_cell`: If set to True and we're running in a notebook environment, to_code() will automatically create a new notebook cell containing the code. -Otherwise to_code() will return the code as string. +- `generate_imports`: Whether to include the required import statements into the code. Default value is True. +- `add_comment`: Whether to add a preceding comment that this code has been generated. +Default value is False. + + + +#### to\_notebook\_cell + +```python +def to_notebook_cell(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = True) +``` + +Creates a new notebook cell with the code to create this pipeline. + +**Arguments**: + +- `pipeline_variable_name`: The variable name of the generated pipeline. +Default value is 'pipeline'. - `generate_imports`: Whether to include the required import statements into the code. Default value is True. -- `comment`: Preceding comment to add. -Defaults to "This code has been generated by Pipeline.to_code()" +- `add_comment`: Whether to add a preceding comment that this code has been generated. +Default value is True. From 975119429ddeb6f122dd7b930f00256a6940fda0 Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Tue, 22 Feb 2022 10:20:24 +0100 Subject: [PATCH 14/22] introduce helper classes for code gen and eval report gen --- haystack/pipelines/base.py | 494 +++++++++++++++++++------------------ 1 file changed, 258 insertions(+), 236 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 6c572b0416..84c58ffa27 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -90,9 +90,8 @@ def to_code( :param add_comment: Whether to add a preceding comment that this code has been generated. Default value is False. """ - pipeline_config = self.get_config() - code = self._generate_code( - pipeline_config=pipeline_config, + code = _PipelineCodeGen.generate_code( + pipeline=self, pipeline_variable_name=pipeline_variable_name, generate_imports=generate_imports, comment=CODE_GEN_DEFAULT_COMMENT if add_comment else None, @@ -389,115 +388,6 @@ def save_to_deepset_cloud( client.save_pipeline_config(config=config, pipeline_config_name=pipeline_config_name) logger.info(f"Pipeline config '{pipeline_config_name}' successfully created.") - @classmethod - def _camel_to_snake_case(cls, input: str) -> str: - return CAMEL_CASE_TO_SNAKE_CASE_REGEX.sub("_", input).lower() - - @classmethod - def _generate_code( - cls, - pipeline_config: dict, - pipeline_variable_name: str = "pipeline", - generate_imports: bool = True, - comment: Optional[str] = None, - ) -> str: - component_definitions = cls._get_component_definitions( - pipeline_config=pipeline_config, overwrite_with_env_variables=False - ) - component_variable_names = {name: cls._camel_to_snake_case(name) for name in component_definitions.keys()} - pipeline_definition = cls._get_pipeline_definition(pipeline_config=pipeline_config) - - code_parts = [] - if generate_imports: - types_to_import = [component["type"] for component in component_definitions.values()] - imports_code = cls._generate_imports_code(types_to_import=types_to_import) - code_parts.append(imports_code) - - components_code = cls._generate_components_code( - component_definitions=component_definitions, component_variable_names=component_variable_names - ) - pipeline_code = cls._generate_pipeline_code( - pipeline_definition=pipeline_definition, - component_variable_names=component_variable_names, - pipeline_variable_name=pipeline_variable_name, - ) - - code_parts.append(components_code) - code_parts.append(pipeline_code) - code = "\n\n".join(code_parts) - - if comment: - comment = re.sub(r"^(#\s)?", "# ", comment, flags=re.MULTILINE) - code = "\n".join([comment, code]) - - return code - - @classmethod - def _generate_pipeline_code( - cls, pipeline_definition: dict, component_variable_names: Dict[str, str], pipeline_variable_name: str - ) -> str: - code_lines = [f"{pipeline_variable_name} = Pipeline()"] - for node in pipeline_definition["nodes"]: - node_name = node["name"] - component_variable_name = component_variable_names[node_name] - inputs = ", ".join(f'"{name}"' for name in node["inputs"]) - code_lines.append( - f'{pipeline_variable_name}.add_node(component={component_variable_name}, name="{node_name}", inputs=[{inputs}])' - ) - - code = "\n".join(code_lines) - return code - - @classmethod - def _generate_components_code(cls, component_definitions: dict, component_variable_names: Dict[str, str]) -> str: - code = "" - declarations = {} - dependency_map = {} - for name, definition in component_definitions.items(): - variable_name = component_variable_names[name] - class_name = definition["type"] - param_value_dict = { - key: component_variable_names.get(value, f'"{value}"') if type(value) == str else value - for key, value in definition["params"].items() - } - init_args = ", ".join(f"{key}={value}" for key, value in param_value_dict.items()) - declarations[name] = f"{variable_name} = {class_name}({init_args})" - dependency_map[name] = [ - param_value for param_value in definition["params"].values() if param_value in component_variable_names - ] - - ordered_components = cls._order_components(dependency_map=dependency_map) - ordered_declarations = [declarations[component] for component in ordered_components] - code = "\n".join(ordered_declarations) - return code - - @classmethod - def _generate_imports_code(cls, types_to_import: List[str]) -> str: - code_lines = [] - importable_classes = { - name: mod - for mod in CODE_GEN_ALLOWED_IMPORTS - for name, obj in inspect.getmembers(sys.modules[mod]) - if inspect.isclass(obj) - } - - imports_by_module: Dict[str, List[str]] = {} - for t in types_to_import: - mod = importable_classes.get(t, MODULE_NOT_FOUND) - if mod in imports_by_module: - imports_by_module[mod].append(t) - else: - imports_by_module[mod] = [t] - - for mod in sorted(imports_by_module.keys()): - sorted_types = sorted(set(imports_by_module[mod])) - import_types = ", ".join(sorted_types) - line_prefix = "# " if mod == MODULE_NOT_FOUND else "" - code_lines.append(f"{line_prefix}from {mod} import {import_types}") - - code = "\n".join(code_lines) - return code - @classmethod def _get_pipeline_definition(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None) -> dict: """ @@ -541,23 +431,6 @@ def _get_component_definitions(cls, pipeline_config: Dict, overwrite_with_env_va return component_definitions - @classmethod - def _order_components( - cls, dependency_map: Dict[str, List[str]], components_to_order: Optional[List[str]] = None - ) -> List[str]: - ordered_components = [] - if components_to_order is None: - components_to_order = list(dependency_map.keys()) - for component in components_to_order: - dependencies = dependency_map[component] - ordered_dependencies = cls._order_components( - dependency_map=dependency_map, components_to_order=dependencies - ) - ordered_components += [d for d in ordered_dependencies if d not in ordered_components] - if component not in ordered_components: - ordered_components.append(component) - return ordered_components - @classmethod def _overwrite_with_env_variables(cls, definition: dict): """ @@ -1221,85 +1094,6 @@ def get_config(self, return_defaults: bool = False) -> dict: config = {"components": list(components.values()), "pipelines": list(pipelines.values()), "version": "0.8"} return config - def _format_document_answer(self, document_or_answer: dict): - return "\n \t".join([f"{name}: {value}" for name, value in document_or_answer.items()]) - - def _format_wrong_sample(self, query: dict): - metrics = "\n \t".join([f"{name}: {value}" for name, value in query["metrics"].items()]) - documents = "\n\n \t".join([self._format_document_answer(doc) for doc in query.get("documents", [])]) - documents = f"Documents: \n \t{documents}\n" if len(documents) > 0 else "" - answers = "\n\n \t".join([self._format_document_answer(answer) for answer in query.get("answers", [])]) - answers = f"Answers: \n \t{answers}\n" if len(answers) > 0 else "" - gold_document_ids = "\n \t".join(query["gold_document_ids"]) - gold_answers = "\n \t".join(query.get("gold_answers", [])) - gold_answers = f"Gold Answers: \n \t{gold_answers}\n" if len(gold_answers) > 0 else "" - s = ( - f"Query: \n \t{query['query']}\n" - f"{gold_answers}" - f"Gold Document Ids: \n \t{gold_document_ids}\n" - f"Metrics: \n \t{metrics}\n" - f"{answers}" - f"{documents}" - f"_______________________________________________________" - ) - return s - - def _format_wrong_samples_node(self, node_name: str, wrong_samples_formatted: str): - s = ( - f" Wrong {node_name} Examples\n" - f"=======================================================\n" - f"{wrong_samples_formatted}\n" - f"=======================================================\n" - ) - return s - - def _format_wrong_samples_report(self, eval_result: EvaluationResult, n_wrong_examples: int = 3): - examples = { - node: eval_result.wrong_examples(node, doc_relevance_col="gold_id_or_answer_match", n=n_wrong_examples) - for node in eval_result.node_results.keys() - } - examples_formatted = { - node: "\n".join([self._format_wrong_sample(example) for example in examples]) - for node, examples in examples.items() - } - - return "\n".join( - [self._format_wrong_samples_node(node, examples) for node, examples in examples_formatted.items()] - ) - - def _format_pipeline_node(self, node: str, calculated_metrics: dict): - node_metrics: dict = {} - for metric_mode in calculated_metrics: - for metric, value in calculated_metrics[metric_mode].get(node, {}).items(): - node_metrics[f"{metric}{metric_mode}"] = value - - node_metrics_formatted = "\n".join( - sorted([f" | {metric}: {value:5.3}" for metric, value in node_metrics.items()]) - ) - node_metrics_formatted = f"{node_metrics_formatted}\n" if len(node_metrics_formatted) > 0 else "" - s = ( - f" {node}\n" - f" |\n" - f"{node_metrics_formatted}" - f" |" - ) - return s - - def _format_pipeline_overview(self, calculated_metrics: dict): - pipeline_overview = "\n".join( - [self._format_pipeline_node(node, calculated_metrics) for node in self.graph.nodes] - ) - s = ( - f"================== Evaluation Report ==================\n" - f"=======================================================\n" - f" Pipeline Overview\n" - f"=======================================================\n" - f"{pipeline_overview}\n" - f" Output\n" - f"=======================================================\n" - ) - return s - def print_eval_report( self, eval_result: EvaluationResult, @@ -1313,36 +1107,10 @@ def print_eval_report( :param n_wrong_examples: The number of worst queries to show. :param metrics_filter: The metrics to show per node. If None all metrics will be shown. """ - if any(degree > 1 for node, degree in self.graph.out_degree): - logger.warning("Pipelines with junctions are currently not supported.") - return - - calculated_metrics = { - "": eval_result.calculate_metrics(doc_relevance_col="gold_id_or_answer_match"), - "_top_1": eval_result.calculate_metrics( - doc_relevance_col="gold_id_or_answer_match", simulated_top_k_reader=1 - ), - " upper bound": eval_result.calculate_metrics( - doc_relevance_col="gold_id_or_answer_match", eval_mode="isolated" - ), - } - - if metrics_filter is not None: - for metric_mode in calculated_metrics: - calculated_metrics[metric_mode] = { - node: metrics - if node not in metrics_filter - else {metric: value for metric, value in metrics.items() if metric in metrics_filter[node]} - for node, metrics in calculated_metrics[metric_mode].items() - } - - pipeline_overview = self._format_pipeline_overview(calculated_metrics) - wrong_samples_report = self._format_wrong_samples_report( - eval_result=eval_result, n_wrong_examples=n_wrong_examples + _PipelineEvalReportGen.print_eval_report( + eval_result=eval_result, pipeline=self, n_wrong_examples=n_wrong_examples, metrics_filter=metrics_filter ) - print(f"{pipeline_overview}\n" f"{wrong_samples_report}") - class RayPipeline(Pipeline): """ @@ -1627,3 +1395,257 @@ def __call__(self, *args, **kwargs): Ray calls this method which is then re-directed to the corresponding component's run(). """ return self.node._dispatch_run(*args, **kwargs) + + +class _PipelineCodeGen: + @classmethod + def _camel_to_snake_case(cls, input: str) -> str: + return CAMEL_CASE_TO_SNAKE_CASE_REGEX.sub("_", input).lower() + + @classmethod + def generate_code( + cls, + pipeline: BasePipeline, + pipeline_variable_name: str = "pipeline", + generate_imports: bool = True, + comment: Optional[str] = None, + ) -> str: + pipeline_config = pipeline.get_config() + component_definitions = pipeline._get_component_definitions( + pipeline_config=pipeline_config, overwrite_with_env_variables=False + ) + component_variable_names = {name: cls._camel_to_snake_case(name) for name in component_definitions.keys()} + pipeline_definition = pipeline._get_pipeline_definition(pipeline_config=pipeline_config) + + code_parts = [] + if generate_imports: + types_to_import = [component["type"] for component in component_definitions.values()] + imports_code = cls._generate_imports_code(types_to_import=types_to_import) + code_parts.append(imports_code) + + components_code = cls._generate_components_code( + component_definitions=component_definitions, component_variable_names=component_variable_names + ) + pipeline_code = cls._generate_pipeline_code( + pipeline_definition=pipeline_definition, + component_variable_names=component_variable_names, + pipeline_variable_name=pipeline_variable_name, + ) + + code_parts.append(components_code) + code_parts.append(pipeline_code) + code = "\n\n".join(code_parts) + + if comment: + comment = re.sub(r"^(#\s)?", "# ", comment, flags=re.MULTILINE) + code = "\n".join([comment, code]) + + return code + + @classmethod + def _generate_pipeline_code( + cls, pipeline_definition: dict, component_variable_names: Dict[str, str], pipeline_variable_name: str + ) -> str: + code_lines = [f"{pipeline_variable_name} = Pipeline()"] + for node in pipeline_definition["nodes"]: + node_name = node["name"] + component_variable_name = component_variable_names[node_name] + inputs = ", ".join(f'"{name}"' for name in node["inputs"]) + code_lines.append( + f'{pipeline_variable_name}.add_node(component={component_variable_name}, name="{node_name}", inputs=[{inputs}])' + ) + + code = "\n".join(code_lines) + return code + + @classmethod + def _generate_components_code(cls, component_definitions: dict, component_variable_names: Dict[str, str]) -> str: + code = "" + declarations = {} + dependency_map = {} + for name, definition in component_definitions.items(): + variable_name = component_variable_names[name] + class_name = definition["type"] + param_value_dict = { + key: component_variable_names.get(value, f'"{value}"') if type(value) == str else value + for key, value in definition["params"].items() + } + init_args = ", ".join(f"{key}={value}" for key, value in param_value_dict.items()) + declarations[name] = f"{variable_name} = {class_name}({init_args})" + dependency_map[name] = [ + param_value for param_value in definition["params"].values() if param_value in component_variable_names + ] + + ordered_components = cls._order_components(dependency_map=dependency_map) + ordered_declarations = [declarations[component] for component in ordered_components] + code = "\n".join(ordered_declarations) + return code + + @classmethod + def _generate_imports_code(cls, types_to_import: List[str]) -> str: + code_lines = [] + importable_classes = { + name: mod + for mod in CODE_GEN_ALLOWED_IMPORTS + for name, obj in inspect.getmembers(sys.modules[mod]) + if inspect.isclass(obj) + } + + imports_by_module: Dict[str, List[str]] = {} + for t in types_to_import: + mod = importable_classes.get(t, MODULE_NOT_FOUND) + if mod in imports_by_module: + imports_by_module[mod].append(t) + else: + imports_by_module[mod] = [t] + + for mod in sorted(imports_by_module.keys()): + sorted_types = sorted(set(imports_by_module[mod])) + import_types = ", ".join(sorted_types) + line_prefix = "# " if mod == MODULE_NOT_FOUND else "" + code_lines.append(f"{line_prefix}from {mod} import {import_types}") + + code = "\n".join(code_lines) + return code + + @classmethod + def _order_components( + cls, dependency_map: Dict[str, List[str]], components_to_order: Optional[List[str]] = None + ) -> List[str]: + ordered_components = [] + if components_to_order is None: + components_to_order = list(dependency_map.keys()) + for component in components_to_order: + dependencies = dependency_map[component] + ordered_dependencies = cls._order_components( + dependency_map=dependency_map, components_to_order=dependencies + ) + ordered_components += [d for d in ordered_dependencies if d not in ordered_components] + if component not in ordered_components: + ordered_components.append(component) + return ordered_components + + +class _PipelineEvalReportGen: + @classmethod + def print_eval_report( + cls, + eval_result: EvaluationResult, + pipeline: Pipeline, + n_wrong_examples: int = 3, + metrics_filter: Optional[Dict[str, List[str]]] = None, + ): + if any(degree > 1 for node, degree in pipeline.graph.out_degree): + logger.warning("Pipelines with junctions are currently not supported.") + return + + calculated_metrics = { + "": eval_result.calculate_metrics(doc_relevance_col="gold_id_or_answer_match"), + "_top_1": eval_result.calculate_metrics( + doc_relevance_col="gold_id_or_answer_match", simulated_top_k_reader=1 + ), + " upper bound": eval_result.calculate_metrics( + doc_relevance_col="gold_id_or_answer_match", eval_mode="isolated" + ), + } + + if metrics_filter is not None: + for metric_mode in calculated_metrics: + calculated_metrics[metric_mode] = { + node: metrics + if node not in metrics_filter + else {metric: value for metric, value in metrics.items() if metric in metrics_filter[node]} + for node, metrics in calculated_metrics[metric_mode].items() + } + + pipeline_overview = cls._format_pipeline_overview(calculated_metrics=calculated_metrics, pipeline=pipeline) + wrong_samples_report = cls._format_wrong_samples_report( + eval_result=eval_result, n_wrong_examples=n_wrong_examples + ) + + print(f"{pipeline_overview}\n" f"{wrong_samples_report}") + + @classmethod + def _format_document_answer(cls, document_or_answer: dict): + return "\n \t".join([f"{name}: {value}" for name, value in document_or_answer.items()]) + + @classmethod + def _format_wrong_sample(cls, query: dict): + metrics = "\n \t".join([f"{name}: {value}" for name, value in query["metrics"].items()]) + documents = "\n\n \t".join([cls._format_document_answer(doc) for doc in query.get("documents", [])]) + documents = f"Documents: \n \t{documents}\n" if len(documents) > 0 else "" + answers = "\n\n \t".join([cls._format_document_answer(answer) for answer in query.get("answers", [])]) + answers = f"Answers: \n \t{answers}\n" if len(answers) > 0 else "" + gold_document_ids = "\n \t".join(query["gold_document_ids"]) + gold_answers = "\n \t".join(query.get("gold_answers", [])) + gold_answers = f"Gold Answers: \n \t{gold_answers}\n" if len(gold_answers) > 0 else "" + s = ( + f"Query: \n \t{query['query']}\n" + f"{gold_answers}" + f"Gold Document Ids: \n \t{gold_document_ids}\n" + f"Metrics: \n \t{metrics}\n" + f"{answers}" + f"{documents}" + f"_______________________________________________________" + ) + return s + + @classmethod + def _format_wrong_samples_node(cls, node_name: str, wrong_samples_formatted: str): + s = ( + f" Wrong {node_name} Examples\n" + f"=======================================================\n" + f"{wrong_samples_formatted}\n" + f"=======================================================\n" + ) + return s + + @classmethod + def _format_wrong_samples_report(cls, eval_result: EvaluationResult, n_wrong_examples: int = 3): + examples = { + node: eval_result.wrong_examples(node, doc_relevance_col="gold_id_or_answer_match", n=n_wrong_examples) + for node in eval_result.node_results.keys() + } + examples_formatted = { + node: "\n".join([cls._format_wrong_sample(example) for example in examples]) + for node, examples in examples.items() + } + + return "\n".join( + [cls._format_wrong_samples_node(node, examples) for node, examples in examples_formatted.items()] + ) + + @classmethod + def _format_pipeline_node(cls, node: str, calculated_metrics: dict): + node_metrics: dict = {} + for metric_mode in calculated_metrics: + for metric, value in calculated_metrics[metric_mode].get(node, {}).items(): + node_metrics[f"{metric}{metric_mode}"] = value + + node_metrics_formatted = "\n".join( + sorted([f" | {metric}: {value:5.3}" for metric, value in node_metrics.items()]) + ) + node_metrics_formatted = f"{node_metrics_formatted}\n" if len(node_metrics_formatted) > 0 else "" + s = ( + f" {node}\n" + f" |\n" + f"{node_metrics_formatted}" + f" |" + ) + return s + + @classmethod + def _format_pipeline_overview(cls, calculated_metrics: dict, pipeline: Pipeline): + pipeline_overview = "\n".join( + [cls._format_pipeline_node(node, calculated_metrics) for node in pipeline.graph.nodes] + ) + s = ( + f"================== Evaluation Report ==================\n" + f"=======================================================\n" + f" Pipeline Overview\n" + f"=======================================================\n" + f"{pipeline_overview}\n" + f" Output\n" + f"=======================================================\n" + ) + return s From f20b402a50bffc712ad4ab023d9ec5ca39f93ad9 Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Tue, 22 Feb 2022 12:06:35 +0100 Subject: [PATCH 15/22] add more tests --- test/test_pipeline.py | 116 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 115 insertions(+), 1 deletion(-) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index af9d80438b..b8ecd020a1 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -8,13 +8,16 @@ from haystack.document_stores.deepsetcloud import DeepsetCloudDocumentStore from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore +from haystack.document_stores.memory import InMemoryDocumentStore +from haystack.nodes.other.join_docs import JoinDocuments from haystack.nodes.retriever.sparse import ElasticsearchRetriever from haystack.pipelines import ( Pipeline, DocumentSearchPipeline, RootNode, + ExtractiveQAPipeline ) -from haystack.pipelines import ExtractiveQAPipeline +from haystack.pipelines.base import _PipelineCodeGen from haystack.nodes import DensePassageRetriever, EmbeddingRetriever from conftest import MOCK_DC, DC_API_ENDPOINT, DC_API_KEY, DC_TEST_INDEX, SAMPLES_PATH, deepset_cloud_fixture @@ -180,6 +183,117 @@ def test_to_code(): assert index_pipeline.get_config() == locals()["index_pipeline_from_code"].get_config() +@pytest.mark.elasticsearch +def test_PipelineCodeGen_simple_dense_pipeline(): + doc_store = InMemoryDocumentStore(index="my-index") + retriever = EmbeddingRetriever(document_store=doc_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2") + pipeline = Pipeline() + pipeline.add_node(component=retriever, name="retri", inputs=["Query"]) + + code = _PipelineCodeGen.generate_code(pipeline=pipeline, pipeline_variable_name="p", generate_imports=False) + assert code == ("in_memory_document_store = InMemoryDocumentStore(index=\"my-index\")\n" + "retri = EmbeddingRetriever(document_store=in_memory_document_store, embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\")\n" + "\n" + "p = Pipeline()\n" + "p.add_node(component=retri, name=\"retri\", inputs=[\"Query\"])") + + +@pytest.mark.elasticsearch +def test_PipelineCodeGen_simple_sparse_pipeline(): + doc_store = ElasticsearchDocumentStore(index="my-index") + retriever = ElasticsearchRetriever(document_store=doc_store, top_k=20) + pipeline = Pipeline() + pipeline.add_node(component=retriever, name="retri", inputs=["Query"]) + + code = _PipelineCodeGen.generate_code(pipeline=pipeline, pipeline_variable_name="p", generate_imports=False) + assert code == ("elasticsearch_document_store = ElasticsearchDocumentStore(index=\"my-index\")\n" + "retri = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + "\n" + "p = Pipeline()\n" + "p.add_node(component=retri, name=\"retri\", inputs=[\"Query\"])") + + +@pytest.mark.elasticsearch +def test_PipelineCodeGen_dual_retriever_pipeline(): + es_doc_store = ElasticsearchDocumentStore(index="my-index") + es_retriever = ElasticsearchRetriever(document_store=es_doc_store, top_k=20) + dense_doc_store = InMemoryDocumentStore(index="my-index") + emb_retriever = EmbeddingRetriever(document_store=dense_doc_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2") + p_ensemble = Pipeline() + p_ensemble.add_node(component=es_retriever, name="EsRetriever", inputs=["Query"]) + p_ensemble.add_node(component=emb_retriever, name="EmbeddingRetriever", inputs=["Query"]) + p_ensemble.add_node( + component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=["EsRetriever", "EmbeddingRetriever"] + ) + + code = _PipelineCodeGen.generate_code(pipeline=p_ensemble, pipeline_variable_name="p", generate_imports=False) + assert code == ("elasticsearch_document_store = ElasticsearchDocumentStore(index=\"my-index\")\n" + "es_retriever = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + "in_memory_document_store = InMemoryDocumentStore(index=\"my-index\")\n" + "embedding_retriever = EmbeddingRetriever(document_store=in_memory_document_store, embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\")\n" + "join_results = JoinDocuments(join_mode=\"merge\")\n" + "\n" + "p = Pipeline()\n" + "p.add_node(component=es_retriever, name=\"EsRetriever\", inputs=[\"Query\"])\n" + "p.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"Query\"])\n" + "p.add_node(component=join_results, name=\"JoinResults\", inputs=[\"EsRetriever\", \"EmbeddingRetriever\"])") + + +@pytest.mark.elasticsearch +def test_PipelineCodeGen_dual_retriever_pipeline_same_docstore(): + es_doc_store = ElasticsearchDocumentStore(index="my-index") + es_retriever = ElasticsearchRetriever(document_store=es_doc_store, top_k=20) + emb_retriever = EmbeddingRetriever(document_store=es_retriever, embedding_model="sentence-transformers/all-MiniLM-L6-v2") + p_ensemble = Pipeline() + p_ensemble.add_node(component=es_retriever, name="EsRetriever", inputs=["Query"]) + p_ensemble.add_node(component=emb_retriever, name="EmbeddingRetriever", inputs=["Query"]) + p_ensemble.add_node( + component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=["EsRetriever", "EmbeddingRetriever"] + ) + + code = _PipelineCodeGen.generate_code(pipeline=p_ensemble, pipeline_variable_name="p", generate_imports=False) + assert code == ("elasticsearch_document_store = ElasticsearchDocumentStore(index=\"my-index\")\n" + "es_retriever = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + "embedding_retriever = EmbeddingRetriever(document_store=elasticsearch_document_store, embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\")\n" + "join_results = JoinDocuments(join_mode=\"merge\")\n" + "\n" + "p = Pipeline()\n" + "p.add_node(component=es_retriever, name=\"EsRetriever\", inputs=[\"Query\"])\n" + "p.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"Query\"])\n" + "p.add_node(component=join_results, name=\"JoinResults\", inputs=[\"EsRetriever\", \"EmbeddingRetriever\"])") + + +@pytest.mark.elasticsearch +def test_PipelineCodeGen_imports(): + doc_store = ElasticsearchDocumentStore(index="my-index") + retriever = ElasticsearchRetriever(document_store=doc_store, top_k=20) + pipeline = Pipeline() + pipeline.add_node(component=retriever, name="retri", inputs=["Query"]) + + code = _PipelineCodeGen.generate_code(pipeline=pipeline, pipeline_variable_name="p", generate_imports=True) + assert code == ("from haystack.document_stores import ElasticsearchDocumentStore\n" + "from haystack.nodes import ElasticsearchRetriever\n" + "\n" + "elasticsearch_document_store = ElasticsearchDocumentStore(index=\"my-index\")\n" + "retri = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + "\n" + "p = Pipeline()\n" + "p.add_node(component=retri, name=\"retri\", inputs=[\"Query\"])") + + +def test_PipelineCodeGen_order_components(): + dependency_map = { + "a": ["aa", "ab"], + "aa": [], + "ab": ["aba"], + "aba": [], + "b": ["a", "c"], + "c": ["a"] + } + ordered = _PipelineCodeGen._order_components(dependency_map=dependency_map) + assert ordered == ["aa", "aba", "ab", "a", "c", "b"] + + @pytest.mark.usefixtures(deepset_cloud_fixture.__name__) @responses.activate def test_load_from_deepset_cloud_query(): From 4f63666957f21514ac2f2bf8e9dc18b43bd8739a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 22 Feb 2022 11:09:04 +0000 Subject: [PATCH 16/22] Update Documentation & Code Style --- test/test_pipeline.py | 108 +++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 53 deletions(-) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index b8ecd020a1..07c103410f 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -11,12 +11,7 @@ from haystack.document_stores.memory import InMemoryDocumentStore from haystack.nodes.other.join_docs import JoinDocuments from haystack.nodes.retriever.sparse import ElasticsearchRetriever -from haystack.pipelines import ( - Pipeline, - DocumentSearchPipeline, - RootNode, - ExtractiveQAPipeline -) +from haystack.pipelines import Pipeline, DocumentSearchPipeline, RootNode, ExtractiveQAPipeline from haystack.pipelines.base import _PipelineCodeGen from haystack.nodes import DensePassageRetriever, EmbeddingRetriever @@ -191,11 +186,13 @@ def test_PipelineCodeGen_simple_dense_pipeline(): pipeline.add_node(component=retriever, name="retri", inputs=["Query"]) code = _PipelineCodeGen.generate_code(pipeline=pipeline, pipeline_variable_name="p", generate_imports=False) - assert code == ("in_memory_document_store = InMemoryDocumentStore(index=\"my-index\")\n" - "retri = EmbeddingRetriever(document_store=in_memory_document_store, embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\")\n" - "\n" - "p = Pipeline()\n" - "p.add_node(component=retri, name=\"retri\", inputs=[\"Query\"])") + assert code == ( + 'in_memory_document_store = InMemoryDocumentStore(index="my-index")\n' + 'retri = EmbeddingRetriever(document_store=in_memory_document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")\n' + "\n" + "p = Pipeline()\n" + 'p.add_node(component=retri, name="retri", inputs=["Query"])' + ) @pytest.mark.elasticsearch @@ -206,11 +203,13 @@ def test_PipelineCodeGen_simple_sparse_pipeline(): pipeline.add_node(component=retriever, name="retri", inputs=["Query"]) code = _PipelineCodeGen.generate_code(pipeline=pipeline, pipeline_variable_name="p", generate_imports=False) - assert code == ("elasticsearch_document_store = ElasticsearchDocumentStore(index=\"my-index\")\n" - "retri = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" - "\n" - "p = Pipeline()\n" - "p.add_node(component=retri, name=\"retri\", inputs=[\"Query\"])") + assert code == ( + 'elasticsearch_document_store = ElasticsearchDocumentStore(index="my-index")\n' + "retri = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + "\n" + "p = Pipeline()\n" + 'p.add_node(component=retri, name="retri", inputs=["Query"])' + ) @pytest.mark.elasticsearch @@ -218,7 +217,9 @@ def test_PipelineCodeGen_dual_retriever_pipeline(): es_doc_store = ElasticsearchDocumentStore(index="my-index") es_retriever = ElasticsearchRetriever(document_store=es_doc_store, top_k=20) dense_doc_store = InMemoryDocumentStore(index="my-index") - emb_retriever = EmbeddingRetriever(document_store=dense_doc_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2") + emb_retriever = EmbeddingRetriever( + document_store=dense_doc_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" + ) p_ensemble = Pipeline() p_ensemble.add_node(component=es_retriever, name="EsRetriever", inputs=["Query"]) p_ensemble.add_node(component=emb_retriever, name="EmbeddingRetriever", inputs=["Query"]) @@ -227,23 +228,27 @@ def test_PipelineCodeGen_dual_retriever_pipeline(): ) code = _PipelineCodeGen.generate_code(pipeline=p_ensemble, pipeline_variable_name="p", generate_imports=False) - assert code == ("elasticsearch_document_store = ElasticsearchDocumentStore(index=\"my-index\")\n" - "es_retriever = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" - "in_memory_document_store = InMemoryDocumentStore(index=\"my-index\")\n" - "embedding_retriever = EmbeddingRetriever(document_store=in_memory_document_store, embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\")\n" - "join_results = JoinDocuments(join_mode=\"merge\")\n" - "\n" - "p = Pipeline()\n" - "p.add_node(component=es_retriever, name=\"EsRetriever\", inputs=[\"Query\"])\n" - "p.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"Query\"])\n" - "p.add_node(component=join_results, name=\"JoinResults\", inputs=[\"EsRetriever\", \"EmbeddingRetriever\"])") + assert code == ( + 'elasticsearch_document_store = ElasticsearchDocumentStore(index="my-index")\n' + "es_retriever = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + 'in_memory_document_store = InMemoryDocumentStore(index="my-index")\n' + 'embedding_retriever = EmbeddingRetriever(document_store=in_memory_document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")\n' + 'join_results = JoinDocuments(join_mode="merge")\n' + "\n" + "p = Pipeline()\n" + 'p.add_node(component=es_retriever, name="EsRetriever", inputs=["Query"])\n' + 'p.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])\n' + 'p.add_node(component=join_results, name="JoinResults", inputs=["EsRetriever", "EmbeddingRetriever"])' + ) @pytest.mark.elasticsearch def test_PipelineCodeGen_dual_retriever_pipeline_same_docstore(): es_doc_store = ElasticsearchDocumentStore(index="my-index") es_retriever = ElasticsearchRetriever(document_store=es_doc_store, top_k=20) - emb_retriever = EmbeddingRetriever(document_store=es_retriever, embedding_model="sentence-transformers/all-MiniLM-L6-v2") + emb_retriever = EmbeddingRetriever( + document_store=es_retriever, embedding_model="sentence-transformers/all-MiniLM-L6-v2" + ) p_ensemble = Pipeline() p_ensemble.add_node(component=es_retriever, name="EsRetriever", inputs=["Query"]) p_ensemble.add_node(component=emb_retriever, name="EmbeddingRetriever", inputs=["Query"]) @@ -252,15 +257,17 @@ def test_PipelineCodeGen_dual_retriever_pipeline_same_docstore(): ) code = _PipelineCodeGen.generate_code(pipeline=p_ensemble, pipeline_variable_name="p", generate_imports=False) - assert code == ("elasticsearch_document_store = ElasticsearchDocumentStore(index=\"my-index\")\n" - "es_retriever = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" - "embedding_retriever = EmbeddingRetriever(document_store=elasticsearch_document_store, embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\")\n" - "join_results = JoinDocuments(join_mode=\"merge\")\n" - "\n" - "p = Pipeline()\n" - "p.add_node(component=es_retriever, name=\"EsRetriever\", inputs=[\"Query\"])\n" - "p.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"Query\"])\n" - "p.add_node(component=join_results, name=\"JoinResults\", inputs=[\"EsRetriever\", \"EmbeddingRetriever\"])") + assert code == ( + 'elasticsearch_document_store = ElasticsearchDocumentStore(index="my-index")\n' + "es_retriever = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + 'embedding_retriever = EmbeddingRetriever(document_store=elasticsearch_document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")\n' + 'join_results = JoinDocuments(join_mode="merge")\n' + "\n" + "p = Pipeline()\n" + 'p.add_node(component=es_retriever, name="EsRetriever", inputs=["Query"])\n' + 'p.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])\n' + 'p.add_node(component=join_results, name="JoinResults", inputs=["EsRetriever", "EmbeddingRetriever"])' + ) @pytest.mark.elasticsearch @@ -271,25 +278,20 @@ def test_PipelineCodeGen_imports(): pipeline.add_node(component=retriever, name="retri", inputs=["Query"]) code = _PipelineCodeGen.generate_code(pipeline=pipeline, pipeline_variable_name="p", generate_imports=True) - assert code == ("from haystack.document_stores import ElasticsearchDocumentStore\n" - "from haystack.nodes import ElasticsearchRetriever\n" - "\n" - "elasticsearch_document_store = ElasticsearchDocumentStore(index=\"my-index\")\n" - "retri = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" - "\n" - "p = Pipeline()\n" - "p.add_node(component=retri, name=\"retri\", inputs=[\"Query\"])") + assert code == ( + "from haystack.document_stores import ElasticsearchDocumentStore\n" + "from haystack.nodes import ElasticsearchRetriever\n" + "\n" + 'elasticsearch_document_store = ElasticsearchDocumentStore(index="my-index")\n' + "retri = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + "\n" + "p = Pipeline()\n" + 'p.add_node(component=retri, name="retri", inputs=["Query"])' + ) def test_PipelineCodeGen_order_components(): - dependency_map = { - "a": ["aa", "ab"], - "aa": [], - "ab": ["aba"], - "aba": [], - "b": ["a", "c"], - "c": ["a"] - } + dependency_map = {"a": ["aa", "ab"], "aa": [], "ab": ["aba"], "aba": [], "b": ["a", "c"], "c": ["a"]} ordered = _PipelineCodeGen._order_components(dependency_map=dependency_map) assert ordered == ["aa", "aba", "ab", "a", "c", "b"] From dec2f96a44ba4451f7ad1d044dbebd94fa0ba5dc Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Tue, 22 Feb 2022 18:42:55 +0100 Subject: [PATCH 17/22] fix Dict typings --- haystack/pipelines/base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 1793aa3476..bc3e37d4c5 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -401,7 +401,7 @@ def save_to_deepset_cloud( logger.info(f"Pipeline config '{pipeline_config_name}' successfully created.") @classmethod - def _get_pipeline_definition(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None) -> dict: + def _get_pipeline_definition(cls, pipeline_config: Dict[str, Any], pipeline_name: Optional[str] = None) -> Dict[str, Any]: """ Get the definition of Pipeline from a given pipeline config. If the config contains more than one Pipeline, then the pipeline_name must be supplied. @@ -423,7 +423,7 @@ def _get_pipeline_definition(cls, pipeline_config: Dict, pipeline_name: Optional return pipeline_definition @classmethod - def _get_component_definitions(cls, pipeline_config: Dict, overwrite_with_env_variables: bool) -> dict: + def _get_component_definitions(cls, pipeline_config: Dict[str, Any], overwrite_with_env_variables: bool) -> Dict[str, Any]: """ Returns the definitions of all components from a given pipeline config. @@ -444,7 +444,7 @@ def _get_component_definitions(cls, pipeline_config: Dict, overwrite_with_env_va return component_definitions @classmethod - def _overwrite_with_env_variables(cls, definition: dict): + def _overwrite_with_env_variables(cls, definition: Dict[str, Any]): """ Overwrite the pipeline config with environment variables. For example, to change index name param for an ElasticsearchDocumentStore, an env variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an @@ -1484,7 +1484,7 @@ def generate_code( @classmethod def _generate_pipeline_code( - cls, pipeline_definition: dict, component_variable_names: Dict[str, str], pipeline_variable_name: str + cls, pipeline_definition: Dict[str, Any], component_variable_names: Dict[str, str], pipeline_variable_name: str ) -> str: code_lines = [f"{pipeline_variable_name} = Pipeline()"] for node in pipeline_definition["nodes"]: @@ -1499,7 +1499,7 @@ def _generate_pipeline_code( return code @classmethod - def _generate_components_code(cls, component_definitions: dict, component_variable_names: Dict[str, str]) -> str: + def _generate_components_code(cls, component_definitions: Dict[str, Any], component_variable_names: Dict[str, str]) -> str: code = "" declarations = {} dependency_map = {} From 14c11fb8cf9a9e4d5a5e16fd91c0310bc04edf6d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 22 Feb 2022 18:03:47 +0000 Subject: [PATCH 18/22] Update Documentation & Code Style --- haystack/pipelines/base.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index bc3e37d4c5..8ed8385f6d 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -401,7 +401,9 @@ def save_to_deepset_cloud( logger.info(f"Pipeline config '{pipeline_config_name}' successfully created.") @classmethod - def _get_pipeline_definition(cls, pipeline_config: Dict[str, Any], pipeline_name: Optional[str] = None) -> Dict[str, Any]: + def _get_pipeline_definition( + cls, pipeline_config: Dict[str, Any], pipeline_name: Optional[str] = None + ) -> Dict[str, Any]: """ Get the definition of Pipeline from a given pipeline config. If the config contains more than one Pipeline, then the pipeline_name must be supplied. @@ -423,7 +425,9 @@ def _get_pipeline_definition(cls, pipeline_config: Dict[str, Any], pipeline_name return pipeline_definition @classmethod - def _get_component_definitions(cls, pipeline_config: Dict[str, Any], overwrite_with_env_variables: bool) -> Dict[str, Any]: + def _get_component_definitions( + cls, pipeline_config: Dict[str, Any], overwrite_with_env_variables: bool + ) -> Dict[str, Any]: """ Returns the definitions of all components from a given pipeline config. @@ -1499,7 +1503,9 @@ def _generate_pipeline_code( return code @classmethod - def _generate_components_code(cls, component_definitions: Dict[str, Any], component_variable_names: Dict[str, str]) -> str: + def _generate_components_code( + cls, component_definitions: Dict[str, Any], component_variable_names: Dict[str, str] + ) -> str: code = "" declarations = {} dependency_map = {} From 1fd94b9cf5d650fb597c12fe824318f5351a6040 Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Tue, 22 Feb 2022 23:19:39 +0100 Subject: [PATCH 19/22] validate user input before code gen --- haystack/pipelines/base.py | 76 ++++++++++++++--- test/test_pipeline.py | 170 +++++++++++++++++++++++++++++++++---- 2 files changed, 215 insertions(+), 31 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 8ed8385f6d..676cc8c7e1 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -42,7 +42,8 @@ ROOT_NODE_TO_PIPELINE_NAME = {"query": "query", "file": "indexing"} -CAMEL_CASE_TO_SNAKE_CASE_REGEX = re.compile(r"(? dict: :param return_defaults: whether to output parameters that have the default values. """ - nodes = self.graph.nodes - pipeline_name = ROOT_NODE_TO_PIPELINE_NAME[self.root_node.lower()] pipelines: dict = {pipeline_name: {"name": pipeline_name, "type": self.__class__.__name__, "nodes": []}} components = {} - for node in nodes: + for node in self.graph.nodes: if node == self.root_node: continue component_instance = self.graph.nodes.get(node)["component"] @@ -1096,25 +1095,37 @@ def get_config(self, return_defaults: bool = False) -> dict: for component_parent in component_parent_classes: component_signature = {**component_signature, **inspect.signature(component_parent).parameters} - for key, value in component_params.items(): + for param_key, param_value in component_params.items(): # A parameter for a Component could be another Component. For instance, a Retriever has # the DocumentStore as a parameter. # Component configs must be a dict with a "type" key. The "type" keys distinguishes between # other parameters like "custom_mapping" that are dicts. # This currently only checks for the case single-level nesting case, wherein, "a Component has another # Component as a parameter". For deeper nesting cases, this function should be made recursive. - if isinstance(value, dict) and "type" in value.keys(): # the parameter is a Component - components[node]["params"][key] = value["type"] - sub_component_signature = inspect.signature(BaseComponent.subclasses[value["type"]]).parameters - params = { + if isinstance(param_value, dict) and "type" in param_value.keys(): # the parameter is a Component + sub_component = param_value + sub_component_type_name = sub_component["type"] + sub_component_signature = inspect.signature( + BaseComponent.subclasses[sub_component_type_name] + ).parameters + sub_component_params = { k: v - for k, v in value["params"].items() + for k, v in sub_component["params"].items() if sub_component_signature[k].default != v or return_defaults is True } - components[value["type"]] = {"name": value["type"], "type": value["type"], "params": params} + + sub_component_name = self._generate_component_name( + type_name=sub_component_type_name, params=sub_component_params, existing_components=components + ) + components[sub_component_name] = { + "name": sub_component_name, + "type": sub_component_type_name, + "params": sub_component_params, + } + components[node]["params"][param_key] = sub_component_name else: - if component_signature[key].default != value or return_defaults is True: - components[node]["params"][key] = value + if component_signature[param_key].default != param_value or return_defaults is True: + components[node]["params"][param_key] = param_value # create the Pipeline definition with how the Component are connected pipelines[pipeline_name]["nodes"].append({"name": node, "inputs": list(self.graph.predecessors(node))}) @@ -1126,6 +1137,22 @@ def get_config(self, return_defaults: bool = False) -> dict: } return config + def _generate_component_name( + self, + type_name: str, + params: Dict[str, Any], + existing_components: Dict[str, Any], + ): + component_name: str = type_name + # add number if there are multiple distinct ones of the same type + while component_name in existing_components and params != existing_components[component_name]["params"]: + occupied_num = 1 + if len(component_name) > len(type_name): + occupied_num = int(component_name[len(type_name) + 1 :]) + new_num = occupied_num + 1 + component_name = f"{type_name}_{new_num}" + return component_name + def print_eval_report( self, eval_result: EvaluationResult, @@ -1446,6 +1473,27 @@ class _PipelineCodeGen: def _camel_to_snake_case(cls, input: str) -> str: return CAMEL_CASE_TO_SNAKE_CASE_REGEX.sub("_", input).lower() + @classmethod + def _validate_user_input(cls, input: str): + if isinstance(input, str) and not VALID_CODE_GEN_INPUT_REGEX.match(input): + raise ValueError(f"'{input}' is not a valid code gen variable name. Use word characters only.") + + @classmethod + def _validate_config(cls, pipeline_config: Dict[str, Any]): + for component in pipeline_config["components"]: + cls._validate_user_input(component["name"]) + cls._validate_user_input(component["type"]) + for k, v in component.get("params", {}).items(): + cls._validate_user_input(k) + cls._validate_user_input(v) + for pipeline in pipeline_config["pipelines"]: + cls._validate_user_input(pipeline["name"]) + cls._validate_user_input(pipeline["type"]) + for node in pipeline["nodes"]: + cls._validate_user_input(node["name"]) + for input in node["inputs"]: + cls._validate_user_input(input) + @classmethod def generate_code( cls, @@ -1455,6 +1503,8 @@ def generate_code( comment: Optional[str] = None, ) -> str: pipeline_config = pipeline.get_config() + cls._validate_config(pipeline_config) + component_definitions = pipeline._get_component_definitions( pipeline_config=pipeline_config, overwrite_with_env_variables=False ) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 17b34c7831..b522843ebe 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -181,23 +181,6 @@ def test_to_code(): assert index_pipeline.get_config() == locals()["index_pipeline_from_code"].get_config() -@pytest.mark.elasticsearch -def test_PipelineCodeGen_simple_dense_pipeline(): - doc_store = InMemoryDocumentStore(index="my-index") - retriever = EmbeddingRetriever(document_store=doc_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2") - pipeline = Pipeline() - pipeline.add_node(component=retriever, name="retri", inputs=["Query"]) - - code = _PipelineCodeGen.generate_code(pipeline=pipeline, pipeline_variable_name="p", generate_imports=False) - assert code == ( - 'in_memory_document_store = InMemoryDocumentStore(index="my-index")\n' - 'retri = EmbeddingRetriever(document_store=in_memory_document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")\n' - "\n" - "p = Pipeline()\n" - 'p.add_node(component=retri, name="retri", inputs=["Query"])' - ) - - @pytest.mark.elasticsearch def test_PipelineCodeGen_simple_sparse_pipeline(): doc_store = ElasticsearchDocumentStore(index="my-index") @@ -250,7 +233,7 @@ def test_PipelineCodeGen_dual_retriever_pipeline_same_docstore(): es_doc_store = ElasticsearchDocumentStore(index="my-index") es_retriever = ElasticsearchRetriever(document_store=es_doc_store, top_k=20) emb_retriever = EmbeddingRetriever( - document_store=es_retriever, embedding_model="sentence-transformers/all-MiniLM-L6-v2" + document_store=es_doc_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" ) p_ensemble = Pipeline() p_ensemble.add_node(component=es_retriever, name="EsRetriever", inputs=["Query"]) @@ -273,6 +256,62 @@ def test_PipelineCodeGen_dual_retriever_pipeline_same_docstore(): ) +@pytest.mark.elasticsearch +def test_PipelineCodeGen_dual_retriever_pipeline_different_docstore(): + es_doc_store_a = ElasticsearchDocumentStore(index="my-index-a") + es_doc_store_b = ElasticsearchDocumentStore(index="my-index-b") + es_retriever = ElasticsearchRetriever(document_store=es_doc_store_a, top_k=20) + emb_retriever = EmbeddingRetriever( + document_store=es_doc_store_b, embedding_model="sentence-transformers/all-MiniLM-L6-v2" + ) + p_ensemble = Pipeline() + p_ensemble.add_node(component=es_retriever, name="EsRetriever", inputs=["Query"]) + p_ensemble.add_node(component=emb_retriever, name="EmbeddingRetriever", inputs=["Query"]) + p_ensemble.add_node( + component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=["EsRetriever", "EmbeddingRetriever"] + ) + + code = _PipelineCodeGen.generate_code(pipeline=p_ensemble, pipeline_variable_name="p", generate_imports=False) + assert code == ( + 'elasticsearch_document_store = ElasticsearchDocumentStore(index="my-index-a")\n' + "es_retriever = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + 'elasticsearch_document_store_2 = ElasticsearchDocumentStore(index="my-index-b")\n' + 'embedding_retriever = EmbeddingRetriever(document_store=elasticsearch_document_store_2, embedding_model="sentence-transformers/all-MiniLM-L6-v2")\n' + 'join_results = JoinDocuments(join_mode="merge")\n' + "\n" + "p = Pipeline()\n" + 'p.add_node(component=es_retriever, name="EsRetriever", inputs=["Query"])\n' + 'p.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])\n' + 'p.add_node(component=join_results, name="JoinResults", inputs=["EsRetriever", "EmbeddingRetriever"])' + ) + + +@pytest.mark.elasticsearch +def test_PipelineCodeGen_dual_retriever_pipeline_same_type(): + es_doc_store = ElasticsearchDocumentStore(index="my-index") + es_retriever_1 = ElasticsearchRetriever(document_store=es_doc_store, top_k=20) + es_retriever_2 = ElasticsearchRetriever(document_store=es_doc_store, top_k=10) + p_ensemble = Pipeline() + p_ensemble.add_node(component=es_retriever_1, name="EsRetriever1", inputs=["Query"]) + p_ensemble.add_node(component=es_retriever_2, name="EsRetriever2", inputs=["Query"]) + p_ensemble.add_node( + component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=["EsRetriever1", "EsRetriever2"] + ) + + code = _PipelineCodeGen.generate_code(pipeline=p_ensemble, pipeline_variable_name="p", generate_imports=False) + assert code == ( + 'elasticsearch_document_store = ElasticsearchDocumentStore(index="my-index")\n' + "es_retriever_1 = ElasticsearchRetriever(document_store=elasticsearch_document_store, top_k=20)\n" + "es_retriever_2 = ElasticsearchRetriever(document_store=elasticsearch_document_store)\n" + 'join_results = JoinDocuments(join_mode="merge")\n' + "\n" + "p = Pipeline()\n" + 'p.add_node(component=es_retriever_1, name="EsRetriever1", inputs=["Query"])\n' + 'p.add_node(component=es_retriever_2, name="EsRetriever2", inputs=["Query"])\n' + 'p.add_node(component=join_results, name="JoinResults", inputs=["EsRetriever1", "EsRetriever2"])' + ) + + @pytest.mark.elasticsearch def test_PipelineCodeGen_imports(): doc_store = ElasticsearchDocumentStore(index="my-index") @@ -299,6 +338,101 @@ def test_PipelineCodeGen_order_components(): assert ordered == ["aa", "aba", "ab", "a", "c", "b"] +@pytest.mark.parametrize("input", ["\btest", " test", "%test", "+test"]) +def test_PipelineCodeGen_validate_user_input_invalid(input): + with pytest.raises(ValueError): + _PipelineCodeGen._validate_user_input(input) + + +@pytest.mark.parametrize("input", ["test", "testName", "test_name", "test-name", "test-name1234"]) +def test_PipelineCodeGen_validate_user_input_valid(input): + _PipelineCodeGen._validate_user_input(input) + + +def test_PipelineCodeGen_validate_pipeline_config_invalid_component_name(): + with pytest.raises(ValueError): + _PipelineCodeGen._validate_config({"components": [{"name": "\btest"}]}) + + +def test_PipelineCodeGen_validate_pipeline_config_invalid_component_type(): + with pytest.raises(ValueError): + _PipelineCodeGen._validate_config({"components": [{"name": "test", "type": "\btest"}]}) + + +def test_PipelineCodeGen_validate_pipeline_config_invalid_component_param(): + with pytest.raises(ValueError): + _PipelineCodeGen._validate_config( + {"components": [{"name": "test", "type": "test", "params": {"key": "\btest"}}]} + ) + + +def test_PipelineCodeGen_validate_pipeline_config_invalid_component_param_key(): + with pytest.raises(ValueError): + _PipelineCodeGen._validate_config( + {"components": [{"name": "test", "type": "test", "params": {"\btest": "test"}}]} + ) + + +def test_PipelineCodeGen_validate_pipeline_config_invalid_pipeline_name(): + with pytest.raises(ValueError): + _PipelineCodeGen._validate_config( + { + "components": [ + { + "name": "test", + "type": "test", + } + ], + "pipelines": [{"name": "\btest"}], + } + ) + + +def test_PipelineCodeGen_validate_pipeline_config_invalid_pipeline_type(): + with pytest.raises(ValueError): + _PipelineCodeGen._validate_config( + { + "components": [ + { + "name": "test", + "type": "test", + } + ], + "pipelines": [{"name": "test", "type": "\btest"}], + } + ) + + +def test_PipelineCodeGen_validate_pipeline_config_invalid_pipeline_node_name(): + with pytest.raises(ValueError): + _PipelineCodeGen._validate_config( + { + "components": [ + { + "name": "test", + "type": "test", + } + ], + "pipelines": [{"name": "test", "type": "test", "nodes": [{"name": "\btest"}]}], + } + ) + + +def test_PipelineCodeGen_validate_pipeline_config_invalid_pipeline_node_inputs(): + with pytest.raises(ValueError): + _PipelineCodeGen._validate_config( + { + "components": [ + { + "name": "test", + "type": "test", + } + ], + "pipelines": [{"name": "test", "type": "test", "nodes": [{"name": "test", "inputs": ["\btest"]}]}], + } + ) + + @pytest.mark.usefixtures(deepset_cloud_fixture.__name__) @responses.activate def test_load_from_deepset_cloud_query(): From 53abe8a82f596b616e7d2aff7c731788d5176bd6 Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Wed, 23 Feb 2022 10:02:19 +0100 Subject: [PATCH 20/22] enable urls for to_code() --- haystack/pipelines/base.py | 2 +- test/test_pipeline.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 676cc8c7e1..48a66b17c8 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -43,7 +43,7 @@ ROOT_NODE_TO_PIPELINE_NAME = {"query": "query", "file": "indexing"} CAMEL_CASE_TO_SNAKE_CASE_REGEX = re.compile(r"(?<=[a-z])(?=[A-Z0-9])") -VALID_CODE_GEN_INPUT_REGEX = re.compile(r"^[-a-zA-Z0-9_/.]+$") +VALID_CODE_GEN_INPUT_REGEX = re.compile(r"^[-a-zA-Z0-9_/.?=:%&$§#@,;()+/*~]+$") MODULE_NOT_FOUND = "MODULE_NOT_FOUND" CODE_GEN_ALLOWED_IMPORTS = ["haystack.document_stores", "haystack.nodes"] CODE_GEN_DEFAULT_COMMENT = "This code has been generated." diff --git a/test/test_pipeline.py b/test/test_pipeline.py index b522843ebe..715acd335d 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -338,13 +338,13 @@ def test_PipelineCodeGen_order_components(): assert ordered == ["aa", "aba", "ab", "a", "c", "b"] -@pytest.mark.parametrize("input", ["\btest", " test", "%test", "+test"]) +@pytest.mark.parametrize("input", ["\btest", " test", "\ttest", "\ntest"]) def test_PipelineCodeGen_validate_user_input_invalid(input): with pytest.raises(ValueError): _PipelineCodeGen._validate_user_input(input) -@pytest.mark.parametrize("input", ["test", "testName", "test_name", "test-name", "test-name1234"]) +@pytest.mark.parametrize("input", ["test", "testName", "test_name", "test-name", "test-name1234", "http://localhost:8000/my-path?param=value#anchor"]) def test_PipelineCodeGen_validate_user_input_valid(input): _PipelineCodeGen._validate_user_input(input) From f356f92f7ab479d680d0060fd6cef3a76748422c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 23 Feb 2022 09:04:54 +0000 Subject: [PATCH 21/22] Update Documentation & Code Style --- test/test_pipeline.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 715acd335d..29e0dd48ae 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -344,7 +344,10 @@ def test_PipelineCodeGen_validate_user_input_invalid(input): _PipelineCodeGen._validate_user_input(input) -@pytest.mark.parametrize("input", ["test", "testName", "test_name", "test-name", "test-name1234", "http://localhost:8000/my-path?param=value#anchor"]) +@pytest.mark.parametrize( + "input", + ["test", "testName", "test_name", "test-name", "test-name1234", "http://localhost:8000/my-path?param=value#anchor"], +) def test_PipelineCodeGen_validate_user_input_valid(input): _PipelineCodeGen._validate_user_input(input) From 278358e561f7b4479b469684309f4e5750a5dab8 Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Wed, 23 Feb 2022 10:24:19 +0100 Subject: [PATCH 22/22] remove all chars except colon from validation regex --- haystack/pipelines/base.py | 2 +- test/test_pipeline.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 48a66b17c8..028e9acf73 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -43,7 +43,7 @@ ROOT_NODE_TO_PIPELINE_NAME = {"query": "query", "file": "indexing"} CAMEL_CASE_TO_SNAKE_CASE_REGEX = re.compile(r"(?<=[a-z])(?=[A-Z0-9])") -VALID_CODE_GEN_INPUT_REGEX = re.compile(r"^[-a-zA-Z0-9_/.?=:%&$§#@,;()+/*~]+$") +VALID_CODE_GEN_INPUT_REGEX = re.compile(r"^[-a-zA-Z0-9_/.:]+$") MODULE_NOT_FOUND = "MODULE_NOT_FOUND" CODE_GEN_ALLOWED_IMPORTS = ["haystack.document_stores", "haystack.nodes"] CODE_GEN_DEFAULT_COMMENT = "This code has been generated." diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 29e0dd48ae..9c2c8ec33e 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -338,15 +338,14 @@ def test_PipelineCodeGen_order_components(): assert ordered == ["aa", "aba", "ab", "a", "c", "b"] -@pytest.mark.parametrize("input", ["\btest", " test", "\ttest", "\ntest"]) +@pytest.mark.parametrize("input", ["\btest", " test", "#test", "+test", "\ttest", "\ntest", "test()"]) def test_PipelineCodeGen_validate_user_input_invalid(input): with pytest.raises(ValueError): _PipelineCodeGen._validate_user_input(input) @pytest.mark.parametrize( - "input", - ["test", "testName", "test_name", "test-name", "test-name1234", "http://localhost:8000/my-path?param=value#anchor"], + "input", ["test", "testName", "test_name", "test-name", "test-name1234", "http://localhost:8000/my-path"] ) def test_PipelineCodeGen_validate_user_input_valid(input): _PipelineCodeGen._validate_user_input(input)