diff --git a/haystack/components/builders/answer_builder.py b/haystack/components/builders/answer_builder.py index b28e99e948..08e420a6e2 100644 --- a/haystack/components/builders/answer_builder.py +++ b/haystack/components/builders/answer_builder.py @@ -10,6 +10,7 @@ class AnswerBuilder: """ Takes a query and the replies a Generator returns as input and parses them into GeneratedAnswer objects. + Optionally, it also takes Documents and metadata from the Generator as inputs to enrich the GeneratedAnswer objects. Usage example: @@ -126,9 +127,10 @@ def run( def _extract_answer_string(reply: str, pattern: Optional[str] = None) -> str: """ Extract the answer string from the generator output using the specified pattern. + If no pattern is specified, the whole string is used as the answer. - :param replies: + :param reply: The output of the Generator. A string. :param pattern: The regular expression pattern to use to extract the answer text from the generator output. diff --git a/haystack/components/builders/dynamic_chat_prompt_builder.py b/haystack/components/builders/dynamic_chat_prompt_builder.py index 2ca28e68e4..06fa5f6124 100644 --- a/haystack/components/builders/dynamic_chat_prompt_builder.py +++ b/haystack/components/builders/dynamic_chat_prompt_builder.py @@ -11,10 +11,12 @@ @component class DynamicChatPromptBuilder: """ - DynamicChatPromptBuilder is designed to construct dynamic prompts from a list of `ChatMessage` instances. It - integrates with Jinja2 templating for dynamic prompt generation. It considers any user or system message in the list - potentially containing a template and renders it with variables provided to the constructor. Additional template - variables can be feed into the component/pipeline `run` method and will be merged before rendering the template. + DynamicChatPromptBuilder is designed to construct dynamic prompts from a list of `ChatMessage` instances. + + It integrates with Jinja2 templating for dynamic prompt generation. It considers any user or system message in the + list potentially containing a template and renders it with variables provided to the constructor. Additional + template variables can be feed into the component/pipeline `run` method and will be merged before rendering the + template. Usage example: ```python @@ -92,6 +94,7 @@ def __init__(self, runtime_variables: Optional[List[str]] = None): def run(self, prompt_source: List[ChatMessage], template_variables: Optional[Dict[str, Any]] = None, **kwargs): """ Executes the dynamic prompt building process by processing a list of `ChatMessage` instances. + Any user message or system message is inspected for templates and rendered with the variables provided to the constructor. You can provide additional template variables directly to this method, which are then merged with the variables provided to the constructor. @@ -151,6 +154,7 @@ def run(self, prompt_source: List[ChatMessage], template_variables: Optional[Dic def _validate_template(self, template_text: str, provided_variables: Set[str]): """ Checks if all the required template variables are provided to the pipeline `run` method. + If all the required template variables are provided, returns a Jinja2 template object. Otherwise, raises a ValueError. diff --git a/haystack/components/builders/dynamic_prompt_builder.py b/haystack/components/builders/dynamic_prompt_builder.py index 3580ecd8b0..dc7185449e 100644 --- a/haystack/components/builders/dynamic_prompt_builder.py +++ b/haystack/components/builders/dynamic_prompt_builder.py @@ -10,8 +10,10 @@ @component class DynamicPromptBuilder: """ - DynamicPromptBuilder is designed to construct dynamic prompts for the pipeline. Users can change the prompt - template at runtime by providing a new template for each pipeline run invocation if needed. + DynamicPromptBuilder is designed to construct dynamic prompts for the pipeline. + + Users can change the prompt template at runtime by providing a new template for each pipeline run invocation + if needed. Usage example: ```python @@ -92,12 +94,15 @@ def __init__(self, runtime_variables: Optional[List[str]] = None): def run(self, prompt_source: str, template_variables: Optional[Dict[str, Any]] = None, **kwargs): """ - Executes the dynamic prompt building process. Depending on the provided type of `prompt_source`, this method - either processes a list of `ChatMessage` instances or a string template. In the case of `ChatMessage` instances, - the last user message is treated as a template and rendered with the resolved pipeline variables and any - additional template variables provided. For a string template, it directly applies the template variables to - render the final prompt. You can provide additional template variables directly to this method, that are then - merged with the variables resolved from the pipeline runtime. + Executes the dynamic prompt building process. + + Depending on the provided type of `prompt_source`, this method either processes a list of `ChatMessage` + instances or a string template. In the case of `ChatMessage` instances, the last user message is treated as a + template and rendered with the resolved pipeline variables and any additional template variables provided. + + For a string template, it directly applies the template variables to render the final prompt. You can provide + additional template variables directly to this method, that are then merged with the variables resolved from + the pipeline runtime. :param prompt_source: A string template. @@ -127,6 +132,7 @@ def run(self, prompt_source: str, template_variables: Optional[Dict[str, Any]] = def _validate_template(self, template_text: str, provided_variables: Set[str]): """ Checks if all the required template variables are provided to the pipeline `run` method. + If all the required template variables are provided, returns a Jinja2 template object. Otherwise, raises a ValueError. diff --git a/haystack/components/builders/prompt_builder.py b/haystack/components/builders/prompt_builder.py index 64b85d76a4..900463bde0 100644 --- a/haystack/components/builders/prompt_builder.py +++ b/haystack/components/builders/prompt_builder.py @@ -40,6 +40,8 @@ def to_dict(self) -> Dict[str, Any]: @component.output_types(prompt=str) def run(self, **kwargs): """ + Renders the prompt template with the provided variables. + :param kwargs: The variables that will be used to render the prompt template. diff --git a/haystack/components/caching/cache_checker.py b/haystack/components/caching/cache_checker.py index 9f7f58b732..8d88f765b9 100644 --- a/haystack/components/caching/cache_checker.py +++ b/haystack/components/caching/cache_checker.py @@ -10,8 +10,7 @@ @component class CacheChecker: """ - Checks for the presence of documents in a Document Store based on a specified - field in each document's metadata. + Checks for the presence of documents in a Document Store based on a specified field in each document's metadata. If matching documents are found, they are returned as hits. If not, the items are returned as misses, indicating they are not in the cache. @@ -92,8 +91,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "CacheChecker": @component.output_types(hits=List[Document], misses=List) def run(self, items: List[Any]): """ - Checks if any document associated with the specified cache field - is already present in the store. + Checks if any document associated with the specified cache field is already present in the store. :param items: Values to be checked against the cache field. diff --git a/haystack/components/classifiers/document_language_classifier.py b/haystack/components/classifiers/document_language_classifier.py index 6f0108589f..daa242c4c5 100644 --- a/haystack/components/classifiers/document_language_classifier.py +++ b/haystack/components/classifiers/document_language_classifier.py @@ -50,6 +50,8 @@ class DocumentLanguageClassifier: def __init__(self, languages: Optional[List[str]] = None): """ + Initialize the DocumentLanguageClassifier. + :param languages: A list of languages in ISO code, each corresponding to a different output connection. For supported languages, see the [`langdetect` documentation](https://github.com/Mimino666/langdetect#languages). If not specified, the default is ["en"]. @@ -63,6 +65,7 @@ def __init__(self, languages: Optional[List[str]] = None): def run(self, documents: List[Document]): """ This method classifies the documents' language and adds it to their metadata. + If a Document's text does not match any of the languages specified at initialization, the metadata value "unmatched" will be stored. diff --git a/haystack/components/connectors/openapi_service.py b/haystack/components/connectors/openapi_service.py index f16288b483..6fa2be0552 100644 --- a/haystack/components/connectors/openapi_service.py +++ b/haystack/components/connectors/openapi_service.py @@ -16,6 +16,8 @@ @component class OpenAPIServiceConnector: """ + A component which connects the Haystack framework to OpenAPI services. + The `OpenAPIServiceConnector` component connects the Haystack framework to OpenAPI services, enabling it to call operations as defined in the OpenAPI specification of the service. @@ -77,8 +79,10 @@ def run( service_credentials: Optional[Union[dict, str]] = None, ) -> Dict[str, List[ChatMessage]]: """ - Processes a list of chat messages to invoke a method on an OpenAPI service. It parses the last message in the - list, expecting it to contain an OpenAI function calling descriptor (name & parameters) in JSON format. + Processes a list of chat messages to invoke a method on an OpenAPI service. + + It parses the last message in the list, expecting it to contain an OpenAI function calling descriptor + (name & parameters) in JSON format. :param messages: A list of `ChatMessage` objects containing the messages to be processed. The last message should contain the function invocation payload in OpenAI function calling format. See the example in the class @@ -148,6 +152,8 @@ def _parse_message(self, message: ChatMessage) -> List[Dict[str, Any]]: def _authenticate_service(self, openapi_service: OpenAPI, credentials: Optional[Union[dict, str]] = None): """ + Authentication with an OpenAPI service. + Authenticates with the OpenAPI service if required, supporting both single (str) and multiple authentication methods (dict). @@ -201,8 +207,9 @@ def _authenticate_service(self, openapi_service: OpenAPI, credentials: Optional[ def _invoke_method(self, openapi_service: OpenAPI, method_invocation_descriptor: Dict[str, Any]) -> Any: """ - Invokes the specified method on the OpenAPI service. The method name and arguments are passed in the - method_invocation_descriptor. + Invokes the specified method on the OpenAPI service. + + The method name and arguments are passed in the method_invocation_descriptor. :param openapi_service: The OpenAPI service instance. :param method_invocation_descriptor: The method name and arguments to be passed to the method. The payload diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index 60b41fd5b3..db8003be2e 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -23,7 +23,8 @@ @component class AzureOCRDocumentConverter: """ - A component for converting files to Documents using Azure's Document Intelligence service. + Convert files to documents using Azure's Document Intelligence service. + Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML. In order to be able to use this component, you need an active Azure account @@ -170,6 +171,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "AzureOCRDocumentConverter": # pylint: disable=line-too-long def _convert_tables_and_text(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> List[Document]: """ + Converts the tables and text extracted by Azure's Document Intelligence service into Haystack Documents. + :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult). :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. @@ -188,6 +191,7 @@ def _convert_tables_and_text(self, result: "AnalyzeResult", meta: Optional[Dict[ def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> List[Document]: """ Converts the tables extracted by Azure's Document Intelligence service into Haystack Documents. + :param result: The AnalyzeResult Azure object :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. @@ -296,8 +300,10 @@ def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any] def _convert_to_natural_text(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> Document: """ - This converts the `AnalyzeResult` object into a single Document. We add "\f" separators between to - differentiate between the text on separate pages. This is the expected format for the PreProcessor. + This converts the `AnalyzeResult` object into a single document. + + We add "\f" separators between to differentiate between the text on separate pages. This is the expected format + for the PreProcessor. :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult). @@ -340,8 +346,10 @@ def _convert_to_single_column_text( self, result: "AnalyzeResult", meta: Optional[Dict[str, str]], threshold_y: float = 0.05 ) -> Document: """ - This converts the `AnalyzeResult` object into a single Haystack Document. We add "\f" separators between to - differentiate between the text on separate pages. This is the expected format for the PreProcessor. + This converts the `AnalyzeResult` object into a single Haystack Document. + + We add "\f" separators between to differentiate between the text on separate pages. This is the expected format + for the PreProcessor. :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult). @@ -427,6 +435,7 @@ def _convert_to_single_column_text( def _collect_table_spans(self, result: "AnalyzeResult") -> Dict: """ Collect the spans of all tables by page number. + :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. :returns: A dictionary with the page number as key and a list of table spans as value. """ @@ -443,6 +452,7 @@ def _check_if_in_table( ) -> bool: """ Check if a line or paragraph is part of a table. + :param tables_on_page: A dictionary with the page number as key and a list of table spans as value. :param line_or_paragraph: The line or paragraph to check. :returns: True if the line or paragraph is part of a table, False otherwise. @@ -457,7 +467,9 @@ def _check_if_in_table( def _hash_dataframe(self, df: pd.DataFrame, desired_samples=5, hash_length=4) -> str: """ - Returns a hash of the DataFrame content. The hash is based on the content of the DataFrame. + Returns a hash of the DataFrame content. + + The hash is based on the content of the DataFrame. :param df: The DataFrame to hash. :param desired_samples: The desired number of samples to hash. :param hash_length: The length of the hash for each sample. diff --git a/haystack/components/converters/openapi_functions.py b/haystack/components/converters/openapi_functions.py index 30403ace4d..e9c6f5392d 100644 --- a/haystack/components/converters/openapi_functions.py +++ b/haystack/components/converters/openapi_functions.py @@ -112,6 +112,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]) -> Dict[str, Any]: def _openapi_to_functions(self, service_openapi_spec: Dict[str, Any]) -> List[Dict[str, Any]]: """ + OpenAPI to OpenAI function conversion. + Extracts functions from the OpenAPI specification of the service and converts them into a format suitable for OpenAI function calling. @@ -188,6 +190,8 @@ def _parse_property_attributes( self, property_schema: Dict[str, Any], include_attributes: Optional[List[str]] = None ) -> Dict[str, Any]: """ + Parses the attributes of a property schema. + Recursively parses the attributes of a property schema, including nested objects and arrays, and includes specified attributes like description, pattern, etc. diff --git a/haystack/components/converters/utils.py b/haystack/components/converters/utils.py index 8666722a7f..30591cd135 100644 --- a/haystack/components/converters/utils.py +++ b/haystack/components/converters/utils.py @@ -7,6 +7,7 @@ def get_bytestream_from_source(source: Union[str, Path, ByteStream]) -> ByteStream: """ Creates a ByteStream object from a source. + :param source: A source to convert to a ByteStream. Can be a string (path to a file), a Path object, or a ByteStream. :return: A ByteStream object. """ @@ -24,6 +25,8 @@ def normalize_metadata( meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], sources_count: int ) -> List[Dict[str, Any]]: """ + Normalize the metadata input for a converter. + Given all the possible value of the meta input for a converter (None, dictionary or list of dicts), makes sure to return a list of dictionaries of the correct length for the converter to use. diff --git a/haystack/components/embedders/hugging_face_api_document_embedder.py b/haystack/components/embedders/hugging_face_api_document_embedder.py index 3f8ebfba04..d6bcb6c52a 100644 --- a/haystack/components/embedders/hugging_face_api_document_embedder.py +++ b/haystack/components/embedders/hugging_face_api_document_embedder.py @@ -19,6 +19,8 @@ @component class HuggingFaceAPIDocumentEmbedder: """ + A component that embeds documents using Hugging Face APIs. + This component can be used to compute Document embeddings using different Hugging Face APIs: - [Free Serverless Inference API]((https://huggingface.co/inference-api) - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) diff --git a/haystack/components/embedders/hugging_face_api_text_embedder.py b/haystack/components/embedders/hugging_face_api_text_embedder.py index de7c3097b2..7404f28396 100644 --- a/haystack/components/embedders/hugging_face_api_text_embedder.py +++ b/haystack/components/embedders/hugging_face_api_text_embedder.py @@ -16,6 +16,8 @@ @component class HuggingFaceAPITextEmbedder: """ + A component that embeds text using Hugging Face APIs. + This component can be used to embed strings using different Hugging Face APIs: - [Free Serverless Inference API]((https://huggingface.co/inference-api) - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) diff --git a/haystack/components/evaluators/answer_exact_match.py b/haystack/components/evaluators/answer_exact_match.py index db5e722591..bbab895bd3 100644 --- a/haystack/components/evaluators/answer_exact_match.py +++ b/haystack/components/evaluators/answer_exact_match.py @@ -6,11 +6,13 @@ @component class AnswerExactMatchEvaluator: """ - Evaluator that checks if predicted answers exactly match ground truth answers. + An answer exact match evaluator class. + + The evaluator that checks if the predicted answers matches any of the ground truth answers exactly. + The result is a number from 0.0 to 1.0, it represents the proportion of predicted answers + that matched one of the ground truth answers. + There can be multiple ground truth answers and multiple predicted answers as input. - Each predicted answer is compared to one ground truth answer. - The final score is a number ranging from 0.0 to 1.0. - It represents the proportion of predicted answers that match their corresponding ground truth answer. Usage example: ```python @@ -33,7 +35,8 @@ class AnswerExactMatchEvaluator: def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]: """ Run the AnswerExactMatchEvaluator on the given inputs. - `ground_truth_answers` and `retrieved_answers` must have the same length. + + The `ground_truth_answers` and `retrieved_answers` must have the same length. :param ground_truth_answers: A list of expected answers. diff --git a/haystack/components/evaluators/document_map.py b/haystack/components/evaluators/document_map.py index 303d7c4dfa..d87ad61242 100644 --- a/haystack/components/evaluators/document_map.py +++ b/haystack/components/evaluators/document_map.py @@ -6,6 +6,8 @@ @component class DocumentMAPEvaluator: """ + A Mean Average Precision (MAP) evaluator for documents. + Evaluator that calculates the mean average precision of the retrieved documents, a metric that measures how high retrieved documents are ranked. Each question can have multiple ground truth documents and multiple retrieved documents. @@ -43,6 +45,7 @@ def run( ) -> Dict[str, Any]: """ Run the DocumentMAPEvaluator on the given inputs. + All lists must have the same length. :param ground_truth_documents: @@ -52,7 +55,7 @@ def run( :returns: A dictionary with the following outputs: - `score` - The average of calculated scores. - - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked. + - `individual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked. """ if len(ground_truth_documents) != len(retrieved_documents): msg = "The length of ground_truth_documents and retrieved_documents must be the same." diff --git a/haystack/components/evaluators/document_recall.py b/haystack/components/evaluators/document_recall.py index 3bd9a767b3..65472bcd25 100644 --- a/haystack/components/evaluators/document_recall.py +++ b/haystack/components/evaluators/document_recall.py @@ -32,6 +32,7 @@ def from_str(string: str) -> "RecallMode": class DocumentRecallEvaluator: """ Evaluator that calculates the Recall score for a list of documents. + Returns both a list of scores for each question and the average. There can be multiple ground truth documents and multiple predicted documents as input. @@ -91,6 +92,7 @@ def run( ) -> Dict[str, Any]: """ Run the DocumentRecallEvaluator on the given inputs. + `ground_truth_documents` and `retrieved_documents` must have the same length. :param ground_truth_documents: diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 34a69e3b11..e035c4073c 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -178,6 +178,8 @@ def run(self, **inputs) -> Dict[str, Any]: def prepare_template(self) -> str: """ + Prepare the prompt template. + Combine instructions, inputs, outputs, and examples into one prompt template with the following format: Instructions: diff --git a/haystack/components/evaluators/sas_evaluator.py b/haystack/components/evaluators/sas_evaluator.py index 6590d25d7d..d7ab26a4df 100644 --- a/haystack/components/evaluators/sas_evaluator.py +++ b/haystack/components/evaluators/sas_evaluator.py @@ -16,6 +16,7 @@ class SASEvaluator: """ SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a list of ground truths. + It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers. The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a @@ -132,6 +133,8 @@ def warm_up(self): @component.output_types(score=float, individual_scores=List[float]) def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]: """ + SASEvaluator component run method. + Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predicted answers and a list of ground truth answers. Both must be list of strings of same length. diff --git a/haystack/components/extractors/named_entity_extractor.py b/haystack/components/extractors/named_entity_extractor.py index 5eb78e075a..1c994fc6b1 100644 --- a/haystack/components/extractors/named_entity_extractor.py +++ b/haystack/components/extractors/named_entity_extractor.py @@ -159,8 +159,7 @@ def warm_up(self): @component.output_types(documents=List[Document]) def run(self, documents: List[Document], batch_size: int = 1) -> Dict[str, Any]: """ - Annotate named entities in each document and store - the annotations in the document's metadata. + Annotate named entities in each document and store the annotations in the document's metadata. :param documents: Documents to process. @@ -227,8 +226,7 @@ def initialized(self) -> bool: @classmethod def get_stored_annotations(cls, document: Document) -> Optional[List[NamedEntityAnnotation]]: """ - Returns the document's named entity annotations stored - in its metadata, if any. + Returns the document's named entity annotations stored in its metadata, if any. :param document: Document whose annotations are to be fetched. @@ -259,16 +257,14 @@ def __init__( @abstractmethod def initialize(self): """ - Initializes the backend. This would usually - entail loading models, pipelines, etc. + Initializes the backend. This would usually entail loading models, pipelines, and so on. """ @property @abstractmethod def initialized(self) -> bool: """ - Returns if the backend has been initialized, i.e, - ready to annotate text. + Returns if the backend has been initialized, for example, ready to annotate text. """ @abstractmethod @@ -295,6 +291,8 @@ def model_name(self) -> str: @property def device(self) -> ComponentDevice: """ + The device on which the backend's model is loaded. + :returns: The device on which the backend's model is loaded. """ @@ -457,8 +455,7 @@ def model_name(self) -> str: @contextmanager def _select_device(self): """ - Context manager used to run spaCy models on a specific - GPU in a scoped manner. + Context manager used to run spaCy models on a specific GPU in a scoped manner. """ # TODO: This won't restore the active device. diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py index 2ccf81e6b6..574af5be76 100644 --- a/haystack/components/fetchers/link_content.py +++ b/haystack/components/fetchers/link_content.py @@ -26,6 +26,8 @@ def _text_content_handler(response: Response) -> ByteStream: """ + Handles text content. + :param response: Response object from the request. :return: The extracted text. """ @@ -34,6 +36,8 @@ def _text_content_handler(response: Response) -> ByteStream: def _binary_content_handler(response: Response) -> ByteStream: """ + Handles binary content. + :param response: Response object from the request. :return: The extracted binary file-like object. """ @@ -211,6 +215,7 @@ def _get_content_type(self, response: Response): def _switch_user_agent(self, retry_state: RetryCallState) -> None: """ Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents. + Used by tenacity to retry the requests with a different user agent. :param retry_state: The retry state (unused, required by tenacity). diff --git a/haystack/components/generators/azure.py b/haystack/components/generators/azure.py index fbe23de00f..5cd7a1430c 100644 --- a/haystack/components/generators/azure.py +++ b/haystack/components/generators/azure.py @@ -14,8 +14,9 @@ class AzureOpenAIGenerator(OpenAIGenerator): """ - Enables text generation using OpenAI's large language models (LLMs) on Azure. It supports gpt-4 and gpt-3.5-turbo - family of models. + A Generator component that uses OpenAI's large language models (LLMs) on Azure to generate text. + + It supports gpt-4 and gpt-3.5-turbo family of models. Users can pass any text generation parameters valid for the `openai.ChatCompletion.create` method directly to this component via the `**generation_kwargs` parameter in __init__ or the `**generation_kwargs` @@ -59,6 +60,8 @@ def __init__( generation_kwargs: Optional[Dict[str, Any]] = None, ): """ + Initialize the Azure OpenAI Generator. + :param azure_endpoint: The endpoint of the deployed model, e.g. `https://example-resource.azure.openai.com/` :param api_version: The version of the API to use. Defaults to 2023-05-15 :param azure_deployment: The deployment of the model, usually the model name. diff --git a/haystack/components/generators/chat/azure.py b/haystack/components/generators/chat/azure.py index 6a1e8fb648..e1d3029ee7 100644 --- a/haystack/components/generators/chat/azure.py +++ b/haystack/components/generators/chat/azure.py @@ -14,6 +14,8 @@ class AzureOpenAIChatGenerator(OpenAIChatGenerator): """ + A Chat Generator component that uses the Azure OpenAI API to generate text. + Enables text generation using OpenAI's large language models (LLMs) on Azure. It supports `gpt-4` and `gpt-3.5-turbo` family of models accessed through the chat completions API endpoint. @@ -76,6 +78,8 @@ def __init__( generation_kwargs: Optional[Dict[str, Any]] = None, ): """ + Initialize the Azure OpenAI Chat Generator component. + :param azure_endpoint: The endpoint of the deployed model, e.g. `"https://example-resource.azure.openai.com/"` :param api_version: The version of the API to use. Defaults to 2023-05-15 :param azure_deployment: The deployment of the model, usually the model name. diff --git a/haystack/components/generators/chat/hugging_face_api.py b/haystack/components/generators/chat/hugging_face_api.py index 8cdb8dc664..eac3877aca 100644 --- a/haystack/components/generators/chat/hugging_face_api.py +++ b/haystack/components/generators/chat/hugging_face_api.py @@ -17,6 +17,8 @@ @component class HuggingFaceAPIChatGenerator: """ + A Chat Generator component that uses Hugging Face APIs to generate text. + This component can be used to generate text using different Hugging Face APIs with the ChatMessage format: - [Free Serverless Inference API](https://huggingface.co/inference-api) - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py index dfdb087d16..ebb8612b83 100644 --- a/haystack/components/generators/chat/hugging_face_local.py +++ b/haystack/components/generators/chat/hugging_face_local.py @@ -32,6 +32,8 @@ @component class HuggingFaceLocalChatGenerator: """ + A Chat Generator component that uses models available on Hugging Face Hub to generate chat responses locally. + The `HuggingFaceLocalChatGenerator` class is a component designed for generating chat responses using models from Hugging Face's model hub. It is tailored for local runtime text generation tasks and provides a convenient interface for working with chat-based models, such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf` @@ -78,6 +80,8 @@ def __init__( streaming_callback: Optional[Callable[[StreamingChunk], None]] = None, ): """ + Initializes the HuggingFaceLocalChatGenerator component. + :param model: The name or path of a Hugging Face model for text generation, for example, `mistralai/Mistral-7B-Instruct-v0.2`, `TheBloke/OpenHermes-2.5-Mistral-7B-16k-AWQ`, etc. The important aspect of the model is that it should be a chat model and that it supports ChatML messaging diff --git a/haystack/components/generators/chat/hugging_face_tgi.py b/haystack/components/generators/chat/hugging_face_tgi.py index 9d5fa752bb..0956468639 100644 --- a/haystack/components/generators/chat/hugging_face_tgi.py +++ b/haystack/components/generators/chat/hugging_face_tgi.py @@ -24,6 +24,8 @@ @component class HuggingFaceTGIChatGenerator: """ + A Chat-based text generation component using Hugging Face's Text Generation Inference (TGI) framework. + Enables text generation using HuggingFace Hub hosted chat-based LLMs. This component is designed to seamlessly inference chat-based models deployed on the Text Generation Inference (TGI) backend. @@ -147,6 +149,8 @@ def __init__( def warm_up(self) -> None: """ + Warm up the tokenizer by loading it from the model. + If the url is not provided, check if the model is deployed on the free tier of the HF inference API. Load the tokenizer """ diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py index d05ed1d8b7..8369892191 100644 --- a/haystack/components/generators/chat/openai.py +++ b/haystack/components/generators/chat/openai.py @@ -17,6 +17,8 @@ @component class OpenAIChatGenerator: """ + A Chat Generator component that uses the OpenAI API to generate text. + Enables text generation using OpenAI's large language models (LLMs). It supports `gpt-4` and `gpt-3.5-turbo` family of models accessed through the chat completions API endpoint. @@ -71,6 +73,8 @@ def __init__( generation_kwargs: Optional[Dict[str, Any]] = None, ): """ + Initializes the OpenAIChatGenerator component. + Creates an instance of OpenAIChatGenerator. Unless specified otherwise in the `model`, this is for OpenAI's GPT-3.5 model. @@ -206,6 +210,7 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessage: """ Connects the streaming chunks into a single ChatMessage. + :param chunk: The last chunk returned by the OpenAI API. :param chunks: The list of all chunks returned by the OpenAI API. """ @@ -256,6 +261,7 @@ def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessa def _build_message(self, completion: ChatCompletion, choice: Choice) -> ChatMessage: """ Converts the non-streaming response from the OpenAI API to a ChatMessage. + :param completion: The completion returned by the OpenAI API. :param choice: The choice returned by the OpenAI API. :return: The ChatMessage. @@ -287,6 +293,7 @@ def _build_message(self, completion: ChatCompletion, choice: Choice) -> ChatMess def _build_chunk(self, chunk: ChatCompletionChunk) -> StreamingChunk: """ Converts the streaming response chunk from the OpenAI API to a StreamingChunk. + :param chunk: The chunk returned by the OpenAI API. :param choice: The choice returned by the OpenAI API. :return: The StreamingChunk. @@ -311,6 +318,7 @@ def _build_chunk(self, chunk: ChatCompletionChunk) -> StreamingChunk: def _check_finish_reason(self, message: ChatMessage) -> None: """ Check the `finish_reason` returned with the OpenAI completions. + If the `finish_reason` is `length` or `content_filter`, log a warning. :param message: The message returned by the LLM. """ diff --git a/haystack/components/generators/hugging_face_api.py b/haystack/components/generators/hugging_face_api.py index ad1ede4ac0..a6d34431c3 100644 --- a/haystack/components/generators/hugging_face_api.py +++ b/haystack/components/generators/hugging_face_api.py @@ -23,6 +23,8 @@ @component class HuggingFaceAPIGenerator: """ + A Generator component that uses Hugging Face APIs to generate text. + This component can be used to generate text using different Hugging Face APIs: - [Free Serverless Inference API]((https://huggingface.co/inference-api) - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) @@ -35,7 +37,7 @@ class HuggingFaceAPIGenerator: from haystack.utils import Secret generator = HuggingFaceAPIGenerator(api_type="serverless_inference_api", - api_params={"model": "mistralai/Mistral-7B-v0.1"}, + api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, token=Secret.from_token("")) result = generator.run(prompt="What's Natural Language Processing?") diff --git a/haystack/components/generators/openai.py b/haystack/components/generators/openai.py index 966b552bdf..d546fda8c7 100644 --- a/haystack/components/generators/openai.py +++ b/haystack/components/generators/openai.py @@ -13,6 +13,8 @@ @component class OpenAIGenerator: """ + Text generation component using OpenAI's large language models (LLMs). + Enables text generation using OpenAI's large language models (LLMs). It supports gpt-4 and gpt-3.5-turbo family of models. @@ -258,6 +260,7 @@ def _build_chunk(self, chunk: Any) -> StreamingChunk: def _check_finish_reason(self, message: ChatMessage) -> None: """ Check the `finish_reason` returned with the OpenAI completions. + If the `finish_reason` is `length`, log a warning to the user. :param message: diff --git a/haystack/components/generators/utils.py b/haystack/components/generators/utils.py index 17e225f1dc..0afb06fe04 100644 --- a/haystack/components/generators/utils.py +++ b/haystack/components/generators/utils.py @@ -7,6 +7,7 @@ def print_streaming_chunk(chunk: StreamingChunk) -> None: """ Default callback function for streaming responses. + Prints the tokens of the first completion to stdout as soon as they are received """ print(chunk.content, flush=True, end="") @@ -15,6 +16,7 @@ def print_streaming_chunk(chunk: StreamingChunk) -> None: def serialize_callback_handler(streaming_callback: Callable[[StreamingChunk], None]) -> str: """ Serializes the streaming callback handler. + :param streaming_callback: The streaming callback handler function :returns: @@ -26,6 +28,7 @@ def serialize_callback_handler(streaming_callback: Callable[[StreamingChunk], No def deserialize_callback_handler(callback_name: str) -> Optional[Callable[[StreamingChunk], None]]: """ Deserializes the streaming callback handler. + :param callback_name: The full path of the streaming callback handler function :returns: diff --git a/haystack/components/joiners/document_joiner.py b/haystack/components/joiners/document_joiner.py index ee432b9e8d..154bfadb68 100644 --- a/haystack/components/joiners/document_joiner.py +++ b/haystack/components/joiners/document_joiner.py @@ -137,6 +137,7 @@ def _merge(self, document_lists): def _reciprocal_rank_fusion(self, document_lists): """ Merge multiple lists of Documents and assign scores based on reciprocal rank fusion. + The constant k is set to 61 (60 was suggested by the original paper, plus 1 as python lists are 0-based and the paper used 1-based ranking). """ diff --git a/haystack/components/others/multiplexer.py b/haystack/components/others/multiplexer.py index 0569349b4c..9eca10cefb 100644 --- a/haystack/components/others/multiplexer.py +++ b/haystack/components/others/multiplexer.py @@ -17,6 +17,8 @@ @component(is_greedy=True) class Multiplexer: """ + A component which receives data connections from multiple components and distributes them to multiple components. + `Multiplexer` offers the ability to both receive data connections from multiple other components and to distribute it to various other components, enhancing the functionality of complex data processing pipelines. @@ -125,6 +127,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "Multiplexer": def run(self, **kwargs): """ + The run method of the `Multiplexer` component. + Multiplexes the input data from the upstream connected components and distributes it to the downstream connected components. diff --git a/haystack/components/preprocessors/document_cleaner.py b/haystack/components/preprocessors/document_cleaner.py index ab3cbe9c50..4d873116fe 100644 --- a/haystack/components/preprocessors/document_cleaner.py +++ b/haystack/components/preprocessors/document_cleaner.py @@ -12,6 +12,8 @@ @component class DocumentCleaner: """ + Cleans the text in the documents. + Cleans up text documents by removing extra whitespaces, empty lines, specified substrings, regexes, page headers and footers (in this order). @@ -38,6 +40,8 @@ def __init__( remove_regex: Optional[str] = None, ): """ + Initialize the DocumentCleaner. + :param remove_empty_lines: Whether to remove empty lines. :param remove_extra_whitespaces: Whether to remove extra whitespaces. :param remove_repeated_substrings: Whether to remove repeated substrings (headers/footers) from pages. @@ -97,6 +101,7 @@ def run(self, documents: List[Document]): def _remove_empty_lines(self, text: str) -> str: """ Remove empty lines and lines that contain nothing but whitespaces from text. + :param text: Text to clean. :returns: The text without empty lines. """ @@ -107,6 +112,7 @@ def _remove_empty_lines(self, text: str) -> str: def _remove_extra_whitespaces(self, text: str) -> str: """ Remove extra whitespaces from text. + :param text: Text to clean. :returns: The text without extra whitespaces. """ @@ -115,6 +121,7 @@ def _remove_extra_whitespaces(self, text: str) -> str: def _remove_regex(self, text: str, regex: str) -> str: """ Remove substrings that match the specified regex from the text. + :param text: Text to clean. :param regex: Regex to match and replace substrings by "". :returns: The text without the substrings that match the regex. @@ -124,6 +131,7 @@ def _remove_regex(self, text: str, regex: str) -> str: def _remove_substrings(self, text: str, substrings: List[str]) -> str: """ Remove all specified substrings from the text. + :param text: Text to clean. :param substrings: Substrings to remove. :returns: The text without the specified substrings. @@ -135,6 +143,7 @@ def _remove_substrings(self, text: str, substrings: List[str]) -> str: def _remove_repeated_substrings(self, text: str) -> str: """ Remove any substrings from the text that occur repeatedly on every page. For example headers or footers. + Pages in the text need to be separated by form feed character "\f". :param text: Text to clean. :returns: The text without the repeated substrings. @@ -148,6 +157,7 @@ def _find_and_remove_header_footer( ) -> str: """ Heuristic to find footers and headers across different pages by searching for the longest common string. + Pages in the text need to be separated by form feed character "\f". For headers, we only search in the first n_chars characters (for footer: last n_chars). Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX", @@ -182,6 +192,7 @@ def _find_and_remove_header_footer( def _ngram(self, seq: str, n: int) -> Generator[str, None, None]: """ Return all ngrams of length n from a text sequence. Each ngram consists of n words split by whitespace. + :param seq: The sequence to generate ngrams from. :param n: The length of the ngrams to generate. :returns: A Generator generating all ngrams of length n from the given sequence. @@ -202,6 +213,7 @@ def _ngram(self, seq: str, n: int) -> Generator[str, None, None]: def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]: """ Generates all possible ngrams from a given sequence of text. + Considering all ngram lengths between the minimum and maximum length. :param seq: The sequence to generate ngrams from. @@ -217,6 +229,7 @@ def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]: def _find_longest_common_ngram(self, sequences: List[str], min_ngram: int = 3, max_ngram: int = 30) -> str: """ Find the longest common ngram across a list of text sequences (e.g. start of pages). + Considering all ngram lengths between the minimum and maximum length. Helpful for finding footers, headers etc. Empty sequences are ignored. diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index b76bdde66f..adea7cc3ce 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -23,6 +23,8 @@ def __init__( split_overlap: int = 0, ): """ + Initialize the DocumentSplitter. + :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ", "sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n". :param split_length: The maximum number of units in each split. @@ -42,6 +44,8 @@ def __init__( @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): """ + Split documents into smaller parts. + Splits documents by the unit expressed in `split_by`, with a length of `split_length` and an overlap of `split_overlap`. diff --git a/haystack/components/preprocessors/text_cleaner.py b/haystack/components/preprocessors/text_cleaner.py index 43d5009e06..3155abcf48 100644 --- a/haystack/components/preprocessors/text_cleaner.py +++ b/haystack/components/preprocessors/text_cleaner.py @@ -8,10 +8,12 @@ @component class TextCleaner: """ - A preprocessor component to clean text data. It can remove substrings matching a list of regular expressions, - convert text to lowercase, remove punctuation, and remove numbers. + A PreProcessor component to clean text data. - This is useful to cleanup text data before evaluation. + It can remove substrings matching a list of regular expressions, convert text to lowercase, remove punctuation, + and remove numbers. + + This is useful to clean up text data before evaluation. """ def __init__( @@ -22,6 +24,8 @@ def __init__( remove_numbers: bool = False, ): """ + Initialize the TextCleaner component. + :param remove_regexps: A list of regular expressions. If provided, it removes substrings matching these regular expressions from the text. :param convert_to_lowercase: If True, converts all characters to lowercase. diff --git a/haystack/components/rankers/lost_in_the_middle.py b/haystack/components/rankers/lost_in_the_middle.py index 292ec7788c..1f45045a54 100644 --- a/haystack/components/rankers/lost_in_the_middle.py +++ b/haystack/components/rankers/lost_in_the_middle.py @@ -6,6 +6,8 @@ @component class LostInTheMiddleRanker: """ + A LostInTheMiddle Ranker. + Ranks documents based on the 'lost in the middle' order so that the most relevant documents are either at the beginning or end, while the least relevant are in the middle. @@ -33,6 +35,8 @@ class LostInTheMiddleRanker: def __init__(self, word_count_threshold: Optional[int] = None, top_k: Optional[int] = None): """ + Initialize the LostInTheMiddleRanker. + If 'word_count_threshold' is specified, this ranker includes all documents up until the point where adding another document would exceed the 'word_count_threshold'. The last document that causes the threshold to be breached will be included in the resulting list of documents, but all subsequent documents will be diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index d1ba40ba6e..ad2ca1da57 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -141,6 +141,7 @@ def run( ): """ Ranks a list of Documents based on the selected meta field by: + 1. Sorting the Documents by the meta field in descending or ascending order. 2. Merging the rankings from the previous component and based on the meta field according to ranking mode and weight. @@ -337,8 +338,10 @@ def _merge_rankings( @staticmethod def _calculate_rrf(rank: int, k: int = 61) -> float: """ - Calculates the reciprocal rank fusion. The constant K is set to 61 (60 was suggested by the original paper, - plus 1 as python lists are 0-based and the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking). + Calculates the reciprocal rank fusion. + + The constant K is set to 61 (60 was suggested by the original paper, plus 1 as python lists are 0-based and + the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking). """ return 1 / (k + rank) @@ -346,6 +349,7 @@ def _calculate_rrf(rank: int, k: int = 61) -> float: def _calc_linear_score(rank: int, amount: int) -> float: """ Calculate the meta field score as a linear score between the greatest and the lowest score in the list. + This linear scaling is useful for: - Reducing the effect of outliers - Creating scores that are meaningfully distributed in the range [0,1], diff --git a/haystack/components/rankers/sentence_transformers_diversity.py b/haystack/components/rankers/sentence_transformers_diversity.py index 86915eb657..0e68be2dd7 100644 --- a/haystack/components/rankers/sentence_transformers_diversity.py +++ b/haystack/components/rankers/sentence_transformers_diversity.py @@ -15,6 +15,8 @@ @component class SentenceTransformersDiversityRanker: """ + A Diversity Ranker based on Sentence Transformers. + Implements a document ranking algorithm that orders documents in such a way as to maximize the overall diversity of the documents. diff --git a/haystack/components/readers/extractive.py b/haystack/components/readers/extractive.py index b7aacb9ade..1591bc91d5 100644 --- a/haystack/components/readers/extractive.py +++ b/haystack/components/readers/extractive.py @@ -455,6 +455,8 @@ def deduplicate_by_overlap( self, answers: List[ExtractedAnswer], overlap_threshold: Optional[float] ) -> List[ExtractedAnswer]: """ + De-duplicates overlapping Extractive Answers. + De-duplicates overlapping Extractive Answers from the same document based on how much the spans of the answers overlap. diff --git a/haystack/components/routers/conditional_router.py b/haystack/components/routers/conditional_router.py index 9760eed9ae..a58ebca65a 100644 --- a/haystack/components/routers/conditional_router.py +++ b/haystack/components/routers/conditional_router.py @@ -163,6 +163,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "ConditionalRouter": def run(self, **kwargs): """ + Executes the routing logic. + Executes the routing logic by evaluating the specified boolean condition expressions for each route in the order they are listed. The method directs the flow of data to the output specified in the first route whose `condition` is True. diff --git a/haystack/components/routers/file_type_router.py b/haystack/components/routers/file_type_router.py index 28bed9f386..8c08f47808 100644 --- a/haystack/components/routers/file_type_router.py +++ b/haystack/components/routers/file_type_router.py @@ -13,6 +13,8 @@ @component class FileTypeRouter: """ + Groups a list of data sources by their MIME types. + FileTypeRouter groups a list of data sources (file paths or byte streams) by their MIME types, allowing for flexible routing of files to different components based on their content type. It supports both exact MIME type matching and pattern matching using regular expressions. @@ -50,6 +52,8 @@ class FileTypeRouter: def __init__(self, mime_types: List[str]): """ + Initialize the FileTypeRouter component. + :param mime_types: A list of file mime types to consider when routing files (e.g. `["text/plain", "audio/x-wav", "image/jpeg"]`). """ diff --git a/haystack/components/routers/metadata_router.py b/haystack/components/routers/metadata_router.py index e4a0bdab28..be51594fb3 100644 --- a/haystack/components/routers/metadata_router.py +++ b/haystack/components/routers/metadata_router.py @@ -72,6 +72,8 @@ def __init__(self, rules: Dict[str, Dict]): def run(self, documents: List[Document]): """ + Route the documents. + Route the documents to different edges based on their fields content and the rules specified during initialization. If a document does not match any of the rules, it is routed to a connection named "unmatched". diff --git a/haystack/components/routers/text_language_router.py b/haystack/components/routers/text_language_router.py index 67bcb932eb..3da5126861 100644 --- a/haystack/components/routers/text_language_router.py +++ b/haystack/components/routers/text_language_router.py @@ -44,6 +44,8 @@ class TextLanguageRouter: def __init__(self, languages: Optional[List[str]] = None): """ + Initialize the TextLanguageRouter component. + :param languages: A list of languages in ISO code, each corresponding to a different output connection. For supported languages, see the [`langdetect` documentation](https://github.com/Mimino666/langdetect#languages). If not specified, the default is `["en"]`. @@ -57,6 +59,7 @@ def __init__(self, languages: Optional[List[str]] = None): def run(self, text: str) -> Dict[str, str]: """ Route the text to one of different output connections based on its language. + If the text does not match any of the languages specified at initialization, it is routed to a connection named "unmatched". diff --git a/haystack/components/routers/zero_shot_text_router.py b/haystack/components/routers/zero_shot_text_router.py index 0f08a8fb49..40ebeecdf1 100644 --- a/haystack/components/routers/zero_shot_text_router.py +++ b/haystack/components/routers/zero_shot_text_router.py @@ -21,6 +21,7 @@ class TransformersZeroShotTextRouter: """ Routes a text input onto different output connections depending on which label it has been categorized into. + This is useful for routing queries to different models in a pipeline depending on their categorization. The set of labels to be used for categorization can be specified. @@ -102,6 +103,8 @@ def __init__( huggingface_pipeline_kwargs: Optional[Dict[str, Any]] = None, ): """ + Initializes the TransformersZeroShotTextRouter. + :param labels: The set of possible class labels to classify each sequence into. Can be a single label, a string of comma-separated labels, or a list of labels. :param multi_label: Whether or not multiple candidate labels can be true. @@ -187,8 +190,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "TransformersZeroShotTextRouter": @component.output_types(documents=Dict[str, str]) def run(self, text: str): """ - Run the TransformersZeroShotTextRouter. This method routes the text to one of the different edges based on which label - it has been categorized into. + Run the TransformersZeroShotTextRouter. + + This method routes the text to one of the different edges based on which label it has been categorized into. :param text: A str to route to one of the different edges. :returns: diff --git a/haystack/components/samplers/top_p.py b/haystack/components/samplers/top_p.py index c1cb5b8cee..5b7ce56454 100644 --- a/haystack/components/samplers/top_p.py +++ b/haystack/components/samplers/top_p.py @@ -56,6 +56,7 @@ def __init__(self, top_p: float = 1.0, score_field: Optional[str] = None): def run(self, documents: List[Document], top_p: Optional[float] = None): """ Filters documents using top-p sampling based on their scores. + If the specified top_p results in no documents being selected (especially in cases of a low top_p value), the method returns the document with the highest similarity score. @@ -113,6 +114,7 @@ def run(self, documents: List[Document], top_p: Optional[float] = None): def _collect_scores(self, documents: List[Document]) -> List[float]: """ Collect the scores from the documents' metadata. + :param documents: List of Documents. :return: List of scores. """ diff --git a/haystack/components/validators/json_schema.py b/haystack/components/validators/json_schema.py index 1fc1d06c66..231015bf4a 100644 --- a/haystack/components/validators/json_schema.py +++ b/haystack/components/validators/json_schema.py @@ -77,6 +77,8 @@ def run(self, messages: List[ChatMessage]) -> dict: def __init__(self, json_schema: Optional[Dict[str, Any]] = None, error_template: Optional[str] = None): """ + Initialize the JsonSchemaValidator component. + :param json_schema: A dictionary representing the [JSON schema](https://json-schema.org/) against which the messages' content is validated. :param error_template: A custom template string for formatting the error message in case of validation failure. @@ -186,8 +188,9 @@ def _is_openai_function_calling_schema(self, json_schema: Dict[str, Any]) -> boo def _recursive_json_to_object(self, data: Any) -> Any: """ - Recursively traverses a data structure (dictionary or list), converting any string values - that are valid JSON objects into dictionary objects, and returns a new data structure. + Convert any string values that are valid JSON objects into dictionary objects. + + Returns a new data structure. :param data: The data structure to be traversed. :return: A new data structure with JSON strings converted to dictionary objects. diff --git a/haystack/components/websearch/searchapi.py b/haystack/components/websearch/searchapi.py index 4f30427433..4cb03cd1bd 100644 --- a/haystack/components/websearch/searchapi.py +++ b/haystack/components/websearch/searchapi.py @@ -41,6 +41,8 @@ def __init__( search_params: Optional[Dict[str, Any]] = None, ): """ + Initialize the SearchApiWebSearch component. + :param api_key: API key for the SearchApi API :param top_k: Number of documents to return. :param allowed_domains: List of domains to limit the search to. diff --git a/haystack/components/websearch/serper_dev.py b/haystack/components/websearch/serper_dev.py index 0942625d63..7cd105fa74 100644 --- a/haystack/components/websearch/serper_dev.py +++ b/haystack/components/websearch/serper_dev.py @@ -44,6 +44,8 @@ def __init__( search_params: Optional[Dict[str, Any]] = None, ): """ + Initialize the SerperDevWebSearch component. + :param api_key: API key for the Serper API. :param top_k: Number of documents to return. :param allowed_domains: List of domains to limit the search to. diff --git a/haystack/components/writers/document_writer.py b/haystack/components/writers/document_writer.py index fe34352576..a0ed98c3e0 100644 --- a/haystack/components/writers/document_writer.py +++ b/haystack/components/writers/document_writer.py @@ -50,6 +50,7 @@ def _get_telemetry_data(self) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]: """ Serializes the component to a dictionary. + :returns: Dictionary with serialized data. """ diff --git a/haystack/core/component/component.py b/haystack/core/component/component.py index 9e527e3014..80c5e21783 100644 --- a/haystack/core/component/component.py +++ b/haystack/core/component/component.py @@ -93,10 +93,10 @@ @contextmanager def _hook_component_init(callback: Callable): """ - Context manager to set a callback that will be invoked - before a component's constructor is called. The callback - receives the component class and the init parameters (as keyword - arguments) and can modify the init parameters in place. + Context manager to set a callback that will be invoked before a component's constructor is called. + + The callback receives the component class and the init parameters (as keyword arguments) and can modify the init + parameters in place. :param callback: Callback function to invoke. @@ -165,8 +165,7 @@ def positional_to_kwargs(cls_type, args) -> Dict[str, Any]: def __call__(cls, *args, **kwargs): """ - This method is called when clients instantiate a Component and - runs before __new__ and __init__. + This method is called when clients instantiate a Component and runs before __new__ and __init__. """ # This will call __new__ then __init__, giving us back the Component instance pre_init_hook = _COMPONENT_PRE_INIT_CALLBACK.get() @@ -234,6 +233,7 @@ def __call__(cls, *args, **kwargs): def _component_repr(component: Component) -> str: """ All Components override their __repr__ method with this one. + It prints the component name and the input/output sockets. """ result = object.__repr__(component) @@ -325,8 +325,7 @@ def run(self, value_0: str, value_1: Optional[str] = None, **kwargs): def set_output_types(self, instance, **types): """ - Method that specifies the output types when the 'run' method is not decorated - with 'component.output_types'. + Method that specifies the output types when the 'run' method is not decorated with 'component.output_types'. Use as: @@ -364,6 +363,8 @@ def run(self, value: int): def output_types_decorator(run_method): """ + Decorator that sets the output types of the decorated method. + This happens at class creation time, and since we don't have the decorated class available here, we temporarily store the output types as an attribute of the decorated method. The ComponentMeta metaclass will use this data to create @@ -390,9 +391,9 @@ def _component(self, cls, is_greedy: bool = False): def copy_class_namespace(namespace): """ - This is the callback that `typing.new_class` will use - to populate the newly created class. We just copy - the whole namespace from the decorated class. + This is the callback that `typing.new_class` will use to populate the newly created class. + + Simply copy the whole namespace from the decorated class. """ for key, val in dict(cls.__dict__).items(): # __dict__ and __weakref__ are class-bound, we should let Python recreate them. diff --git a/haystack/core/pipeline/draw.py b/haystack/core/pipeline/draw.py index ea920dbc16..c122857a9c 100644 --- a/haystack/core/pipeline/draw.py +++ b/haystack/core/pipeline/draw.py @@ -102,8 +102,8 @@ def _to_mermaid_text(graph: networkx.MultiDiGraph) -> str: """ Converts a Networkx graph into Mermaid syntax. - The output of this function can be used in the documentation with `mermaid` codeblocks, and it will - be automatically rendered. + The output of this function can be used in the documentation with `mermaid` codeblocks and will be + automatically rendered. """ # Copy the graph to avoid modifying the original graph = _prepare_for_drawing(graph.copy()) diff --git a/haystack/core/pipeline/pipeline.py b/haystack/core/pipeline/pipeline.py index 3a6b664a02..b729148180 100644 --- a/haystack/core/pipeline/pipeline.py +++ b/haystack/core/pipeline/pipeline.py @@ -241,10 +241,11 @@ def load( callbacks: Optional[DeserializationCallbacks] = None, ) -> "Pipeline": """ - Creates a `Pipeline` object from the string representation read from the file-like object passed in the `fp` argument. + Creates a `Pipeline` object a string representation. + + The string representation is read from the file-like object passed in the `fp` argument. + - :param data: - The string representation of the pipeline, can be `str`, `bytes` or `bytearray`. :param fp: A file-like object ready to be read from. :param marshaller: @@ -312,7 +313,7 @@ def connect(self, sender: str, receiver: str) -> "Pipeline": Connects two components together. All components to connect must exist in the pipeline. - If connecting to an component that has several output connections, specify the inputs and output names as + If connecting to a component that has several output connections, specify the inputs and output names as 'component_name.connections_name'. :param sender: @@ -598,9 +599,9 @@ def warm_up(self): def _validate_input(self, data: Dict[str, Any]): """ - Validates input data for the pipeline. + Validates pipeline input data. - Validates that: + Validates that data: * Each Component name actually exists in the Pipeline * Each Component is not missing any input * Each Component has only one input per input socket, if not variadic @@ -1049,7 +1050,10 @@ def run(self, word: str): def _prepare_component_input_data(self, data: Dict[str, Any]) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Any]]: """ - Organizes input data for pipeline components and identifies any inputs that are not matched to any component's input slots. + Prepares input data for pipeline components. + + Organizes input data for pipeline components and identifies any inputs that are not matched to any + component's input slots. This method processes a flat dictionary of input data, where each key-value pair represents an input name and its corresponding value. It distributes these inputs to the appropriate pipeline components based on diff --git a/haystack/core/pipeline/template.py b/haystack/core/pipeline/template.py index 49141eb728..f5d71f9373 100644 --- a/haystack/core/pipeline/template.py +++ b/haystack/core/pipeline/template.py @@ -22,9 +22,10 @@ class PredefinedPipeline(Enum): class PipelineTemplate: """ - The PipelineTemplate class enables the straightforward creation of flexible and configurable pipelines using Jinja2 templated YAML files. + The PipelineTemplate enables the creation of flexible and configurable pipelines. - Specifically designed to simplify the setup of complex data processing pipelines for + The PipelineTemplate class enables the straightforward creation of flexible and configurable pipelines using + Jinja2 templated YAML files. Specifically designed to simplify the setup of complex data processing pipelines for a range of NLP tasks—including question answering, retriever augmented generation (RAG), document indexing, among others - PipelineTemplate empowers users to dynamically generate pipeline configurations from templates and customize components as necessary. Its design philosophy centers on providing an accessible, yet powerful, tool @@ -63,9 +64,9 @@ def __init__(self, template_content: str): """ Initialize a PipelineTemplate. - Besides calling the constructor directly, a set of utility methods is provided - for conveniently create an instance of `PipelineTemplate` from different sources. See `from_string`, - `from_file`, `from_predefined` and `from_url`. + Besides calling the constructor directly, a set of utility methods is provided to conveniently create an + instance of `PipelineTemplate` from different sources. See `from_string`, `from_file`, `from_predefined` + and `from_url`. :param template_content: The raw template source to use in the template. """ @@ -106,7 +107,9 @@ def from_file(cls, file_path: Union[Path, str]) -> "PipelineTemplate": @classmethod def from_predefined(cls, predefined_pipeline: PredefinedPipeline) -> "PipelineTemplate": """ - Create a PipelineTemplate from a predefined template. See `PredefinedPipeline` for available options. + Create a PipelineTemplate from a predefined template. + + See `PredefinedPipeline` for available options. :param predefined_pipeline: The predefined pipeline to use. :returns: An instance of `PipelineTemplate `. diff --git a/haystack/core/serialization.py b/haystack/core/serialization.py index df521564e8..52e9efa793 100644 --- a/haystack/core/serialization.py +++ b/haystack/core/serialization.py @@ -123,8 +123,7 @@ def default_to_dict(obj: Any, **init_parameters) -> Dict[str, Any]: """ Utility function to serialize an object to a dictionary. - This is mostly necessary for Components, but it can be used by any object. - + This is mostly necessary for components but can be used by any object. `init_parameters` are parameters passed to the object class `__init__`. They must be defined explicitly as they'll be used when creating a new instance of `obj` with `from_dict`. Omitting them might cause deserialisation @@ -165,7 +164,7 @@ def default_from_dict(cls: Type[object], data: Dict[str, Any]) -> Any: """ Utility function to deserialize a dictionary to an object. - This is mostly necessary for Components but, it can be used by any object. + This is mostly necessary for components but can be used by any object. The function will raise a `DeserializationError` if the `type` field in `data` is missing or it doesn't match the type of `cls`. diff --git a/haystack/dataclasses/sparse_embedding.py b/haystack/dataclasses/sparse_embedding.py index 191f98dbcf..5fbfc2bbd9 100644 --- a/haystack/dataclasses/sparse_embedding.py +++ b/haystack/dataclasses/sparse_embedding.py @@ -8,7 +8,7 @@ class SparseEmbedding: def __init__(self, indices: List[int], values: List[float]): """ - Initialize a sparse embedding. + Initialize a SparseEmbedding object. :param indices: List of indices of non-zero elements in the embedding. :param values: List of values of non-zero elements in the embedding. @@ -22,7 +22,7 @@ def __init__(self, indices: List[int], values: List[float]): def to_dict(self): """ - Convert the sparse embedding to a dictionary. + Convert the SparseEmbedding object to a dictionary. :returns: Serialized sparse embedding. @@ -32,7 +32,7 @@ def to_dict(self): @classmethod def from_dict(cls, sparse_embedding_dict): """ - Deserializes the sparse embedding from a dictionary. + Deserializes the sparse embedding from a dictionary. :param sparse_embedding_dict: Dictionary to deserialize from. diff --git a/test/components/generators/chat/test_hugging_face_api.py b/test/components/generators/chat/test_hugging_face_api.py index 0eb48e9bd5..df2b33618b 100644 --- a/test/components/generators/chat/test_hugging_face_api.py +++ b/test/components/generators/chat/test_hugging_face_api.py @@ -126,7 +126,7 @@ def test_init_tgi_no_url(self): def test_to_dict(self, mock_check_valid_model): generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "mistralai/Mistral-7B-v0.1"}, + api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, token=Secret.from_env_var("ENV_VAR", strict=False), generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], @@ -136,14 +136,14 @@ def test_to_dict(self, mock_check_valid_model): init_params = result["init_parameters"] assert init_params["api_type"] == HFGenerationAPIType.SERVERLESS_INFERENCE_API - assert init_params["api_params"] == {"model": "mistralai/Mistral-7B-v0.1"} + assert init_params["api_params"] == {"model": "HuggingFaceH4/zephyr-7b-beta"} assert init_params["token"] == {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"} assert init_params["generation_kwargs"] == {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512} def test_from_dict(self, mock_check_valid_model): generator = HuggingFaceAPIChatGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "mistralai/Mistral-7B-v0.1"}, + api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, token=Secret.from_env_var("ENV_VAR", strict=False), generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], @@ -154,7 +154,7 @@ def test_from_dict(self, mock_check_valid_model): # now deserialize, call from_dict generator_2 = HuggingFaceAPIChatGenerator.from_dict(result) assert generator_2.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API - assert generator_2.api_params == {"model": "mistralai/Mistral-7B-v0.1"} + assert generator_2.api_params == {"model": "HuggingFaceH4/zephyr-7b-beta"} assert generator_2.token == Secret.from_env_var("ENV_VAR", strict=False) assert generator_2.generation_kwargs == {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512} assert generator_2.streaming_callback is streaming_callback_handler @@ -225,7 +225,6 @@ def mock_iter(self): # Generate text response with streaming callback response = generator.run(chat_messages) - print(response) # check kwargs passed to text_generation _, kwargs = mock_chat_completion.call_args diff --git a/test/components/generators/test_hugging_face_api.py b/test/components/generators/test_hugging_face_api.py index 93d69585c7..8786e7f536 100644 --- a/test/components/generators/test_hugging_face_api.py +++ b/test/components/generators/test_hugging_face_api.py @@ -118,7 +118,7 @@ def test_init_tgi_no_url(self): def test_to_dict(self, mock_check_valid_model): generator = HuggingFaceAPIGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "mistralai/Mistral-7B-v0.1"}, + api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, token=Secret.from_env_var("ENV_VAR", strict=False), generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], @@ -128,7 +128,7 @@ def test_to_dict(self, mock_check_valid_model): init_params = result["init_parameters"] assert init_params["api_type"] == HFGenerationAPIType.SERVERLESS_INFERENCE_API - assert init_params["api_params"] == {"model": "mistralai/Mistral-7B-v0.1"} + assert init_params["api_params"] == {"model": "HuggingFaceH4/zephyr-7b-beta"} assert init_params["token"] == {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"} assert init_params["generation_kwargs"] == { "temperature": 0.6, @@ -139,7 +139,7 @@ def test_to_dict(self, mock_check_valid_model): def test_from_dict(self, mock_check_valid_model): generator = HuggingFaceAPIGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "mistralai/Mistral-7B-v0.1"}, + api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, token=Secret.from_env_var("ENV_VAR", strict=False), generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], @@ -150,7 +150,7 @@ def test_from_dict(self, mock_check_valid_model): # now deserialize, call from_dict generator_2 = HuggingFaceAPIGenerator.from_dict(result) assert generator_2.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API - assert generator_2.api_params == {"model": "mistralai/Mistral-7B-v0.1"} + assert generator_2.api_params == {"model": "HuggingFaceH4/zephyr-7b-beta"} assert generator_2.token == Secret.from_env_var("ENV_VAR", strict=False) assert generator_2.generation_kwargs == { "temperature": 0.6, @@ -164,7 +164,7 @@ def test_generate_text_response_with_valid_prompt_and_generation_parameters( ): generator = HuggingFaceAPIGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "mistralai/Mistral-7B-v0.1"}, + api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, token=Secret.from_env_var("ENV_VAR", strict=False), generation_kwargs={"temperature": 0.6}, stop_words=["stop", "words"], @@ -194,7 +194,7 @@ def test_generate_text_response_with_valid_prompt_and_generation_parameters( def test_generate_text_with_custom_generation_parameters(self, mock_check_valid_model, mock_text_generation): generator = HuggingFaceAPIGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "mistralai/Mistral-7B-v0.1"} + api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "HuggingFaceH4/zephyr-7b-beta"} ) generation_kwargs = {"temperature": 0.8, "max_new_tokens": 100} @@ -217,9 +217,7 @@ def test_generate_text_with_custom_generation_parameters(self, mock_check_valid_ assert len(response["meta"]) > 0 assert [isinstance(reply, str) for reply in response["replies"]] - def test_generate_text_with_streaming_callback( - self, mock_check_valid_model, mock_auto_tokenizer, mock_text_generation - ): + def test_generate_text_with_streaming_callback(self, mock_check_valid_model, mock_text_generation): streaming_call_count = 0 # Define the streaming callback function @@ -228,10 +226,9 @@ def streaming_callback_fn(chunk: StreamingChunk): streaming_call_count += 1 assert isinstance(chunk, StreamingChunk) - # Create an instance of HuggingFaceRemoteGenerator generator = HuggingFaceAPIGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "mistralai/Mistral-7B-v0.1"}, + api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, streaming_callback=streaming_callback_fn, ) @@ -282,12 +279,11 @@ def mock_iter(self): def test_run_serverless(self): generator = HuggingFaceAPIGenerator( api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "mistralai/Mistral-7B-v0.1"}, + api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, generation_kwargs={"max_new_tokens": 20}, ) response = generator.run("How are you?") - # Assert that the response contains the generated replies assert "replies" in response assert isinstance(response["replies"], list)