Skip to content

Commit

Permalink
feat!: Use Secret for passing authentication secrets to components (#…
Browse files Browse the repository at this point in the history
…6887)

* feat!: Use `Secret` for passing authentication secrets to components

* Add comment to clarify type ignore
  • Loading branch information
shadeMe authored Feb 5, 2024
1 parent 393a799 commit 27d1af3
Show file tree
Hide file tree
Showing 52 changed files with 707 additions and 421 deletions.
8 changes: 6 additions & 2 deletions haystack/components/audio/whisper_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.dataclasses import ByteStream
from haystack.utils import Secret, deserialize_secrets_inplace

logger = logging.getLogger(__name__)

Expand All @@ -24,7 +25,7 @@ class RemoteWhisperTranscriber:

def __init__(
self,
api_key: Optional[str] = None,
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
model: str = "whisper-1",
api_base_url: Optional[str] = None,
organization: Optional[str] = None,
Expand Down Expand Up @@ -61,6 +62,7 @@ def __init__(
self.organization = organization
self.model = model
self.api_base_url = api_base_url
self.api_key = api_key

# Only response_format = "json" is supported
whisper_params = kwargs
Expand All @@ -71,7 +73,7 @@ def __init__(
)
whisper_params["response_format"] = "json"
self.whisper_params = whisper_params
self.client = OpenAI(api_key=api_key, organization=organization, base_url=api_base_url)
self.client = OpenAI(api_key=api_key.resolve_value(), organization=organization, base_url=api_base_url)

def to_dict(self) -> Dict[str, Any]:
"""
Expand All @@ -81,6 +83,7 @@ def to_dict(self) -> Dict[str, Any]:
"""
return default_to_dict(
self,
api_key=self.api_key.to_dict(),
model=self.model,
organization=self.organization,
api_base_url=self.api_base_url,
Expand All @@ -92,6 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "RemoteWhisperTranscriber":
"""
Deserialize this component from a dictionary.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
Expand Down
3 changes: 2 additions & 1 deletion haystack/components/builders/dynamic_chat_prompt_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ class DynamicChatPromptBuilder:
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.dataclasses import ChatMessage
from haystack import Pipeline
from haystack.utils import Secret
# no parameter init, we don't use any runtime template variables
prompt_builder = DynamicChatPromptBuilder()
llm = OpenAIChatGenerator(api_key="<your-api-key>", model="gpt-3.5-turbo")
llm = OpenAIChatGenerator(api_key=Secret.from_token("<your-api-key>"), model="gpt-3.5-turbo")
pipe = Pipeline()
pipe.add_component("prompt_builder", prompt_builder)
Expand Down
3 changes: 2 additions & 1 deletion haystack/components/builders/dynamic_prompt_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ class DynamicPromptBuilder:
from haystack.components.builders import DynamicPromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack import Pipeline, component, Document
from haystack.utils import Secret
prompt_builder = DynamicPromptBuilder(runtime_variables=["documents"])
llm = OpenAIGenerator(api_key="<your-api-key>", model="gpt-3.5-turbo")
llm = OpenAIGenerator(api_key=Secret.from_token("<your-api-key>"), model="gpt-3.5-turbo")
@component
Expand Down
39 changes: 19 additions & 20 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from pathlib import Path
from typing import List, Union, Dict, Any, Optional
import logging
import os

from haystack.lazy_imports import LazyImport
from haystack import component, Document, default_to_dict
from haystack import component, Document, default_to_dict, default_from_dict
from haystack.dataclasses import ByteStream
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.utils import Secret, deserialize_secrets_inplace

logger = logging.getLogger(__name__)

Expand All @@ -29,42 +29,33 @@ class AzureOCRDocumentConverter:
Usage example:
```python
from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.utils import Secret
converter = AzureOCRDocumentConverter()
converter = AzureOCRDocumentConverter(endpoint="<url>", api_key=Secret.from_token("<your-api-key>"))
results = converter.run(sources=["image-based-document.pdf"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the PDF file.'
```
"""

def __init__(self, endpoint: str, api_key: Optional[str] = None, model_id: str = "prebuilt-read"):
def __init__(
self, endpoint: str, api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"), model_id: str = "prebuilt-read"
):
"""
Create an AzureOCRDocumentConverter component.
:param endpoint: The endpoint of your Azure resource.
:param api_key: The key of your Azure resource. It can be
explicitly provided or automatically read from the
environment variable AZURE_AI_API_KEY (recommended).
:param api_key: The key of your Azure resource.
:param model_id: The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
for a list of available models. Default: `"prebuilt-read"`.
"""
azure_import.check()

api_key = api_key or os.environ.get("AZURE_AI_API_KEY")
# we check whether api_key is None or an empty string
if not api_key:
msg = (
"AzureOCRDocumentConverter expects an API key. "
"Set the AZURE_AI_API_KEY environment variable (recommended) or pass it explicitly."
)
raise ValueError(msg)

self.document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(api_key)
)
self.document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(api_key.resolve_value())) # type: ignore
self.endpoint = endpoint
self.model_id = model_id
self.api_key = api_key

@component.output_types(documents=List[Document], raw_azure_response=List[Dict])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
Expand Down Expand Up @@ -116,7 +107,15 @@ def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(self, endpoint=self.endpoint, model_id=self.model_id)
return default_to_dict(self, api_key=self.api_key.to_dict(), endpoint=self.endpoint, model_id=self.model_id)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "AzureOCRDocumentConverter":
"""
Deserialize this component from a dictionary.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

@staticmethod
def _convert_azure_result_to_document(result: "AnalyzeResult", file_suffix: Optional[str] = None) -> Document:
Expand Down
33 changes: 19 additions & 14 deletions haystack/components/embedders/azure_document_embedder.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
from typing import List, Optional, Dict, Any, Tuple

from openai.lib.azure import AzureADTokenProvider, AzureOpenAI
from openai.lib.azure import AzureOpenAI
from tqdm import tqdm

from haystack import component, Document, default_to_dict
from haystack import component, Document, default_to_dict, default_from_dict
from haystack.utils import Secret, deserialize_secrets_inplace


@component
Expand Down Expand Up @@ -34,9 +35,8 @@ def __init__(
azure_endpoint: Optional[str] = None,
api_version: Optional[str] = "2023-05-15",
azure_deployment: str = "text-embedding-ada-002",
api_key: Optional[str] = None,
azure_ad_token: Optional[str] = None,
azure_ad_token_provider: Optional[AzureADTokenProvider] = None,
api_key: Optional[Secret] = Secret.from_env_var("AZURE_OPENAI_API_KEY", strict=False),
azure_ad_token: Optional[Secret] = Secret.from_env_var("AZURE_OPENAI_AD_TOKEN", strict=False),
organization: Optional[str] = None,
prefix: str = "",
suffix: str = "",
Expand All @@ -53,8 +53,6 @@ def __init__(
:param azure_deployment: The deployment of the model, usually the model name.
:param api_key: The API key to use for authentication.
:param azure_ad_token: Azure Active Directory token, see https://www.microsoft.com/en-us/security/business/identity-access/microsoft-entra-id
:param azure_ad_token_provider: A function that returns an Azure Active Directory token, will be invoked
on every request.
:param organization: The Organization ID, defaults to `None`. See
[production best practices](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization).
:param prefix: A string to add to the beginning of each text.
Expand All @@ -70,6 +68,11 @@ def __init__(
if not azure_endpoint:
raise ValueError("Please provide an Azure endpoint or set the environment variable AZURE_OPENAI_ENDPOINT.")

if api_key is None and azure_ad_token is None:
raise ValueError("Please provide an API key or an Azure Active Directory token.")

self.api_key = api_key
self.azure_ad_token = azure_ad_token
self.api_version = api_version
self.azure_endpoint = azure_endpoint
self.azure_deployment = azure_deployment
Expand All @@ -85,9 +88,8 @@ def __init__(
api_version=api_version,
azure_endpoint=azure_endpoint,
azure_deployment=azure_deployment,
api_key=api_key,
azure_ad_token=azure_ad_token,
azure_ad_token_provider=azure_ad_token_provider,
api_key=api_key.resolve_value() if api_key is not None else None,
azure_ad_token=azure_ad_token.resolve_value() if azure_ad_token is not None else None,
organization=organization,
)

Expand All @@ -98,10 +100,6 @@ def _get_telemetry_data(self) -> Dict[str, Any]:
return {"model": self.azure_deployment}

def to_dict(self) -> Dict[str, Any]:
"""
This method overrides the default serializer in order to avoid leaking the `api_key` value passed
to the constructor.
"""
return default_to_dict(
self,
azure_endpoint=self.azure_endpoint,
Expand All @@ -114,8 +112,15 @@ def to_dict(self) -> Dict[str, Any]:
progress_bar=self.progress_bar,
meta_fields_to_embed=self.meta_fields_to_embed,
embedding_separator=self.embedding_separator,
api_key=self.api_key.to_dict() if self.api_key is not None else None,
azure_ad_token=self.azure_ad_token.to_dict() if self.azure_ad_token is not None else None,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "AzureOpenAIDocumentEmbedder":
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key", "azure_ad_token"])
return default_from_dict(cls, data)

def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
"""
Prepare the texts to embed by concatenating the Document text with the metadata fields to embed.
Expand Down
29 changes: 19 additions & 10 deletions haystack/components/embedders/azure_text_embedder.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
from typing import List, Optional, Dict, Any

from openai.lib.azure import AzureADTokenProvider, AzureOpenAI
from openai.lib.azure import AzureOpenAI

from haystack import component, default_to_dict, Document
from haystack import component, Document, default_to_dict, default_from_dict
from haystack.utils import Secret, deserialize_secrets_inplace


@component
Expand Down Expand Up @@ -32,9 +33,8 @@ def __init__(
azure_endpoint: Optional[str] = None,
api_version: Optional[str] = "2023-05-15",
azure_deployment: str = "text-embedding-ada-002",
api_key: Optional[str] = None,
azure_ad_token: Optional[str] = None,
azure_ad_token_provider: Optional[AzureADTokenProvider] = None,
api_key: Optional[Secret] = Secret.from_env_var("AZURE_OPENAI_API_KEY", strict=False),
azure_ad_token: Optional[Secret] = Secret.from_env_var("AZURE_OPENAI_AD_TOKEN", strict=False),
organization: Optional[str] = None,
prefix: str = "",
suffix: str = "",
Expand All @@ -47,8 +47,6 @@ def __init__(
:param azure_deployment: The deployment of the model, usually the model name.
:param api_key: The API key to use for authentication.
:param azure_ad_token: Azure Active Directory token, see https://www.microsoft.com/en-us/security/business/identity-access/microsoft-entra-id
:param azure_ad_token_provider: A function that returns an Azure Active Directory token, will be invoked
on every request.
:param organization: The Organization ID, defaults to `None`. See
[production best practices](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization).
:param prefix: A string to add to the beginning of each text.
Expand All @@ -62,6 +60,11 @@ def __init__(
if not azure_endpoint:
raise ValueError("Please provide an Azure endpoint or set the environment variable AZURE_OPENAI_ENDPOINT.")

if api_key is None and azure_ad_token is None:
raise ValueError("Please provide an API key or an Azure Active Directory token.")

self.api_key = api_key
self.azure_ad_token = azure_ad_token
self.api_version = api_version
self.azure_endpoint = azure_endpoint
self.azure_deployment = azure_deployment
Expand All @@ -73,9 +76,8 @@ def __init__(
api_version=api_version,
azure_endpoint=azure_endpoint,
azure_deployment=azure_deployment,
api_key=api_key,
azure_ad_token=azure_ad_token,
azure_ad_token_provider=azure_ad_token_provider,
api_key=api_key.resolve_value() if api_key is not None else None,
azure_ad_token=azure_ad_token.resolve_value() if azure_ad_token is not None else None,
organization=organization,
)

Expand All @@ -98,8 +100,15 @@ def to_dict(self) -> Dict[str, Any]:
api_version=self.api_version,
prefix=self.prefix,
suffix=self.suffix,
api_key=self.api_key.to_dict() if self.api_key is not None else None,
azure_ad_token=self.azure_ad_token.to_dict() if self.azure_ad_token is not None else None,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "AzureOpenAITextEmbedder":
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key", "azure_ad_token"])
return default_from_dict(cls, data)

@component.output_types(embedding=List[float], meta=Dict[str, Any])
def run(self, text: str):
"""Embed a string using AzureOpenAITextEmbedder."""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List, Optional, Union, Dict
from typing import List, Optional, Dict

from haystack.lazy_imports import LazyImport
from haystack.utils.auth import Secret

with LazyImport(message="Run 'pip install \"sentence-transformers>=2.2.0\"'") as sentence_transformers_import:
from sentence_transformers import SentenceTransformer
Expand All @@ -14,14 +15,12 @@ class _SentenceTransformersEmbeddingBackendFactory:
_instances: Dict[str, "_SentenceTransformersEmbeddingBackend"] = {}

@staticmethod
def get_embedding_backend(model: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None):
embedding_backend_id = f"{model}{device}{use_auth_token}"
def get_embedding_backend(model: str, device: Optional[str] = None, auth_token: Optional[Secret] = None):
embedding_backend_id = f"{model}{device}{auth_token}"

if embedding_backend_id in _SentenceTransformersEmbeddingBackendFactory._instances:
return _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id]
embedding_backend = _SentenceTransformersEmbeddingBackend(
model=model, device=device, use_auth_token=use_auth_token
)
embedding_backend = _SentenceTransformersEmbeddingBackend(model=model, device=device, auth_token=auth_token)
_SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
return embedding_backend

Expand All @@ -31,9 +30,11 @@ class _SentenceTransformersEmbeddingBackend:
Class to manage Sentence Transformers embeddings.
"""

def __init__(self, model: str, device: Optional[str] = None, use_auth_token: Union[bool, str, None] = None):
def __init__(self, model: str, device: Optional[str] = None, auth_token: Optional[Secret] = None):
sentence_transformers_import.check()
self.model = SentenceTransformer(model_name_or_path=model, device=device, use_auth_token=use_auth_token)
self.model = SentenceTransformer(
model_name_or_path=model, device=device, use_auth_token=auth_token.resolve_value() if auth_token else None
)

def embed(self, data: List[str], **kwargs) -> List[List[float]]:
embeddings = self.model.encode(data, **kwargs).tolist()
Expand Down
7 changes: 4 additions & 3 deletions haystack/components/embedders/hf_utils.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
from typing import Optional

from haystack.lazy_imports import LazyImport
from haystack.utils.auth import Secret

with LazyImport(message="Run 'pip install transformers'") as transformers_import:
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError


def check_valid_model(model_id: str, token: Optional[str]) -> None:
def check_valid_model(model_id: str, token: Optional[Secret]) -> None:
"""
Check if the provided model ID corresponds to a valid model on HuggingFace Hub.
Also check if the model is a embedding model.
:param model_id: A string representing the HuggingFace model ID.
:param token: An optional string representing the authentication token.
:param token: The optional authentication token.
:raises ValueError: If the model is not found or is not a embedding model.
"""
transformers_import.check()

api = HfApi()
try:
model_info = api.model_info(model_id, token=token)
model_info = api.model_info(model_id, token=token.resolve_value() if token else None)
except RepositoryNotFoundError as e:
raise ValueError(
f"Model {model_id} not found on HuggingFace Hub. Please provide a valid HuggingFace model_id."
Expand Down
Loading

0 comments on commit 27d1af3

Please sign in to comment.